diff --git a/riddle_scrapper.py b/riddle_scrapper.py new file mode 100644 index 0000000..f2948c7 --- /dev/null +++ b/riddle_scrapper.py @@ -0,0 +1,47 @@ +```python +import requests +from bs4 import BeautifulSoup +import json + +riddles = [] + +def scrape_year_data(scrap_year): + base_url = 'https://www.fan-fortboyard.fr/pages/fanzone/enigmes-du-pere-fouras/' + if(scrap_year > 2019): + url = f'{base_url}enigmes-{scrap_year}.html' # Replace with the actual URL pattern + else: + url = f'{base_url}{scrap_year}.html' # Replace with the actual URL pattern + response = requests.get(url) + + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + + paragraphs = soup.find_all('p') + + for paragraph in paragraphs: + span = paragraph.find('span') + if span and span.find('strong'): + strong_tag = span.find('strong') + if strong_tag and strong_tag.text.strip().isdigit(): + riddle_count = strong_tag.text.strip() + answer_span = paragraph.find_all('span', style=True) + if answer_span: + answer_text = answer_span[-1].get_text(strip=True) + riddle_count = strong_tag.text.strip() + riddle_text = paragraph.decode_contents() + riddle_text = riddle_text.replace('
', '\n').replace('
', '\n') + riddle_text = BeautifulSoup(riddle_text, 'html.parser').get_text(strip=True) + riddle_text = riddle_text.split(')', 1)[-1].strip() + if "RÉPONSE" in riddle_text: + riddle_text = riddle_text.split("RÉPONSE", 1)[0].strip() + riddles.append({'riddle': riddle_text, 'answer': answer_text, 'count': riddle_count, 'year': scrap_year}) + else: + print(f'No answer found : {scrap_year}-{riddle_count}') + else: + print(f'Failed to retrieve data for year {year}: {response.status_code}') + +for year in range(2007, 2023): + scrape_year_data(year) +with open('riddles.json', 'w', encoding='utf-8') as json_file: + json.dump(riddles, json_file, indent=4, ensure_ascii=False) +``` \ No newline at end of file