added riddle scrapper v1

This commit is contained in:
Anselme FRANÇOIS 2025-01-23 16:53:57 +01:00
parent eaf92abff0
commit 929b9af254

47
riddle_scrapper.py Normal file
View File

@ -0,0 +1,47 @@
```python
import requests
from bs4 import BeautifulSoup
import json
riddles = []
def scrape_year_data(scrap_year):
base_url = 'https://www.fan-fortboyard.fr/pages/fanzone/enigmes-du-pere-fouras/'
if(scrap_year > 2019):
url = f'{base_url}enigmes-{scrap_year}.html' # Replace with the actual URL pattern
else:
url = f'{base_url}{scrap_year}.html' # Replace with the actual URL pattern
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
for paragraph in paragraphs:
span = paragraph.find('span')
if span and span.find('strong'):
strong_tag = span.find('strong')
if strong_tag and strong_tag.text.strip().isdigit():
riddle_count = strong_tag.text.strip()
answer_span = paragraph.find_all('span', style=True)
if answer_span:
answer_text = answer_span[-1].get_text(strip=True)
riddle_count = strong_tag.text.strip()
riddle_text = paragraph.decode_contents()
riddle_text = riddle_text.replace('<br/>', '\n').replace('<br>', '\n')
riddle_text = BeautifulSoup(riddle_text, 'html.parser').get_text(strip=True)
riddle_text = riddle_text.split(')', 1)[-1].strip()
if "RÉPONSE" in riddle_text:
riddle_text = riddle_text.split("RÉPONSE", 1)[0].strip()
riddles.append({'riddle': riddle_text, 'answer': answer_text, 'count': riddle_count, 'year': scrap_year})
else:
print(f'No answer found : {scrap_year}-{riddle_count}')
else:
print(f'Failed to retrieve data for year {year}: {response.status_code}')
for year in range(2007, 2023):
scrape_year_data(year)
with open('riddles.json', 'w', encoding='utf-8') as json_file:
json.dump(riddles, json_file, indent=4, ensure_ascii=False)
```