47 lines
2.1 KiB
Python
47 lines
2.1 KiB
Python
```python
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
riddles = []
|
|
|
|
def scrape_year_data(scrap_year):
|
|
base_url = 'https://www.fan-fortboyard.fr/pages/fanzone/enigmes-du-pere-fouras/'
|
|
if(scrap_year > 2019):
|
|
url = f'{base_url}enigmes-{scrap_year}.html' # Replace with the actual URL pattern
|
|
else:
|
|
url = f'{base_url}{scrap_year}.html' # Replace with the actual URL pattern
|
|
response = requests.get(url)
|
|
|
|
if response.status_code == 200:
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
paragraphs = soup.find_all('p')
|
|
|
|
for paragraph in paragraphs:
|
|
span = paragraph.find('span')
|
|
if span and span.find('strong'):
|
|
strong_tag = span.find('strong')
|
|
if strong_tag and strong_tag.text.strip().isdigit():
|
|
riddle_count = strong_tag.text.strip()
|
|
answer_span = paragraph.find_all('span', style=True)
|
|
if answer_span:
|
|
answer_text = answer_span[-1].get_text(strip=True)
|
|
riddle_count = strong_tag.text.strip()
|
|
riddle_text = paragraph.decode_contents()
|
|
riddle_text = riddle_text.replace('<br/>', '\n').replace('<br>', '\n')
|
|
riddle_text = BeautifulSoup(riddle_text, 'html.parser').get_text(strip=True)
|
|
riddle_text = riddle_text.split(')', 1)[-1].strip()
|
|
if "RÉPONSE" in riddle_text:
|
|
riddle_text = riddle_text.split("RÉPONSE", 1)[0].strip()
|
|
riddles.append({'riddle': riddle_text, 'answer': answer_text, 'count': riddle_count, 'year': scrap_year})
|
|
else:
|
|
print(f'No answer found : {scrap_year}-{riddle_count}')
|
|
else:
|
|
print(f'Failed to retrieve data for year {year}: {response.status_code}')
|
|
|
|
for year in range(2007, 2023):
|
|
scrape_year_data(year)
|
|
with open('riddles.json', 'w', encoding='utf-8') as json_file:
|
|
json.dump(riddles, json_file, indent=4, ensure_ascii=False)
|
|
``` |