import json import re from typing import Any, Dict import logging import requests from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) output_dir = './fandom-circuit-pages' def load_json_file(file_path: str) -> Dict[str, Any]: """Load and parse a JSON file""" try: with open(file_path, 'r') as f: return json.load(f) except (FileNotFoundError, json.JSONDecodeError) as e: logger.error(f"Error reading {file_path}: {str(e)}") return {} def remove_header_from_child(child): if child: # Get the h2 parent of the span h_parent = child.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) if h_parent: # Find all elements after the h2 heading current = h_parent.next_sibling while current: next_element = current.next_sibling current.decompose() current = next_element # Remove the h2 heading itself h_parent.decompose() def clean_fandom_page(soup): # Find the main content div main_content = soup.find('div', {'id': 'content', 'class': 'page-content'}) if main_content: # Remove unwanted elements for element in main_content.find_all(['script', 'style', 'nav']): element.decompose() # Replace complex image thumbnails with simple img tags for figure in main_content.find_all('figure', class_='thumb'): # Get the image link and alt text a_tag = figure.find('a', class_='mw-file-description') if a_tag: img_url = a_tag.get('href') # Remove the /revision/latest?cb=XXXXXXXX part using regex img_url = re.sub(r'/revision/latest\?cb=[0-9]+', '', img_url) img_tag = figure.find('img') alt_text = img_tag.get('alt', '') if img_tag else '' # Get the caption if available caption_p = figure.find('p', class_='caption') caption_text = caption_p.get_text(strip=True) if caption_p else '' # Create a new img tag new_img = soup.new_tag('img') new_img['src'] = img_url new_img['alt'] = alt_text new_img['title'] = caption_text # Replace the figure with the new img tag figure.replace_with(new_img) # Remove edit section links for edit_section in main_content.find_all('span', class_='mw-editsection'): edit_section.decompose() # Remove the "Notes" section and all references # First find the notes heading remove_header_from_child(main_content.find('span', {'id': 'Grand_Prix_Winners_at_Silverstone'})) remove_header_from_child(main_content.find('span', {'id': 'Per_Grand_Prix'})) remove_header_from_child(main_content.find('span', {'id': 'Multiple_winners'})) remove_header_from_child(main_content.find('span', {'id': 'By_Drivers'})) remove_header_from_child(main_content.find('span', {'id': 'By_Constructors'})) remove_header_from_child(main_content.find('span', {'id': 'By_Engine_Suppliers'})) remove_header_from_child(main_content.find('span', {'id': 'Wins_by_Country'})) remove_header_from_child(main_content.find('span', {'id': 'Drivers'})) remove_header_from_child(main_content.find('span', {'id': 'Constructors'})) remove_header_from_child(main_content.find('span', {'id': 'Notes'})) # Remove all reference wrappers anywhere in the document for refs in main_content.find_all('div', class_='mw-references-wrap'): refs.decompose() for refs in main_content.find_all('table', class_='wikitable'): refs.decompose() for refs in main_content.find_all('div', {'id': 'toc'}): refs.decompose() for refs in main_content.find_all('aside'): refs.decompose() # Remove comment sections for comments in main_content.find_all('div', class_='comments-wrapper'): comments.decompose() return main_content # Load existing data existing_data = load_json_file('circuits/circuits.json') for country, localities in existing_data.items(): for locality, circuits in localities.items(): for circuit_name, circuit_info in circuits.items(): existing_circuit = existing_data.get(country, {}).get(locality, {}).get(circuit_name, {}) fandom_url = circuit_info['urls']['fandom'] if fandom_url is not None: response = requests.get(fandom_url, allow_redirects=True, timeout=5, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}) soup = BeautifulSoup(response.content, 'html.parser') cleaned_content = clean_fandom_page(soup) # Create a new clean HTML document clean_html = BeautifulSoup("""