jschoubben
/
f1-circuits


			
				
					
						
						
							
							import json
import re
from typing import Any, Dict
import logging
import requests
from bs4 import BeautifulSoup

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

output_dir = './fandom-circuit-pages'

def load_json_file(file_path: str) -> Dict[str, Any]:
    """Load and parse a JSON file"""
    try:
        with open(file_path, 'r') as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        logger.error(f"Error reading {file_path}: {str(e)}")
        return {}

def remove_header_from_child(child):
    if child:
        # Get the h2 parent of the span
        h_parent = child.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        if h_parent:
            # Find all elements after the h2 heading
            current = h_parent.next_sibling
            while current:
                next_element = current.next_sibling
                current.decompose()
                current = next_element
            # Remove the h2 heading itself
            h_parent.decompose()

def clean_fandom_page(soup):
    # Find the main content div
    main_content = soup.find('div', {'id': 'content', 'class': 'page-content'})

    if main_content:
        # Remove unwanted elements
        for element in main_content.find_all(['script', 'style', 'nav']):
            element.decompose()

        # Replace complex image thumbnails with simple img tags
        for figure in main_content.find_all('figure', class_='thumb'):
            # Get the image link and alt text
            a_tag = figure.find('a', class_='mw-file-description')
            if a_tag:
                img_url = a_tag.get('href')
                # Remove the /revision/latest?cb=XXXXXXXX part using regex
                img_url = re.sub(r'/revision/latest\?cb=[0-9]+', '', img_url)
                img_tag = figure.find('img')
                alt_text = img_tag.get('alt', '') if img_tag else ''

                # Get the caption if available
                caption_p = figure.find('p', class_='caption')
                caption_text = caption_p.get_text(strip=True) if caption_p else ''

                # Create a new img tag
                new_img = soup.new_tag('img')
                new_img['src'] = img_url
                new_img['alt'] = alt_text
                new_img['title'] = caption_text

                # Replace the figure with the new img tag
                figure.replace_with(new_img)

        # Remove edit section links
        for edit_section in main_content.find_all('span', class_='mw-editsection'):
            edit_section.decompose()

        # Remove the "Notes" section and all references
        # First find the notes heading
        remove_header_from_child(main_content.find('span', {'id': 'Grand_Prix_Winners_at_Silverstone'}))
        remove_header_from_child(main_content.find('span', {'id': 'Per_Grand_Prix'}))
        remove_header_from_child(main_content.find('span', {'id': 'Multiple_winners'}))
        remove_header_from_child(main_content.find('span', {'id': 'By_Drivers'}))
        remove_header_from_child(main_content.find('span', {'id': 'By_Constructors'}))
        remove_header_from_child(main_content.find('span', {'id': 'By_Engine_Suppliers'}))
        remove_header_from_child(main_content.find('span', {'id': 'Wins_by_Country'}))
        remove_header_from_child(main_content.find('span', {'id': 'Drivers'}))
        remove_header_from_child(main_content.find('span', {'id': 'Constructors'}))
        remove_header_from_child(main_content.find('span', {'id': 'Notes'}))

        # Remove all reference wrappers anywhere in the document
        for refs in main_content.find_all('div', class_='mw-references-wrap'):
            refs.decompose()
        for refs in main_content.find_all('table', class_='wikitable'):
            refs.decompose()
        for refs in main_content.find_all('div', {'id': 'toc'}):
            refs.decompose()
        for refs in main_content.find_all('aside'):
            refs.decompose()

        # Remove comment sections
        for comments in main_content.find_all('div', class_='comments-wrapper'):
            comments.decompose()

    return main_content

# Load existing data
existing_data = load_json_file('circuits/circuits.json')

for country, localities in existing_data.items():
    for locality, circuits in localities.items():
        for circuit_name, circuit_info in circuits.items():
            existing_circuit = existing_data.get(country, {}).get(locality, {}).get(circuit_name, {})
            fandom_url = circuit_info['urls']['fandom']
            if fandom_url is not None:
                response = requests.get(fandom_url, allow_redirects=True, timeout=5,
                                        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
                soup = BeautifulSoup(response.content, 'html.parser')
                cleaned_content = clean_fandom_page(soup)

                # Create a new clean HTML document
                clean_html = BeautifulSoup("""
                <!DOCTYPE html>
                <html lang="en">
                <head>
                    <meta charset="UTF-8">
                    <title>Silverstone Circuit</title>
                </head>
                <body>
                </body>
                </html>
                """, 'html.parser')

                # Add the cleaned content to the body
                clean_html.body.append(cleaned_content)

                # Save to file with proper formatting
                with open(f"{output_dir}/{circuit_name}.html", 'w', encoding='utf-8') as f:
                    f.write(clean_html.prettify())