|
- import json
- import re
- from typing import Any, Dict
- import logging
- import requests
- from bs4 import BeautifulSoup
-
- logging.basicConfig(level=logging.INFO)
- logger = logging.getLogger(__name__)
-
- output_dir = './fandom-circuit-pages'
-
- def load_json_file(file_path: str) -> Dict[str, Any]:
- """Load and parse a JSON file"""
- try:
- with open(file_path, 'r') as f:
- return json.load(f)
- except (FileNotFoundError, json.JSONDecodeError) as e:
- logger.error(f"Error reading {file_path}: {str(e)}")
- return {}
-
- def remove_header_from_child(child):
- if child:
- # Get the h2 parent of the span
- h_parent = child.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
- if h_parent:
- # Find all elements after the h2 heading
- current = h_parent.next_sibling
- while current:
- next_element = current.next_sibling
- current.decompose()
- current = next_element
- # Remove the h2 heading itself
- h_parent.decompose()
-
- def clean_fandom_page(soup):
- # Find the main content div
- main_content = soup.find('div', {'id': 'content', 'class': 'page-content'})
-
- if main_content:
- # Remove unwanted elements
- for element in main_content.find_all(['script', 'style', 'nav']):
- element.decompose()
-
- # Replace complex image thumbnails with simple img tags
- for figure in main_content.find_all('figure', class_='thumb'):
- # Get the image link and alt text
- a_tag = figure.find('a', class_='mw-file-description')
- if a_tag:
- img_url = a_tag.get('href')
- # Remove the /revision/latest?cb=XXXXXXXX part using regex
- img_url = re.sub(r'/revision/latest\?cb=[0-9]+', '', img_url)
- img_tag = figure.find('img')
- alt_text = img_tag.get('alt', '') if img_tag else ''
-
- # Get the caption if available
- caption_p = figure.find('p', class_='caption')
- caption_text = caption_p.get_text(strip=True) if caption_p else ''
-
- # Create a new img tag
- new_img = soup.new_tag('img')
- new_img['src'] = img_url
- new_img['alt'] = alt_text
- new_img['title'] = caption_text
-
- # Replace the figure with the new img tag
- figure.replace_with(new_img)
-
- # Remove edit section links
- for edit_section in main_content.find_all('span', class_='mw-editsection'):
- edit_section.decompose()
-
- # Remove the "Notes" section and all references
- # First find the notes heading
- remove_header_from_child(main_content.find('span', {'id': 'Grand_Prix_Winners_at_Silverstone'}))
- remove_header_from_child(main_content.find('span', {'id': 'Per_Grand_Prix'}))
- remove_header_from_child(main_content.find('span', {'id': 'Multiple_winners'}))
- remove_header_from_child(main_content.find('span', {'id': 'By_Drivers'}))
- remove_header_from_child(main_content.find('span', {'id': 'By_Constructors'}))
- remove_header_from_child(main_content.find('span', {'id': 'By_Engine_Suppliers'}))
- remove_header_from_child(main_content.find('span', {'id': 'Wins_by_Country'}))
- remove_header_from_child(main_content.find('span', {'id': 'Drivers'}))
- remove_header_from_child(main_content.find('span', {'id': 'Constructors'}))
- remove_header_from_child(main_content.find('span', {'id': 'Notes'}))
-
- # Remove all reference wrappers anywhere in the document
- for refs in main_content.find_all('div', class_='mw-references-wrap'):
- refs.decompose()
- for refs in main_content.find_all('table', class_='wikitable'):
- refs.decompose()
- for refs in main_content.find_all('div', {'id': 'toc'}):
- refs.decompose()
- for refs in main_content.find_all('aside'):
- refs.decompose()
-
- # Remove comment sections
- for comments in main_content.find_all('div', class_='comments-wrapper'):
- comments.decompose()
-
- return main_content
-
- # Load existing data
- existing_data = load_json_file('circuits/circuits.json')
-
- for country, localities in existing_data.items():
- for locality, circuits in localities.items():
- for circuit_name, circuit_info in circuits.items():
- existing_circuit = existing_data.get(country, {}).get(locality, {}).get(circuit_name, {})
- fandom_url = circuit_info['urls']['fandom']
- if fandom_url is not None:
- response = requests.get(fandom_url, allow_redirects=True, timeout=5,
- headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
- soup = BeautifulSoup(response.content, 'html.parser')
- cleaned_content = clean_fandom_page(soup)
-
- # Create a new clean HTML document
- clean_html = BeautifulSoup("""
- <!DOCTYPE html>
- <html lang="en">
- <head>
- <meta charset="UTF-8">
- <title>Silverstone Circuit</title>
- </head>
- <body>
- </body>
- </html>
- """, 'html.parser')
-
- # Add the cleaned content to the body
- clean_html.body.append(cleaned_content)
-
- # Save to file with proper formatting
- with open(f"{output_dir}/{circuit_name}.html", 'w', encoding='utf-8') as f:
- f.write(clean_html.prettify())
|