F1 circuit layouts with year-by-year SVGs — manually traced track variations
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

135 linhas
5.6KB

  1. import json
  2. import re
  3. from typing import Any, Dict
  4. import logging
  5. import requests
  6. from bs4 import BeautifulSoup
  7. logging.basicConfig(level=logging.INFO)
  8. logger = logging.getLogger(__name__)
  9. output_dir = './fandom-circuit-pages'
  10. def load_json_file(file_path: str) -> Dict[str, Any]:
  11. """Load and parse a JSON file"""
  12. try:
  13. with open(file_path, 'r') as f:
  14. return json.load(f)
  15. except (FileNotFoundError, json.JSONDecodeError) as e:
  16. logger.error(f"Error reading {file_path}: {str(e)}")
  17. return {}
  18. def remove_header_from_child(child):
  19. if child:
  20. # Get the h2 parent of the span
  21. h_parent = child.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
  22. if h_parent:
  23. # Find all elements after the h2 heading
  24. current = h_parent.next_sibling
  25. while current:
  26. next_element = current.next_sibling
  27. current.decompose()
  28. current = next_element
  29. # Remove the h2 heading itself
  30. h_parent.decompose()
  31. def clean_fandom_page(soup):
  32. # Find the main content div
  33. main_content = soup.find('div', {'id': 'content', 'class': 'page-content'})
  34. if main_content:
  35. # Remove unwanted elements
  36. for element in main_content.find_all(['script', 'style', 'nav']):
  37. element.decompose()
  38. # Replace complex image thumbnails with simple img tags
  39. for figure in main_content.find_all('figure', class_='thumb'):
  40. # Get the image link and alt text
  41. a_tag = figure.find('a', class_='mw-file-description')
  42. if a_tag:
  43. img_url = a_tag.get('href')
  44. # Remove the /revision/latest?cb=XXXXXXXX part using regex
  45. img_url = re.sub(r'/revision/latest\?cb=[0-9]+', '', img_url)
  46. img_tag = figure.find('img')
  47. alt_text = img_tag.get('alt', '') if img_tag else ''
  48. # Get the caption if available
  49. caption_p = figure.find('p', class_='caption')
  50. caption_text = caption_p.get_text(strip=True) if caption_p else ''
  51. # Create a new img tag
  52. new_img = soup.new_tag('img')
  53. new_img['src'] = img_url
  54. new_img['alt'] = alt_text
  55. new_img['title'] = caption_text
  56. # Replace the figure with the new img tag
  57. figure.replace_with(new_img)
  58. # Remove edit section links
  59. for edit_section in main_content.find_all('span', class_='mw-editsection'):
  60. edit_section.decompose()
  61. # Remove the "Notes" section and all references
  62. # First find the notes heading
  63. remove_header_from_child(main_content.find('span', {'id': 'Grand_Prix_Winners_at_Silverstone'}))
  64. remove_header_from_child(main_content.find('span', {'id': 'Per_Grand_Prix'}))
  65. remove_header_from_child(main_content.find('span', {'id': 'Multiple_winners'}))
  66. remove_header_from_child(main_content.find('span', {'id': 'By_Drivers'}))
  67. remove_header_from_child(main_content.find('span', {'id': 'By_Constructors'}))
  68. remove_header_from_child(main_content.find('span', {'id': 'By_Engine_Suppliers'}))
  69. remove_header_from_child(main_content.find('span', {'id': 'Wins_by_Country'}))
  70. remove_header_from_child(main_content.find('span', {'id': 'Drivers'}))
  71. remove_header_from_child(main_content.find('span', {'id': 'Constructors'}))
  72. remove_header_from_child(main_content.find('span', {'id': 'Notes'}))
  73. # Remove all reference wrappers anywhere in the document
  74. for refs in main_content.find_all('div', class_='mw-references-wrap'):
  75. refs.decompose()
  76. for refs in main_content.find_all('table', class_='wikitable'):
  77. refs.decompose()
  78. for refs in main_content.find_all('div', {'id': 'toc'}):
  79. refs.decompose()
  80. for refs in main_content.find_all('aside'):
  81. refs.decompose()
  82. # Remove comment sections
  83. for comments in main_content.find_all('div', class_='comments-wrapper'):
  84. comments.decompose()
  85. return main_content
  86. # Load existing data
  87. existing_data = load_json_file('circuits/circuits.json')
  88. for country, localities in existing_data.items():
  89. for locality, circuits in localities.items():
  90. for circuit_name, circuit_info in circuits.items():
  91. existing_circuit = existing_data.get(country, {}).get(locality, {}).get(circuit_name, {})
  92. fandom_url = circuit_info['urls']['fandom']
  93. if fandom_url is not None:
  94. response = requests.get(fandom_url, allow_redirects=True, timeout=5,
  95. headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
  96. soup = BeautifulSoup(response.content, 'html.parser')
  97. cleaned_content = clean_fandom_page(soup)
  98. # Create a new clean HTML document
  99. clean_html = BeautifulSoup("""
  100. <!DOCTYPE html>
  101. <html lang="en">
  102. <head>
  103. <meta charset="UTF-8">
  104. <title>Silverstone Circuit</title>
  105. </head>
  106. <body>
  107. </body>
  108. </html>
  109. """, 'html.parser')
  110. # Add the cleaned content to the body
  111. clean_html.body.append(cleaned_content)
  112. # Save to file with proper formatting
  113. with open(f"{output_dir}/{circuit_name}.html", 'w', encoding='utf-8') as f:
  114. f.write(clean_html.prettify())