F1 circuit layouts with year-by-year SVGs — manually traced track variations
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

135 行
5.6KB

  1. import json
  2. import re
  3. from typing import Any, Dict
  4. import logging
  5. import requests
  6. from bs4 import BeautifulSoup
  7. logging.basicConfig(level=logging.INFO)
  8. logger = logging.getLogger(__name__)
  9. output_dir = './fandom-circuit-pages'
  10. def load_json_file(file_path: str) -> Dict[str, Any]:
  11. """Load and parse a JSON file"""
  12. try:
  13. with open(file_path, 'r') as f:
  14. return json.load(f)
  15. except (FileNotFoundError, json.JSONDecodeError) as e:
  16. logger.error(f"Error reading {file_path}: {str(e)}")
  17. return {}
  18. def remove_header_from_child(child):
  19. if child:
  20. # Get the h2 parent of the span
  21. h_parent = child.find_parent(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
  22. if h_parent:
  23. # Find all elements after the h2 heading
  24. current = h_parent.next_sibling
  25. while current:
  26. next_element = current.next_sibling
  27. current.decompose()
  28. current = next_element
  29. # Remove the h2 heading itself
  30. h_parent.decompose()
  31. def clean_fandom_page(soup):
  32. # Find the main content div
  33. main_content = soup.find('div', {'id': 'content', 'class': 'page-content'})
  34. if main_content:
  35. # Remove unwanted elements
  36. for element in main_content.find_all(['script', 'style', 'nav']):
  37. element.decompose()
  38. # Replace complex image thumbnails with simple img tags
  39. for figure in main_content.find_all('figure', class_='thumb'):
  40. # Get the image link and alt text
  41. a_tag = figure.find('a', class_='mw-file-description')
  42. if a_tag:
  43. img_url = a_tag.get('href')
  44. # Remove the /revision/latest?cb=XXXXXXXX part using regex
  45. img_url = re.sub(r'/revision/latest\?cb=[0-9]+', '', img_url)
  46. img_tag = figure.find('img')
  47. alt_text = img_tag.get('alt', '') if img_tag else ''
  48. # Get the caption if available
  49. caption_p = figure.find('p', class_='caption')
  50. caption_text = caption_p.get_text(strip=True) if caption_p else ''
  51. # Create a new img tag
  52. new_img = soup.new_tag('img')
  53. new_img['src'] = img_url
  54. new_img['alt'] = alt_text
  55. new_img['title'] = caption_text
  56. # Replace the figure with the new img tag
  57. figure.replace_with(new_img)
  58. # Remove edit section links
  59. for edit_section in main_content.find_all('span', class_='mw-editsection'):
  60. edit_section.decompose()
  61. # Remove the "Notes" section and all references
  62. # First find the notes heading
  63. remove_header_from_child(main_content.find('span', {'id': 'Grand_Prix_Winners_at_Silverstone'}))
  64. remove_header_from_child(main_content.find('span', {'id': 'Per_Grand_Prix'}))
  65. remove_header_from_child(main_content.find('span', {'id': 'Multiple_winners'}))
  66. remove_header_from_child(main_content.find('span', {'id': 'By_Drivers'}))
  67. remove_header_from_child(main_content.find('span', {'id': 'By_Constructors'}))
  68. remove_header_from_child(main_content.find('span', {'id': 'By_Engine_Suppliers'}))
  69. remove_header_from_child(main_content.find('span', {'id': 'Wins_by_Country'}))
  70. remove_header_from_child(main_content.find('span', {'id': 'Drivers'}))
  71. remove_header_from_child(main_content.find('span', {'id': 'Constructors'}))
  72. remove_header_from_child(main_content.find('span', {'id': 'Notes'}))
  73. # Remove all reference wrappers anywhere in the document
  74. for refs in main_content.find_all('div', class_='mw-references-wrap'):
  75. refs.decompose()
  76. for refs in main_content.find_all('table', class_='wikitable'):
  77. refs.decompose()
  78. for refs in main_content.find_all('div', {'id': 'toc'}):
  79. refs.decompose()
  80. for refs in main_content.find_all('aside'):
  81. refs.decompose()
  82. # Remove comment sections
  83. for comments in main_content.find_all('div', class_='comments-wrapper'):
  84. comments.decompose()
  85. return main_content
  86. # Load existing data
  87. existing_data = load_json_file('circuits/circuits.json')
  88. for country, localities in existing_data.items():
  89. for locality, circuits in localities.items():
  90. for circuit_name, circuit_info in circuits.items():
  91. existing_circuit = existing_data.get(country, {}).get(locality, {}).get(circuit_name, {})
  92. fandom_url = circuit_info['urls']['fandom']
  93. if fandom_url is not None:
  94. response = requests.get(fandom_url, allow_redirects=True, timeout=5,
  95. headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
  96. soup = BeautifulSoup(response.content, 'html.parser')
  97. cleaned_content = clean_fandom_page(soup)
  98. # Create a new clean HTML document
  99. clean_html = BeautifulSoup("""
  100. <!DOCTYPE html>
  101. <html lang="en">
  102. <head>
  103. <meta charset="UTF-8">
  104. <title>Silverstone Circuit</title>
  105. </head>
  106. <body>
  107. </body>
  108. </html>
  109. """, 'html.parser')
  110. # Add the cleaned content to the body
  111. clean_html.body.append(cleaned_content)
  112. # Save to file with proper formatting
  113. with open(f"{output_dir}/{circuit_name}.html", 'w', encoding='utf-8') as f:
  114. f.write(clean_html.prettify())