|
@@ -0,0 +1,61 @@
|
|
|
+from typing import Optional
|
|
|
+import requests
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+
|
|
|
+def get_description_from_soup(soup) -> Optional[int]:
|
|
|
+ rating = None
|
|
|
+ try:
|
|
|
+ potential_desc = soup.find("div", id_="descriptionContainer")
|
|
|
+ potential_rating = soup.find("div", class_="allmusic-rating")
|
|
|
+ if potential_rating:
|
|
|
+ rating = int(strip_and_clean(potential_rating.get_text()))
|
|
|
+ except ValueError:
|
|
|
+ pass
|
|
|
+ return rating
|
|
|
+
|
|
|
+
|
|
|
+def scrape_gaia_trail(url):
|
|
|
+ headers = {"User-Agent": "Mozilla/5.0"}
|
|
|
+ r = requests.get(url, headers=headers)
|
|
|
+ r.raise_for_status()
|
|
|
+ soup = BeautifulSoup(r.text, "html.parser")
|
|
|
+
|
|
|
+ # Title
|
|
|
+ title = soup.select_one("h1").get_text(strip=True)
|
|
|
+
|
|
|
+ # Stats are inside divs with classes like "flex" and label-value pairs
|
|
|
+ stats = {}
|
|
|
+ stat_blocks = soup.select("div.flex.flex-col.items-start.text-sm span")
|
|
|
+ for i in range(0, len(stat_blocks) - 1, 2):
|
|
|
+ label = stat_blocks[i].get_text(strip=True)
|
|
|
+ value = stat_blocks[i + 1].get_text(strip=True)
|
|
|
+ stats[label] = value
|
|
|
+
|
|
|
+ # Description (under 'Overview' header)
|
|
|
+ overview_heading = soup.find(
|
|
|
+ lambda tag: tag.name == "h2" and "Overview" in tag.text
|
|
|
+ )
|
|
|
+ if overview_heading:
|
|
|
+ desc_parts = []
|
|
|
+ for sib in overview_heading.next_siblings:
|
|
|
+ if sib.name and sib.name.startswith("h"):
|
|
|
+ break
|
|
|
+ if sib.name == "p":
|
|
|
+ desc_parts.append(sib.get_text(strip=True))
|
|
|
+ description = "\n\n".join(desc_parts).strip()
|
|
|
+ else:
|
|
|
+ description = None
|
|
|
+
|
|
|
+ return {
|
|
|
+ "url": url,
|
|
|
+ "title": title,
|
|
|
+ "stats": stats,
|
|
|
+ "description": description,
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+# Example usage
|
|
|
+url = "https://www.gaiagps.com/hike/318136/"
|
|
|
+trail_data = scrape_gaia_trail(url)
|
|
|
+print(trail_data)
|