Przeglądaj źródła

Add podcast scraper using Google

Colin Powell 2 lat temu
rodzic
commit
dd66774bda

+ 26 - 0
tests/podcasts_tests/test_scrapers.py

@@ -0,0 +1,26 @@
+from vrobbler.apps.podcasts.scrapers import scrape_data_from_google_podcasts
+
+expected_desc = (
+    "NPR's Up First is the news you need to start your day. "
+    "The three biggest stories of the day, with reporting and analysis "
+    "from NPR News — in 10 minutes. Available weekdays by 6 a.m. ET, "
+    "with hosts Leila Fadel, Steve Inskeep, Rachel Martin and A Martinez. "
+    "Also available on Saturdays by 8 a.m. ET, with Ayesha Rascoe and "
+    "Scott Simon. On Sundays, hear a longer exploration behind the "
+    "headlines with Rachel Martin, available by 8 a.m. ET. Subscribe "
+    "and listen, then support your local NPR station at donate.npr.org.  "
+    "Support NPR's reporting by subscribing to Up First+ and unlock "
+    "sponsor-free listening. Learn more at plus.npr.org/UpFirst"
+)
+
+expected_img_url = "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT-PqtK-bauo8wm8dBE__SVGArlvfBYY8rqxr2kA5UwjKzEx8c"
+
+
+def test_get_not_allowed_from_mopidy():
+    query = "Up First"
+    result_dict = scrape_data_from_google_podcasts(query)
+
+    assert result_dict["title"] == query
+    assert result_dict["description"] == expected_desc
+    assert result_dict["image_url"] == expected_img_url
+    assert result_dict["publisher"] == "NPR"

+ 73 - 0
vrobbler/apps/podcasts/scrapers.py

@@ -0,0 +1,73 @@
+import urllib
+from typing import Optional
+from bs4 import BeautifulSoup
+import requests
+import logging
+
+logger = logging.getLogger(__name__)
+
+PODCAST_SEARCH_URL = "https://podcasts.google.com/search/{query}"
+
+
+def strip_and_clean(text):
+    return text.replace("\n", " ").rstrip().lstrip()
+
+
+def get_title_from_soup(soup) -> Optional[int]:
+    title = None
+    try:
+        potential_title = soup.find("div", class_="FyxyKd")
+        if potential_title:
+            title = strip_and_clean(potential_title.get_text())
+    except ValueError:
+        pass
+    return title
+
+
+def get_publisher_from_soup(soup) -> str:
+    pub = ""
+    try:
+        potential_pub = soup.find("div", class_="J3Ov7d")
+        if potential_pub:
+            pub = strip_and_clean(potential_pub.get_text())
+    except ValueError:
+        pass
+    return pub
+
+
+def get_description_from_soup(soup) -> str:
+    desc = ""
+    try:
+        potential_desc = soup.find("div", class_="yuTZxb")
+        if potential_desc:
+            desc = strip_and_clean(potential_desc.get_text())
+    except ValueError:
+        pass
+    return desc
+
+
+def get_img_url_from_soup(soup) -> str:
+    url = ""
+    try:
+        img_tag = soup.find("img", class_="BhVIWc")
+        try:
+            url = img_tag["src"]
+        except IndexError:
+            pass
+    except ValueError:
+        pass
+    return url
+
+
+def scrape_data_from_google_podcasts(title) -> dict:
+    data_dict = {}
+    headers = {"User-Agent": "Vrobbler 0.11.12"}
+    url = PODCAST_SEARCH_URL.format(query=title)
+    r = requests.get(url, headers=headers)
+    if r.status_code == 200:
+        soup = BeautifulSoup(r.text, "html")
+        data_dict["title"] = get_title_from_soup(soup)
+        data_dict["description"] = get_description_from_soup(soup)
+        data_dict["publisher"] = get_publisher_from_soup(soup)
+        data_dict["image_url"] = get_img_url_from_soup(soup)
+    return data_dict