2 lat temu · dd66774bda
--- a/tests/podcasts_tests/test_scrapers.py
+++ b/tests/podcasts_tests/test_scrapers.py
@@ -0,0 +1,26 @@
 
				+from vrobbler.apps.podcasts.scrapers import scrape_data_from_google_podcasts
			
 
				+
			
 
				+expected_desc = (
			
 
				+    "NPR's Up First is the news you need to start your day. "
			
 
				+    "The three biggest stories of the day, with reporting and analysis "
			
 
				+    "from NPR News — in 10 minutes. Available weekdays by 6 a.m. ET, "
			
 
				+    "with hosts Leila Fadel, Steve Inskeep, Rachel Martin and A Martinez. "
			
 
				+    "Also available on Saturdays by 8 a.m. ET, with Ayesha Rascoe and "
			
 
				+    "Scott Simon. On Sundays, hear a longer exploration behind the "
			
 
				+    "headlines with Rachel Martin, available by 8 a.m. ET. Subscribe "
			
 
				+    "and listen, then support your local NPR station at donate.npr.org.  "
			
 
				+    "Support NPR's reporting by subscribing to Up First+ and unlock "
			
 
				+    "sponsor-free listening. Learn more at plus.npr.org/UpFirst"
			
 
				+)
			
 
				+
			
 
				+expected_img_url = "https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT-PqtK-bauo8wm8dBE__SVGArlvfBYY8rqxr2kA5UwjKzEx8c"
			
 
				+
			
 
				+
			
 
				+def test_get_not_allowed_from_mopidy():
			
 
				+    query = "Up First"
			
 
				+    result_dict = scrape_data_from_google_podcasts(query)
			
 
				+
			
 
				+    assert result_dict["title"] == query
			
 
				+    assert result_dict["description"] == expected_desc
			
 
				+    assert result_dict["image_url"] == expected_img_url
			
 
				+    assert result_dict["publisher"] == "NPR"
			
--- a/vrobbler/apps/podcasts/scrapers.py
+++ b/vrobbler/apps/podcasts/scrapers.py
@@ -0,0 +1,73 @@
 
				+import urllib
			
 
				+from typing import Optional
			
 
				+from bs4 import BeautifulSoup
			
 
				+import requests
			
 
				+import logging
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+PODCAST_SEARCH_URL = "https://podcasts.google.com/search/{query}"
			
 
				+
			
 
				+
			
 
				+def strip_and_clean(text):
			
 
				+    return text.replace("\n", " ").rstrip().lstrip()
			
 
				+
			
 
				+
			
 
				+def get_title_from_soup(soup) -> Optional[int]:
			
 
				+    title = None
			
 
				+    try:
			
 
				+        potential_title = soup.find("div", class_="FyxyKd")
			
 
				+        if potential_title:
			
 
				+            title = strip_and_clean(potential_title.get_text())
			
 
				+    except ValueError:
			
 
				+        pass
			
 
				+    return title
			
 
				+
			
 
				+
			
 
				+def get_publisher_from_soup(soup) -> str:
			
 
				+    pub = ""
			
 
				+    try:
			
 
				+        potential_pub = soup.find("div", class_="J3Ov7d")
			
 
				+        if potential_pub:
			
 
				+            pub = strip_and_clean(potential_pub.get_text())
			
 
				+    except ValueError:
			
 
				+        pass
			
 
				+    return pub
			
 
				+
			
 
				+
			
 
				+def get_description_from_soup(soup) -> str:
			
 
				+    desc = ""
			
 
				+    try:
			
 
				+        potential_desc = soup.find("div", class_="yuTZxb")
			
 
				+        if potential_desc:
			
 
				+            desc = strip_and_clean(potential_desc.get_text())
			
 
				+    except ValueError:
			
 
				+        pass
			
 
				+    return desc
			
 
				+
			
 
				+
			
 
				+def get_img_url_from_soup(soup) -> str:
			
 
				+    url = ""
			
 
				+    try:
			
 
				+        img_tag = soup.find("img", class_="BhVIWc")
			
 
				+        try:
			
 
				+            url = img_tag["src"]
			
 
				+        except IndexError:
			
 
				+            pass
			
 
				+    except ValueError:
			
 
				+        pass
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def scrape_data_from_google_podcasts(title) -> dict:
			
 
				+    data_dict = {}
			
 
				+    headers = {"User-Agent": "Vrobbler 0.11.12"}
			
 
				+    url = PODCAST_SEARCH_URL.format(query=title)
			
 
				+    r = requests.get(url, headers=headers)
			
 
				+    if r.status_code == 200:
			
 
				+        soup = BeautifulSoup(r.text, "html")
			
 
				+        data_dict["title"] = get_title_from_soup(soup)
			
 
				+        data_dict["description"] = get_description_from_soup(soup)
			
 
				+        data_dict["publisher"] = get_publisher_from_soup(soup)
			
 
				+        data_dict["image_url"] = get_img_url_from_soup(soup)
			
 
				+    return data_dict