|
@@ -9,55 +9,61 @@ logger = logging.getLogger(__name__)
|
|
|
PODCAST_SEARCH_URL = "https://podcasts.google.com/search/{query}"
|
|
|
|
|
|
|
|
|
-def strip_and_clean(text):
|
|
|
+def _strip_and_clean(text):
|
|
|
return text.replace("\n", " ").rstrip().lstrip()
|
|
|
|
|
|
|
|
|
-def get_title_from_soup(soup) -> Optional[int]:
|
|
|
+def _build_google_url(url):
|
|
|
+ return url.replace("./", "https://podcasts.google.com/")
|
|
|
+
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+def _get_title_from_soup(soup) -> Optional[int]:
|
|
|
title = None
|
|
|
try:
|
|
|
potential_title = soup.find("div", class_="FyxyKd")
|
|
|
if potential_title:
|
|
|
- title = strip_and_clean(potential_title.get_text())
|
|
|
+ title = _strip_and_clean(potential_title.get_text())
|
|
|
except ValueError:
|
|
|
pass
|
|
|
return title
|
|
|
|
|
|
|
|
|
-def get_url_from_soup(soup) -> Optional[int]:
|
|
|
+def _get_url_from_soup(soup) -> Optional[int]:
|
|
|
url = None
|
|
|
try:
|
|
|
- url_tag = soup.find("div", class_="AZqljb JSLBqe")
|
|
|
+ url_tag = soup.find("div", class_="yXo2Qc")
|
|
|
if url_tag:
|
|
|
- url = url_tag.get("data-feed")
|
|
|
+ url = _build_google_url(url_tag.get("href"))
|
|
|
except ValueError:
|
|
|
pass
|
|
|
return url
|
|
|
|
|
|
|
|
|
-def get_producer_from_soup(soup) -> str:
|
|
|
+def _get_producer_from_soup(soup) -> str:
|
|
|
pub = ""
|
|
|
try:
|
|
|
potential_pub = soup.find("div", class_="J3Ov7d")
|
|
|
if potential_pub:
|
|
|
- pub = strip_and_clean(potential_pub.get_text())
|
|
|
+ pub = _strip_and_clean(potential_pub.get_text())
|
|
|
except ValueError:
|
|
|
pass
|
|
|
return pub
|
|
|
|
|
|
|
|
|
-def get_description_from_soup(soup) -> str:
|
|
|
+def _get_description_from_soup(soup) -> str:
|
|
|
desc = ""
|
|
|
try:
|
|
|
potential_desc = soup.find("div", class_="yuTZxb")
|
|
|
if potential_desc:
|
|
|
- desc = strip_and_clean(potential_desc.get_text())
|
|
|
+ desc = _strip_and_clean(potential_desc.get_text())
|
|
|
except ValueError:
|
|
|
pass
|
|
|
return desc
|
|
|
|
|
|
|
|
|
-def get_img_url_from_soup(soup) -> str:
|
|
|
+def _get_img_url_from_soup(soup) -> str:
|
|
|
url = ""
|
|
|
try:
|
|
|
img_tag = soup.find("img", class_="BhVIWc")
|
|
@@ -77,9 +83,9 @@ def scrape_data_from_google_podcasts(title) -> dict:
|
|
|
r = requests.get(url, headers=headers)
|
|
|
if r.status_code == 200:
|
|
|
soup = BeautifulSoup(r.text, "html")
|
|
|
- data_dict["title"] = get_title_from_soup(soup)
|
|
|
- data_dict["description"] = get_description_from_soup(soup)
|
|
|
- data_dict["producer"] = get_producer_from_soup(soup)
|
|
|
- data_dict["url"] = get_url_from_soup(soup)
|
|
|
- data_dict["image_url"] = get_img_url_from_soup(soup)
|
|
|
+ data_dict["title"] = _get_title_from_soup(soup)
|
|
|
+ data_dict["description"] = _get_description_from_soup(soup)
|
|
|
+ data_dict["producer"] = _get_producer_from_soup(soup)
|
|
|
+ data_dict["google_url"] = _get_url_from_soup(soup)
|
|
|
+ data_dict["image_url"] = _get_img_url_from_soup(soup)
|
|
|
return data_dict
|