scrapers.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import urllib
  2. from typing import Optional
  3. from bs4 import BeautifulSoup
  4. import requests
  5. import logging
  6. logger = logging.getLogger(__name__)
  7. PODCAST_SEARCH_URL = "https://podcasts.google.com/search/{query}"
  8. def _strip_and_clean(text):
  9. return text.replace("\n", " ").rstrip().lstrip()
  10. def _build_google_url(url):
  11. return url.replace("./", "https://podcasts.google.com/")
  12. return
  13. def _get_title_from_soup(soup) -> Optional[int]:
  14. title = None
  15. try:
  16. potential_title = soup.find("div", class_="FyxyKd")
  17. if potential_title:
  18. title = _strip_and_clean(potential_title.get_text())
  19. except ValueError:
  20. pass
  21. return title
  22. def _get_url_from_soup(soup) -> Optional[int]:
  23. url = None
  24. try:
  25. url_tag = soup.find("a", class_="yXo2Qc")
  26. if url_tag:
  27. url = _build_google_url(url_tag.get("href"))
  28. except ValueError:
  29. pass
  30. return url
  31. def _get_producer_from_soup(soup) -> str:
  32. pub = ""
  33. try:
  34. potential_pub = soup.find("div", class_="J3Ov7d")
  35. if potential_pub:
  36. pub = _strip_and_clean(potential_pub.get_text())
  37. except ValueError:
  38. pass
  39. return pub
  40. def _get_description_from_soup(soup) -> str:
  41. desc = ""
  42. try:
  43. potential_desc = soup.find("div", class_="yuTZxb")
  44. if potential_desc:
  45. desc = _strip_and_clean(potential_desc.get_text())
  46. except ValueError:
  47. pass
  48. return desc
  49. def _get_img_url_from_soup(soup) -> str:
  50. url = ""
  51. try:
  52. img_tag = soup.find("img", class_="BhVIWc")
  53. try:
  54. url = img_tag["src"]
  55. except IndexError:
  56. pass
  57. except ValueError:
  58. pass
  59. return url
  60. def scrape_data_from_google_podcasts(title) -> dict:
  61. data_dict = {}
  62. headers = {"User-Agent": "Vrobbler 0.11.12"}
  63. url = PODCAST_SEARCH_URL.format(query=title)
  64. r = requests.get(url, headers=headers)
  65. if r.status_code == 200:
  66. soup = BeautifulSoup(r.text, "html")
  67. data_dict["title"] = _get_title_from_soup(soup)
  68. data_dict["description"] = _get_description_from_soup(soup)
  69. data_dict["producer"] = _get_producer_from_soup(soup)
  70. data_dict["google_url"] = _get_url_from_soup(soup)
  71. data_dict["image_url"] = _get_img_url_from_soup(soup)
  72. return data_dict