scrapers.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. from typing import Optional
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import logging
  5. logger = logging.getLogger(__name__)
  6. PODCAST_SEARCH_URL = "https://podcasts.google.com/search/{query}"
  7. def _strip_and_clean(text):
  8. return text.replace("\n", " ").rstrip().lstrip()
  9. def _build_google_url(url):
  10. return url.replace("./", "https://podcasts.google.com/")
  11. return
  12. def _get_title_from_soup(soup) -> Optional[int]:
  13. title = None
  14. try:
  15. potential_title = soup.find("div", class_="FyxyKd")
  16. if potential_title:
  17. title = _strip_and_clean(potential_title.get_text())
  18. except ValueError:
  19. pass
  20. return title
  21. def _get_url_from_soup(soup) -> Optional[int]:
  22. url = None
  23. try:
  24. url_tag = soup.find("a", class_="yXo2Qc")
  25. if url_tag:
  26. url = _build_google_url(url_tag.get("href"))
  27. except ValueError:
  28. pass
  29. return url
  30. def _get_producer_from_soup(soup) -> str:
  31. pub = ""
  32. try:
  33. potential_pub = soup.find("div", class_="J3Ov7d")
  34. if potential_pub:
  35. pub = _strip_and_clean(potential_pub.get_text())
  36. except ValueError:
  37. pass
  38. return pub
  39. def _get_description_from_soup(soup) -> str:
  40. desc = ""
  41. try:
  42. potential_desc = soup.find("div", class_="yuTZxb")
  43. if potential_desc:
  44. desc = _strip_and_clean(potential_desc.get_text())
  45. except ValueError:
  46. pass
  47. return desc
  48. def _get_img_url_from_soup(soup) -> str:
  49. url = ""
  50. try:
  51. img_tag = soup.find("img", class_="BhVIWc")
  52. try:
  53. url = img_tag["src"]
  54. except IndexError:
  55. pass
  56. except ValueError:
  57. pass
  58. return url
  59. def scrape_data_from_google_podcasts(title) -> dict:
  60. data_dict = {}
  61. headers = {"User-Agent": "Vrobbler 0.11.12"}
  62. url = PODCAST_SEARCH_URL.format(query=title)
  63. r = requests.get(url, headers=headers)
  64. if r.status_code == 200:
  65. soup = BeautifulSoup(r.text, "html.parser")
  66. data_dict["title"] = _get_title_from_soup(soup)
  67. data_dict["description"] = _get_description_from_soup(soup)
  68. data_dict["producer"] = _get_producer_from_soup(soup)
  69. data_dict["google_url"] = _get_url_from_soup(soup)
  70. data_dict["image_url"] = _get_img_url_from_soup(soup)
  71. return data_dict