skatevideosite.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. from enum import Enum
  2. from typing import Optional
  3. from bs4 import BeautifulSoup
  4. import requests
  5. import logging
  6. logger = logging.getLogger(__name__)
  7. USER_AGENT = (
  8. "Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0"
  9. )
  10. SKATEVIDEOSITE_URL = "https://www.skatevideosite.com"
  11. SKATEVIDEOSITE_SEARCH_URL = SKATEVIDEOSITE_URL + "/search/?q={title}"
  12. class AmazonAttribute(Enum):
  13. SERIES = 0
  14. PAGES = 1
  15. LANGUAGE = 2
  16. PUBLISHER = 3
  17. PUB_DATE = 4
  18. DIMENSIONS = 5
  19. ISBN_10 = 6
  20. ISBN_13 = 7
  21. def strip_and_clean(text):
  22. return text.strip("\n").rstrip().lstrip()
  23. def get_rating_from_soup(soup) -> Optional[int]:
  24. rating = None
  25. try:
  26. potential_rating = soup.find("div", class_="allmusic-rating")
  27. if potential_rating:
  28. rating = int(strip_and_clean(potential_rating.get_text()))
  29. except ValueError:
  30. pass
  31. return rating
  32. def get_review_from_soup(soup) -> str:
  33. review = ""
  34. try:
  35. potential_text = soup.find("div", class_="text")
  36. if potential_text:
  37. review = strip_and_clean(potential_text.get_text())
  38. except ValueError:
  39. pass
  40. return review
  41. def scrape_data_from_amazon(url) -> dict:
  42. data_dict = {}
  43. headers = {"User-Agent": USER_AGENT}
  44. r = requests.get(url, headers=headers)
  45. if r.status_code == 200:
  46. soup = BeautifulSoup(r.text, "html.parser")
  47. import pdb
  48. pdb.set_trace()
  49. data_dict["rating"] = get_rating_from_soup(soup)
  50. data_dict["review"] = get_review_from_soup(soup)
  51. return data_dict
  52. def lookup_video_from_skatevideosite(title: str) -> Optional[dict]:
  53. video_metadata = None
  54. search_url = SKATEVIDEOSITE_SEARCH_URL.format(title=title)
  55. headers = {
  56. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
  57. "accept-language": "en-GB,en;q=0.9",
  58. }
  59. response = requests.get(search_url, headers=headers)
  60. if response.status_code != 200:
  61. logger.info(f"Bad http response from SkateVideoSite {response}")
  62. return video_metadata
  63. soup = BeautifulSoup(response.text, "html.parser")
  64. detail_url = ""
  65. try:
  66. detail_url = SKATEVIDEOSITE_URL + soup.findAll("a")[12]["href"]
  67. except IndexError:
  68. pass
  69. detail_response = requests.get(detail_url, headers=headers)
  70. detail_soup = BeautifulSoup(detail_response.text, "html.parser")
  71. try:
  72. result = soup.find("div", class_="card-body").find("a")
  73. except:
  74. result = None
  75. if not result:
  76. logger.info(
  77. f"No search results found on skatevideosite",
  78. extra={"title": title},
  79. )
  80. return video_metadata
  81. year = (
  82. detail_soup.find("span", class_="whitespace-normal")
  83. .contents[0]
  84. .replace("(", "")
  85. .replace(")", "")
  86. )
  87. run_time_seconds = (
  88. int(
  89. detail_soup.find("div", class_="p-1")
  90. .contents[-1]
  91. .contents[0]
  92. .strip("(")
  93. .strip("min )")
  94. )
  95. * 60
  96. )
  97. return {
  98. "title": str(result.find("img").get("alt").replace(" cover", "")),
  99. "video_type": "S",
  100. "year": year,
  101. "run_time_seconds": run_time_seconds,
  102. "cover_url": str(result.find("img").get("src")),
  103. }