Bladeren bron

[webpages] Clean up how we fetch webpage data

Colin Powell 1 jaar geleden
bovenliggende
commit
c2a53875a0
1 gewijzigde bestanden met toevoegingen van 43 en 41 verwijderingen
  1. 43 41
      vrobbler/apps/webpages/models.py

+ 43 - 41
vrobbler/apps/webpages/models.py

@@ -59,11 +59,9 @@ class WebPage(ScrobblableMixin):
     def _raw_domain(self):
         self.url.split("//")[-1].split("/")[0]
 
-    def _update_extract_from_web(self, force=True):
-        headers = {
-            "headers": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0"
-        }
-        raw_text = requests.get(self.url, headers=headers).text
+    def _update_extract_from_web(self, raw_text: str = "", force=True):
+        if not raw_text:
+            raw_text = requests.get(self.url, headers=headers).text
         if not self.extract or force:
             self.extract = trafilatura.extract(raw_text)
             self.save(update_fields=["extract"])
@@ -95,34 +93,38 @@ class WebPage(ScrobblableMixin):
         )
 
     def clean_title(self, title: str, save=True):
-        if len(title.split('|')) > 1:
-            if "The Quietus" in title:
-                title = title.split('|')[-0]
-            else:
-                title = title.split('|')[0]
-        if len(title.split('–')) > 1:
-            title = title.split('–')[0]
-        if len(title.split(' - ')) > 1:
-            title = title.split(' - ')[0]
+        if len(title.split("|")) > 1:
+            title = title.split("|")[0]
+        if len(title.split("–")) > 1:
+            title = title.split("–")[0]
+        if len(title.split(" - ")) > 1:
+            title = title.split(" - ")[0]
         self.title = title.strip()
+
         if save:
             self.save(update_fields=["title"])
 
-
-
-    def _update_domain_from_url(self):
+    def _update_domain_from_url(self, save=False):
         domain = self.url.split("//")[-1].split("/")[0].split("www.")[-1]
         self.domain, created = Domain.objects.get_or_create(root=domain)
 
-    def _update_data_from_web(self, force=True):
-        headers = {
-            "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0"
-        }
-        raw_text = requests.get(self.url, headers=headers).text
-        if not self.title or force:
-            self.set_title(raw_text[
-                raw_text.find("<title>") + 7 : raw_text.find("</title>")
-            ])
+        if save:
+            self.save(update_fields=["domain"])
+
+    def _update_title_from_web(self, raw_text: str, save=False):
+        self.title = raw_text[
+            raw_text.find("<title>") + 7 : raw_text.find("</title>")
+        ]
+
+        if not self.title and self.extract:
+            first_line = self.extract.split("\n")[0]
+            if len(first_line) < 254:
+                self.title = first_line
+
+        if save:
+            self.save(update_fields=["title"])
+
+    def _update_date_from_web(self, save=False):
         try:
             date_str = find_date(str(self.url))
         except ValueError:
@@ -130,12 +132,19 @@ class WebPage(ScrobblableMixin):
         if date_str:
             self.date = pendulum.parse(date_str).date()
 
+        if save:
+            self.save(update_fields=["date"])
+
+    def fetch_data_from_web(self, save=True, force=True):
+        raw_text = trafilatura.fetch_url(self.url)
         if not self.extract or force:
             self.extract = trafilatura.extract(raw_text)
-            if not self.title:
-                first_line = self.extract.split("\n")[0]
-                if len(first_line) < 254:
-                    self.title = first_line
+
+        if not self.title or force:
+            self._update_title_from_web(raw_text)
+
+        if not self.date or force:
+            self._update_date_from_web()
 
         if not self.domain or force:
             self._update_domain_from_url()
@@ -143,15 +152,8 @@ class WebPage(ScrobblableMixin):
         if not self.run_time_seconds or force:
             self.run_time_seconds = self.estimated_time_to_read_in_seconds
 
-        self.save(
-            update_fields=[
-                "title",
-                "domain",
-                "extract",
-                "run_time_seconds",
-                "date",
-            ]
-        )
+        if save:
+            self.save()
 
     @classmethod
     def find_or_create(cls, data_dict: Dict) -> "GeoLocation":
@@ -167,6 +169,6 @@ class WebPage(ScrobblableMixin):
         webpage = cls.objects.filter(url=data_dict.get("url")).first()
 
         if not webpage:
-            webpage = cls.objects.create(url=data_dict.get("url"))
-            webpage._update_data_from_web()
+            webpage = cls(url=data_dict.get("url"))
+            webpage.fetch_data_from_web(save=True)
         return webpage