|
@@ -1,9 +1,11 @@
|
|
|
import logging
|
|
|
from typing import Dict
|
|
|
from uuid import uuid4
|
|
|
+from django_extensions.db.models import TimeStampedModel
|
|
|
|
|
|
import pendulum
|
|
|
import requests
|
|
|
+from taggit.managers import TaggableManager
|
|
|
import trafilatura
|
|
|
from django.apps import apps
|
|
|
from django.conf import settings
|
|
@@ -18,20 +20,41 @@ BNULL = {"blank": True, "null": True}
|
|
|
User = get_user_model()
|
|
|
|
|
|
|
|
|
+class Domain(TimeStampedModel):
|
|
|
+ uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
|
|
|
+ root = models.CharField(max_length=255)
|
|
|
+ name = models.CharField(max_length=255, **BNULL)
|
|
|
+
|
|
|
+ tags = TaggableManager(blank=True)
|
|
|
+
|
|
|
+ def __str__(self):
|
|
|
+ if self.name:
|
|
|
+ return self.name
|
|
|
+ return self.root
|
|
|
+
|
|
|
+ def scrobbles_for_user(self, user_id):
|
|
|
+ from scrobbles.models import Scrobble
|
|
|
+
|
|
|
+ return Scrobble.objects.filter(
|
|
|
+ web_page__domain=self, user_id=user_id
|
|
|
+ ).order_by("-timestamp")
|
|
|
+
|
|
|
+
|
|
|
class WebPage(ScrobblableMixin):
|
|
|
COMPLETION_PERCENT = getattr(settings, "WEBSITE_COMPLETION_PERCENT", 100)
|
|
|
|
|
|
uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
|
|
|
url = models.URLField(max_length=500)
|
|
|
date = models.DateField(**BNULL)
|
|
|
- domain = models.CharField(max_length=200, **BNULL)
|
|
|
+ domain = models.ForeignKey(Domain, on_delete=models.DO_NOTHING, **BNULL)
|
|
|
extract = models.TextField(**BNULL)
|
|
|
|
|
|
def __str__(self):
|
|
|
if self.title:
|
|
|
return self.title
|
|
|
-
|
|
|
- return self.domain
|
|
|
+ if self.domain:
|
|
|
+ return self.domain
|
|
|
+ return str(self.uuid)
|
|
|
|
|
|
def _raw_domain(self):
|
|
|
self.url.split("//")[-1].split("/")[0]
|
|
@@ -71,6 +94,10 @@ class WebPage(ScrobblableMixin):
|
|
|
"-timestamp"
|
|
|
)
|
|
|
|
|
|
+ def _update_domain_from_url(self):
|
|
|
+ domain = self.url.split("//")[-1].split("/")[0].split("www.")[-1]
|
|
|
+ self.domain, created = Domain.objects.get_or_create(root=domain)
|
|
|
+
|
|
|
def _update_data_from_web(self, force=True):
|
|
|
headers = {
|
|
|
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0"
|
|
@@ -88,7 +115,7 @@ class WebPage(ScrobblableMixin):
|
|
|
self.extract = trafilatura.extract(raw_text)
|
|
|
|
|
|
if not self.domain or force:
|
|
|
- self.domain = self.url.split("//")[-1].split("/")[0]
|
|
|
+ self._update_domain_from_url()
|
|
|
|
|
|
if not self.run_time_seconds or force:
|
|
|
self.run_time_seconds = self.estimated_time_to_read_in_seconds
|