Browse Source

[webpages] Break domains out to model

Colin Powell 1 year ago
parent
commit
e65ab4d8c2

+ 18 - 1
vrobbler/apps/webpages/admin.py

@@ -1,17 +1,34 @@
 from django.contrib import admin
 
-from webpages.models import WebPage
+from webpages.models import Domain, WebPage
 
 from scrobbles.admin import ScrobbleInline
 
 
+class WebPageInline(admin.TabularInline):
+    model = WebPage
+    extra = 0
+
+    exclude = ("extract",)
+
+
+@admin.register(Domain)
+class DomainAdmin(admin.ModelAdmin):
+    date_hierarchy = "created"
+    ordering = ("root",)
+
+    inlines = [WebPageInline]
+
+
 @admin.register(WebPage)
 class WebPageAdmin(admin.ModelAdmin):
     date_hierarchy = "created"
     list_display = (
+        "uuid",
         "title",
         "url",
     )
+    raw_id_fields = ("domain",)
     ordering = ("-created",)
     search_fields = ("title",)
     inlines = [

+ 91 - 0
vrobbler/apps/webpages/migrations/0004_domain_alter_webpage_domain.py

@@ -0,0 +1,91 @@
+# Generated by Django 4.2.9 on 2024-03-27 02:13
+
+from django.db import migrations, models
+import django.db.models.deletion
+import django_extensions.db.fields
+import taggit.managers
+import uuid
+
+
+def clear_domains(apps, schema_editor):
+    WebPage = apps.get_model("webpages", "WebPage")
+    for w in WebPage.objects.all():
+        if w.domain:
+            w.domain = None
+            w.save(update_fields=["domain"])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("taggit", "0004_alter_taggeditem_content_type_alter_taggeditem_tag"),
+        ("webpages", "0003_webpage_date"),
+    ]
+
+    operations = [
+        migrations.RunPython(clear_domains),
+        migrations.CreateModel(
+            name="Domain",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "created",
+                    django_extensions.db.fields.CreationDateTimeField(
+                        auto_now_add=True, verbose_name="created"
+                    ),
+                ),
+                (
+                    "modified",
+                    django_extensions.db.fields.ModificationDateTimeField(
+                        auto_now=True, verbose_name="modified"
+                    ),
+                ),
+                (
+                    "uuid",
+                    models.UUIDField(
+                        blank=True,
+                        default=uuid.uuid4,
+                        editable=False,
+                        null=True,
+                    ),
+                ),
+                ("root", models.CharField(max_length=255)),
+                (
+                    "name",
+                    models.CharField(blank=True, max_length=255, null=True),
+                ),
+                (
+                    "tags",
+                    taggit.managers.TaggableManager(
+                        blank=True,
+                        help_text="A comma-separated list of tags.",
+                        through="taggit.TaggedItem",
+                        to="taggit.Tag",
+                        verbose_name="Tags",
+                    ),
+                ),
+            ],
+            options={
+                "get_latest_by": "modified",
+                "abstract": False,
+            },
+        ),
+        migrations.AlterField(
+            model_name="webpage",
+            name="domain",
+            field=models.ForeignKey(
+                blank=True,
+                null=True,
+                on_delete=django.db.models.deletion.DO_NOTHING,
+                to="webpages.domain",
+            ),
+        ),
+    ]

+ 31 - 4
vrobbler/apps/webpages/models.py

@@ -1,9 +1,11 @@
 import logging
 from typing import Dict
 from uuid import uuid4
+from django_extensions.db.models import TimeStampedModel
 
 import pendulum
 import requests
+from taggit.managers import TaggableManager
 import trafilatura
 from django.apps import apps
 from django.conf import settings
@@ -18,20 +20,41 @@ BNULL = {"blank": True, "null": True}
 User = get_user_model()
 
 
+class Domain(TimeStampedModel):
+    uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
+    root = models.CharField(max_length=255)
+    name = models.CharField(max_length=255, **BNULL)
+
+    tags = TaggableManager(blank=True)
+
+    def __str__(self):
+        if self.name:
+            return self.name
+        return self.root
+
+    def scrobbles_for_user(self, user_id):
+        from scrobbles.models import Scrobble
+
+        return Scrobble.objects.filter(
+            web_page__domain=self, user_id=user_id
+        ).order_by("-timestamp")
+
+
 class WebPage(ScrobblableMixin):
     COMPLETION_PERCENT = getattr(settings, "WEBSITE_COMPLETION_PERCENT", 100)
 
     uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
     url = models.URLField(max_length=500)
     date = models.DateField(**BNULL)
-    domain = models.CharField(max_length=200, **BNULL)
+    domain = models.ForeignKey(Domain, on_delete=models.DO_NOTHING, **BNULL)
     extract = models.TextField(**BNULL)
 
     def __str__(self):
         if self.title:
             return self.title
-
-        return self.domain
+        if self.domain:
+            return self.domain
+        return str(self.uuid)
 
     def _raw_domain(self):
         self.url.split("//")[-1].split("/")[0]
@@ -71,6 +94,10 @@ class WebPage(ScrobblableMixin):
             "-timestamp"
         )
 
+    def _update_domain_from_url(self):
+        domain = self.url.split("//")[-1].split("/")[0].split("www.")[-1]
+        self.domain, created = Domain.objects.get_or_create(root=domain)
+
     def _update_data_from_web(self, force=True):
         headers = {
             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0"
@@ -88,7 +115,7 @@ class WebPage(ScrobblableMixin):
             self.extract = trafilatura.extract(raw_text)
 
         if not self.domain or force:
-            self.domain = self.url.split("//")[-1].split("/")[0]
+            self._update_domain_from_url()
 
         if not self.run_time_seconds or force:
             self.run_time_seconds = self.estimated_time_to_read_in_seconds