Browse Source

[books] Add papers as a data model

Colin Powell 2 tháng trước cách đây
mục cha
commit
9e3f714c61

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 365 - 353
poetry.lock


+ 1 - 0
pyproject.toml

@@ -12,6 +12,7 @@ python-dateutil = "^2.8.2"
 python-dotenv = "^0.20.0"
 python-json-logger = "^2.0.2"
 colorlog = "^6.6.0"
+httpx = "<=0.27.2"
 djangorestframework = "^3.13.1"
 Markdown = "^3.3.6"
 django-filter = "^21.1"

+ 136 - 0
vrobbler/apps/books/migrations/0025_remove_author_amazon_id_and_more.py

@@ -0,0 +1,136 @@
+# Generated by Django 4.2.19 on 2025-02-18 05:11
+
+from django.db import migrations, models
+import django_extensions.db.fields
+import taggit.managers
+import uuid
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("scrobbles", "0067_scrobble_food_alter_scrobble_media_type"),
+        ("books", "0024_book_publisher"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="author",
+            name="amazon_id",
+        ),
+        migrations.RemoveField(
+            model_name="author",
+            name="librarything_id",
+        ),
+        migrations.RemoveField(
+            model_name="author",
+            name="locg_slug",
+        ),
+        migrations.AddField(
+            model_name="author",
+            name="semantic_id",
+            field=models.CharField(blank=True, max_length=50, null=True),
+        ),
+        migrations.CreateModel(
+            name="Paper",
+            fields=[
+                (
+                    "id",
+                    models.BigAutoField(
+                        auto_created=True,
+                        primary_key=True,
+                        serialize=False,
+                        verbose_name="ID",
+                    ),
+                ),
+                (
+                    "created",
+                    django_extensions.db.fields.CreationDateTimeField(
+                        auto_now_add=True, verbose_name="created"
+                    ),
+                ),
+                (
+                    "modified",
+                    django_extensions.db.fields.ModificationDateTimeField(
+                        auto_now=True, verbose_name="modified"
+                    ),
+                ),
+                (
+                    "uuid",
+                    models.UUIDField(
+                        blank=True,
+                        default=uuid.uuid4,
+                        editable=False,
+                        null=True,
+                    ),
+                ),
+                ("run_time_seconds", models.IntegerField(default=900)),
+                (
+                    "run_time_ticks",
+                    models.PositiveBigIntegerField(blank=True, null=True),
+                ),
+                ("title", models.CharField(max_length=255)),
+                ("semantic_title", models.CharField(max_length=255)),
+                (
+                    "koreader_data_by_hash",
+                    models.JSONField(blank=True, null=True),
+                ),
+                (
+                    "semantic_id",
+                    models.CharField(blank=True, max_length=50, null=True),
+                ),
+                (
+                    "arxiv_id",
+                    models.CharField(blank=True, max_length=50, null=True),
+                ),
+                (
+                    "corpus_id",
+                    models.CharField(blank=True, max_length=50, null=True),
+                ),
+                (
+                    "doi_id",
+                    models.CharField(blank=True, max_length=50, null=True),
+                ),
+                ("pages", models.IntegerField(blank=True, null=True)),
+                (
+                    "language",
+                    models.CharField(blank=True, max_length=4, null=True),
+                ),
+                (
+                    "first_publish_year",
+                    models.IntegerField(blank=True, null=True),
+                ),
+                ("publish_date", models.DateField(blank=True, null=True)),
+                (
+                    "journal",
+                    models.CharField(blank=True, max_length=255, null=True),
+                ),
+                (
+                    "journal_volume",
+                    models.CharField(blank=True, max_length=50, null=True),
+                ),
+                ("abstract", models.TextField(blank=True, null=True)),
+                ("num_citations", models.IntegerField(blank=True, null=True)),
+                (
+                    "openaccess_pdf_url",
+                    models.CharField(blank=True, max_length=255, null=True),
+                ),
+                (
+                    "authors",
+                    models.ManyToManyField(blank=True, to="books.author"),
+                ),
+                (
+                    "genre",
+                    taggit.managers.TaggableManager(
+                        help_text="A comma-separated list of tags.",
+                        through="scrobbles.ObjectWithGenres",
+                        to="scrobbles.Genre",
+                        verbose_name="Tags",
+                    ),
+                ),
+            ],
+            options={
+                "abstract": False,
+            },
+        ),
+    ]

+ 18 - 0
vrobbler/apps/books/migrations/0026_alter_paper_semantic_title.py

@@ -0,0 +1,18 @@
+# Generated by Django 4.2.19 on 2025-02-18 05:28
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("books", "0025_remove_author_amazon_id_and_more"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="paper",
+            name="semantic_title",
+            field=models.CharField(blank=True, max_length=255, null=True),
+        ),
+    ]

+ 22 - 0
vrobbler/apps/books/migrations/0027_remove_paper_num_citations_paper_tldr.py

@@ -0,0 +1,22 @@
+# Generated by Django 4.2.19 on 2025-02-18 05:33
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("books", "0026_alter_paper_semantic_title"),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name="paper",
+            name="num_citations",
+        ),
+        migrations.AddField(
+            model_name="paper",
+            name="tldr",
+            field=models.CharField(blank=True, max_length=255, null=True),
+        ),
+    ]

+ 94 - 17
vrobbler/apps/books/models.py

@@ -36,6 +36,7 @@ from vrobbler.apps.books.locg import (
     lookup_comic_writer_by_locg_slug,
 )
 from vrobbler.apps.books.sources.google import lookup_book_from_google
+from vrobbler.apps.books.sources.semantic import lookup_paper_from_semantic
 from vrobbler.apps.scrobbles.dataclasses import BookLogData
 
 COMICVINE_API_KEY = getattr(settings, "COMICVINE_API_KEY", "")
@@ -64,22 +65,27 @@ class Author(TimeStampedModel):
     )
     bio = models.TextField(**BNULL)
     wikipedia_url = models.CharField(max_length=255, **BNULL)
-    isni = models.CharField(max_length=255, **BNULL)
-    locg_slug = models.CharField(max_length=255, **BNULL)
     wikidata_id = models.CharField(max_length=255, **BNULL)
+    isni = models.CharField(max_length=255, **BNULL)
     goodreads_id = models.CharField(max_length=255, **BNULL)
-    librarything_id = models.CharField(max_length=255, **BNULL)
     comicvine_data = models.JSONField(**BNULL)
-    amazon_id = models.CharField(max_length=255, **BNULL)
+
+    semantic_id = models.CharField(max_length=50, **BNULL)
 
     def __str__(self):
         return f"{self.name}"
 
-    def fix_metadata(self, data_dict: dict = {}):
-        if not data_dict and self.openlibrary_id:
-            data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
+    def enrich_from_semantic(self, overwrite=False):
+        ...
+
+    def enrich_from_google_books(self, overwrite=False):
+        ...
+
+    def enrich_from_openlibrary(self, overwrite=False):
+        data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
 
         if not data_dict or not data_dict.get("name"):
+            logger.warning("Could not find author on openlibrary")
             return
 
         headshot_url = data_dict.pop("author_headshot_url", "")
@@ -160,26 +166,32 @@ class Book(LongPlayScrobblableMixin):
         if not created and not overwrite:
             return book
 
-        bdict, authors, cover, genres = lookup_book_from_google(
-            title
-        ).as_dict_with_authors_cover_and_genres()
+        book_dict = lookup_book_from_google(title)
 
         if created or overwrite:
             author_list = []
+            authors = book_dict.pop("authors")
+            cover_url = book_dict.pop("cover_url")
+            genres = book_dict.pop("generes")
+
             if authors:
                 for author_str in authors:
                     if author_str:
-                        author_list.append(
-                            Author.objects.get_or_create(name=author_str)[0]
+                        author, a_created = Author.objects.get_or_create(
+                            name=author_str
                         )
+                        author_list.append(author)
+                        if a_created:
+                            # TODO enrich author
+                            ...
 
-            for k, v in bdict.items():
+            for k, v in book_dict.items():
                 setattr(book, k, v)
-                book.save()
+            book.save()
 
-                book.save_image_from_url(cover)
-                book.genre.add(*genres)
-                book.authors.add(*author_list)
+            book.save_image_from_url(cover_url)
+            book.genre.add(*genres)
+            book.authors.add(*author_list)
         return book
 
     def save_image_from_url(self, url: str, force_update: bool = False):
@@ -474,3 +486,68 @@ class Page(TimeStampedModel):
             seconds=self.duration_seconds
         )
         self.save(update_fields=["end_time"])
+
+
+class Paper(LongPlayScrobblableMixin):
+    """Keeps track of Academic Papers"""
+
+    COMPLETION_PERCENT = getattr(settings, "PAPER_COMPLETION_PERCENT", 60)
+    AVG_PAGE_READING_SECONDS = getattr(
+        settings, "AVERAGE_PAGE_READING_SECONDS", 60
+    )
+
+    title = models.CharField(max_length=255)
+    semantic_title = models.CharField(max_length=255, **BNULL)
+    authors = models.ManyToManyField(Author, blank=True)
+    koreader_data_by_hash = models.JSONField(**BNULL)
+    arxiv_id = models.CharField(max_length=50, **BNULL)
+    semantic_id = models.CharField(max_length=50, **BNULL)
+    arxiv_id = models.CharField(max_length=50, **BNULL)
+    corpus_id = models.CharField(max_length=50, **BNULL)
+    doi_id = models.CharField(max_length=50, **BNULL)
+    pages = models.IntegerField(**BNULL)
+    language = models.CharField(max_length=4, **BNULL)
+    first_publish_year = models.IntegerField(**BNULL)
+    publish_date = models.DateField(**BNULL)
+    journal = models.CharField(max_length=255, **BNULL)
+    journal_volume = models.CharField(max_length=50, **BNULL)
+    abstract = models.TextField(**BNULL)
+    tldr = models.CharField(max_length=255, **BNULL)
+    openaccess_pdf_url = models.CharField(max_length=255, **BNULL)
+
+    genre = TaggableManager(through=ObjectWithGenres)
+
+    @classmethod
+    def get_from_semantic(cls, title: str, overwrite: bool = False) -> "Paper":
+        paper, created = cls.objects.get_or_create(title=title)
+        if not created and not overwrite:
+            return paper
+
+        paper_dict = lookup_paper_from_semantic(title)
+
+        if created or overwrite:
+            author_list = []
+            author_dicts = paper_dict.pop("author_dicts")
+            if author_dicts:
+                for author_dict in author_dicts:
+                    if author_dict.get("authorId"):
+                        author, a_created = Author.objects.get_or_create(
+                            semantic_id=author_dict.get("authorId")
+                        )
+                        author_list.append(author)
+                        if a_created:
+                            author.name = author_dict.get("name")
+                            author.save()
+                            # TODO enrich author?
+                            ...
+
+            for k, v in paper_dict.items():
+                setattr(paper, k, v)
+            paper.save()
+
+            if author_list:
+                paper.authors.add(*author_list)
+            genres = paper_dict.pop("genres", [])
+            if genres:
+                paper.genre.add(*genres)
+        return paper

+ 23 - 22
vrobbler/apps/books/sources/google.py

@@ -3,7 +3,6 @@ import logging
 
 import pendulum
 import requests
-from books.metadata import BookMetadata
 from django.conf import settings
 
 API_KEY = settings.GOOGLE_API_KEY
@@ -14,8 +13,8 @@ GOOGLE_BOOKS_URL = (
 logger = logging.getLogger(__name__)
 
 
-def lookup_book_from_google(title: str) -> BookMetadata:
-    book_metadata = BookMetadata(title=title)
+def lookup_book_from_google(title: str) -> dict:
+    book_dict = {"title": title}
 
     url = GOOGLE_BOOKS_URL.format(title=title, key=API_KEY)
     headers = {"User-Agent": "Vrobbler 0.11.12"}
@@ -25,7 +24,7 @@ def lookup_book_from_google(title: str) -> BookMetadata:
         logger.warning(
             "Bad response from Google", extra={"response": response}
         )
-        return book_metadata
+        return book_dict
 
     google_result = (
         json.loads(response.content).get("items", [{}])[0].get("volumeInfo")
@@ -39,30 +38,32 @@ def lookup_book_from_google(title: str) -> BookMetadata:
             isbn_13 = ident.get("identifier")
         if ident.get("type") == "ISBN_10":
             isbn_10 = ident.get("identifier")
-    book_metadata.title = google_result.get("title")
-    if google_result.get("subtitle"):
-        book_metadata.title = ": ".join(
-            [google_result.get("title"), google_result.get("subtitle")]
-        )
-    book_metadata.authors = google_result.get("authors")
-    book_metadata.publisher = google_result.get("publisher")
-    book_metadata.first_publish_year = publish_date.year
-    book_metadata.pages = google_result.get("pageCount")
-    book_metadata.isbn_13 = isbn_13
-    book_metadata.isbn_10 = isbn_10
-    book_metadata.publish_date = google_result.get("publishedDate")
-    book_metadata.language = google_result.get("language")
-    book_metadata.summary = google_result.get("description")
-    book_metadata.genres = google_result.get("categories")
-    book_metadata.cover_url = (
+    # TODO this may lead to issues with the first get if Google changes our title
+    # book_metadata.title = google_result.get("title")
+    # if google_result.get("subtitle"):
+    #    book_metadata["title"] = ": ".join(
+    #        [google_result.get("title"), google_result.get("subtitle")]
+    #    )
+    book_dict["subtitle"] = google_result.get("subtitle")
+    book_dict["authors"] = google_result.get("authors")
+    book_dict["publisher"] = google_result.get("publisher")
+    book_dict["first_publish_year"] = publish_date.year
+    book_dict["pages"] = google_result.get("pageCount")
+    book_dict["isbn_13"] = isbn_13
+    book_dict["isbn_10"] = isbn_10
+    book_dict["publish_date"] = google_result.get("publishedDate")
+    book_dict["language"] = google_result.get("language")
+    book_dict["summary"] = google_result.get("description")
+    book_dict["genres"] = google_result.get("categories")
+    book_dict["cover_url"] = (
         google_result.get("imageLinks", {})
         .get("thumbnail")
         .replace("zoom=1", "zoom=15")
         .replace("&edge=curl", "")
     )
 
-    book_metadata.run_time_seconds = book_metadata.pages * getattr(
+    book_dict["run_time_seconds"] = book_dict.get("pages", 10) * getattr(
         settings, "AVERAGE_PAGE_READING_SECONDS", 60
     )
 
-    return book_metadata
+    return book_dict

+ 73 - 0
vrobbler/apps/books/sources/semantic.py

@@ -0,0 +1,73 @@
+import json
+import logging
+from datetime import datetime
+
+import requests
+from django.conf import settings
+
+PAPER_SEARCH_URL = (
+    "https://api.semanticscholar.org/graph/v1/paper/search/match?query={}"
+)
+PAPER_DETAIL_URL = "https://api.semanticscholar.org/graph/v1/paper/{}?fields=title,authors,url,year,abstract,externalIds,citationCount,referenceCount,journal,fieldsOfStudy,publicationDate,openAccessPdf"
+
+logger = logging.getLogger(__name__)
+
+
+def get_api_result(url):
+    headers = {"User-Agent": "Vrobbler 0.11.12"}
+    response = requests.get(url, headers=headers)
+
+    if response.status_code != 200:
+        logger.warning(
+            "Bad response from Semantic", extra={"response": response}
+        )
+        return None
+
+    return response
+
+
+def lookup_paper_from_semantic(title: str) -> dict:
+    paper_dict = {"title": title}
+
+    response = get_api_result(PAPER_SEARCH_URL.format(title))
+    if not response:
+        return paper_dict
+
+    semantic_id = json.loads(response.content).get("data")[0].get("paperId")
+    response = get_api_result(PAPER_DETAIL_URL.format(semantic_id))
+
+    result = json.loads(response.content)
+
+    page_str = result.get("journal", {}).get("pages")
+    if page_str:
+        try:
+            start_page = page_str.split(" - ")[0]
+            end_page = page_str.split(" - ")[1]
+            paper_dict["pages"] = int(end_page) - int(start_page)
+        except IndexError:
+            pass
+
+    paper_dict["semantic_id"] = result.get("paperId")
+    paper_dict["doi_id"] = result.get("externalIds", {}).get("DOI")
+    paper_dict["arxiv_id"] = result.get("externalIds", {}).get("ArXiv")
+    paper_dict["pubmed_id"] = result.get("externalIds", {}).get("PubMed")
+    paper_dict["corpus_id"] = result.get("externalIds", {}).get("CorpusId")
+    paper_dict["semantic_title"] = result.get("title")
+    paper_dict["first_publish_year"] = result.get("year")
+    paper_dict["publish_date"] = datetime.strptime(
+        result.get("publicationDate", "1950-01-01"), "%Y-%m-%d"
+    )
+    paper_dict["abstract"] = result.get("abstract")
+    paper_dict["tldr"] = result.get("bib", {}).get("abstract")
+    paper_dict["journal"] = result.get("journal", {}).get("name")
+    paper_dict["journal_volume"] = result.get("journal", {}).get("volume")
+    paper_dict["openaccess_pdf_url"] = result.get("openAccessPdf", {}).get(
+        "url"
+    )
+    paper_dict["run_time_seconds"] = paper_dict.get("pages", 10) * getattr(
+        settings, "AVERAGE_PAGE_READING_SECONDS", 60
+    )
+    paper_dict["author_dicts"] = result.get("authors")
+    paper_dict["genres"] = result.get("fieldsOfStudy")
+
+    return paper_dict

+ 2 - 0
vrobbler/apps/scrobbles/admin.py

@@ -20,7 +20,9 @@ class ScrobbleInline(admin.TabularInline):
         "track",
         "video_game",
         "book",
+        "paper",
         "sport_event",
+        "food",
         "board_game",
         "geo_location",
         "task",

+ 52 - 0
vrobbler/apps/scrobbles/migrations/0068_scrobble_paper_alter_scrobble_media_type.py

@@ -0,0 +1,52 @@
+# Generated by Django 4.2.19 on 2025-02-18 05:27
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("books", "0025_remove_author_amazon_id_and_more"),
+        ("scrobbles", "0067_scrobble_food_alter_scrobble_media_type"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="scrobble",
+            name="paper",
+            field=models.ForeignKey(
+                blank=True,
+                null=True,
+                on_delete=django.db.models.deletion.DO_NOTHING,
+                to="books.paper",
+            ),
+        ),
+        migrations.AlterField(
+            model_name="scrobble",
+            name="media_type",
+            field=models.CharField(
+                choices=[
+                    ("Video", "Video"),
+                    ("Track", "Track"),
+                    ("PodcastEpisode", "Podcast episode"),
+                    ("SportEvent", "Sport event"),
+                    ("Book", "Book"),
+                    ("Paper", "Paper"),
+                    ("VideoGame", "Video game"),
+                    ("BoardGame", "Board game"),
+                    ("GeoLocation", "GeoLocation"),
+                    ("Trail", "Trail"),
+                    ("Beer", "Beer"),
+                    ("Food", "Food"),
+                    ("Task", "Task"),
+                    ("WebPage", "Web Page"),
+                    ("LifeEvent", "Life event"),
+                    ("Mood", "Mood"),
+                    ("BrickSet", "Brick set"),
+                ],
+                default="Video",
+                max_length=14,
+            ),
+        ),
+    ]

+ 3 - 1
vrobbler/apps/scrobbles/models.py

@@ -10,7 +10,7 @@ import pytz
 from beers.models import Beer
 from boardgames.models import BoardGame
 from books.koreader import process_koreader_sqlite_file
-from books.models import Book
+from books.models import Book, Paper
 from bricksets.models import BrickSet
 from django.conf import settings
 from django.contrib.auth import get_user_model
@@ -502,6 +502,7 @@ class Scrobble(TimeStampedModel):
         PODCAST_EPISODE = "PodcastEpisode", "Podcast episode"
         SPORT_EVENT = "SportEvent", "Sport event"
         BOOK = "Book", "Book"
+        PAPER = "Paper", "Paper"
         VIDEO_GAME = "VideoGame", "Video game"
         BOARD_GAME = "BoardGame", "Board game"
         GEO_LOCATION = "GeoLocation", "GeoLocation"
@@ -524,6 +525,7 @@ class Scrobble(TimeStampedModel):
         SportEvent, on_delete=models.DO_NOTHING, **BNULL
     )
     book = models.ForeignKey(Book, on_delete=models.DO_NOTHING, **BNULL)
+    paper = models.ForeignKey(Paper, on_delete=models.DO_NOTHING, **BNULL)
     video_game = models.ForeignKey(
         VideoGame, on_delete=models.DO_NOTHING, **BNULL
     )

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác