secstate
/
vrobbler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552
							import logging
from collections import OrderedDict
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
from uuid import uuid4

import requests
from books.constants import READCOMICSONLINE_URL
from books.openlibrary import (
    lookup_author_from_openlibrary,
    lookup_book_from_openlibrary,
)
from books.sources.google import lookup_book_from_google
from books.sources.semantic import lookup_paper_from_semantic
from books.utils import get_comic_issue_url
from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.files.base import ContentFile
from django.db import models
from django.urls import reverse
from django_extensions.db.models import TimeStampedModel
from imagekit.models import ImageSpecField
from imagekit.processors import ResizeToFit
from scrobbles.dataclasses import BaseLogData, LongPlayLogData
from scrobbles.mixins import (
    LongPlayScrobblableMixin,
    ObjectWithGenres,
    ScrobblableConstants,
)
from scrobbles.utils import get_scrobbles_for_media, next_url_if_exists
from taggit.managers import TaggableManager
from thefuzz import fuzz

from vrobbler.apps.books.locg import (
    lookup_comic_by_locg_slug,
    lookup_comic_from_locg,
    lookup_comic_writer_by_locg_slug,
)
from vrobbler.apps.books.sources.comicvine import (
    ComicVineClient,
    lookup_comic_from_comicvine,
)

COMICVINE_API_KEY = getattr(settings, "COMICVINE_API_KEY", "")

logger = logging.getLogger(__name__)
User = get_user_model()
BNULL = {"blank": True, "null": True}


@dataclass
class BookPageLogData(BaseLogData):
    page_number: Optional[int] = None
    end_ts: Optional[int] = None
    start_ts: Optional[int] = None
    duration: Optional[int] = None


@dataclass
class BookLogData(BaseLogData, LongPlayLogData):
    koreader_hash: Optional[str] = None
    page_data: Optional[dict[int, BookPageLogData]] = None
    pages_read: Optional[int] = None
    page_start: Optional[int] = None
    page_end: Optional[int] = None
    resume_url: Optional[str] = None

    _excluded_fields = {"koreader_hash", "page_data"}

    def avg_seconds_per_page(self):
        if self.page_data:
            total_duration = 0
            for page_num, stats in self.page_data.items():
                total_duration += stats.get("duration", 0)
            if total_duration:
                return int(total_duration / len(self.page_data))


class Author(TimeStampedModel):
    name = models.CharField(max_length=255)
    uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
    openlibrary_id = models.CharField(max_length=255, **BNULL)
    headshot = models.ImageField(upload_to="books/authors/", **BNULL)
    headshot_small = ImageSpecField(
        source="headshot",
        processors=[ResizeToFit(100, 100)],
        format="JPEG",
        options={"quality": 60},
    )
    headshot_medium = ImageSpecField(
        source="headshot",
        processors=[ResizeToFit(300, 300)],
        format="JPEG",
        options={"quality": 75},
    )
    bio = models.TextField(**BNULL)
    wikipedia_url = models.CharField(max_length=255, **BNULL)
    wikidata_id = models.CharField(max_length=255, **BNULL)
    isni = models.CharField(max_length=255, **BNULL)
    goodreads_id = models.CharField(max_length=255, **BNULL)
    comicvine_data = models.JSONField(**BNULL)

    semantic_id = models.CharField(max_length=50, **BNULL)

    def __str__(self):
        return f"{self.name}"

    def enrich_from_semantic(self, overwrite=False):
        ...

    def enrich_from_google_books(self, overwrite=False):
        ...

    def enrich_from_openlibrary(self, overwrite=False):
        data_dict = lookup_author_from_openlibrary(self.openlibrary_id)

        if not data_dict or not data_dict.get("name"):
            logger.warning("Could not find author on openlibrary")
            return

        headshot_url = data_dict.pop("author_headshot_url", "")

        Author.objects.filter(pk=self.id).update(**data_dict)
        self.refresh_from_db()

        if headshot_url:
            r = requests.get(headshot_url)
            if r.status_code == 200:
                fname = f"{self.name}_{self.uuid}.jpg"
                self.headshot.save(fname, ContentFile(r.content), save=True)


class Book(LongPlayScrobblableMixin):
    COMPLETION_PERCENT = getattr(settings, "BOOK_COMPLETION_PERCENT", 95)
    AVG_PAGE_READING_SECONDS = getattr(
        settings, "AVERAGE_PAGE_READING_SECONDS", 60
    )

    title = models.CharField(max_length=255)
    original_title = models.CharField(max_length=255, **BNULL)
    authors = models.ManyToManyField(Author, blank=True)
    koreader_data_by_hash = models.JSONField(**BNULL)
    isbn_13 = models.CharField(max_length=255, **BNULL)
    isbn_10 = models.CharField(max_length=255, **BNULL)
    pages = models.IntegerField(**BNULL)
    language = models.CharField(max_length=4, **BNULL)
    first_publish_year = models.IntegerField(**BNULL)
    publish_date = models.DateField(**BNULL)
    publisher = models.CharField(max_length=255, **BNULL)
    first_sentence = models.TextField(**BNULL)
    # ComicVine
    comicvine_id = models.CharField(max_length=255, **BNULL)
    readcomics_url = models.CharField(max_length=255, **BNULL)
    next_readcomics_url = models.CharField(max_length=255, **BNULL)
    issue_number = models.IntegerField(**BNULL)
    volume_number = models.IntegerField(**BNULL)
    # OpenLibrary
    openlibrary_id = models.CharField(max_length=255, **BNULL)
    cover = models.ImageField(upload_to="books/covers/", **BNULL)
    cover_small = ImageSpecField(
        source="cover",
        processors=[ResizeToFit(100, 100)],
        format="JPEG",
        options={"quality": 60},
    )
    cover_medium = ImageSpecField(
        source="cover",
        processors=[ResizeToFit(300, 300)],
        format="JPEG",
        options={"quality": 75},
    )
    summary = models.TextField(**BNULL)

    genre = TaggableManager(through=ObjectWithGenres)

    def __str__(self) -> str:
        if self.issue_number and "Issue" not in str(self.title):
            return f"{self.title} - Issue {self.issue_number}"
        if self.volume_number and "Volume" not in str(self.title):
            return f"{self.title} - Volume {self.volume_number}"
        return f"{self.title}"

    @property
    def subtitle(self):
        return f" by {self.author}"

    @property
    def strings(self) -> ScrobblableConstants:
        return ScrobblableConstants(verb="Reading", tags="book")

    @property
    def logdata_cls(self):
        return BookLogData

    @property
    def primary_image_url(self) -> str:
        url = ""
        if self.cover:
            url = self.cover_medium.url
        return url

    def get_absolute_url(self):
        return reverse("books:book_detail", kwargs={"slug": self.uuid})

    @classmethod
    def get_from_comicvine(cls, title: str, overwrite: bool = False, force_new: bool =False) -> "Book":
        book, created = cls.objects.get_or_create(title=title)

        if not created:
            return book

        book_dict = lookup_comic_from_comicvine(title)

        if created or overwrite:
            author_list = []
            author_dicts = book_dict.pop("author_dicts")
            if author_dicts:
                for author_dict in author_dicts:
                    if author_dict.get("authorId"):
                        author, a_created = Author.objects.get_or_create(
                            semantic_id=author_dict.get("authorId")
                        )
                        author_list.append(author)
                        if a_created:
                            author.name = author_dict.get("name")
                            author.save()
                            # TODO enrich author?
                            ...

            for k, v in book_dict.items():
                setattr(book, k, v)
            book.save()

            if author_list:
                book.authors.add(*author_list)
            genres = book_dict.pop("genres", [])
            if genres:
                book.genre.add(*genres)
        return book

    @classmethod
    def find_or_create(
            cls, title: str, url: str = "", enrich: bool = False, commit: bool = True
    ):
        """Given a title, get a Book instance.

        If the book is not already in our database, or overwrite is True,
        this method will enrich the Book with data from Google.

        By default this method will also save the data back to the model. If you'd
        like to batch create, use commit=False and you'll get an unsaved but enriched
        instance back which you can then save at your convenience."""
        # TODO use either a Google Books id identifier or author name like for tracks
        book, created = cls.objects.get_or_create(original_title=title)
        if not created:
            logger.info(
                "Found exact match for book by title", extra={"title": title}
            )

        if not enrich:
            logger.info(
                "Found book by title, but not enriching",
                extra={"title": title},
            )
            return book

        book_dict = None
        if READCOMICSONLINE_URL in url:
            book_dict = lookup_comic_from_comicvine(title)
            book_dict["readcomics_url"] = get_comic_issue_url(url)
            book_dict["next_readcomics_url"] = next_url_if_exists(book_dict["readcomics_url"])

        if not book_dict:
            book_dict = lookup_book_from_google(title)

        if not book_dict:
            logger.warning("No book found in any source, using data as is", extra={"title": title})

        author_list = []
        authors = book_dict.pop("authors", [])
        cover_url = book_dict.pop("cover_url", "")
        genres = book_dict.pop("generes", [])

        if authors:
            for author_str in authors:
                if author_str:
                    author, a_created = Author.objects.get_or_create(
                        name=author_str
                    )
                    author_list.append(author)
                    if a_created:
                        # TODO enrich author
                        ...

        for k, v in book_dict.items():
            setattr(book, k, v)

        if commit:
            book.save()

            book.save_image_from_url(cover_url)
            book.genre.add(*genres)
            book.authors.add(*author_list)

        return book

    def save_image_from_url(self, url: str, force_update: bool = False):
        if url and (not self.cover or force_update):
            r = requests.get(url)
            if r.status_code == 200:
                fname = f"{self.title}_{self.uuid}.jpg"
                self.cover.save(fname, ContentFile(r.content), save=True)

    def fix_metadata(self, data: dict = {}, force_update=False):
        if (not self.openlibrary_id or not self.locg_slug) or force_update:
            author_name = ""
            if self.author:
                author_name = self.author.name

            if not data:
                logger.warn(f"Checking openlibrary for {self.title}")
                if self.openlibrary_id and force_update:
                    data = lookup_book_from_openlibrary(
                        str(self.openlibrary_id)
                    )
                else:
                    data = lookup_book_from_openlibrary(
                        str(self.title), author_name
                    )

            if not data:
                if self.locg_slug:
                    logger.warn(
                        f"Checking LOCG for {self.title} with slug {self.locg_slug}"
                    )
                    data = lookup_comic_by_locg_slug(str(self.locg_slug))
                else:
                    logger.warn(f"Checking LOCG for {self.title}")
                    data = lookup_comic_from_locg(str(self.title))

            if not data and COMICVINE_API_KEY:
                logger.warn(f"Checking ComicVine for {self.title}")
                cv_client = ComicVineClient(api_key=COMICVINE_API_KEY)
                data = lookup_comic_from_comicvine(str(self.title))

            if not data:
                logger.warn(f"Book not found in any sources: {self.title}")
                return

            # We can discard the author name from OL for now, we'll lookup details below
            data.pop("ol_author_name", "")
            if data.get("ol_author_id"):
                self.fix_authors_metadata(data.pop("ol_author_id", ""))
            if data.get("locg_writer_slug"):
                self.get_author_from_locg(data.pop("locg_writer_slug", ""))

            ol_title = data.get("title", "")
            data.pop("ol_author_id", "")

            # Kick out a little warning if we're about to change KoReader's title
            if (
                fuzz.ratio(ol_title.lower(), str(self.title).lower()) < 80
                and not force_update
            ):
                logger.warn(
                    f"OL and KoReader disagree on this book title {self.title} != {ol_title}, check manually"
                )
                self.openlibrary_id = data.get("openlibrary_id")
                self.save(update_fields=["openlibrary_id"])
                return

            # If we don't know pages, don't overwrite existing with None
            if "pages" in data.keys() and data.get("pages") == None:
                data.pop("pages")

            if (
                not isinstance(data.get("pages"), int)
                and "pages" in data.keys()
            ):
                logger.info(
                    f"Pages for {self} from OL expected to be int, but got {data.get('pages')}"
                )
                data.pop("pages")

            # Pop this, so we can look it up later
            cover_url = data.pop("cover_url", "")

            subject_key_list = data.pop("subject_key_list", "")

            # Fun trick for updating all fields at once
            Book.objects.filter(pk=self.id).update(**data)
            self.refresh_from_db()

            if subject_key_list:
                self.genre.add(*subject_key_list)

            if cover_url:
                r = requests.get(cover_url)
                if r.status_code == 200:
                    fname = f"{self.title}_{self.uuid}.jpg"
                    self.cover.save(fname, ContentFile(r.content), save=True)

            if self.pages:
                self.base_run_time_seconds = int(self.pages) * int(
                    self.AVG_PAGE_READING_SECONDS
                )

            self.save()

    def fix_authors_metadata(self, openlibrary_author_id):
        author = Author.objects.filter(
            openlibrary_id=openlibrary_author_id
        ).first()
        if not author:
            data = lookup_author_from_openlibrary(openlibrary_author_id)
            author_image_url = data.pop("author_headshot_url", None)

            author = Author.objects.create(**data)

            if author_image_url:
                r = requests.get(author_image_url)
                if r.status_code == 200:
                    fname = f"{author.name}_{author.uuid}.jpg"
                    author.headshot.save(
                        fname, ContentFile(r.content), save=True
                    )
        self.authors.add(author)

    def get_author_from_locg(self, locg_slug):
        writer = lookup_comic_writer_by_locg_slug(locg_slug)

        author, created = Author.objects.get_or_create(
            name=writer["name"], locg_slug=writer["locg_slug"]
        )
        if (created or not author.headshot) and writer["photo_url"]:
            r = requests.get(writer["photo_url"])
            if r.status_code == 200:
                fname = f"{author.name}_{author.uuid}.jpg"
                author.headshot.save(fname, ContentFile(r.content), save=True)
        self.authors.add(author)

    def page_data_for_user(
        self, user_id: int, convert_timestamps: bool = True
    ) -> dict:
        scrobbles = self.scrobble_set.filter(user=user_id)

        pages = {}
        for scrobble in scrobbles:
            if scrobble.logdata.page_data:
                for page, data in scrobble.logdata.page_data.items():
                    if convert_timestamps:
                        data["start_ts"] = datetime.fromtimestamp(
                            data["start_ts"]
                        )
                        data["end_ts"] = datetime.fromtimestamp(data["end_ts"])
                    pages[page] = data
        sorted_pages = OrderedDict(
            sorted(pages.items(), key=lambda x: x[1]["start_ts"])
        )

        return sorted_pages

    @property
    def author(self):
        return self.authors.first()

    @property
    def pages_for_completion(self) -> int:
        if not self.pages:
            logger.warn(f"{self} has no pages, no completion percentage")
            return 0
        return int(self.pages * (self.COMPLETION_PERCENT / 100))

    def update_long_play_seconds(self):
        """Check page timestamps and duration and update"""
        if self.page_set.all():
            ...

    def progress_for_user(self, user_id: int) -> int:
        """Used to keep track of whether the book is complete or not"""
        user = User.objects.get(id=user_id)
        last_scrobble = get_scrobbles_for_media(self, user).last()
        progress = 0
        if last_scrobble:
            progress = int((last_scrobble.last_page_read / self.pages) * 100)
        return progress


class Paper(LongPlayScrobblableMixin):
    """Keeps track of Academic Papers"""

    COMPLETION_PERCENT = getattr(settings, "PAPER_COMPLETION_PERCENT", 60)
    AVG_PAGE_READING_SECONDS = getattr(
        settings, "AVERAGE_PAGE_READING_SECONDS", 60
    )

    title = models.CharField(max_length=255)
    semantic_title = models.CharField(max_length=255, **BNULL)
    authors = models.ManyToManyField(Author, blank=True)
    koreader_data_by_hash = models.JSONField(**BNULL)
    arxiv_id = models.CharField(max_length=50, **BNULL)
    semantic_id = models.CharField(max_length=50, **BNULL)
    arxiv_id = models.CharField(max_length=50, **BNULL)
    corpus_id = models.CharField(max_length=50, **BNULL)
    doi_id = models.CharField(max_length=50, **BNULL)
    pages = models.IntegerField(**BNULL)
    language = models.CharField(max_length=4, **BNULL)
    first_publish_year = models.IntegerField(**BNULL)
    publish_date = models.DateField(**BNULL)
    journal = models.CharField(max_length=255, **BNULL)
    journal_volume = models.CharField(max_length=50, **BNULL)
    abstract = models.TextField(**BNULL)
    tldr = models.CharField(max_length=255, **BNULL)
    openaccess_pdf_url = models.CharField(max_length=255, **BNULL)

    genre = TaggableManager(through=ObjectWithGenres)

    @classmethod
    def get_from_semantic(cls, title: str, overwrite: bool = False) -> "Paper":
        paper, created = cls.objects.get_or_create(title=title)
        if not created and not overwrite:
            return paper

        paper_dict = lookup_paper_from_semantic(title)

        if created or overwrite:
            author_list = []
            author_dicts = paper_dict.pop("author_dicts")
            if author_dicts:
                for author_dict in author_dicts:
                    if author_dict.get("authorId"):
                        author, a_created = Author.objects.get_or_create(
                            semantic_id=author_dict.get("authorId")
                        )
                        author_list.append(author)
                        if a_created:
                            author.name = author_dict.get("name")
                            author.save()
                            # TODO enrich author?
                            ...

            for k, v in paper_dict.items():
                setattr(paper, k, v)
            paper.save()

            if author_list:
                paper.authors.add(*author_list)
            genres = paper_dict.pop("genres", [])
            if genres:
                paper.genre.add(*genres)
        return paper