| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536 |
- from collections import OrderedDict
- from dataclasses import dataclass
- import logging
- from datetime import datetime
- from typing import Optional
- from uuid import uuid4
- import requests
- from books.openlibrary import (
- lookup_author_from_openlibrary,
- lookup_book_from_openlibrary,
- )
- from django.conf import settings
- from django.contrib.auth import get_user_model
- from django.core.files.base import ContentFile
- from django.db import models
- from django.urls import reverse
- from django_extensions.db.models import TimeStampedModel
- from imagekit.models import ImageSpecField
- from imagekit.processors import ResizeToFit
- from scrobbles.mixins import (
- LongPlayScrobblableMixin,
- ObjectWithGenres,
- ScrobblableConstants,
- )
- from scrobbles.utils import get_scrobbles_for_media
- from taggit.managers import TaggableManager
- from thefuzz import fuzz
- from vrobbler.apps.books.sources.comicvine import (
- ComicVineClient,
- lookup_comic_from_comicvine,
- )
- from vrobbler.apps.books.locg import (
- lookup_comic_by_locg_slug,
- lookup_comic_from_locg,
- lookup_comic_writer_by_locg_slug,
- )
- from books.sources.google import lookup_book_from_google
- from books.sources.semantic import lookup_paper_from_semantic
- from scrobbles.dataclasses import BaseLogData, LongPlayLogData
- COMICVINE_API_KEY = getattr(settings, "COMICVINE_API_KEY", "")
- logger = logging.getLogger(__name__)
- User = get_user_model()
- BNULL = {"blank": True, "null": True}
- @dataclass
- class BookPageLogData(BaseLogData):
- page_number: Optional[int] = None
- end_ts: Optional[int] = None
- start_ts: Optional[int] = None
- duration: Optional[int] = None
- @dataclass
- class BookLogData(BaseLogData, LongPlayLogData):
- koreader_hash: Optional[str] = None
- page_data: Optional[dict[int, BookPageLogData]] = None
- pages_read: Optional[int] = None
- page_start: Optional[int] = None
- page_end: Optional[int] = None
- _excluded_fields = {"koreader_hash", "page_data"}
- def avg_seconds_per_page(self):
- if self.page_data:
- total_duration = 0
- for page_num, stats in self.page_data.items():
- total_duration += stats.get("duration", 0)
- if total_duration:
- return int(total_duration / len(self.page_data))
- class Author(TimeStampedModel):
- name = models.CharField(max_length=255)
- uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
- openlibrary_id = models.CharField(max_length=255, **BNULL)
- headshot = models.ImageField(upload_to="books/authors/", **BNULL)
- headshot_small = ImageSpecField(
- source="headshot",
- processors=[ResizeToFit(100, 100)],
- format="JPEG",
- options={"quality": 60},
- )
- headshot_medium = ImageSpecField(
- source="headshot",
- processors=[ResizeToFit(300, 300)],
- format="JPEG",
- options={"quality": 75},
- )
- bio = models.TextField(**BNULL)
- wikipedia_url = models.CharField(max_length=255, **BNULL)
- wikidata_id = models.CharField(max_length=255, **BNULL)
- isni = models.CharField(max_length=255, **BNULL)
- goodreads_id = models.CharField(max_length=255, **BNULL)
- comicvine_data = models.JSONField(**BNULL)
- semantic_id = models.CharField(max_length=50, **BNULL)
- def __str__(self):
- return f"{self.name}"
- def enrich_from_semantic(self, overwrite=False):
- ...
- def enrich_from_google_books(self, overwrite=False):
- ...
- def enrich_from_openlibrary(self, overwrite=False):
- data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
- if not data_dict or not data_dict.get("name"):
- logger.warning("Could not find author on openlibrary")
- return
- headshot_url = data_dict.pop("author_headshot_url", "")
- Author.objects.filter(pk=self.id).update(**data_dict)
- self.refresh_from_db()
- if headshot_url:
- r = requests.get(headshot_url)
- if r.status_code == 200:
- fname = f"{self.name}_{self.uuid}.jpg"
- self.headshot.save(fname, ContentFile(r.content), save=True)
- class Book(LongPlayScrobblableMixin):
- COMPLETION_PERCENT = getattr(settings, "BOOK_COMPLETION_PERCENT", 95)
- AVG_PAGE_READING_SECONDS = getattr(
- settings, "AVERAGE_PAGE_READING_SECONDS", 60
- )
- title = models.CharField(max_length=255)
- original_title = models.CharField(max_length=255, **BNULL)
- authors = models.ManyToManyField(Author, blank=True)
- koreader_data_by_hash = models.JSONField(**BNULL)
- isbn_13 = models.CharField(max_length=255, **BNULL)
- isbn_10 = models.CharField(max_length=255, **BNULL)
- pages = models.IntegerField(**BNULL)
- language = models.CharField(max_length=4, **BNULL)
- first_publish_year = models.IntegerField(**BNULL)
- publish_date = models.DateField(**BNULL)
- publisher = models.CharField(max_length=255, **BNULL)
- first_sentence = models.TextField(**BNULL)
- # ComicVine
- comicvine_id = models.CharField(max_length=255, **BNULL)
- issue_number = models.IntegerField(max_length=5, **BNULL)
- volume_number = models.IntegerField(max_length=5, **BNULL)
- # OpenLibrary
- openlibrary_id = models.CharField(max_length=255, **BNULL)
- cover = models.ImageField(upload_to="books/covers/", **BNULL)
- cover_small = ImageSpecField(
- source="cover",
- processors=[ResizeToFit(100, 100)],
- format="JPEG",
- options={"quality": 60},
- )
- cover_medium = ImageSpecField(
- source="cover",
- processors=[ResizeToFit(300, 300)],
- format="JPEG",
- options={"quality": 75},
- )
- summary = models.TextField(**BNULL)
- genre = TaggableManager(through=ObjectWithGenres)
- def __str__(self):
- return f"{self.title}"
- @property
- def subtitle(self):
- return f" by {self.author}"
- @property
- def strings(self) -> ScrobblableConstants:
- return ScrobblableConstants(verb="Reading", tags="book")
- @property
- def logdata_cls(self):
- return BookLogData
- @property
- def primary_image_url(self) -> str:
- url = ""
- if self.cover:
- url = self.cover_medium.url
- return url
- def get_absolute_url(self):
- return reverse("books:book_detail", kwargs={"slug": self.uuid})
- @classmethod
- def get_from_comicvine(cls, title: str, overwrite: bool = False, force_new: bool =False) -> "Book":
- book, created = cls.objects.get_or_create(title=title)
- if not created and not overwrite and not force_new:
- book, created = cls.objects.get_or_create(original_title=title)
- logger.info("Found comic by original title, use force_new=True to override")
- return book
- book_dict = lookup_comic_from_comicvine(title)
- if created or overwrite:
- author_list = []
- author_dicts = book_dict.pop("author_dicts")
- if author_dicts:
- for author_dict in author_dicts:
- if author_dict.get("authorId"):
- author, a_created = Author.objects.get_or_create(
- semantic_id=author_dict.get("authorId")
- )
- author_list.append(author)
- if a_created:
- author.name = author_dict.get("name")
- author.save()
- # TODO enrich author?
- ...
- for k, v in book_dict.items():
- setattr(book, k, v)
- book.save()
- if author_list:
- book.authors.add(*author_list)
- genres = book_dict.pop("genres", [])
- if genres:
- book.genre.add(*genres)
- return book
- @classmethod
- def find_or_create(
- cls, title: str, enrich: bool = False, commit: bool = True
- ):
- """Given a title, get a Book instance.
- If the book is not already in our database, or overwrite is True,
- this method will enrich the Book with data from Google.
- By default this method will also save the data back to the model. If you'd
- like to batch create, use commit=False and you'll get an unsaved but enriched
- instance back which you can then save at your convenience."""
- # TODO use either a Google Books id identifier or author name like for tracks
- book, created = cls.objects.get_or_create(title=title)
- if not created:
- logger.info(
- "Found exact match for book by title", extra={"title": title}
- )
- if not enrich:
- logger.info(
- "Found book by title, but not enriching",
- extra={"title": title},
- )
- return book
- book_dict = lookup_book_from_google(title)
- if not book_dict or book_dict.get("isbn_10"):
- book_dict = lookup_comic_from_comicvine(title)
- author_list = []
- authors = book_dict.pop("authors", [])
- cover_url = book_dict.pop("cover_url", "")
- genres = book_dict.pop("generes", [])
- if authors:
- for author_str in authors:
- if author_str:
- author, a_created = Author.objects.get_or_create(
- name=author_str
- )
- author_list.append(author)
- if a_created:
- # TODO enrich author
- ...
- for k, v in book_dict.items():
- setattr(book, k, v)
- if commit:
- book.save()
- book.save_image_from_url(cover_url)
- book.genre.add(*genres)
- book.authors.add(*author_list)
- return book
- def save_image_from_url(self, url: str, force_update: bool = False):
- if not self.cover or (force_update and url):
- r = requests.get(url)
- if r.status_code == 200:
- fname = f"{self.title}_{self.uuid}.jpg"
- self.cover.save(fname, ContentFile(r.content), save=True)
- def fix_metadata(self, data: dict = {}, force_update=False):
- if (not self.openlibrary_id or not self.locg_slug) or force_update:
- author_name = ""
- if self.author:
- author_name = self.author.name
- if not data:
- logger.warn(f"Checking openlibrary for {self.title}")
- if self.openlibrary_id and force_update:
- data = lookup_book_from_openlibrary(
- str(self.openlibrary_id)
- )
- else:
- data = lookup_book_from_openlibrary(
- str(self.title), author_name
- )
- if not data:
- if self.locg_slug:
- logger.warn(
- f"Checking LOCG for {self.title} with slug {self.locg_slug}"
- )
- data = lookup_comic_by_locg_slug(str(self.locg_slug))
- else:
- logger.warn(f"Checking LOCG for {self.title}")
- data = lookup_comic_from_locg(str(self.title))
- if not data and COMICVINE_API_KEY:
- logger.warn(f"Checking ComicVine for {self.title}")
- cv_client = ComicVineClient(api_key=COMICVINE_API_KEY)
- data = lookup_comic_from_comicvine(str(self.title))
- if not data:
- logger.warn(f"Book not found in any sources: {self.title}")
- return
- # We can discard the author name from OL for now, we'll lookup details below
- data.pop("ol_author_name", "")
- if data.get("ol_author_id"):
- self.fix_authors_metadata(data.pop("ol_author_id", ""))
- if data.get("locg_writer_slug"):
- self.get_author_from_locg(data.pop("locg_writer_slug", ""))
- ol_title = data.get("title", "")
- data.pop("ol_author_id", "")
- # Kick out a little warning if we're about to change KoReader's title
- if (
- fuzz.ratio(ol_title.lower(), str(self.title).lower()) < 80
- and not force_update
- ):
- logger.warn(
- f"OL and KoReader disagree on this book title {self.title} != {ol_title}, check manually"
- )
- self.openlibrary_id = data.get("openlibrary_id")
- self.save(update_fields=["openlibrary_id"])
- return
- # If we don't know pages, don't overwrite existing with None
- if "pages" in data.keys() and data.get("pages") == None:
- data.pop("pages")
- if (
- not isinstance(data.get("pages"), int)
- and "pages" in data.keys()
- ):
- logger.info(
- f"Pages for {self} from OL expected to be int, but got {data.get('pages')}"
- )
- data.pop("pages")
- # Pop this, so we can look it up later
- cover_url = data.pop("cover_url", "")
- subject_key_list = data.pop("subject_key_list", "")
- # Fun trick for updating all fields at once
- Book.objects.filter(pk=self.id).update(**data)
- self.refresh_from_db()
- if subject_key_list:
- self.genre.add(*subject_key_list)
- if cover_url:
- r = requests.get(cover_url)
- if r.status_code == 200:
- fname = f"{self.title}_{self.uuid}.jpg"
- self.cover.save(fname, ContentFile(r.content), save=True)
- if self.pages:
- self.run_time_seconds = int(self.pages) * int(
- self.AVG_PAGE_READING_SECONDS
- )
- self.save()
- def fix_authors_metadata(self, openlibrary_author_id):
- author = Author.objects.filter(
- openlibrary_id=openlibrary_author_id
- ).first()
- if not author:
- data = lookup_author_from_openlibrary(openlibrary_author_id)
- author_image_url = data.pop("author_headshot_url", None)
- author = Author.objects.create(**data)
- if author_image_url:
- r = requests.get(author_image_url)
- if r.status_code == 200:
- fname = f"{author.name}_{author.uuid}.jpg"
- author.headshot.save(
- fname, ContentFile(r.content), save=True
- )
- self.authors.add(author)
- def get_author_from_locg(self, locg_slug):
- writer = lookup_comic_writer_by_locg_slug(locg_slug)
- author, created = Author.objects.get_or_create(
- name=writer["name"], locg_slug=writer["locg_slug"]
- )
- if (created or not author.headshot) and writer["photo_url"]:
- r = requests.get(writer["photo_url"])
- if r.status_code == 200:
- fname = f"{author.name}_{author.uuid}.jpg"
- author.headshot.save(fname, ContentFile(r.content), save=True)
- self.authors.add(author)
- def page_data_for_user(
- self, user_id: int, convert_timestamps: bool = True
- ) -> dict:
- scrobbles = self.scrobble_set.filter(user=user_id)
- pages = {}
- for scrobble in scrobbles:
- if scrobble.logdata.page_data:
- for page, data in scrobble.logdata.page_data.items():
- if convert_timestamps:
- data["start_ts"] = datetime.fromtimestamp(
- data["start_ts"]
- )
- data["end_ts"] = datetime.fromtimestamp(data["end_ts"])
- pages[page] = data
- sorted_pages = OrderedDict(
- sorted(pages.items(), key=lambda x: x[1]["start_ts"])
- )
- return sorted_pages
- @property
- def author(self):
- return self.authors.first()
- @property
- def pages_for_completion(self) -> int:
- if not self.pages:
- logger.warn(f"{self} has no pages, no completion percentage")
- return 0
- return int(self.pages * (self.COMPLETION_PERCENT / 100))
- def update_long_play_seconds(self):
- """Check page timestamps and duration and update"""
- if self.page_set.all():
- ...
- def progress_for_user(self, user_id: int) -> int:
- """Used to keep track of whether the book is complete or not"""
- user = User.objects.get(id=user_id)
- last_scrobble = get_scrobbles_for_media(self, user).last()
- progress = 0
- if last_scrobble:
- progress = int((last_scrobble.last_page_read / self.pages) * 100)
- return progress
- class Paper(LongPlayScrobblableMixin):
- """Keeps track of Academic Papers"""
- COMPLETION_PERCENT = getattr(settings, "PAPER_COMPLETION_PERCENT", 60)
- AVG_PAGE_READING_SECONDS = getattr(
- settings, "AVERAGE_PAGE_READING_SECONDS", 60
- )
- title = models.CharField(max_length=255)
- semantic_title = models.CharField(max_length=255, **BNULL)
- authors = models.ManyToManyField(Author, blank=True)
- koreader_data_by_hash = models.JSONField(**BNULL)
- arxiv_id = models.CharField(max_length=50, **BNULL)
- semantic_id = models.CharField(max_length=50, **BNULL)
- arxiv_id = models.CharField(max_length=50, **BNULL)
- corpus_id = models.CharField(max_length=50, **BNULL)
- doi_id = models.CharField(max_length=50, **BNULL)
- pages = models.IntegerField(**BNULL)
- language = models.CharField(max_length=4, **BNULL)
- first_publish_year = models.IntegerField(**BNULL)
- publish_date = models.DateField(**BNULL)
- journal = models.CharField(max_length=255, **BNULL)
- journal_volume = models.CharField(max_length=50, **BNULL)
- abstract = models.TextField(**BNULL)
- tldr = models.CharField(max_length=255, **BNULL)
- openaccess_pdf_url = models.CharField(max_length=255, **BNULL)
- genre = TaggableManager(through=ObjectWithGenres)
- @classmethod
- def get_from_semantic(cls, title: str, overwrite: bool = False) -> "Paper":
- paper, created = cls.objects.get_or_create(title=title)
- if not created and not overwrite:
- return paper
- paper_dict = lookup_paper_from_semantic(title)
- if created or overwrite:
- author_list = []
- author_dicts = paper_dict.pop("author_dicts")
- if author_dicts:
- for author_dict in author_dicts:
- if author_dict.get("authorId"):
- author, a_created = Author.objects.get_or_create(
- semantic_id=author_dict.get("authorId")
- )
- author_list.append(author)
- if a_created:
- author.name = author_dict.get("name")
- author.save()
- # TODO enrich author?
- ...
- for k, v in paper_dict.items():
- setattr(paper, k, v)
- paper.save()
- if author_list:
- paper.authors.add(*author_list)
- genres = paper_dict.pop("genres", [])
- if genres:
- paper.genre.add(*genres)
- return paper
|