models.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. from collections import OrderedDict
  2. from dataclasses import dataclass
  3. import logging
  4. from datetime import datetime
  5. from typing import Optional
  6. from uuid import uuid4
  7. import requests
  8. from books.openlibrary import (
  9. lookup_author_from_openlibrary,
  10. lookup_book_from_openlibrary,
  11. )
  12. from django.conf import settings
  13. from django.contrib.auth import get_user_model
  14. from django.core.files.base import ContentFile
  15. from django.db import models
  16. from django.urls import reverse
  17. from django_extensions.db.models import TimeStampedModel
  18. from imagekit.models import ImageSpecField
  19. from imagekit.processors import ResizeToFit
  20. from scrobbles.mixins import (
  21. LongPlayScrobblableMixin,
  22. ObjectWithGenres,
  23. ScrobblableConstants,
  24. )
  25. from scrobbles.utils import get_scrobbles_for_media
  26. from taggit.managers import TaggableManager
  27. from thefuzz import fuzz
  28. from vrobbler.apps.books.sources.comicvine import (
  29. ComicVineClient,
  30. lookup_comic_from_comicvine,
  31. )
  32. from vrobbler.apps.books.locg import (
  33. lookup_comic_by_locg_slug,
  34. lookup_comic_from_locg,
  35. lookup_comic_writer_by_locg_slug,
  36. )
  37. from books.sources.google import lookup_book_from_google
  38. from books.sources.semantic import lookup_paper_from_semantic
  39. from scrobbles.dataclasses import BaseLogData, LongPlayLogData
  40. COMICVINE_API_KEY = getattr(settings, "COMICVINE_API_KEY", "")
  41. logger = logging.getLogger(__name__)
  42. User = get_user_model()
  43. BNULL = {"blank": True, "null": True}
  44. @dataclass
  45. class BookPageLogData(BaseLogData):
  46. page_number: Optional[int] = None
  47. end_ts: Optional[int] = None
  48. start_ts: Optional[int] = None
  49. duration: Optional[int] = None
  50. @dataclass
  51. class BookLogData(BaseLogData, LongPlayLogData):
  52. koreader_hash: Optional[str] = None
  53. page_data: Optional[dict[int, BookPageLogData]] = None
  54. pages_read: Optional[int] = None
  55. page_start: Optional[int] = None
  56. page_end: Optional[int] = None
  57. _excluded_fields = {"koreader_hash", "page_data"}
  58. def avg_seconds_per_page(self):
  59. if self.page_data:
  60. total_duration = 0
  61. for page_num, stats in self.page_data.items():
  62. total_duration += stats.get("duration", 0)
  63. if total_duration:
  64. return int(total_duration / len(self.page_data))
  65. class Author(TimeStampedModel):
  66. name = models.CharField(max_length=255)
  67. uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
  68. openlibrary_id = models.CharField(max_length=255, **BNULL)
  69. headshot = models.ImageField(upload_to="books/authors/", **BNULL)
  70. headshot_small = ImageSpecField(
  71. source="headshot",
  72. processors=[ResizeToFit(100, 100)],
  73. format="JPEG",
  74. options={"quality": 60},
  75. )
  76. headshot_medium = ImageSpecField(
  77. source="headshot",
  78. processors=[ResizeToFit(300, 300)],
  79. format="JPEG",
  80. options={"quality": 75},
  81. )
  82. bio = models.TextField(**BNULL)
  83. wikipedia_url = models.CharField(max_length=255, **BNULL)
  84. wikidata_id = models.CharField(max_length=255, **BNULL)
  85. isni = models.CharField(max_length=255, **BNULL)
  86. goodreads_id = models.CharField(max_length=255, **BNULL)
  87. comicvine_data = models.JSONField(**BNULL)
  88. semantic_id = models.CharField(max_length=50, **BNULL)
  89. def __str__(self):
  90. return f"{self.name}"
  91. def enrich_from_semantic(self, overwrite=False):
  92. ...
  93. def enrich_from_google_books(self, overwrite=False):
  94. ...
  95. def enrich_from_openlibrary(self, overwrite=False):
  96. data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
  97. if not data_dict or not data_dict.get("name"):
  98. logger.warning("Could not find author on openlibrary")
  99. return
  100. headshot_url = data_dict.pop("author_headshot_url", "")
  101. Author.objects.filter(pk=self.id).update(**data_dict)
  102. self.refresh_from_db()
  103. if headshot_url:
  104. r = requests.get(headshot_url)
  105. if r.status_code == 200:
  106. fname = f"{self.name}_{self.uuid}.jpg"
  107. self.headshot.save(fname, ContentFile(r.content), save=True)
  108. class Book(LongPlayScrobblableMixin):
  109. COMPLETION_PERCENT = getattr(settings, "BOOK_COMPLETION_PERCENT", 95)
  110. AVG_PAGE_READING_SECONDS = getattr(
  111. settings, "AVERAGE_PAGE_READING_SECONDS", 60
  112. )
  113. title = models.CharField(max_length=255)
  114. original_title = models.CharField(max_length=255, **BNULL)
  115. authors = models.ManyToManyField(Author, blank=True)
  116. koreader_data_by_hash = models.JSONField(**BNULL)
  117. isbn_13 = models.CharField(max_length=255, **BNULL)
  118. isbn_10 = models.CharField(max_length=255, **BNULL)
  119. pages = models.IntegerField(**BNULL)
  120. language = models.CharField(max_length=4, **BNULL)
  121. first_publish_year = models.IntegerField(**BNULL)
  122. publish_date = models.DateField(**BNULL)
  123. publisher = models.CharField(max_length=255, **BNULL)
  124. first_sentence = models.TextField(**BNULL)
  125. # ComicVine
  126. comicvine_id = models.CharField(max_length=255, **BNULL)
  127. issue_number = models.IntegerField(max_length=5, **BNULL)
  128. volume_number = models.IntegerField(max_length=5, **BNULL)
  129. # OpenLibrary
  130. openlibrary_id = models.CharField(max_length=255, **BNULL)
  131. cover = models.ImageField(upload_to="books/covers/", **BNULL)
  132. cover_small = ImageSpecField(
  133. source="cover",
  134. processors=[ResizeToFit(100, 100)],
  135. format="JPEG",
  136. options={"quality": 60},
  137. )
  138. cover_medium = ImageSpecField(
  139. source="cover",
  140. processors=[ResizeToFit(300, 300)],
  141. format="JPEG",
  142. options={"quality": 75},
  143. )
  144. summary = models.TextField(**BNULL)
  145. genre = TaggableManager(through=ObjectWithGenres)
  146. def __str__(self):
  147. return f"{self.title}"
  148. @property
  149. def subtitle(self):
  150. return f" by {self.author}"
  151. @property
  152. def strings(self) -> ScrobblableConstants:
  153. return ScrobblableConstants(verb="Reading", tags="book")
  154. @property
  155. def logdata_cls(self):
  156. return BookLogData
  157. @property
  158. def primary_image_url(self) -> str:
  159. url = ""
  160. if self.cover:
  161. url = self.cover_medium.url
  162. return url
  163. def get_absolute_url(self):
  164. return reverse("books:book_detail", kwargs={"slug": self.uuid})
  165. @classmethod
  166. def get_from_comicvine(cls, title: str, overwrite: bool = False, force_new: bool =False) -> "Book":
  167. book, created = cls.objects.get_or_create(title=title)
  168. if not created and not overwrite and not force_new:
  169. book, created = cls.objects.get_or_create(original_title=title)
  170. logger.info("Found comic by original title, use force_new=True to override")
  171. return book
  172. book_dict = lookup_comic_from_comicvine(title)
  173. if created or overwrite:
  174. author_list = []
  175. author_dicts = book_dict.pop("author_dicts")
  176. if author_dicts:
  177. for author_dict in author_dicts:
  178. if author_dict.get("authorId"):
  179. author, a_created = Author.objects.get_or_create(
  180. semantic_id=author_dict.get("authorId")
  181. )
  182. author_list.append(author)
  183. if a_created:
  184. author.name = author_dict.get("name")
  185. author.save()
  186. # TODO enrich author?
  187. ...
  188. for k, v in book_dict.items():
  189. setattr(book, k, v)
  190. book.save()
  191. if author_list:
  192. book.authors.add(*author_list)
  193. genres = book_dict.pop("genres", [])
  194. if genres:
  195. book.genre.add(*genres)
  196. return book
  197. @classmethod
  198. def find_or_create(
  199. cls, title: str, enrich: bool = False, commit: bool = True
  200. ):
  201. """Given a title, get a Book instance.
  202. If the book is not already in our database, or overwrite is True,
  203. this method will enrich the Book with data from Google.
  204. By default this method will also save the data back to the model. If you'd
  205. like to batch create, use commit=False and you'll get an unsaved but enriched
  206. instance back which you can then save at your convenience."""
  207. # TODO use either a Google Books id identifier or author name like for tracks
  208. book, created = cls.objects.get_or_create(title=title)
  209. if not created:
  210. logger.info(
  211. "Found exact match for book by title", extra={"title": title}
  212. )
  213. if not enrich:
  214. logger.info(
  215. "Found book by title, but not enriching",
  216. extra={"title": title},
  217. )
  218. return book
  219. book_dict = lookup_book_from_google(title)
  220. if not book_dict or book_dict.get("isbn_10"):
  221. book_dict = lookup_comic_from_comicvine(title)
  222. author_list = []
  223. authors = book_dict.pop("authors", [])
  224. cover_url = book_dict.pop("cover_url", "")
  225. genres = book_dict.pop("generes", [])
  226. if authors:
  227. for author_str in authors:
  228. if author_str:
  229. author, a_created = Author.objects.get_or_create(
  230. name=author_str
  231. )
  232. author_list.append(author)
  233. if a_created:
  234. # TODO enrich author
  235. ...
  236. for k, v in book_dict.items():
  237. setattr(book, k, v)
  238. if commit:
  239. book.save()
  240. book.save_image_from_url(cover_url)
  241. book.genre.add(*genres)
  242. book.authors.add(*author_list)
  243. return book
  244. def save_image_from_url(self, url: str, force_update: bool = False):
  245. if not self.cover or (force_update and url):
  246. r = requests.get(url)
  247. if r.status_code == 200:
  248. fname = f"{self.title}_{self.uuid}.jpg"
  249. self.cover.save(fname, ContentFile(r.content), save=True)
  250. def fix_metadata(self, data: dict = {}, force_update=False):
  251. if (not self.openlibrary_id or not self.locg_slug) or force_update:
  252. author_name = ""
  253. if self.author:
  254. author_name = self.author.name
  255. if not data:
  256. logger.warn(f"Checking openlibrary for {self.title}")
  257. if self.openlibrary_id and force_update:
  258. data = lookup_book_from_openlibrary(
  259. str(self.openlibrary_id)
  260. )
  261. else:
  262. data = lookup_book_from_openlibrary(
  263. str(self.title), author_name
  264. )
  265. if not data:
  266. if self.locg_slug:
  267. logger.warn(
  268. f"Checking LOCG for {self.title} with slug {self.locg_slug}"
  269. )
  270. data = lookup_comic_by_locg_slug(str(self.locg_slug))
  271. else:
  272. logger.warn(f"Checking LOCG for {self.title}")
  273. data = lookup_comic_from_locg(str(self.title))
  274. if not data and COMICVINE_API_KEY:
  275. logger.warn(f"Checking ComicVine for {self.title}")
  276. cv_client = ComicVineClient(api_key=COMICVINE_API_KEY)
  277. data = lookup_comic_from_comicvine(str(self.title))
  278. if not data:
  279. logger.warn(f"Book not found in any sources: {self.title}")
  280. return
  281. # We can discard the author name from OL for now, we'll lookup details below
  282. data.pop("ol_author_name", "")
  283. if data.get("ol_author_id"):
  284. self.fix_authors_metadata(data.pop("ol_author_id", ""))
  285. if data.get("locg_writer_slug"):
  286. self.get_author_from_locg(data.pop("locg_writer_slug", ""))
  287. ol_title = data.get("title", "")
  288. data.pop("ol_author_id", "")
  289. # Kick out a little warning if we're about to change KoReader's title
  290. if (
  291. fuzz.ratio(ol_title.lower(), str(self.title).lower()) < 80
  292. and not force_update
  293. ):
  294. logger.warn(
  295. f"OL and KoReader disagree on this book title {self.title} != {ol_title}, check manually"
  296. )
  297. self.openlibrary_id = data.get("openlibrary_id")
  298. self.save(update_fields=["openlibrary_id"])
  299. return
  300. # If we don't know pages, don't overwrite existing with None
  301. if "pages" in data.keys() and data.get("pages") == None:
  302. data.pop("pages")
  303. if (
  304. not isinstance(data.get("pages"), int)
  305. and "pages" in data.keys()
  306. ):
  307. logger.info(
  308. f"Pages for {self} from OL expected to be int, but got {data.get('pages')}"
  309. )
  310. data.pop("pages")
  311. # Pop this, so we can look it up later
  312. cover_url = data.pop("cover_url", "")
  313. subject_key_list = data.pop("subject_key_list", "")
  314. # Fun trick for updating all fields at once
  315. Book.objects.filter(pk=self.id).update(**data)
  316. self.refresh_from_db()
  317. if subject_key_list:
  318. self.genre.add(*subject_key_list)
  319. if cover_url:
  320. r = requests.get(cover_url)
  321. if r.status_code == 200:
  322. fname = f"{self.title}_{self.uuid}.jpg"
  323. self.cover.save(fname, ContentFile(r.content), save=True)
  324. if self.pages:
  325. self.run_time_seconds = int(self.pages) * int(
  326. self.AVG_PAGE_READING_SECONDS
  327. )
  328. self.save()
  329. def fix_authors_metadata(self, openlibrary_author_id):
  330. author = Author.objects.filter(
  331. openlibrary_id=openlibrary_author_id
  332. ).first()
  333. if not author:
  334. data = lookup_author_from_openlibrary(openlibrary_author_id)
  335. author_image_url = data.pop("author_headshot_url", None)
  336. author = Author.objects.create(**data)
  337. if author_image_url:
  338. r = requests.get(author_image_url)
  339. if r.status_code == 200:
  340. fname = f"{author.name}_{author.uuid}.jpg"
  341. author.headshot.save(
  342. fname, ContentFile(r.content), save=True
  343. )
  344. self.authors.add(author)
  345. def get_author_from_locg(self, locg_slug):
  346. writer = lookup_comic_writer_by_locg_slug(locg_slug)
  347. author, created = Author.objects.get_or_create(
  348. name=writer["name"], locg_slug=writer["locg_slug"]
  349. )
  350. if (created or not author.headshot) and writer["photo_url"]:
  351. r = requests.get(writer["photo_url"])
  352. if r.status_code == 200:
  353. fname = f"{author.name}_{author.uuid}.jpg"
  354. author.headshot.save(fname, ContentFile(r.content), save=True)
  355. self.authors.add(author)
  356. def page_data_for_user(
  357. self, user_id: int, convert_timestamps: bool = True
  358. ) -> dict:
  359. scrobbles = self.scrobble_set.filter(user=user_id)
  360. pages = {}
  361. for scrobble in scrobbles:
  362. if scrobble.logdata.page_data:
  363. for page, data in scrobble.logdata.page_data.items():
  364. if convert_timestamps:
  365. data["start_ts"] = datetime.fromtimestamp(
  366. data["start_ts"]
  367. )
  368. data["end_ts"] = datetime.fromtimestamp(data["end_ts"])
  369. pages[page] = data
  370. sorted_pages = OrderedDict(
  371. sorted(pages.items(), key=lambda x: x[1]["start_ts"])
  372. )
  373. return sorted_pages
  374. @property
  375. def author(self):
  376. return self.authors.first()
  377. @property
  378. def pages_for_completion(self) -> int:
  379. if not self.pages:
  380. logger.warn(f"{self} has no pages, no completion percentage")
  381. return 0
  382. return int(self.pages * (self.COMPLETION_PERCENT / 100))
  383. def update_long_play_seconds(self):
  384. """Check page timestamps and duration and update"""
  385. if self.page_set.all():
  386. ...
  387. def progress_for_user(self, user_id: int) -> int:
  388. """Used to keep track of whether the book is complete or not"""
  389. user = User.objects.get(id=user_id)
  390. last_scrobble = get_scrobbles_for_media(self, user).last()
  391. progress = 0
  392. if last_scrobble:
  393. progress = int((last_scrobble.last_page_read / self.pages) * 100)
  394. return progress
  395. class Paper(LongPlayScrobblableMixin):
  396. """Keeps track of Academic Papers"""
  397. COMPLETION_PERCENT = getattr(settings, "PAPER_COMPLETION_PERCENT", 60)
  398. AVG_PAGE_READING_SECONDS = getattr(
  399. settings, "AVERAGE_PAGE_READING_SECONDS", 60
  400. )
  401. title = models.CharField(max_length=255)
  402. semantic_title = models.CharField(max_length=255, **BNULL)
  403. authors = models.ManyToManyField(Author, blank=True)
  404. koreader_data_by_hash = models.JSONField(**BNULL)
  405. arxiv_id = models.CharField(max_length=50, **BNULL)
  406. semantic_id = models.CharField(max_length=50, **BNULL)
  407. arxiv_id = models.CharField(max_length=50, **BNULL)
  408. corpus_id = models.CharField(max_length=50, **BNULL)
  409. doi_id = models.CharField(max_length=50, **BNULL)
  410. pages = models.IntegerField(**BNULL)
  411. language = models.CharField(max_length=4, **BNULL)
  412. first_publish_year = models.IntegerField(**BNULL)
  413. publish_date = models.DateField(**BNULL)
  414. journal = models.CharField(max_length=255, **BNULL)
  415. journal_volume = models.CharField(max_length=50, **BNULL)
  416. abstract = models.TextField(**BNULL)
  417. tldr = models.CharField(max_length=255, **BNULL)
  418. openaccess_pdf_url = models.CharField(max_length=255, **BNULL)
  419. genre = TaggableManager(through=ObjectWithGenres)
  420. @classmethod
  421. def get_from_semantic(cls, title: str, overwrite: bool = False) -> "Paper":
  422. paper, created = cls.objects.get_or_create(title=title)
  423. if not created and not overwrite:
  424. return paper
  425. paper_dict = lookup_paper_from_semantic(title)
  426. if created or overwrite:
  427. author_list = []
  428. author_dicts = paper_dict.pop("author_dicts")
  429. if author_dicts:
  430. for author_dict in author_dicts:
  431. if author_dict.get("authorId"):
  432. author, a_created = Author.objects.get_or_create(
  433. semantic_id=author_dict.get("authorId")
  434. )
  435. author_list.append(author)
  436. if a_created:
  437. author.name = author_dict.get("name")
  438. author.save()
  439. # TODO enrich author?
  440. ...
  441. for k, v in paper_dict.items():
  442. setattr(paper, k, v)
  443. paper.save()
  444. if author_list:
  445. paper.authors.add(*author_list)
  446. genres = paper_dict.pop("genres", [])
  447. if genres:
  448. paper.genre.add(*genres)
  449. return paper