models.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552
  1. import logging
  2. from collections import OrderedDict
  3. from dataclasses import dataclass
  4. from datetime import datetime
  5. from typing import Optional
  6. from uuid import uuid4
  7. import requests
  8. from books.constants import READCOMICSONLINE_URL
  9. from books.openlibrary import (
  10. lookup_author_from_openlibrary,
  11. lookup_book_from_openlibrary,
  12. )
  13. from books.sources.google import lookup_book_from_google
  14. from books.sources.semantic import lookup_paper_from_semantic
  15. from books.utils import get_comic_issue_url
  16. from django.conf import settings
  17. from django.contrib.auth import get_user_model
  18. from django.core.files.base import ContentFile
  19. from django.db import models
  20. from django.urls import reverse
  21. from django_extensions.db.models import TimeStampedModel
  22. from imagekit.models import ImageSpecField
  23. from imagekit.processors import ResizeToFit
  24. from scrobbles.dataclasses import BaseLogData, LongPlayLogData
  25. from scrobbles.mixins import (
  26. LongPlayScrobblableMixin,
  27. ObjectWithGenres,
  28. ScrobblableConstants,
  29. )
  30. from scrobbles.utils import get_scrobbles_for_media, next_url_if_exists
  31. from taggit.managers import TaggableManager
  32. from thefuzz import fuzz
  33. from vrobbler.apps.books.locg import (
  34. lookup_comic_by_locg_slug,
  35. lookup_comic_from_locg,
  36. lookup_comic_writer_by_locg_slug,
  37. )
  38. from vrobbler.apps.books.sources.comicvine import (
  39. ComicVineClient,
  40. lookup_comic_from_comicvine,
  41. )
  42. COMICVINE_API_KEY = getattr(settings, "COMICVINE_API_KEY", "")
  43. logger = logging.getLogger(__name__)
  44. User = get_user_model()
  45. BNULL = {"blank": True, "null": True}
  46. @dataclass
  47. class BookPageLogData(BaseLogData):
  48. page_number: Optional[int] = None
  49. end_ts: Optional[int] = None
  50. start_ts: Optional[int] = None
  51. duration: Optional[int] = None
  52. @dataclass
  53. class BookLogData(BaseLogData, LongPlayLogData):
  54. koreader_hash: Optional[str] = None
  55. page_data: Optional[dict[int, BookPageLogData]] = None
  56. pages_read: Optional[int] = None
  57. page_start: Optional[int] = None
  58. page_end: Optional[int] = None
  59. resume_url: Optional[str] = None
  60. _excluded_fields = {"koreader_hash", "page_data"}
  61. def avg_seconds_per_page(self):
  62. if self.page_data:
  63. total_duration = 0
  64. for page_num, stats in self.page_data.items():
  65. total_duration += stats.get("duration", 0)
  66. if total_duration:
  67. return int(total_duration / len(self.page_data))
  68. class Author(TimeStampedModel):
  69. name = models.CharField(max_length=255)
  70. uuid = models.UUIDField(default=uuid4, editable=False, **BNULL)
  71. openlibrary_id = models.CharField(max_length=255, **BNULL)
  72. headshot = models.ImageField(upload_to="books/authors/", **BNULL)
  73. headshot_small = ImageSpecField(
  74. source="headshot",
  75. processors=[ResizeToFit(100, 100)],
  76. format="JPEG",
  77. options={"quality": 60},
  78. )
  79. headshot_medium = ImageSpecField(
  80. source="headshot",
  81. processors=[ResizeToFit(300, 300)],
  82. format="JPEG",
  83. options={"quality": 75},
  84. )
  85. bio = models.TextField(**BNULL)
  86. wikipedia_url = models.CharField(max_length=255, **BNULL)
  87. wikidata_id = models.CharField(max_length=255, **BNULL)
  88. isni = models.CharField(max_length=255, **BNULL)
  89. goodreads_id = models.CharField(max_length=255, **BNULL)
  90. comicvine_data = models.JSONField(**BNULL)
  91. semantic_id = models.CharField(max_length=50, **BNULL)
  92. def __str__(self):
  93. return f"{self.name}"
  94. def enrich_from_semantic(self, overwrite=False):
  95. ...
  96. def enrich_from_google_books(self, overwrite=False):
  97. ...
  98. def enrich_from_openlibrary(self, overwrite=False):
  99. data_dict = lookup_author_from_openlibrary(self.openlibrary_id)
  100. if not data_dict or not data_dict.get("name"):
  101. logger.warning("Could not find author on openlibrary")
  102. return
  103. headshot_url = data_dict.pop("author_headshot_url", "")
  104. Author.objects.filter(pk=self.id).update(**data_dict)
  105. self.refresh_from_db()
  106. if headshot_url:
  107. r = requests.get(headshot_url)
  108. if r.status_code == 200:
  109. fname = f"{self.name}_{self.uuid}.jpg"
  110. self.headshot.save(fname, ContentFile(r.content), save=True)
  111. class Book(LongPlayScrobblableMixin):
  112. COMPLETION_PERCENT = getattr(settings, "BOOK_COMPLETION_PERCENT", 95)
  113. AVG_PAGE_READING_SECONDS = getattr(
  114. settings, "AVERAGE_PAGE_READING_SECONDS", 60
  115. )
  116. title = models.CharField(max_length=255)
  117. original_title = models.CharField(max_length=255, **BNULL)
  118. authors = models.ManyToManyField(Author, blank=True)
  119. koreader_data_by_hash = models.JSONField(**BNULL)
  120. isbn_13 = models.CharField(max_length=255, **BNULL)
  121. isbn_10 = models.CharField(max_length=255, **BNULL)
  122. pages = models.IntegerField(**BNULL)
  123. language = models.CharField(max_length=4, **BNULL)
  124. first_publish_year = models.IntegerField(**BNULL)
  125. publish_date = models.DateField(**BNULL)
  126. publisher = models.CharField(max_length=255, **BNULL)
  127. first_sentence = models.TextField(**BNULL)
  128. # ComicVine
  129. comicvine_id = models.CharField(max_length=255, **BNULL)
  130. readcomics_url = models.CharField(max_length=255, **BNULL)
  131. next_readcomics_url = models.CharField(max_length=255, **BNULL)
  132. issue_number = models.IntegerField(**BNULL)
  133. volume_number = models.IntegerField(**BNULL)
  134. # OpenLibrary
  135. openlibrary_id = models.CharField(max_length=255, **BNULL)
  136. cover = models.ImageField(upload_to="books/covers/", **BNULL)
  137. cover_small = ImageSpecField(
  138. source="cover",
  139. processors=[ResizeToFit(100, 100)],
  140. format="JPEG",
  141. options={"quality": 60},
  142. )
  143. cover_medium = ImageSpecField(
  144. source="cover",
  145. processors=[ResizeToFit(300, 300)],
  146. format="JPEG",
  147. options={"quality": 75},
  148. )
  149. summary = models.TextField(**BNULL)
  150. genre = TaggableManager(through=ObjectWithGenres)
  151. def __str__(self) -> str:
  152. if self.issue_number and "Issue" not in str(self.title):
  153. return f"{self.title} - Issue {self.issue_number}"
  154. if self.volume_number and "Volume" not in str(self.title):
  155. return f"{self.title} - Volume {self.volume_number}"
  156. return f"{self.title}"
  157. @property
  158. def subtitle(self):
  159. return f" by {self.author}"
  160. @property
  161. def strings(self) -> ScrobblableConstants:
  162. return ScrobblableConstants(verb="Reading", tags="book")
  163. @property
  164. def logdata_cls(self):
  165. return BookLogData
  166. @property
  167. def primary_image_url(self) -> str:
  168. url = ""
  169. if self.cover:
  170. url = self.cover_medium.url
  171. return url
  172. def get_absolute_url(self):
  173. return reverse("books:book_detail", kwargs={"slug": self.uuid})
  174. @classmethod
  175. def get_from_comicvine(cls, title: str, overwrite: bool = False, force_new: bool =False) -> "Book":
  176. book, created = cls.objects.get_or_create(title=title)
  177. if not created:
  178. return book
  179. book_dict = lookup_comic_from_comicvine(title)
  180. if created or overwrite:
  181. author_list = []
  182. author_dicts = book_dict.pop("author_dicts")
  183. if author_dicts:
  184. for author_dict in author_dicts:
  185. if author_dict.get("authorId"):
  186. author, a_created = Author.objects.get_or_create(
  187. semantic_id=author_dict.get("authorId")
  188. )
  189. author_list.append(author)
  190. if a_created:
  191. author.name = author_dict.get("name")
  192. author.save()
  193. # TODO enrich author?
  194. ...
  195. for k, v in book_dict.items():
  196. setattr(book, k, v)
  197. book.save()
  198. if author_list:
  199. book.authors.add(*author_list)
  200. genres = book_dict.pop("genres", [])
  201. if genres:
  202. book.genre.add(*genres)
  203. return book
  204. @classmethod
  205. def find_or_create(
  206. cls, title: str, url: str = "", enrich: bool = False, commit: bool = True
  207. ):
  208. """Given a title, get a Book instance.
  209. If the book is not already in our database, or overwrite is True,
  210. this method will enrich the Book with data from Google.
  211. By default this method will also save the data back to the model. If you'd
  212. like to batch create, use commit=False and you'll get an unsaved but enriched
  213. instance back which you can then save at your convenience."""
  214. # TODO use either a Google Books id identifier or author name like for tracks
  215. book, created = cls.objects.get_or_create(original_title=title)
  216. if not created:
  217. logger.info(
  218. "Found exact match for book by title", extra={"title": title}
  219. )
  220. if not enrich:
  221. logger.info(
  222. "Found book by title, but not enriching",
  223. extra={"title": title},
  224. )
  225. return book
  226. book_dict = None
  227. if READCOMICSONLINE_URL in url:
  228. book_dict = lookup_comic_from_comicvine(title)
  229. book_dict["readcomics_url"] = get_comic_issue_url(url)
  230. book_dict["next_readcomics_url"] = next_url_if_exists(book_dict["readcomics_url"])
  231. if not book_dict:
  232. book_dict = lookup_book_from_google(title)
  233. if not book_dict:
  234. logger.warning("No book found in any source, using data as is", extra={"title": title})
  235. author_list = []
  236. authors = book_dict.pop("authors", [])
  237. cover_url = book_dict.pop("cover_url", "")
  238. genres = book_dict.pop("generes", [])
  239. if authors:
  240. for author_str in authors:
  241. if author_str:
  242. author, a_created = Author.objects.get_or_create(
  243. name=author_str
  244. )
  245. author_list.append(author)
  246. if a_created:
  247. # TODO enrich author
  248. ...
  249. for k, v in book_dict.items():
  250. setattr(book, k, v)
  251. if commit:
  252. book.save()
  253. book.save_image_from_url(cover_url)
  254. book.genre.add(*genres)
  255. book.authors.add(*author_list)
  256. return book
  257. def save_image_from_url(self, url: str, force_update: bool = False):
  258. if url and (not self.cover or force_update):
  259. r = requests.get(url)
  260. if r.status_code == 200:
  261. fname = f"{self.title}_{self.uuid}.jpg"
  262. self.cover.save(fname, ContentFile(r.content), save=True)
  263. def fix_metadata(self, data: dict = {}, force_update=False):
  264. if (not self.openlibrary_id or not self.locg_slug) or force_update:
  265. author_name = ""
  266. if self.author:
  267. author_name = self.author.name
  268. if not data:
  269. logger.warn(f"Checking openlibrary for {self.title}")
  270. if self.openlibrary_id and force_update:
  271. data = lookup_book_from_openlibrary(
  272. str(self.openlibrary_id)
  273. )
  274. else:
  275. data = lookup_book_from_openlibrary(
  276. str(self.title), author_name
  277. )
  278. if not data:
  279. if self.locg_slug:
  280. logger.warn(
  281. f"Checking LOCG for {self.title} with slug {self.locg_slug}"
  282. )
  283. data = lookup_comic_by_locg_slug(str(self.locg_slug))
  284. else:
  285. logger.warn(f"Checking LOCG for {self.title}")
  286. data = lookup_comic_from_locg(str(self.title))
  287. if not data and COMICVINE_API_KEY:
  288. logger.warn(f"Checking ComicVine for {self.title}")
  289. cv_client = ComicVineClient(api_key=COMICVINE_API_KEY)
  290. data = lookup_comic_from_comicvine(str(self.title))
  291. if not data:
  292. logger.warn(f"Book not found in any sources: {self.title}")
  293. return
  294. # We can discard the author name from OL for now, we'll lookup details below
  295. data.pop("ol_author_name", "")
  296. if data.get("ol_author_id"):
  297. self.fix_authors_metadata(data.pop("ol_author_id", ""))
  298. if data.get("locg_writer_slug"):
  299. self.get_author_from_locg(data.pop("locg_writer_slug", ""))
  300. ol_title = data.get("title", "")
  301. data.pop("ol_author_id", "")
  302. # Kick out a little warning if we're about to change KoReader's title
  303. if (
  304. fuzz.ratio(ol_title.lower(), str(self.title).lower()) < 80
  305. and not force_update
  306. ):
  307. logger.warn(
  308. f"OL and KoReader disagree on this book title {self.title} != {ol_title}, check manually"
  309. )
  310. self.openlibrary_id = data.get("openlibrary_id")
  311. self.save(update_fields=["openlibrary_id"])
  312. return
  313. # If we don't know pages, don't overwrite existing with None
  314. if "pages" in data.keys() and data.get("pages") == None:
  315. data.pop("pages")
  316. if (
  317. not isinstance(data.get("pages"), int)
  318. and "pages" in data.keys()
  319. ):
  320. logger.info(
  321. f"Pages for {self} from OL expected to be int, but got {data.get('pages')}"
  322. )
  323. data.pop("pages")
  324. # Pop this, so we can look it up later
  325. cover_url = data.pop("cover_url", "")
  326. subject_key_list = data.pop("subject_key_list", "")
  327. # Fun trick for updating all fields at once
  328. Book.objects.filter(pk=self.id).update(**data)
  329. self.refresh_from_db()
  330. if subject_key_list:
  331. self.genre.add(*subject_key_list)
  332. if cover_url:
  333. r = requests.get(cover_url)
  334. if r.status_code == 200:
  335. fname = f"{self.title}_{self.uuid}.jpg"
  336. self.cover.save(fname, ContentFile(r.content), save=True)
  337. if self.pages:
  338. self.base_run_time_seconds = int(self.pages) * int(
  339. self.AVG_PAGE_READING_SECONDS
  340. )
  341. self.save()
  342. def fix_authors_metadata(self, openlibrary_author_id):
  343. author = Author.objects.filter(
  344. openlibrary_id=openlibrary_author_id
  345. ).first()
  346. if not author:
  347. data = lookup_author_from_openlibrary(openlibrary_author_id)
  348. author_image_url = data.pop("author_headshot_url", None)
  349. author = Author.objects.create(**data)
  350. if author_image_url:
  351. r = requests.get(author_image_url)
  352. if r.status_code == 200:
  353. fname = f"{author.name}_{author.uuid}.jpg"
  354. author.headshot.save(
  355. fname, ContentFile(r.content), save=True
  356. )
  357. self.authors.add(author)
  358. def get_author_from_locg(self, locg_slug):
  359. writer = lookup_comic_writer_by_locg_slug(locg_slug)
  360. author, created = Author.objects.get_or_create(
  361. name=writer["name"], locg_slug=writer["locg_slug"]
  362. )
  363. if (created or not author.headshot) and writer["photo_url"]:
  364. r = requests.get(writer["photo_url"])
  365. if r.status_code == 200:
  366. fname = f"{author.name}_{author.uuid}.jpg"
  367. author.headshot.save(fname, ContentFile(r.content), save=True)
  368. self.authors.add(author)
  369. def page_data_for_user(
  370. self, user_id: int, convert_timestamps: bool = True
  371. ) -> dict:
  372. scrobbles = self.scrobble_set.filter(user=user_id)
  373. pages = {}
  374. for scrobble in scrobbles:
  375. if scrobble.logdata.page_data:
  376. for page, data in scrobble.logdata.page_data.items():
  377. if convert_timestamps:
  378. data["start_ts"] = datetime.fromtimestamp(
  379. data["start_ts"]
  380. )
  381. data["end_ts"] = datetime.fromtimestamp(data["end_ts"])
  382. pages[page] = data
  383. sorted_pages = OrderedDict(
  384. sorted(pages.items(), key=lambda x: x[1]["start_ts"])
  385. )
  386. return sorted_pages
  387. @property
  388. def author(self):
  389. return self.authors.first()
  390. @property
  391. def pages_for_completion(self) -> int:
  392. if not self.pages:
  393. logger.warn(f"{self} has no pages, no completion percentage")
  394. return 0
  395. return int(self.pages * (self.COMPLETION_PERCENT / 100))
  396. def update_long_play_seconds(self):
  397. """Check page timestamps and duration and update"""
  398. if self.page_set.all():
  399. ...
  400. def progress_for_user(self, user_id: int) -> int:
  401. """Used to keep track of whether the book is complete or not"""
  402. user = User.objects.get(id=user_id)
  403. last_scrobble = get_scrobbles_for_media(self, user).last()
  404. progress = 0
  405. if last_scrobble:
  406. progress = int((last_scrobble.last_page_read / self.pages) * 100)
  407. return progress
  408. class Paper(LongPlayScrobblableMixin):
  409. """Keeps track of Academic Papers"""
  410. COMPLETION_PERCENT = getattr(settings, "PAPER_COMPLETION_PERCENT", 60)
  411. AVG_PAGE_READING_SECONDS = getattr(
  412. settings, "AVERAGE_PAGE_READING_SECONDS", 60
  413. )
  414. title = models.CharField(max_length=255)
  415. semantic_title = models.CharField(max_length=255, **BNULL)
  416. authors = models.ManyToManyField(Author, blank=True)
  417. koreader_data_by_hash = models.JSONField(**BNULL)
  418. arxiv_id = models.CharField(max_length=50, **BNULL)
  419. semantic_id = models.CharField(max_length=50, **BNULL)
  420. arxiv_id = models.CharField(max_length=50, **BNULL)
  421. corpus_id = models.CharField(max_length=50, **BNULL)
  422. doi_id = models.CharField(max_length=50, **BNULL)
  423. pages = models.IntegerField(**BNULL)
  424. language = models.CharField(max_length=4, **BNULL)
  425. first_publish_year = models.IntegerField(**BNULL)
  426. publish_date = models.DateField(**BNULL)
  427. journal = models.CharField(max_length=255, **BNULL)
  428. journal_volume = models.CharField(max_length=50, **BNULL)
  429. abstract = models.TextField(**BNULL)
  430. tldr = models.CharField(max_length=255, **BNULL)
  431. openaccess_pdf_url = models.CharField(max_length=255, **BNULL)
  432. genre = TaggableManager(through=ObjectWithGenres)
  433. @classmethod
  434. def get_from_semantic(cls, title: str, overwrite: bool = False) -> "Paper":
  435. paper, created = cls.objects.get_or_create(title=title)
  436. if not created and not overwrite:
  437. return paper
  438. paper_dict = lookup_paper_from_semantic(title)
  439. if created or overwrite:
  440. author_list = []
  441. author_dicts = paper_dict.pop("author_dicts")
  442. if author_dicts:
  443. for author_dict in author_dicts:
  444. if author_dict.get("authorId"):
  445. author, a_created = Author.objects.get_or_create(
  446. semantic_id=author_dict.get("authorId")
  447. )
  448. author_list.append(author)
  449. if a_created:
  450. author.name = author_dict.get("name")
  451. author.save()
  452. # TODO enrich author?
  453. ...
  454. for k, v in paper_dict.items():
  455. setattr(paper, k, v)
  456. paper.save()
  457. if author_list:
  458. paper.authors.add(*author_list)
  459. genres = paper_dict.pop("genres", [])
  460. if genres:
  461. paper.genre.add(*genres)
  462. return paper