koreader.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. import re
  2. import codecs
  3. import logging
  4. import os
  5. import sqlite3
  6. from datetime import datetime
  7. from enum import Enum
  8. from typing import Iterable, List
  9. import pytz
  10. import requests
  11. from books.models import Author, Book, Page
  12. from pylast import httpx, tempfile
  13. from scrobbles.models import Scrobble
  14. from stream_sqlite import stream_sqlite
  15. from vrobbler.apps.books.openlibrary import get_author_openlibrary_id
  16. logger = logging.getLogger(__name__)
  17. class KoReaderBookColumn(Enum):
  18. ID = 0
  19. TITLE = 1
  20. AUTHORS = 2
  21. NOTES = 3
  22. LAST_OPEN = 4
  23. HIGHLIGHTS = 5
  24. PAGES = 6
  25. SERIES = 7
  26. LANGUAGE = 8
  27. MD5 = 9
  28. TOTAL_READ_TIME = 10
  29. TOTAL_READ_PAGES = 11
  30. class KoReaderPageStatColumn(Enum):
  31. ID_BOOK = 0
  32. PAGE = 1
  33. START_TIME = 2
  34. DURATION = 3
  35. TOTAL_PAGES = 4
  36. def _sqlite_bytes(sqlite_url):
  37. with httpx.stream("GET", sqlite_url) as r:
  38. yield from r.iter_bytes(chunk_size=65_536)
  39. def get_book_map_from_sqlite(rows: Iterable) -> dict:
  40. """Given an interable of sqlite rows from the books table, lookup existing
  41. books, create ones that don't exist, and return a mapping of koreader IDs to
  42. primary key IDs for page creation.
  43. """
  44. book_id_map = {}
  45. for book_row in rows:
  46. book = Book.objects.filter(
  47. koreader_md5=book_row[KoReaderBookColumn.MD5.value]
  48. ).first()
  49. if not book:
  50. book, created = Book.objects.get_or_create(
  51. title=book_row[KoReaderBookColumn.TITLE.value]
  52. )
  53. if created:
  54. total_pages = book_row[KoReaderBookColumn.PAGES.value]
  55. run_time = total_pages * book.AVG_PAGE_READING_SECONDS
  56. ko_authors = book_row[
  57. KoReaderBookColumn.AUTHORS.value
  58. ].replace("\n", ", ")
  59. # Strip middle initials, OpenLibrary often fails with these
  60. ko_authors = re.sub(" [A-Z]. ", " ", ko_authors)
  61. book_dict = {
  62. "title": book_row[KoReaderBookColumn.TITLE.value],
  63. "pages": total_pages,
  64. "koreader_md5": book_row[KoReaderBookColumn.MD5.value],
  65. "koreader_id": int(book_row[KoReaderBookColumn.ID.value]),
  66. "koreader_authors": ko_authors,
  67. "run_time_seconds": run_time,
  68. }
  69. Book.objects.filter(pk=book.id).update(**book_dict)
  70. # Add authors
  71. authors = ko_authors.split(", ")
  72. author_list = []
  73. for author_str in authors:
  74. logger.debug(f"Looking up author {author_str}")
  75. if author_str == "N/A":
  76. continue
  77. author, created = Author.objects.get_or_create(
  78. name=author_str
  79. )
  80. if created:
  81. author.openlibrary_id = get_author_openlibrary_id(
  82. author_str
  83. )
  84. author.save(update_fields=["openlibrary_id"])
  85. author.fix_metadata()
  86. logger.debug(f"Created author {author}")
  87. book.authors.add(author)
  88. # This will try to fix metadata by looking it up on OL
  89. book.fix_metadata()
  90. playback_position_seconds = 0
  91. if book_row[KoReaderBookColumn.TOTAL_READ_TIME.value]:
  92. playback_position_seconds = book_row[
  93. KoReaderBookColumn.TOTAL_READ_TIME.value
  94. ]
  95. pages_read = 0
  96. if book_row[KoReaderBookColumn.TOTAL_READ_PAGES.value]:
  97. pages_read = int(
  98. book_row[KoReaderBookColumn.TOTAL_READ_PAGES.value]
  99. )
  100. timestamp = datetime.utcfromtimestamp(
  101. book_row[KoReaderBookColumn.LAST_OPEN.value]
  102. ).replace(tzinfo=pytz.utc)
  103. book.refresh_from_db()
  104. book_id_map[book_row[KoReaderBookColumn.ID.value]] = book.id
  105. return book_id_map
  106. def build_scrobbles_from_pages(
  107. rows: Iterable, book_id_map: dict, user_id: int
  108. ) -> List[Scrobble]:
  109. new_scrobbles = []
  110. new_scrobbles = []
  111. pages_created = []
  112. for page_row in rows:
  113. koreader_id = page_row[KoReaderPageStatColumn.ID_BOOK.value]
  114. page_number = page_row[KoReaderPageStatColumn.PAGE.value]
  115. ts = page_row[KoReaderPageStatColumn.START_TIME.value]
  116. book_id = book_id_map[koreader_id]
  117. page, page_created = Page.objects.get_or_create(
  118. book_id=book_id, number=page_number, user_id=user_id
  119. )
  120. if page_created:
  121. page.start_time = datetime.utcfromtimestamp(ts).replace(
  122. tzinfo=pytz.utc
  123. )
  124. page.duration_seconds = page_row[
  125. KoReaderPageStatColumn.DURATION.value
  126. ]
  127. page.save(update_fields=["start_time", "duration_seconds"])
  128. pages_created.append(page)
  129. for page in pages_created:
  130. if page.is_scrobblable:
  131. # Page number is a placeholder, we'll re-preocess this after creation
  132. logger.debug(
  133. f"Queueing scrobble for {page.book}, page {page.number}"
  134. )
  135. new_scrobble = Scrobble(
  136. book_id=page.book_id,
  137. user_id=user_id,
  138. source="KOReader",
  139. timestamp=page.start_time,
  140. played_to_completion=True,
  141. in_progress=False,
  142. book_pages_read=page.number,
  143. long_play_complete=False,
  144. )
  145. new_scrobbles.append(new_scrobble)
  146. return new_scrobbles
  147. def enrich_koreader_scrobbles(scrobbles: list) -> None:
  148. """Given a list of scrobbles, update pages read, long play seconds and check
  149. for media completion"""
  150. for scrobble in scrobbles:
  151. if scrobble.next:
  152. # Set pages read to the starting page of the next scrobble minus one, if it exists
  153. scrobble.book_pages_read = scrobble.next.book_pages_read - 1
  154. scrobble.save(update_fields=["book_pages_read"])
  155. else:
  156. # Set pages read to the last page we have
  157. scrobble.book_pages_read = scrobble.book.page_set.last().number
  158. scrobble.save(update_fields=["book_pages_read", "long_play_complete"])
  159. def process_koreader_sqlite_url(file_url, user_id) -> list:
  160. book_id_map = {}
  161. new_scrobbles = []
  162. for table_name, pragma_table_info, rows in stream_sqlite(
  163. _sqlite_bytes(file_url), max_buffer_size=1_048_576
  164. ):
  165. if table_name == "book":
  166. book_id_map = get_book_map_from_sqlite(rows)
  167. if table_name == "page":
  168. new_scrobbles = build_scrobbles_from_pages(
  169. rows, book_id_map, user_id
  170. )
  171. created = []
  172. if new_scrobbles:
  173. created = Scrobble.objects.bulk_create(new_scrobbles)
  174. enrich_koreader_scrobbles(created)
  175. logger.info(
  176. f"Created {len(created)} scrobbles",
  177. extra={"created_scrobbles": created},
  178. )
  179. return created
  180. def process_koreader_sqlite_file(file_path, user_id) -> list:
  181. """Given a sqlite file from KoReader, open the book table, iterate
  182. over rows creating scrobbles from each book found"""
  183. # Create a SQL connection to our SQLite database
  184. con = sqlite3.connect(file_path)
  185. cur = con.cursor()
  186. book_id_map = get_book_map_from_sqlite(cur.execute("SELECT * FROM book"))
  187. new_scrobbles = build_scrobbles_from_pages(
  188. cur.execute("SELECT * from page_stat_data"), book_id_map, user_id
  189. )
  190. created = []
  191. if new_scrobbles:
  192. created = Scrobble.objects.bulk_create(new_scrobbles)
  193. enrich_koreader_scrobbles(created)
  194. logger.info(
  195. f"Created {len(created)} scrobbles",
  196. extra={"created_scrobbles": created},
  197. )
  198. return created
  199. def process_koreader_sqlite(file_path: str, user_id: int) -> list:
  200. is_os_file = "https://" not in file_path
  201. if is_os_file:
  202. created = process_koreader_sqlite_file(file_path, user_id)
  203. else:
  204. created = process_koreader_sqlite_url(file_path, user_id)
  205. return created