koreader.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. import logging
  2. import re
  3. import sqlite3
  4. from datetime import datetime, timedelta
  5. from enum import Enum
  6. from zoneinfo import ZoneInfo
  7. import requests
  8. from books.constants import BOOKS_TITLES_TO_IGNORE
  9. from django.apps import apps
  10. from django.contrib.auth import get_user_model
  11. from scrobbles.notifications import ScrobbleNtfyNotification
  12. from stream_sqlite import stream_sqlite
  13. from webdav.client import get_webdav_client
  14. logger = logging.getLogger(__name__)
  15. User = get_user_model()
  16. class KoReaderBookColumn(Enum):
  17. ID = 0
  18. TITLE = 1
  19. AUTHORS = 2
  20. NOTES = 3
  21. LAST_OPEN = 4
  22. HIGHLIGHTS = 5
  23. PAGES = 6
  24. SERIES = 7
  25. LANGUAGE = 8
  26. MD5 = 9
  27. TOTAL_READ_TIME = 10
  28. TOTAL_READ_PAGES = 11
  29. class KoReaderPageStatColumn(Enum):
  30. ID_BOOK = 0
  31. PAGE = 1
  32. START_TIME = 2
  33. DURATION = 3
  34. TOTAL_PAGES = 4
  35. def _sqlite_bytes(sqlite_url):
  36. with requests.get(sqlite_url, stream=True) as r:
  37. yield from r.iter_content(chunk_size=65_536)
  38. # Grace period between page reads for it to be a new scrobble
  39. SESSION_GAP_SECONDS = 1800 # a half hour
  40. def get_author_str_from_row(row):
  41. """Given a the raw author string from KoReader, convert it to a single line and
  42. strip the middle initials, as OpenLibrary lookup usually fails with those.
  43. """
  44. ko_authors = row[KoReaderBookColumn.AUTHORS.value].replace("\n", ", ")
  45. # Strip middle initials, OpenLibrary often fails with these
  46. return re.sub(" [A-Z]. ", " ", ko_authors)
  47. def lookup_or_create_authors_from_author_str(ko_author_str: str) -> list:
  48. """Takes a string of authors from KoReader and returns a list
  49. of Authors from our database
  50. """
  51. from books.models import Author
  52. author_str_list = ko_author_str.split(", ")
  53. author_list = []
  54. for author_str in author_str_list:
  55. logger.debug(f"Looking up author {author_str}")
  56. # KoReader gave us nothing, bail
  57. if author_str == "N/A":
  58. logger.warn(f"KoReader author string is N/A, no authors to find")
  59. continue
  60. author = Author.objects.filter(name=author_str).first()
  61. if not author:
  62. author = Author.objects.create(name=author_str)
  63. # TODO Move these to async processes after importing
  64. # author.fix_metadata()
  65. logger.debug(f"Created author {author}")
  66. author_list.append(author)
  67. return author_list
  68. def create_book_from_row(row: list):
  69. from books.models import Book
  70. # No KoReader book yet, create it
  71. author_str = get_author_str_from_row(row).replace("\x00", "")
  72. total_pages = row[KoReaderBookColumn.PAGES.value]
  73. run_time = total_pages * Book.AVG_PAGE_READING_SECONDS
  74. book_title = row[KoReaderBookColumn.TITLE.value].replace("\x00", "")
  75. if " - " in book_title:
  76. split_title = book_title.split(" - ")
  77. book_title = split_title[0]
  78. if (not author_str or author_str == "N/A") and len(split_title) > 1:
  79. author_str = split_title[1].split("_")[0]
  80. clean_row = []
  81. for value in row:
  82. if isinstance(value, str):
  83. value = value.replace("\x00", "")
  84. clean_row.append(value)
  85. book = Book.objects.create(
  86. title=book_title.replace("_", ":"),
  87. pages=total_pages,
  88. koreader_data_by_hash={
  89. str(row[KoReaderBookColumn.MD5.value]): {
  90. "title": book_title,
  91. "author_str": author_str,
  92. "book_id": row[KoReaderBookColumn.ID.value],
  93. "raw_row_data": clean_row,
  94. }
  95. },
  96. run_time_seconds=run_time,
  97. )
  98. # TODO Move these to async processes after importing
  99. # book.fix_metadata()
  100. # Add authors
  101. author_list = lookup_or_create_authors_from_author_str(author_str)
  102. if author_list:
  103. book.authors.add(*author_list)
  104. # self._lookup_authors
  105. return book
  106. def build_book_map(rows) -> dict:
  107. """Given an interable of sqlite rows from the books table, lookup existing
  108. books, create ones that don't exist, and return a mapping of koreader IDs to
  109. primary key IDs for page creation.
  110. """
  111. from books.models import Book
  112. book_id_map = {}
  113. for book_row in rows:
  114. if book_row[KoReaderBookColumn.TITLE.value] in BOOKS_TITLES_TO_IGNORE:
  115. logger.info(
  116. "[build_book_map] Ignoring book title that is likely garbage",
  117. extra={"book_row": book_row, "media_type": "Book"},
  118. )
  119. continue
  120. book = Book.objects.filter(
  121. koreader_data_by_hash__icontains=book_row[
  122. KoReaderBookColumn.MD5.value
  123. ]
  124. ).first()
  125. if not book:
  126. title = (
  127. book_row[KoReaderBookColumn.TITLE.value]
  128. .split(" - ")[0]
  129. .lower()
  130. .replace("\x00", "")
  131. )
  132. book = Book.objects.filter(title=title).first()
  133. if not book:
  134. book = create_book_from_row(book_row)
  135. book.refresh_from_db()
  136. total_seconds = 0
  137. if book_row[KoReaderBookColumn.TOTAL_READ_TIME.value]:
  138. total_seconds = book_row[KoReaderBookColumn.TOTAL_READ_TIME.value]
  139. book_id_map[book_row[KoReaderBookColumn.ID.value]] = {
  140. "book_id": book.id,
  141. "hash": book_row[KoReaderBookColumn.MD5.value],
  142. "total_seconds": total_seconds,
  143. }
  144. return book_id_map
  145. def build_page_data(page_rows: list, book_map: dict, user_tz=None) -> dict:
  146. """Given rows of page data from KoReader, parse each row and build
  147. scrobbles for our user, loading the page data into the page_data
  148. field on the scrobble instance.
  149. """
  150. book_ids_not_found = []
  151. for page_row in page_rows:
  152. koreader_book_id = page_row[KoReaderPageStatColumn.ID_BOOK.value]
  153. if koreader_book_id not in book_map.keys():
  154. book_ids_not_found.append(koreader_book_id)
  155. continue
  156. if "pages" not in book_map[koreader_book_id].keys():
  157. book_map[koreader_book_id]["pages"] = {}
  158. page_number = page_row[KoReaderPageStatColumn.PAGE.value]
  159. duration = page_row[KoReaderPageStatColumn.DURATION.value]
  160. start_ts = page_row[KoReaderPageStatColumn.START_TIME.value]
  161. book_map[koreader_book_id]["pages"][page_number] = {
  162. "duration": duration,
  163. "start_ts": start_ts,
  164. "end_ts": start_ts + duration,
  165. }
  166. if book_ids_not_found:
  167. logger.info(
  168. f"Found pages for books not in file: {set(book_ids_not_found)}"
  169. )
  170. return book_map
  171. def build_scrobbles_from_book_map(
  172. book_map: dict, user: "User"
  173. ) -> list["Scrobble"]:
  174. Scrobble = apps.get_model("scrobbles", "Scrobble")
  175. scrobbles_to_create = []
  176. pages_not_found = []
  177. for koreader_book_id, book_dict in book_map.items():
  178. book_id = book_dict["book_id"]
  179. if "pages" not in book_dict.keys():
  180. pages_not_found.append(book_id)
  181. continue
  182. should_create_scrobble = False
  183. scrobble_page_data = {}
  184. playback_position_seconds = 0
  185. prev_page_stats = {}
  186. last_page_number = 0
  187. pages_processed = 0
  188. total_pages_read = len(book_map[koreader_book_id]["pages"])
  189. ordered_pages = sorted(
  190. book_map[koreader_book_id]["pages"].items(),
  191. key=lambda x: x[1]["start_ts"],
  192. )
  193. for cur_page_number, stats in ordered_pages:
  194. pages_processed += 1
  195. seconds_from_last_page = 0
  196. if prev_page_stats:
  197. seconds_from_last_page = stats.get(
  198. "end_ts"
  199. ) - prev_page_stats.get("start_ts")
  200. playback_position_seconds = playback_position_seconds + stats.get(
  201. "duration"
  202. )
  203. end_of_reading = pages_processed == total_pages_read
  204. big_jump_to_this_page = (cur_page_number - last_page_number) > 10
  205. is_session_gap = seconds_from_last_page > SESSION_GAP_SECONDS
  206. if (
  207. is_session_gap and not big_jump_to_this_page
  208. ) or end_of_reading:
  209. should_create_scrobble = True
  210. if should_create_scrobble:
  211. scrobble_page_data = dict(
  212. sorted(
  213. scrobble_page_data.items(),
  214. key=lambda x: x[1]["start_ts"],
  215. )
  216. )
  217. try:
  218. first_page = scrobble_page_data.get(
  219. list(scrobble_page_data.keys())[0]
  220. )
  221. last_page = scrobble_page_data.get(
  222. list(scrobble_page_data.keys())[-1]
  223. )
  224. except IndexError:
  225. logger.error(
  226. "Could not process book, no page data found",
  227. extra={"scrobble_page_data": scrobble_page_data},
  228. )
  229. continue
  230. timestamp = user.profile.get_timestamp_with_tz(
  231. datetime.fromtimestamp(int(first_page.get("start_ts")))
  232. )
  233. stop_timestamp = user.profile.get_timestamp_with_tz(
  234. datetime.fromtimestamp(int(last_page.get("end_ts")))
  235. )
  236. # Adjust for Daylight Saving Time
  237. if timestamp.dst() == timedelta(
  238. 0
  239. ) or stop_timestamp.dst() == timedelta(0):
  240. timestamp = timestamp - timedelta(hours=1)
  241. stop_timestamp = stop_timestamp - timedelta(hours=1)
  242. scrobble = Scrobble.objects.filter(
  243. timestamp=timestamp,
  244. book_id=book_id,
  245. user_id=user.id,
  246. ).first()
  247. if not scrobble:
  248. logger.info(
  249. f"Queueing scrobble for {book_id}, page {cur_page_number}"
  250. )
  251. log_data = {
  252. "koreader_hash": book_dict.get("hash"),
  253. "page_data": scrobble_page_data,
  254. "pages_read": len(scrobble_page_data.keys()),
  255. }
  256. scrobbles_to_create.append(
  257. Scrobble(
  258. book_id=book_id,
  259. user_id=user.id,
  260. source="KOReader",
  261. media_type=Scrobble.MediaType.BOOK,
  262. timestamp=timestamp,
  263. log=log_data,
  264. stop_timestamp=stop_timestamp,
  265. playback_position_seconds=playback_position_seconds,
  266. in_progress=False,
  267. played_to_completion=True,
  268. long_play_complete=False,
  269. timezone=timestamp.tzinfo.name,
  270. )
  271. )
  272. # Then start over
  273. should_create_scrobble = False
  274. playback_position_seconds = 0
  275. scrobble_page_data = {}
  276. # We accumulate pages for the scrobble until we should create a new one
  277. scrobble_page_data[cur_page_number] = stats
  278. last_page_number = cur_page_number
  279. prev_page_stats = stats
  280. if pages_not_found:
  281. logger.info(f"Pages not found for books: {set(pages_not_found)}")
  282. return scrobbles_to_create
  283. def fix_long_play_stats_for_scrobbles(scrobbles: list) -> None:
  284. """Given a list of scrobbles, update pages read, long play seconds and check
  285. for media completion"""
  286. for scrobble in scrobbles:
  287. # But if there's a next scrobble, set pages read to their starting page
  288. if scrobble.previous and not scrobble.previous.long_play_complete:
  289. scrobble.long_play_seconds = scrobble.playback_position_seconds + (
  290. scrobble.previous.long_play_seconds or 0
  291. )
  292. else:
  293. scrobble.long_play_seconds = scrobble.playback_position_seconds
  294. scrobble.log["book_pages_read"] = scrobble.calc_pages_read()
  295. scrobble.save(update_fields=["log", "long_play_seconds"])
  296. def process_koreader_sqlite_file(file_path, user_id) -> list:
  297. """Given a sqlite file from KoReader, open the book table, iterate
  298. over rows creating scrobbles from each book found"""
  299. Scrobble = apps.get_model("scrobbles", "Scrobble")
  300. new_scrobbles = []
  301. user = User.objects.filter(id=user_id).first()
  302. tz = ZoneInfo("UTC")
  303. if user:
  304. tz = user.profile.tzinfo
  305. is_os_file = "https://" not in file_path
  306. if is_os_file:
  307. # Loading sqlite file from local filesystem
  308. con = sqlite3.connect(file_path)
  309. cur = con.cursor()
  310. try:
  311. book_map = build_book_map(cur.execute("SELECT * FROM book"))
  312. except sqlite3.OperationalError:
  313. logger.warning("KOReader sqlite file had not table: book")
  314. return new_scrobbles
  315. book_map = build_page_data(
  316. cur.execute(
  317. "SELECT * from page_stat_data ORDER BY id_book, start_time"
  318. ),
  319. book_map,
  320. tz,
  321. )
  322. new_scrobbles = build_scrobbles_from_book_map(book_map, user)
  323. else:
  324. # Streaming the sqlite file off S3
  325. book_map = {}
  326. for table_name, pragma_table_info, rows in stream_sqlite(
  327. _sqlite_bytes(file_path), max_buffer_size=1_048_576
  328. ):
  329. logger.debug(f"Found table {table_name} - processing")
  330. if table_name == "book":
  331. book_map = build_book_map(rows)
  332. for table_name, pragma_table_info, rows in stream_sqlite(
  333. _sqlite_bytes(file_path), max_buffer_size=1_048_576
  334. ):
  335. if table_name == "page_stat_data":
  336. book_map = build_page_data(rows, book_map, tz)
  337. new_scrobbles = build_scrobbles_from_book_map(book_map, user)
  338. logger.info(f"Creating {len(new_scrobbles)} new scrobbles")
  339. created = []
  340. if new_scrobbles:
  341. created = Scrobble.objects.bulk_create(new_scrobbles)
  342. if created:
  343. ScrobbleNtfyNotification(created[-1]).send()
  344. fix_long_play_stats_for_scrobbles(created)
  345. logger.info(
  346. f"Created {len(created)} scrobbles",
  347. extra={"created_scrobbles": created},
  348. )
  349. return created
  350. def fetch_file_from_webdav(user_id: int) -> str:
  351. file_path = f"/tmp/{user_id}-koreader-import.sqlite3"
  352. client = get_webdav_client(user_id)
  353. if not client:
  354. logger.warning("could not get webdav client for user")
  355. # TODO maybe we raise an exception here?
  356. return ""
  357. client.download_sync(
  358. remote_path="var/koreader/statistics.sqlite3",
  359. local_path=file_path,
  360. )
  361. return file_path