koreader.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. import logging
  2. import re
  3. import sqlite3
  4. from datetime import datetime, timedelta
  5. from enum import Enum
  6. from zoneinfo import ZoneInfo
  7. import requests
  8. from books.constants import BOOKS_TITLES_TO_IGNORE
  9. from django.apps import apps
  10. from django.contrib.auth import get_user_model
  11. from scrobbles.constants import MediaType
  12. from scrobbles.notifications import ScrobbleNtfyNotification
  13. from stream_sqlite import stream_sqlite
  14. from webdav.client import get_webdav_client
  15. logger = logging.getLogger(__name__)
  16. User = get_user_model()
  17. class KoReaderBookColumn(Enum):
  18. ID = 0
  19. TITLE = 1
  20. AUTHORS = 2
  21. NOTES = 3
  22. LAST_OPEN = 4
  23. HIGHLIGHTS = 5
  24. PAGES = 6
  25. SERIES = 7
  26. LANGUAGE = 8
  27. MD5 = 9
  28. TOTAL_READ_TIME = 10
  29. TOTAL_READ_PAGES = 11
  30. class KoReaderPageStatColumn(Enum):
  31. ID_BOOK = 0
  32. PAGE = 1
  33. START_TIME = 2
  34. DURATION = 3
  35. TOTAL_PAGES = 4
  36. def _sqlite_bytes(sqlite_url):
  37. with requests.get(sqlite_url, stream=True) as r:
  38. yield from r.iter_content(chunk_size=65_536)
  39. # Grace period between page reads for it to be a new scrobble
  40. SESSION_GAP_SECONDS = 1800 # a half hour
  41. def get_author_str_from_row(row):
  42. """Given a the raw author string from KoReader, convert it to a single line and
  43. strip the middle initials, as OpenLibrary lookup usually fails with those.
  44. """
  45. ko_authors = row[KoReaderBookColumn.AUTHORS.value].replace("\n", ", ")
  46. # Strip middle initials, OpenLibrary often fails with these
  47. return re.sub(" [A-Z]. ", " ", ko_authors)
  48. def lookup_or_create_authors_from_author_str(ko_author_str: str) -> list:
  49. """Takes a string of authors from KoReader and returns a list
  50. of Authors from our database
  51. """
  52. from books.models import Author
  53. author_str_list = ko_author_str.split(", ")
  54. author_list = []
  55. for author_str in author_str_list:
  56. logger.debug(f"Looking up author {author_str}")
  57. # KoReader gave us nothing, bail
  58. if author_str == "N/A":
  59. logger.warn(f"KoReader author string is N/A, no authors to find")
  60. continue
  61. author = Author.objects.filter(name=author_str).first()
  62. if not author:
  63. author = Author.objects.create(name=author_str)
  64. # TODO Move these to async processes after importing
  65. # author.fix_metadata()
  66. logger.debug(f"Created author {author}")
  67. author_list.append(author)
  68. return author_list
  69. def create_book_from_row(row: list):
  70. from books.models import Book
  71. # No KoReader book yet, create it
  72. author_str = get_author_str_from_row(row).replace("\x00", "")
  73. total_pages = row[KoReaderBookColumn.PAGES.value]
  74. run_time = total_pages * Book.AVG_PAGE_READING_SECONDS
  75. book_title = row[KoReaderBookColumn.TITLE.value].replace("\x00", "")
  76. if " - " in book_title:
  77. split_title = book_title.split(" - ")
  78. book_title = split_title[0]
  79. if (not author_str or author_str == "N/A") and len(split_title) > 1:
  80. author_str = split_title[1].split("_")[0]
  81. clean_row = []
  82. for value in row:
  83. if isinstance(value, str):
  84. value = value.replace("\x00", "")
  85. clean_row.append(value)
  86. book = Book.objects.create(
  87. title=book_title.replace("_", ":"),
  88. pages=total_pages,
  89. koreader_data_by_hash={
  90. str(row[KoReaderBookColumn.MD5.value]): {
  91. "title": book_title,
  92. "author_str": author_str,
  93. "book_id": row[KoReaderBookColumn.ID.value],
  94. "raw_row_data": clean_row,
  95. }
  96. },
  97. base_run_time_seconds=run_time,
  98. )
  99. # TODO Move these to async processes after importing
  100. # book.fix_metadata()
  101. # Add authors
  102. author_list = lookup_or_create_authors_from_author_str(author_str)
  103. if author_list:
  104. book.authors.add(*author_list)
  105. # self._lookup_authors
  106. return book
  107. def build_book_map(rows) -> dict:
  108. """Given an interable of sqlite rows from the books table, lookup existing
  109. books, create ones that don't exist, and return a mapping of koreader IDs to
  110. primary key IDs for page creation.
  111. """
  112. from books.models import Book
  113. book_id_map = {}
  114. for book_row in rows:
  115. if book_row[KoReaderBookColumn.TITLE.value] in BOOKS_TITLES_TO_IGNORE:
  116. logger.info(
  117. "[build_book_map] Ignoring book title that is likely garbage",
  118. extra={"book_row": book_row, "media_type": "Book"},
  119. )
  120. continue
  121. book = Book.objects.filter(
  122. koreader_data_by_hash__icontains=book_row[
  123. KoReaderBookColumn.MD5.value
  124. ]
  125. ).first()
  126. if not book:
  127. title = (
  128. book_row[KoReaderBookColumn.TITLE.value]
  129. .split(" - ")[0]
  130. .lower()
  131. .replace("\x00", "")
  132. )
  133. book = Book.objects.filter(title=title).first()
  134. if not book:
  135. book = create_book_from_row(book_row)
  136. book.refresh_from_db()
  137. total_seconds = 0
  138. if book_row[KoReaderBookColumn.TOTAL_READ_TIME.value]:
  139. total_seconds = book_row[KoReaderBookColumn.TOTAL_READ_TIME.value]
  140. book_id_map[book_row[KoReaderBookColumn.ID.value]] = {
  141. "book_id": book.id,
  142. "hash": book_row[KoReaderBookColumn.MD5.value],
  143. "total_seconds": total_seconds,
  144. }
  145. return book_id_map
  146. def build_page_data(page_rows: list, book_map: dict, user_tz=None) -> dict:
  147. """Given rows of page data from KoReader, parse each row and build
  148. scrobbles for our user, loading the page data into the page_data
  149. field on the scrobble instance.
  150. """
  151. book_ids_not_found = []
  152. for page_row in page_rows:
  153. koreader_book_id = page_row[KoReaderPageStatColumn.ID_BOOK.value]
  154. if koreader_book_id not in book_map.keys():
  155. book_ids_not_found.append(koreader_book_id)
  156. continue
  157. if "pages" not in book_map[koreader_book_id].keys():
  158. book_map[koreader_book_id]["pages"] = {}
  159. page_number = page_row[KoReaderPageStatColumn.PAGE.value]
  160. duration = page_row[KoReaderPageStatColumn.DURATION.value]
  161. start_ts = page_row[KoReaderPageStatColumn.START_TIME.value]
  162. book_map[koreader_book_id]["pages"][page_number] = {
  163. "duration": duration,
  164. "start_ts": start_ts,
  165. "end_ts": start_ts + duration,
  166. }
  167. if book_ids_not_found:
  168. logger.info(
  169. f"Found pages for books not in file: {set(book_ids_not_found)}"
  170. )
  171. return book_map
  172. def build_scrobbles_from_book_map(
  173. book_map: dict, user: "User"
  174. ) -> list["Scrobble"]:
  175. Scrobble = apps.get_model("scrobbles", "Scrobble")
  176. scrobbles_to_create = []
  177. pages_not_found = []
  178. for koreader_book_id, book_dict in book_map.items():
  179. book_id = book_dict["book_id"]
  180. if "pages" not in book_dict.keys():
  181. pages_not_found.append(book_id)
  182. continue
  183. should_create_scrobble = False
  184. scrobble_page_data = {}
  185. playback_position_seconds = 0
  186. prev_page_stats = {}
  187. last_page_number = 0
  188. pages_processed = 0
  189. total_pages_read = len(book_map[koreader_book_id]["pages"])
  190. ordered_pages = sorted(
  191. book_map[koreader_book_id]["pages"].items(),
  192. key=lambda x: x[1]["start_ts"],
  193. )
  194. for cur_page_number, stats in ordered_pages:
  195. pages_processed += 1
  196. seconds_from_last_page = 0
  197. if prev_page_stats:
  198. seconds_from_last_page = stats.get(
  199. "end_ts"
  200. ) - prev_page_stats.get("start_ts")
  201. playback_position_seconds = playback_position_seconds + stats.get(
  202. "duration"
  203. )
  204. end_of_reading = pages_processed == total_pages_read
  205. big_jump_to_this_page = (cur_page_number - last_page_number) > 10
  206. is_session_gap = seconds_from_last_page > SESSION_GAP_SECONDS
  207. if (
  208. is_session_gap and not big_jump_to_this_page
  209. ) or end_of_reading:
  210. should_create_scrobble = True
  211. if should_create_scrobble:
  212. scrobble_page_data = dict(
  213. sorted(
  214. scrobble_page_data.items(),
  215. key=lambda x: x[1]["start_ts"],
  216. )
  217. )
  218. try:
  219. first_page = scrobble_page_data.get(
  220. list(scrobble_page_data.keys())[0]
  221. )
  222. last_page = scrobble_page_data.get(
  223. list(scrobble_page_data.keys())[-1]
  224. )
  225. except IndexError:
  226. logger.error(
  227. "Could not process book, no page data found",
  228. extra={"scrobble_page_data": scrobble_page_data},
  229. )
  230. continue
  231. timestamp = user.profile.get_timestamp_with_tz(
  232. datetime.fromtimestamp(int(first_page.get("start_ts")))
  233. )
  234. stop_timestamp = user.profile.get_timestamp_with_tz(
  235. datetime.fromtimestamp(int(last_page.get("end_ts")))
  236. )
  237. # Adjust for Daylight Saving Time
  238. #if timestamp.dst() == timedelta(
  239. # 0
  240. #) or stop_timestamp.dst() == timedelta(0):
  241. # timestamp = timestamp - timedelta(hours=1)
  242. # stop_timestamp = stop_timestamp - timedelta(hours=1)
  243. timestamp -= timedelta(seconds=timestamp.dst())
  244. stop_timestamp -= timedelta(seconds=timestamp.dst())
  245. scrobble = Scrobble.objects.filter(
  246. timestamp=timestamp,
  247. book_id=book_id,
  248. user_id=user.id,
  249. ).first()
  250. if not scrobble:
  251. logger.info(
  252. f"Queueing scrobble for {book_id}, page {cur_page_number}"
  253. )
  254. log_data = {
  255. "koreader_hash": book_dict.get("hash"),
  256. "page_data": scrobble_page_data,
  257. "pages_read": len(scrobble_page_data.keys()),
  258. }
  259. scrobbles_to_create.append(
  260. Scrobble(
  261. book_id=book_id,
  262. user_id=user.id,
  263. source="KOReader",
  264. media_type=MediaType.BOOK,
  265. timestamp=timestamp,
  266. log=log_data,
  267. stop_timestamp=stop_timestamp,
  268. playback_position_seconds=playback_position_seconds,
  269. in_progress=False,
  270. played_to_completion=True,
  271. long_play_complete=False,
  272. timezone=timestamp.tzinfo.name,
  273. )
  274. )
  275. # Then start over
  276. should_create_scrobble = False
  277. playback_position_seconds = 0
  278. scrobble_page_data = {}
  279. # We accumulate pages for the scrobble until we should create a new one
  280. scrobble_page_data[cur_page_number] = stats
  281. last_page_number = cur_page_number
  282. prev_page_stats = stats
  283. if pages_not_found:
  284. logger.info(f"Pages not found for books: {set(pages_not_found)}")
  285. return scrobbles_to_create
  286. def fix_long_play_stats_for_scrobbles(scrobbles: list) -> None:
  287. """Given a list of scrobbles, update pages read, long play seconds and check
  288. for media completion"""
  289. for scrobble in scrobbles:
  290. # But if there's a next scrobble, set pages read to their starting page
  291. if scrobble.previous and not scrobble.previous.long_play_complete:
  292. scrobble.long_play_seconds = scrobble.playback_position_seconds + (
  293. scrobble.previous.long_play_seconds or 0
  294. )
  295. else:
  296. scrobble.long_play_seconds = scrobble.playback_position_seconds
  297. scrobble.log["book_pages_read"] = scrobble.calc_pages_read()
  298. scrobble.save(update_fields=["log", "long_play_seconds"])
  299. def process_koreader_sqlite_file(file_path, user_id) -> list:
  300. """Given a sqlite file from KoReader, open the book table, iterate
  301. over rows creating scrobbles from each book found"""
  302. Scrobble = apps.get_model("scrobbles", "Scrobble")
  303. new_scrobbles = []
  304. user = User.objects.filter(id=user_id).first()
  305. tz = ZoneInfo("UTC")
  306. if user:
  307. tz = user.profile.tzinfo
  308. is_os_file = "https://" not in file_path
  309. if is_os_file:
  310. # Loading sqlite file from local filesystem
  311. con = sqlite3.connect(file_path)
  312. cur = con.cursor()
  313. try:
  314. book_map = build_book_map(cur.execute("SELECT * FROM book"))
  315. except sqlite3.OperationalError:
  316. logger.warning("KOReader sqlite file had not table: book")
  317. return new_scrobbles
  318. book_map = build_page_data(
  319. cur.execute(
  320. "SELECT * from page_stat_data ORDER BY id_book, start_time"
  321. ),
  322. book_map,
  323. tz,
  324. )
  325. new_scrobbles = build_scrobbles_from_book_map(book_map, user)
  326. else:
  327. # Streaming the sqlite file off S3
  328. book_map = {}
  329. for table_name, pragma_table_info, rows in stream_sqlite(
  330. _sqlite_bytes(file_path), max_buffer_size=1_048_576
  331. ):
  332. logger.debug(f"Found table {table_name} - processing")
  333. if table_name == "book":
  334. book_map = build_book_map(rows)
  335. for table_name, pragma_table_info, rows in stream_sqlite(
  336. _sqlite_bytes(file_path), max_buffer_size=1_048_576
  337. ):
  338. if table_name == "page_stat_data":
  339. book_map = build_page_data(rows, book_map, tz)
  340. new_scrobbles = build_scrobbles_from_book_map(book_map, user)
  341. logger.info(f"Creating {len(new_scrobbles)} new scrobbles")
  342. created = []
  343. if new_scrobbles:
  344. created = Scrobble.objects.bulk_create(new_scrobbles)
  345. if created:
  346. ScrobbleNtfyNotification(created[-1]).send()
  347. fix_long_play_stats_for_scrobbles(created)
  348. logger.info(
  349. f"Created {len(created)} scrobbles",
  350. extra={"created_scrobbles": created},
  351. )
  352. return created
  353. def fetch_file_from_webdav(user_id: int) -> str:
  354. file_path = f"/tmp/{user_id}-koreader-import.sqlite3"
  355. client = get_webdav_client(user_id)
  356. if not client:
  357. logger.warning("could not get webdav client for user")
  358. # TODO maybe we raise an exception here?
  359. return ""
  360. client.download_sync(
  361. remote_path="var/koreader/statistics.sqlite3",
  362. local_path=file_path,
  363. )
  364. return file_path