浏览代码

Fix KOReader imports to use pages for scrobbles

Colin Powell 2 年之前
父节点
当前提交
f082bea571
共有 2 个文件被更改,包括 168 次插入84 次删除
  1. 137 75
      vrobbler/apps/books/koreader.py
  2. 31 9
      vrobbler/apps/scrobbles/models.py

+ 137 - 75
vrobbler/apps/books/koreader.py

@@ -1,12 +1,17 @@
+import codecs
 import logging
-from datetime import datetime
+import os
 import sqlite3
+from datetime import datetime
 from enum import Enum
+from typing import Iterable, List
 
 import pytz
-
+import requests
 from books.models import Author, Book, Page
+from pylast import httpx, tempfile
 from scrobbles.models import Scrobble
+from stream_sqlite import stream_sqlite
 
 logger = logging.getLogger(__name__)
 
@@ -34,54 +39,24 @@ class KoReaderPageStatColumn(Enum):
     TOTAL_PAGES = 4
 
 
-def process_pages_for_book(book_id, sqlite_file_path):
-    con = sqlite3.connect(sqlite_file_path)
-    cur = con.cursor()
+def _sqlite_bytes(sqlite_url):
+    with httpx.stream("GET", sqlite_url) as r:
+        yield from r.iter_bytes(chunk_size=65_536)
 
-    book = Book.objects.filter(koreader_id=book_id).first()
-    if not book:
-        logger.error(f"No book found with KoReader ID of {book_id}")
-        return
 
-    page_table = cur.execute(
-        f"SELECT * FROM page_stat_data where id_book={book_id}"
-    )
-    new_pages = []
-    for page_row in page_table:
-        page_number = page_row[KoReaderPageStatColumn.PAGE.value]
-        page, page_created = Page.objects.get_or_create(
-            book=book, number=page_number
-        )
-        if page_created:
-            ts = page_row[KoReaderPageStatColumn.START_TIME.value]
-            page.start_time = datetime.utcfromtimestamp(ts).replace(
-                tzinfo=pytz.utc
-            )
-            page.duration_seconds = page_row[
-                KoReaderPageStatColumn.DURATION.value
-            ]
-            page.save()
-            new_pages.append(page)
-    logger.info(f"Added {len(new_pages)} for book {book}")
-    return new_pages
+def get_book_map_from_sqlite(rows: Iterable) -> dict:
+    """Given an interable of sqlite rows from the books table, lookup existing
+    books, create ones that don't exist, and return a mapping of koreader IDs to
+    primary key IDs for page creation.
 
+    """
+    book_id_map = {}
 
-def process_koreader_sqlite_file(sqlite_file_path, user_id):
-    """Given a sqlite file from KoReader, open the book table, iterate
-    over rows creating scrobbles from each book found"""
-    # Create a SQL connection to our SQLite database
-    con = sqlite3.connect(sqlite_file_path)
-    cur = con.cursor()
-
-    # Return all results of query
-    book_table = cur.execute("SELECT * FROM book")
-    new_scrobbles = []
-    for book_row in book_table:
+    for book_row in rows:
         authors = book_row[KoReaderBookColumn.AUTHORS.value].split("\n")
         author_list = []
         for author_str in authors:
             logger.debug(f"Looking up author {author_str}")
-
             if author_str == "N/A":
                 continue
 
@@ -96,11 +71,11 @@ def process_koreader_sqlite_file(sqlite_file_path, user_id):
         )
 
         if created:
-            pages = book_row[KoReaderBookColumn.PAGES.value]
-            run_time = pages * book.AVG_PAGE_READING_SECONDS
+            total_pages = book_row[KoReaderBookColumn.PAGES.value]
+            run_time = total_pages * book.AVG_PAGE_READING_SECONDS
             book_dict = {
                 "title": book_row[KoReaderBookColumn.TITLE.value],
-                "pages": book_row[KoReaderBookColumn.PAGES.value],
+                "pages": total_pages,
                 "koreader_md5": book_row[KoReaderBookColumn.MD5.value],
                 "koreader_id": int(book_row[KoReaderBookColumn.ID.value]),
                 "koreader_authors": book_row[KoReaderBookColumn.AUTHORS.value],
@@ -117,6 +92,7 @@ def process_koreader_sqlite_file(sqlite_file_path, user_id):
             playback_position_seconds = book_row[
                 KoReaderBookColumn.TOTAL_READ_TIME.value
             ]
+
         pages_read = 0
         if book_row[KoReaderBookColumn.TOTAL_READ_PAGES.value]:
             pages_read = int(
@@ -125,37 +101,123 @@ def process_koreader_sqlite_file(sqlite_file_path, user_id):
         timestamp = datetime.utcfromtimestamp(
             book_row[KoReaderBookColumn.LAST_OPEN.value]
         ).replace(tzinfo=pytz.utc)
-        process_pages_for_book(book.koreader_id, sqlite_file_path)
-
-        new_scrobble = Scrobble(
-            book_id=book.id,
-            user_id=user_id,
-            source="KOReader",
-            timestamp=timestamp,
-            playback_position_seconds=playback_position_seconds,
-            played_to_completion=True,
-            in_progress=False,
-            book_pages_read=pages_read,
+        book_id_map[book.koreader_id] = book.id
+
+    return book_id_map
+
+
+def build_scrobbles_from_pages(
+    rows: Iterable, book_id_map: dict, user_id: int
+) -> List[Scrobble]:
+    new_scrobbles = []
+
+    new_scrobbles = []
+    for page_row in rows:
+        koreader_id = page_row[KoReaderPageStatColumn.ID_BOOK.value]
+        page_number = page_row[KoReaderPageStatColumn.PAGE.value]
+        ts = page_row[KoReaderPageStatColumn.START_TIME.value]
+        book_id = book_id_map[koreader_id]
+
+        page, page_created = Page.objects.get_or_create(
+            book_id=book_id, number=page_number, user_id=user_id
+        )
+        if page_created:
+            page.start_time = datetime.utcfromtimestamp(ts).replace(
+                tzinfo=pytz.utc
+            )
+            page.duration_seconds = page_row[
+                KoReaderPageStatColumn.DURATION.value
+            ]
+            page.save(update_fields=["start_time", "duration_seconds"])
+            page.refresh_from_db()
+        if page.is_scrobblable:
+            # Page number is a placeholder, we'll re-preocess this after creation
+            logger.debug(
+                f"Queueing scrobble for {page.book}, page {page.number}"
+            )
+            new_scrobble = Scrobble(
+                book_id=page.book_id,
+                user_id=user_id,
+                source="KOReader",
+                timestamp=page.start_time,
+                played_to_completion=True,
+                in_progress=False,
+                book_pages_read=page_number,
+                long_play_complete=False,
+            )
+            new_scrobbles.append(new_scrobble)
+    return new_scrobbles
+
+
+def enrich_koreader_scrobbles(scrobbles: list) -> None:
+    """Given a list of scrobbles, update pages read, long play seconds and check
+    for media completion"""
+
+    for scrobble in scrobbles:
+        if scrobble.next:
+            scrobble.book_pages_read = scrobble.next.book_pages_read - 1
+            scrobble.save(update_fields=["book_pages_read"])
+        else:
+            scrobble.book_pages_read = scrobble.book.page_set.last().number
+            scrobble.long_play_complete =
+
+        scrobble.save(update_fields=["book_pages_read", "long_play_complete"])
+
+
+def process_koreader_sqlite_url(file_url, user_id) -> list:
+    book_id_map = {}
+    new_scrobbles = []
+
+    for table_name, pragma_table_info, rows in stream_sqlite(
+        _sqlite_bytes(file_url), max_buffer_size=1_048_576
+    ):
+        if table_name == "book":
+            book_id_map = get_book_map_from_sqlite(rows)
+
+        if table_name == "page":
+            new_scrobbles = build_scrobbles_from_pages(
+                rows, book_id_map, user_id
+            )
+
+    created = []
+    if new_scrobbles:
+        created = Scrobble.objects.bulk_create(new_scrobbles)
+        enrich_koreader_scrobbles(created)
+        logger.info(
+            f"Created {len(created)} scrobbles",
+            extra={"created_scrobbles": created},
         )
+    return created
+
+
+def process_koreader_sqlite_file(file_path, user_id) -> list:
+    """Given a sqlite file from KoReader, open the book table, iterate
+    over rows creating scrobbles from each book found"""
+    # Create a SQL connection to our SQLite database
+    con = sqlite3.connect(file_path)
+    cur = con.cursor()
 
-        existing = Scrobble.objects.filter(
-            timestamp=timestamp, book=book
-        ).first()
-        if existing:
-            logger.debug(f"Skipping existing scrobble {new_scrobble}")
-            continue
-        if book.progress_for_user(user_id) >= Book.COMPLETION_PERCENT:
-            new_scrobble.long_play_complete = True
-
-        logger.debug(f"Queued scrobble {new_scrobble} for creation")
-        new_scrobbles.append(new_scrobble)
-
-    # Be sure to close the connection
-    con.close()
-
-    created = Scrobble.objects.bulk_create(new_scrobbles)
-    logger.info(
-        f"Created {len(created)} scrobbles",
-        extra={"created_scrobbles": created},
+    book_id_map = get_book_map_from_sqlite(cur.execute("SELECT * FROM book"))
+    new_scrobbles = build_scrobbles_from_pages(
+        cur.execute("SELECT * from page_stat_data"), book_id_map, user_id
     )
+
+    created = []
+    if new_scrobbles:
+        created = Scrobble.objects.bulk_create(new_scrobbles)
+        enrich_koreader_scrobbles(created)
+        logger.info(
+            f"Created {len(created)} scrobbles",
+            extra={"created_scrobbles": created},
+        )
+    return created
+
+
+def process_koreader_sqlite(file_path: str, user_id: int) -> list:
+    is_os_file = "https://" not in file_path
+
+    if is_os_file:
+        created = process_koreader_sqlite_file(file_path, user_id)
+    else:
+        created = process_koreader_sqlite_url(file_path, user_id)
     return created

+ 31 - 9
vrobbler/apps/scrobbles/models.py

@@ -126,6 +126,9 @@ class BaseFileImportMixin(TimeStampedModel):
         self.process_count = len(scrobbles)
         self.save(update_fields=["process_log", "process_count"])
 
+    def upload_file_path(self):
+        raise NotImplementedError
+
 
 class KoReaderImport(BaseFileImportMixin):
     class Meta:
@@ -144,10 +147,18 @@ class KoReaderImport(BaseFileImportMixin):
         uuid = instance.uuid
         return f"koreader-uploads/{uuid}.{extension}"
 
+    @property
+    def upload_file_path(self) -> str:
+        if getattr(settings, "USE_S3_STORAGE"):
+            path = self.sqlite_file.url
+        else:
+            path = self.sqlite_file.path
+        return path
+
     sqlite_file = models.FileField(upload_to=get_path, **BNULL)
 
     def process(self, force=False):
-        from books.koreader import process_koreader_sqlite_file
+        from books.koreader import process_koreader_sqlite
 
         if self.processed_finished and not force:
             logger.info(
@@ -156,8 +167,8 @@ class KoReaderImport(BaseFileImportMixin):
             return
 
         self.mark_started()
-        scrobbles = process_koreader_sqlite_file(
-            self.sqlite_file.path, self.user.id
+        scrobbles = process_koreader_sqlite(
+            self.upload_file_path, self.user.id
         )
         self.record_log(scrobbles)
         self.mark_finished()
@@ -180,6 +191,13 @@ class AudioScrobblerTSVImport(BaseFileImportMixin):
         uuid = instance.uuid
         return f"audioscrobbler-uploads/{uuid}.{extension}"
 
+    def upload_file_path(self):
+        if getattr(settings, "USE_S3_STORAGE"):
+            path = self.tsv_file.url
+        else:
+            path = self.tsv_file.path
+        return path
+
     tsv_file = models.FileField(upload_to=get_path, **BNULL)
 
     def process(self, force=False):
@@ -198,12 +216,8 @@ class AudioScrobblerTSVImport(BaseFileImportMixin):
         if self.user:
             user_id = self.user.id
             tz = self.user.profile.tzinfo
-        if getattr(settings, "USE_S3_STORAGE"):
-            tsv_str = self.tsv_file.url
-        else:
-            tsv_str = self.tsv_file.path
         scrobbles = process_audioscrobbler_tsv_file(
-            tsv_str, user_id, user_tz=tz
+            self.upload_file_path, user_id, user_tz=tz
         )
         self.record_log(scrobbles)
         self.mark_finished()
@@ -457,13 +471,21 @@ class Scrobble(TimeStampedModel):
         return is_stale
 
     @property
-    def previous(self):
+    def previous(self) -> "Scrobble":
         return (
             self.media_obj.scrobble_set.order_by("-timestamp")
             .filter(timestamp__lt=self.timestamp)
             .first()
         )
 
+    @property
+    def next(self) -> "Scrobble":
+        return (
+            self.media_obj.scrobble_set.order_by("timestamp")
+            .filter(timestamp__gt=self.timestamp)
+            .first()
+        )
+
     @property
     def session_pages_read(self) -> Optional[int]:
         """Look one scrobble back, if it isn't complete,"""