瀏覽代碼

Add proper author lookups and fix bad OL fixes

Colin Powell 2 年之前
父節點
當前提交
d19838a26f
共有 3 個文件被更改,包括 96 次插入38 次删除
  1. 52 32
      vrobbler/apps/books/koreader.py
  2. 4 2
      vrobbler/apps/books/models.py
  3. 40 4
      vrobbler/apps/books/openlibrary.py

+ 52 - 32
vrobbler/apps/books/koreader.py

@@ -12,6 +12,7 @@ from books.models import Author, Book, Page
 from pylast import httpx, tempfile
 from scrobbles.models import Scrobble
 from stream_sqlite import stream_sqlite
+from vrobbler.apps.books.openlibrary import get_author_openlibrary_id
 
 logger = logging.getLogger(__name__)
 
@@ -53,39 +54,55 @@ def get_book_map_from_sqlite(rows: Iterable) -> dict:
     book_id_map = {}
 
     for book_row in rows:
-        authors = book_row[KoReaderBookColumn.AUTHORS.value].split("\n")
-        author_list = []
-        for author_str in authors:
-            logger.debug(f"Looking up author {author_str}")
-            if author_str == "N/A":
-                continue
-
-            author, created = Author.objects.get_or_create(name=author_str)
-            if created:
-                author.fix_metadata()
-            author_list.append(author)
-            logger.debug(f"Found author {author}, created: {created}")
+        book = Book.objects.filter(
+            koreader_md5=book_row[KoReaderBookColumn.MD5.value]
+        ).first()
 
-        book, created = Book.objects.get_or_create(
-            title=book_row[KoReaderBookColumn.TITLE.value]
-        )
+        if not book:
+            book, created = Book.objects.get_or_create(
+                title=book_row[KoReaderBookColumn.TITLE.value]
+            )
 
-        if created:
-            total_pages = book_row[KoReaderBookColumn.PAGES.value]
-            run_time = total_pages * book.AVG_PAGE_READING_SECONDS
-            book_dict = {
-                "title": book_row[KoReaderBookColumn.TITLE.value],
-                "pages": total_pages,
-                "koreader_md5": book_row[KoReaderBookColumn.MD5.value],
-                "koreader_id": int(book_row[KoReaderBookColumn.ID.value]),
-                "koreader_authors": book_row[KoReaderBookColumn.AUTHORS.value],
-                "run_time_seconds": run_time,
-            }
-            Book.objects.filter(pk=book.id).update(**book_dict)
-            book.fix_metadata()
-
-            if author_list:
-                book.authors.add(*[a.id for a in author_list])
+            if created:
+                total_pages = book_row[KoReaderBookColumn.PAGES.value]
+                run_time = total_pages * book.AVG_PAGE_READING_SECONDS
+                ko_authors = book_row[
+                    KoReaderBookColumn.AUTHORS.value
+                ].replace("\n", ", ")
+                book_dict = {
+                    "title": book_row[KoReaderBookColumn.TITLE.value],
+                    "pages": total_pages,
+                    "koreader_md5": book_row[KoReaderBookColumn.MD5.value],
+                    "koreader_id": int(book_row[KoReaderBookColumn.ID.value]),
+                    "koreader_authors": ko_authors,
+                    "run_time_seconds": run_time,
+                }
+                Book.objects.filter(pk=book.id).update(**book_dict)
+
+                # Add authors
+                authors = book_row[KoReaderBookColumn.AUTHORS.value].split(
+                    "\n"
+                )
+                author_list = []
+                for author_str in authors:
+                    logger.debug(f"Looking up author {author_str}")
+                    if author_str == "N/A":
+                        continue
+
+                    author, created = Author.objects.get_or_create(
+                        name=author_str
+                    )
+                    if created:
+                        author.openlibrary_id = get_author_openlibrary_id(
+                            author_str
+                        )
+                        author.save(update_fields=["openlibrary_id"])
+                        author.fix_metadata()
+                        logger.debug(f"Created author {author}")
+                    book.authors.add(author)
+
+                # This will try to fix metadata by looking it up on OL
+                book.fix_metadata()
 
         playback_position_seconds = 0
         if book_row[KoReaderBookColumn.TOTAL_READ_TIME.value]:
@@ -101,6 +118,7 @@ def get_book_map_from_sqlite(rows: Iterable) -> dict:
         timestamp = datetime.utcfromtimestamp(
             book_row[KoReaderBookColumn.LAST_OPEN.value]
         ).replace(tzinfo=pytz.utc)
+        book.refresh_from_db()
         book_id_map[book.koreader_id] = book.id
 
     return book_id_map
@@ -130,6 +148,7 @@ def build_scrobbles_from_pages(
             ]
             page.save(update_fields=["start_time", "duration_seconds"])
             page.refresh_from_db()
+
         if page.is_scrobblable:
             # Page number is a placeholder, we'll re-preocess this after creation
             logger.debug(
@@ -155,11 +174,12 @@ def enrich_koreader_scrobbles(scrobbles: list) -> None:
 
     for scrobble in scrobbles:
         if scrobble.next:
+            # Set pages read to the starting page of the next scrobble minus one, if it exists
             scrobble.book_pages_read = scrobble.next.book_pages_read - 1
             scrobble.save(update_fields=["book_pages_read"])
         else:
+            # Set pages read to the last page we have
             scrobble.book_pages_read = scrobble.book.page_set.last().number
-            scrobble.long_play_complete =
 
         scrobble.save(update_fields=["book_pages_read", "long_play_complete"])
 

+ 4 - 2
vrobbler/apps/books/models.py

@@ -98,7 +98,10 @@ class Book(LongPlayScrobblableMixin):
 
     def fix_metadata(self, force_update=False):
         if not self.openlibrary_id or force_update:
-            book_dict = lookup_book_from_openlibrary(self.title, self.author)
+            author_name = ""
+            if self.author:
+                author_name = self.author.name
+            book_dict = lookup_book_from_openlibrary(self.title, author_name)
             if not book_dict:
                 logger.warn(f"Book not found in OL {self.title}")
                 return
@@ -115,7 +118,6 @@ class Book(LongPlayScrobblableMixin):
                 logger.warn(
                     f"OL and KoReader disagree on this book title {self.title} != {ol_title}"
                 )
-                return
 
             Book.objects.filter(pk=self.id).update(**book_dict)
             self.refresh_from_db()

+ 40 - 4
vrobbler/apps/books/openlibrary.py

@@ -1,14 +1,15 @@
 import json
 import logging
+import re
 import urllib
 
 import requests
 
 logger = logging.getLogger(__name__)
 
-SEARCH_URL = "https://openlibrary.org/search.json?title={title}"
 ISBN_URL = "https://openlibrary.org/isbn/{isbn}.json"
-SEARCH_URL = "https://openlibrary.org/search.json?title={title}"
+SEARCH_URL = "https://openlibrary.org/search.json?q={query}&sort=editions&mode=everything"
+AUTHOR_SEARCH_URL = "https://openlibrary.org/search/authors.json?q={query}"
 COVER_URL = "https://covers.openlibrary.org/b/olid/{id}-L.jpg"
 AUTHOR_URL = "https://openlibrary.org/authors/{id}.json"
 AUTHOR_IMAGE_URL = "https://covers.openlibrary.org/a/olid/{id}-L.jpg"
@@ -21,6 +22,24 @@ def get_first(key: str, result: dict) -> str:
     return obj
 
 
+def get_author_openlibrary_id(name: str) -> str:
+    search_url = AUTHOR_SEARCH_URL.format(query=name)
+    response = requests.get(search_url)
+
+    if response.status_code != 200:
+        logger.warn(f"Bad response from OL: {response.status_code}")
+        return ""
+
+    results = json.loads(response.content)
+
+    if not results:
+        logger.warn(f"No author results found from search for {name}")
+        return ""
+
+    result = results.get("docs", [])
+    return result[0].get("key")
+
+
 def lookup_author_from_openlibrary(olid: str) -> dict:
     author_url = AUTHOR_URL.format(id=olid)
     response = requests.get(author_url)
@@ -58,7 +77,14 @@ def lookup_author_from_openlibrary(olid: str) -> dict:
 
 def lookup_book_from_openlibrary(title: str, author: str = None) -> dict:
     title_quoted = urllib.parse.quote(title)
-    search_url = SEARCH_URL.format(title=title_quoted)
+    author_quoted = ""
+    if author:
+        # Strip middle initials, OpenLibrary often fails with these
+        author = re.sub(" [A-Z]. ", " ", author)
+        author_quoted = urllib.parse.quote(author)
+    query = f"{title_quoted} {author_quoted}"
+
+    search_url = SEARCH_URL.format(query=query)
     response = requests.get(search_url)
 
     if response.status_code != 200:
@@ -71,7 +97,17 @@ def lookup_book_from_openlibrary(title: str, author: str = None) -> dict:
         logger.warn(f"No results found from OL for {title}")
         return {}
 
-    top = results.get("docs")[0]
+    top = None
+    for result in results.get("docs"):
+        # These Summary things suck and ruin our one-shot search
+        if "Summary of" not in result.get("title"):
+            top = result
+            break
+
+    if not top:
+        logger.warn(f"No book found for query {query}")
+        return {}
+
     ol_id = top.get("cover_edition_key")
     ol_author_id = get_first("author_key", top)
     first_sentence = ""