浏览代码

Add fuzzing for book titles

Colin Powell 1 年之前
父节点
当前提交
919fa1b0b4
共有 5 个文件被更改,包括 247 次插入569 次删除
  1. 226 563
      poetry.lock
  2. 1 0
      pyproject.toml
  3. 4 1
      vrobbler/apps/books/models.py
  4. 4 1
      vrobbler/apps/books/openlibrary.py
  5. 12 4
      vrobbler/apps/books/tests/test_openlibrary.py

文件差异内容过多而无法显示
+ 226 - 563
poetry.lock


+ 1 - 0
pyproject.toml

@@ -44,6 +44,7 @@ ipython = "^8.14.0"
 pendulum = "^2.1.2"
 trafilatura = "^1.6.3"
 django-imagekit = "^5.0.0"
+thefuzz = "^0.22.1"
 
 [tool.poetry.group.dev]
 optional = true

+ 4 - 1
vrobbler/apps/books/models.py

@@ -185,7 +185,10 @@ class Book(LongPlayScrobblableMixin):
             if "pages" in data.keys() and data.get("pages") == None:
                 data.pop("pages")
 
-            if not isinstance(data.get("pages"), int):
+            if (
+                not isinstance(data.get("pages"), int)
+                and "pages" in data.keys()
+            ):
                 logger.info(
                     f"Pages for {self} from OL expected to be int, but got {data.get('pages')}"
                 )

+ 4 - 1
vrobbler/apps/books/openlibrary.py

@@ -5,6 +5,8 @@ import urllib
 
 import requests
 
+from thefuzz import fuzz
+
 logger = logging.getLogger(__name__)
 
 ISBN_URL = "https://openlibrary.org/isbn/{isbn}.json"
@@ -102,8 +104,9 @@ def lookup_book_from_openlibrary(
 
     top = None
     for result in results.get("docs"):
-        if title.lower() == result.get("title", "").lower():
+        if fuzz.ratio(title.lower(), result.get("title", "").lower()) > 90:
             top = result
+            break
 
     if not top:
         for result in results.get("docs"):

+ 12 - 4
vrobbler/apps/books/tests/test_openlibrary.py

@@ -1,14 +1,14 @@
+from unittest import skip
+
 import pytest
 
-from vrobbler.apps.books.openlibrary import (
-    lookup_book_from_openlibrary,
-)
+from vrobbler.apps.books.openlibrary import lookup_book_from_openlibrary
 
 
 def test_lookup_modern_book():
     book = lookup_book_from_openlibrary("Matrix", "Lauren Groff")
     assert book.get("title") == "Matrix"
-    assert book.get("openlibrary_id") == "OL47572299M"
+    assert book.get("openlibrary_id") == "OL32170218M"
     assert book.get("ol_author_id") == "OL3675729A"
 
 
@@ -26,3 +26,11 @@ def test_lookup_foreign_book():
     assert book.get("title") == "Ravage"
     assert book.get("openlibrary_id") == "OL8837839M"
     assert book.get("ol_author_id") == "OL152472A"
+
+
+@skip("This is rotten in OL, updated but waiting for it to update")
+def test_lookup_book():
+    book = lookup_book_from_openlibrary("Hark! A Vagrant")
+    assert book.get("title") == "Hark! A Vagrant"
+    assert book.get("openlibrary_id") == "OL8837839M"
+    assert book.get("ol_author_id") == "OL152472A"

部分文件因为文件数量过多而无法显示