Просмотр исходного кода

[podcasts] Add parsing of RSS feed urls

Colin Powell 16 часов назад
Родитель
Сommit
e7203cdb9b
4 измененных файлов с 119 добавлено и 18 удалено
  1. 21 1
      PROJECT.org
  2. 27 1
      poetry.lock
  3. 1 0
      pyproject.toml
  4. 70 16
      vrobbler/apps/podcasts/utils.py

+ 21 - 1
PROJECT.org

@@ -92,7 +92,7 @@ fetching and simple saving.
 :LOGBOOK:
 CLOCK: [2025-07-09 Wed 09:55]--[2025-07-09 Wed 10:15] =>  0:20
 :END:
-* Backlog [0/24]
+* Backlog [1/28]
 ** TODO [#C] Create small utility to clean up tracks scrobbled with wonky playback times :vrobbler:personal:bug:music:scrobbles:
 ** TODO [#C] Move to using more robust mopidy-webhooks pacakge form pypi :utility:improvement:
 :PROPERTIES:
@@ -491,6 +491,26 @@ https://life.lab.unbl.ink/scrobble/e39779c8-62a5-46a6-bdef-fb7662810dc6/start/
 - Note taken on [2025-09-30 Tue 09:33]
 
   This may have already been resolved ... need to just confirm it.
+** TODO
+
+** DONE [#A] Add RSS feed lookups to podcasts :vrobbler:personal:feature:podcasts:
+:PROPERTIES:
+:ID:       d60645b0-7578-97c1-0278-05bd9de4269c
+:END:
+
+- Note taken on [2025-10-14 Tue 10:08]
+
+  Turns out the Podcast plugin for mopidy does a pretty good job of showing the
+  latest file without having to scroll the bottom using only Muse to not parse
+  the podcast title name. BUT, now we're getting urls like this:
+
+  https://nsf.libsyn.com/rss#77e01251-cb20-4609-b577-d48e985d2e7b
+
+  This is great, because there's more context there, but it has to read out of
+  the RSS feed. We should add a check in the podcast util to sniff out the file
+  referenced in the # in that url and populate the info from there. This should
+  actually be much more reliable than the current state of the podcast lookup
+  which depends on the file to be name properly.
 
 * Version 26.0 [3/3]
 ** DONE Clean up templates for scrobble details :vrobbler:personal:bug:templates:

+ 27 - 1
poetry.lock

@@ -1602,6 +1602,21 @@ files = [
 [package.extras]
 devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
 
+[[package]]
+name = "feedparser"
+version = "6.0.12"
+description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds"
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+    {file = "feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324"},
+    {file = "feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228"},
+]
+
+[package.dependencies]
+sgmllib3k = "*"
+
 [[package]]
 name = "filelock"
 version = "3.18.0"
@@ -4403,6 +4418,17 @@ enabler = ["pytest-enabler (>=2.2)"]
 test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
 type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
 
+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+description = "Py3k port of sgmllib."
+optional = false
+python-versions = "*"
+groups = ["main"]
+files = [
+    {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
+]
+
 [[package]]
 name = "shellingham"
 version = "1.5.4"
@@ -5499,4 +5525,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.9,<3.12"
-content-hash = "3a483aefea0a3afebf187b17b7df72a158788024ca8121b512b39567fb5ec8ca"
+content-hash = "cd3b566597e09aa444f9af30f95f94f922bf3dca71fbd05c887fb10cbc11d7bf"

+ 1 - 0
pyproject.toml

@@ -56,6 +56,7 @@ poetry-bumpversion = "^0.3.3"
 orgparse = "^0.4.20250520"
 tmdbv3api = "^1.9.0"
 themoviedb = "^1.0.2"
+feedparser = "^6.0.12"
 
 [tool.poetry.group.test]
 optional = true

+ 70 - 16
vrobbler/apps/podcasts/utils.py

@@ -1,7 +1,9 @@
 import logging
 import os
+from typing import Any
 from urllib.parse import unquote
 
+import feedparser
 from dateutil.parser import ParserError, parse
 from podcasts.models import PodcastEpisode
 
@@ -10,26 +12,80 @@ logger = logging.getLogger(__name__)
 # TODO This should be configurable in settings or per deploy
 PODCAST_DATE_FORMAT = "YYYY-MM-DD"
 
+def parse_duration(d):
+    if not d:
+        return None
+    if d.isdigit():
+        return int(d)
+    parts = [int(p) for p in d.split(":")]
+    while len(parts) < 3:
+        parts.insert(0, 0)
+    h, m, s = parts
+    return h * 3600 + m * 60 + s
+
+def fetch_metadata_from_rss(uri: str) -> dict[str, Any]:
+    log_context = {"mopidy_uri": uri, "media_type": "Podcast"}
+    podcast_data: dict[str, Any] = {}
+
+    try:
+        feed = feedparser.parse(uri.split("#")[0])
+        target_guid = uri.split("#")[1]
+    except IndexError:
+        logger.warning("Tried to parse uri as RSS feed, but no target found", extra=log_context)
+        return podcast_data
+
+    podcast_data = {
+        "podcast_name": feed.feed.get("title", "Unknown Podcast"),
+        "podcast_description": feed.feed.get("description", ""),
+        "podcast_link": feed.feed.get("link", ""),
+    }
+
+    for entry in feed.entries:
+        if target_guid in target_guid:
+            logger.info("🎧 Episode found in RSS feed", extra=log_context)
+            podcast_data["episode_name"] = entry.title
+            podcast_data["episode_num"] = entry.guid
+            podcast_data["episode_pub_date"] = entry.get("published", None)
+            podcast_data["episode_description"] = entry.get("description", None)
+            podcast_data["episode_url"] = entry.enclosures[0].href if entry.get("enclosures") else None
+            podcast_data["episode_runtime_seconds"] = parse_duration(entry.get("itunes_duration", None))
+            return podcast_data
+    else:
+        logger.info("Episode not found in RSS feed.")
+
+
+def parse_mopidy_uri(uri: str) -> dict[str, Any]:
+    podcast_data: dict[str, Any] = {}
 
-def parse_mopidy_uri(uri: str) -> dict:
     logger.debug(f"Parsing URI: {uri}")
+    if "https://" in uri:
+        return fetch_metadata_from_rss(uri)
+
+
     parsed_uri = os.path.splitext(unquote(uri))[0].split("/")
 
+    podcast_data = {
+        "episode_filename": parsed_uri[-1],
+        "episode_num": None,
+        "podcast_name": parsed_uri[-2].strip(),
+        "pub_date": None,
+    }
+
+
     episode_str = parsed_uri[-1]
-    podcast_name = parsed_uri[-2].strip()
     episode_num = None
     episode_num_pad = 0
 
     try:
         # Without episode numbers the date will lead
-        pub_date = parse(episode_str[0:10])
+        podcast_data["pub_date"] = parse(episode_str[0:10])
     except ParserError:
-        episode_num = int(episode_str.split("-")[0])
-        episode_num_pad = len(str(episode_num)) + 1
+        podcast_data["episode_num"] = int(episode_str.split("-")[0])
+        episode_num_pad = len(str(podcast_data["episode_num"])) + 1
 
         try:
             # Beacuse we have epsiode numbers on
-            pub_date = parse(
+            podcast_data["pub_date"] = parse(
                 episode_str[
                     episode_num_pad : len(PODCAST_DATE_FORMAT)
                     + episode_num_pad
@@ -39,22 +95,19 @@ def parse_mopidy_uri(uri: str) -> dict:
             pub_date = ""
 
     gap_to_strip = 0
-    if pub_date:
+    if podcast_data["pub_date"]:
         gap_to_strip += len(PODCAST_DATE_FORMAT)
-    if episode_num:
+    if podcast_data["episode_num"]:
         gap_to_strip += episode_num_pad
 
-    episode_name = episode_str[gap_to_strip:].replace("-", " ").strip()
+    podcast_data["episode_name"] = episode_str[gap_to_strip:].replace("-", " ").strip()
 
-    return {
-        "episode_filename": episode_name,
-        "episode_num": episode_num,
-        "podcast_name": podcast_name,
-        "pub_date": pub_date,
-    }
+    return podcast_data
 
 
 def get_or_create_podcast(post_data: dict) -> PodcastEpisode:
+    logger.info("Looking up podcast", extra={"post_data": post_data, "media_type": "Podcast"})
+
     mopidy_uri = post_data.get("mopidy_uri", "")
     parsed_data = parse_mopidy_uri(mopidy_uri)
 
@@ -66,9 +119,10 @@ def get_or_create_podcast(post_data: dict) -> PodcastEpisode:
     podcast_dict = {"name": podcast_name}
 
     episode_name = parsed_data.get("episode_filename")
+    run_time_seconds = parsed_data.get("episode_runtime_seconds", post_data.get("run_time", 2700))
     episode_dict = {
         "title": episode_name,
-        "run_time_seconds": post_data.get("run_time"),
+        "run_time_seconds": run_time_seconds,
         "number": parsed_data.get("episode_num"),
         "pub_date": parsed_data.get("pub_date"),
         "mopidy_uri": mopidy_uri,