瀏覽代碼

Scrape all the things

Colin Powell 2 年之前
父節點
當前提交
31f490a32b

+ 32 - 1
poetry.lock

@@ -140,6 +140,21 @@ test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)",
 toml = ["toml"]
 yaml = ["PyYAML"]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.11.2"
+description = "Screen-scraping library"
+category = "main"
+optional = false
+python-versions = ">=3.6.0"
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "billiard"
 version = "3.6.4.0"
@@ -1441,6 +1456,14 @@ category = "dev"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "soupsieve"
+version = "2.4"
+description = "A modern CSS selector implementation for Beautiful Soup."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "sqlalchemy"
 version = "1.4.46"
@@ -1730,7 +1753,7 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "80565e921ad29750e4b26536e0d4d45ace1b06dfca679b8b9c80776c44463234"
+content-hash = "3f63bf670e22f6aa464abcfc835ba39e7faa27809cc666b35b6ba1cb35313af6"
 
 [metadata.files]
 aiohttp = [
@@ -1876,6 +1899,10 @@ bandit = [
     {file = "bandit-1.7.4-py3-none-any.whl", hash = "sha256:412d3f259dab4077d0e7f0c11f50f650cc7d10db905d98f6520a95a18049658a"},
     {file = "bandit-1.7.4.tar.gz", hash = "sha256:2d63a8c573417bae338962d4b9b06fbc6080f74ecd955a092849e1e65c717bd2"},
 ]
+beautifulsoup4 = [
+    {file = "beautifulsoup4-4.11.2-py3-none-any.whl", hash = "sha256:0e79446b10b3ecb499c1556f7e228a53e64a2bfcebd455f370d8927cb5b59e39"},
+    {file = "beautifulsoup4-4.11.2.tar.gz", hash = "sha256:bc4bdda6717de5a2987436fb8d72f45dc90dd856bdfd512a1314ce90349a0106"},
+]
 billiard = [
     {file = "billiard-3.6.4.0-py3-none-any.whl", hash = "sha256:87103ea78fa6ab4d5c751c4909bcff74617d985de7fa8b672cf8618afd5a875b"},
     {file = "billiard-3.6.4.0.tar.gz", hash = "sha256:299de5a8da28a783d51b197d496bef4f1595dd023a93a4f59dde1886ae905547"},
@@ -2963,6 +2990,10 @@ sortedcontainers = [
     {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
     {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
 ]
+soupsieve = [
+    {file = "soupsieve-2.4-py3-none-any.whl", hash = "sha256:49e5368c2cda80ee7e84da9dbe3e110b70a4575f196efb74e51b94549d921955"},
+    {file = "soupsieve-2.4.tar.gz", hash = "sha256:e28dba9ca6c7c00173e34e4ba57448f0688bb681b7c5e8bf4971daafc093d69a"},
+]
 sqlalchemy = [
     {file = "SQLAlchemy-1.4.46-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:7001f16a9a8e06488c3c7154827c48455d1c1507d7228d43e781afbc8ceccf6d"},
     {file = "SQLAlchemy-1.4.46-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c7a46639ba058d320c9f53a81db38119a74b8a7a1884df44d09fbe807d028aaf"},

+ 1 - 0
pyproject.toml

@@ -37,6 +37,7 @@ django-encrypted-field = "^1.0.5"
 celery = "^5.2.7"
 honcho = "^1.1.0"
 howlongtobeatpy = "^1.0.5"
+beautifulsoup4 = "^4.11.2"
 
 [tool.poetry.dev-dependencies]
 Werkzeug = "2.0.3"

+ 85 - 0
vrobbler/apps/music/allmusic.py

@@ -0,0 +1,85 @@
+import urllib
+from typing import Optional
+from bs4 import BeautifulSoup
+import requests
+import logging
+
+logger = logging.getLogger(__name__)
+
+ALLMUSIC_SEARCH_URL = "https://www.allmusic.com/search/{subpath}/{query}"
+
+
+def strip_and_clean(text):
+    return text.strip("\n").rstrip().lstrip()
+
+
+def get_rating_from_soup(soup) -> Optional[int]:
+    rating = None
+    try:
+        potential_rating = soup.find("div", class_="allmusic-rating")
+        if potential_rating:
+            rating = int(strip_and_clean(potential_rating.get_text()))
+    except ValueError:
+        pass
+    return rating
+
+
+def get_review_from_soup(soup) -> str:
+    review = ""
+    try:
+        potential_text = soup.find("div", class_="text")
+        if potential_text:
+            review = strip_and_clean(potential_text.get_text())
+    except ValueError:
+        pass
+    return review
+
+
+def scrape_data_from_allmusic(url) -> dict:
+    data_dict = {}
+    headers = {"User-Agent": "Vrobbler 0.11.12"}
+    r = requests.get(url, headers=headers)
+    if r.status_code == 200:
+        soup = BeautifulSoup(r.text, "html")
+        data_dict["rating"] = get_rating_from_soup(soup)
+        data_dict["review"] = get_review_from_soup(soup)
+    return data_dict
+
+
+def get_allmusic_slug(artist_name=None, album_name=None) -> str:
+    slug = ""
+    if not artist_name:
+        return slug
+
+    subpath = "artists"
+    class_ = "name"
+    query = urllib.parse.quote(artist_name)
+    if album_name:
+        subpath = "albums"
+        class_ = "title"
+        query = "+".join([query, urllib.parse.quote(album_name)])
+
+    url = ALLMUSIC_SEARCH_URL.format(subpath=subpath, query=query)
+    headers = {"User-Agent": "Vrobbler 0.11.12"}
+    r = requests.get(url, headers=headers)
+
+    if r.status_code != 200:
+        logger.info(f"Bad http response from Allmusic {r}")
+        return slug
+
+    soup = BeautifulSoup(r.text, "html")
+    results = soup.find("ul", class_="search-results")
+
+    if not results:
+        logger.info(f"No search results for {query}")
+        return slug
+
+    prime_result = results.findAll("div", class_=class_)
+
+    if not prime_result:
+        logger.info(f"Could not find specific result for search {query}")
+
+    result_url = prime_result[0].find_all("a")[0]["href"]
+    slug = result_url.split("/")[-1:][0]
+
+    return slug

+ 49 - 0
vrobbler/apps/music/bandcamp.py

@@ -0,0 +1,49 @@
+import logging
+import urllib
+
+import requests
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+BANDCAMP_SEARCH_URL = "https://bandcamp.com/search?q={query}&item_type={itype}"
+
+
+def get_bandcamp_slug(artist_name=None, album_name=None) -> str:
+    slug = ""
+    if not artist_name:
+        return slug
+
+    query = urllib.parse.quote(artist_name)
+    item_type = "b"
+    class_ = "heading"
+    if album_name:
+        item_type = "a"
+        query = "+".join([query, urllib.parse.quote(album_name)])
+
+    url = BANDCAMP_SEARCH_URL.format(query=query, itype=item_type)
+    headers = {"User-Agent": "Vrobbler 0.11.12"}
+    r = requests.get(url, headers=headers)
+
+    if r.status_code != 200:
+        logger.info(f"Bad http response from Bandcamp {r}")
+        return slug
+
+    soup = BeautifulSoup(r.text, "html")
+
+    results = soup.find("ul", class_="result-items")
+
+    if not results:
+        logger.info(f"No search results for {query}")
+        return slug
+
+    prime_result = results.findAll("div", class_=class_)
+
+    if not prime_result:
+        logger.info(f"Could not find specific result for search {query}")
+
+    result_url = prime_result[0].find_all("a")[0]["href"]
+    if item_type == "b":
+        slug = result_url.split("/")[2].split(".")[0]
+    else:
+        slug = result_url.split("?")[0]
+    return slug

+ 23 - 0
vrobbler/apps/music/migrations/0018_album_allmusic_rating_album_allmusic_review.py

@@ -0,0 +1,23 @@
+# Generated by Django 4.1.5 on 2023-03-15 01:17
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("music", "0017_track_genre"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="album",
+            name="allmusic_rating",
+            field=models.IntegerField(blank=True, null=True),
+        ),
+        migrations.AddField(
+            model_name="album",
+            name="allmusic_review",
+            field=models.TextField(blank=True, null=True),
+        ),
+    ]

+ 18 - 0
vrobbler/apps/music/migrations/0019_artist_allmusic_id.py

@@ -0,0 +1,18 @@
+# Generated by Django 4.1.5 on 2023-03-15 03:05
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("music", "0018_album_allmusic_rating_album_allmusic_review"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="artist",
+            name="allmusic_id",
+            field=models.CharField(blank=True, max_length=100, null=True),
+        ),
+    ]

+ 23 - 0
vrobbler/apps/music/migrations/0020_album_bandcamp_id_artist_bandcamp_id.py

@@ -0,0 +1,23 @@
+# Generated by Django 4.1.5 on 2023-03-15 03:32
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("music", "0019_artist_allmusic_id"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="album",
+            name="bandcamp_id",
+            field=models.CharField(blank=True, max_length=100, null=True),
+        ),
+        migrations.AddField(
+            model_name="artist",
+            name="bandcamp_id",
+            field=models.CharField(blank=True, max_length=100, null=True),
+        ),
+    ]

+ 80 - 1
vrobbler/apps/music/models.py

@@ -14,6 +14,11 @@ from django.utils.translation import gettext_lazy as _
 from django_extensions.db.models import TimeStampedModel
 from scrobbles.mixins import ScrobblableMixin
 from music.theaudiodb import lookup_artist_from_tadb, lookup_album_from_tadb
+from vrobbler.apps.music.allmusic import (
+    get_allmusic_slug,
+    scrape_data_from_allmusic,
+)
+from vrobbler.apps.music.bandcamp import get_bandcamp_slug
 
 logger = logging.getLogger(__name__)
 BNULL = {"blank": True, "null": True}
@@ -26,6 +31,8 @@ class Artist(TimeStampedModel):
     theaudiodb_genre = models.CharField(max_length=255, **BNULL)
     theaudiodb_mood = models.CharField(max_length=255, **BNULL)
     musicbrainz_id = models.CharField(max_length=255, **BNULL)
+    allmusic_id = models.CharField(max_length=100, **BNULL)
+    bandcamp_id = models.CharField(max_length=100, **BNULL)
     thumbnail = models.ImageField(upload_to="artist/", **BNULL)
 
     class Meta:
@@ -38,6 +45,18 @@ class Artist(TimeStampedModel):
     def mb_link(self):
         return f"https://musicbrainz.org/artist/{self.musicbrainz_id}"
 
+    @property
+    def allmusic_link(self):
+        if self.allmusic_id:
+            return f"https://www.allmusic.com/artist/{self.allmusic_id}"
+        return ""
+
+    @property
+    def bandcamp_link(self):
+        if self.bandcamp_id:
+            return f"https://{self.bandcamp_id}.bandcamp.com/"
+        return ""
+
     def get_absolute_url(self):
         return reverse("music:artist_detail", kwargs={"slug": self.uuid})
 
@@ -61,6 +80,24 @@ class Artist(TimeStampedModel):
 
         return ChartRecord.objects.filter(track__artist=self).order_by("-year")
 
+    def scrape_allmusic(self, force=False) -> None:
+        if not self.allmusic_id or force:
+            slug = get_allmusic_slug(self.name)
+            if not slug:
+                logger.info(f"No allmsuic link for {self}")
+                return
+            self.allmusic_id = slug
+            self.save(update_fields=["allmusic_id"])
+
+    def scrape_bandcamp(self, force=False) -> None:
+        if not self.bandcamp_id or force:
+            slug = get_bandcamp_slug(self.name)
+            if not slug:
+                logger.info(f"No bandcamp link for {self}")
+                return
+            self.bandcamp_id = slug
+            self.save(update_fields=["bandcamp_id"])
+
     def fix_metadata(self):
         tadb_info = lookup_artist_from_tadb(self.name)
         if not tadb_info:
@@ -109,6 +146,9 @@ class Album(TimeStampedModel):
     theaudiodb_speed = models.CharField(max_length=255, **BNULL)
     theaudiodb_theme = models.CharField(max_length=255, **BNULL)
     allmusic_id = models.CharField(max_length=255, **BNULL)
+    allmusic_rating = models.IntegerField(**BNULL)
+    allmusic_review = models.TextField(**BNULL)
+    bandcamp_id = models.CharField(max_length=100, **BNULL)
     rateyourmusic_id = models.CharField(max_length=255, **BNULL)
     wikipedia_slug = models.CharField(max_length=255, **BNULL)
     discogs_id = models.CharField(max_length=255, **BNULL)
@@ -139,6 +179,29 @@ class Album(TimeStampedModel):
     def primary_artist(self):
         return self.artists.first()
 
+    def scrape_allmusic(self, force=False) -> None:
+        if not self.allmusic_id or force:
+            slug = get_allmusic_slug(self.name, self.primary_artist.name)
+            if not slug:
+                logger.info(
+                    f"No allmsuic link for {self} by {self.primary_artist}"
+                )
+                return
+            self.allmusic_id = slug
+            self.save(update_fields=["allmusic_id"])
+
+        allmusic_data = scrape_data_from_allmusic(self.allmusic_link)
+
+        if not allmusic_data:
+            logger.info(
+                f"No allmsuic data for {self} by {self.primary_artist}"
+            )
+            return
+
+        self.allmusic_review = allmusic_data["review"]
+        self.allmusic_rating = allmusic_data["rating"]
+        self.save(update_fields=["allmusic_review", "allmusic_rating"])
+
     def scrape_theaudiodb(self) -> None:
         artist = "Various Artists"
         if self.primary_artist:
@@ -150,6 +213,15 @@ class Album(TimeStampedModel):
 
         Album.objects.filter(pk=self.pk).update(**album_data)
 
+    def scrape_bandcamp(self, force=False) -> None:
+        if not self.bandcamp_id or force:
+            slug = get_bandcamp_slug(self.primary_artist.name, self.name)
+            if not slug:
+                logger.info(f"No bandcamp link for {self}")
+                return
+            self.bandcamp_id = slug
+            self.save(update_fields=["bandcamp_id"])
+
     def fix_metadata(self):
         if (
             not self.musicbrainz_albumartist_id
@@ -198,6 +270,7 @@ class Album(TimeStampedModel):
             ):
                 self.fetch_artwork()
         self.scrape_theaudiodb()
+        self.scrape_allmusic()
 
     def fetch_artwork(self, force=False):
         if not self.cover_image and not force:
@@ -242,7 +315,7 @@ class Album(TimeStampedModel):
     @property
     def allmusic_link(self) -> str:
         if self.allmusic_id:
-            return f"https://www.allmusic.com/artist/{self.allmusic_id}"
+            return f"https://www.allmusic.com/album/{self.allmusic_id}"
         return ""
 
     @property
@@ -263,6 +336,12 @@ class Album(TimeStampedModel):
         album_slug = self.name.lower().replace(" ", "-")
         return f"https://rateyourmusic.com/release/album/{artist_slug}/{album_slug}/"
 
+    @property
+    def bandcamp_link(self):
+        if self.bandcamp_id and self.primary_artist.bandcamp_id:
+            return f"https://{self.primary_artist.bandcamp_id}.bandcamp.com/album/{self.bandcamp_id}"
+        return ""
+
     @property
     def bandcamp_search_link(self):
         artist = self.primary_artist.name.lower()

+ 5 - 2
vrobbler/templates/music/album_detail.html

@@ -16,12 +16,15 @@
     <div style="float:left; width:600px; margin-left:10px; ">
         {% if object.theaudiodb_description %}
         <p>{{object.theaudiodb_description|safe|linebreaks|truncatewords:160}}</p>
-        <hr/>
         {% endif %}
+        {% if object.allmusic_review%}
+        <p>{{object.allmusic_review|safe|linebreaks|truncatewords:160}}</p>
+        {% endif %}
+        <hr/>
         <p style="float:right;">
             <a href="{{album.mb_link}}"><img src="{% static "images/musicbrainz-logo.png" %}" width=35></a>
             <a href="{{album.rym_link}}"><img src="{% static "images/rateyourmusic-logo.jpg" %}" width=35></a>
-            <a href="{{album.bandcamp_search_link}}"><img src="{% static "images/bandcamp-logo.png" %}" width=35></a>
+            {% if album.bandcamp_link %}<a href="{{album.bandcamp_link}}"><img src="{% static "images/bandcamp-logo.png" %}" width=35></a>{% endif %}
             {% if album.tadb_link %}<a href="{{album.tadb_link}}"><img src="{% static "images/theaudiodb-logo.png" %}" width=35></a>{% endif %}
             {% if album.allmusic_link %}<a href="{{album.allmusic_link}}"><img src="{% static "images/allmusic-logo.png" %}" width=35></a>{% endif %}
         </p>

+ 2 - 1
vrobbler/templates/music/artist_detail.html

@@ -26,7 +26,8 @@
         <p style="float:right;">
             <a href="{{artist.mb_link}}"><img src="{% static "images/musicbrainz-logo.png" %}" width=35></a>
             <a href="{{artist.rym_link}}"><img src="{% static "images/rateyourmusic-logo.jpg" %}" width=35></a>
-            <a href="{{artist.bandcamp_search_link}}"><img src="{% static "images/bandcamp-logo.png" %}" width=35></a>
+            {% if artist.bandcamp_link %}<a href="{{artist.bandcamp_link}}"><img src="{% static "images/bandcamp-logo.png" %}" width=35></a>{% endif %}
+            {% if artist.allmusic_link %}<a href="{{artist.allmusic_link}}"><img src="{% static "images/allmusic-logo.png" %}" width=35></a>{% endif %}
         </p>
     </div>
 </div>