bandcamp-dl/bandcamp_dl/bandcamp.py

176 lines
5.7 KiB
Python
Raw Normal View History

from datetime import datetime as dt
import json
import logging
import requests
from bs4 import BeautifulSoup
from bs4 import FeatureNotFound
from bandcamp_dl.bandcampjson import BandcampJSON
from bandcamp_dl.__init__ import __version__
class Bandcamp:
def __init__(self):
self.headers = {'User-Agent': f'bandcamp-dl/{__version__} (https://github.com/iheanyi/bandcamp-dl)'}
self.soup = None
self.tracks = None
def parse(self, url: str, art: bool = True, lyrics: bool = False, debugging: bool = False) -> dict or None:
"""Requests the page, cherry-picks album info
:param url: album/track url
:param art: if True download album art
:param lyrics: if True fetch track lyrics
:param debugging: if True then verbose output
:return: album metadata
"""
if debugging:
logging.basicConfig(level=logging.DEBUG)
2014-06-07 21:14:56 -07:00
try:
response = requests.get(url, headers=self.headers)
2014-06-07 21:14:56 -07:00
except requests.exceptions.MissingSchema:
return None
try:
self.soup = BeautifulSoup(response.text, "lxml")
except FeatureNotFound:
self.soup = BeautifulSoup(response.text, "html.parser")
logging.debug(" Generating BandcampJSON..")
bandcamp_json = BandcampJSON(self.soup, debugging).generate()
page_json = {}
for entry in bandcamp_json:
page_json = {**page_json, **json.loads(entry)}
logging.debug(" BandcampJSON generated..")
logging.debug(" Generating Album..")
self.tracks = page_json['trackinfo']
album_release = page_json['album_release_date']
if album_release is None:
album_release = page_json['current']['release_date']
if album_release is None:
album_release = page_json['embed_info']['item_public']
try:
album_title = page_json['current']['title']
except KeyError:
album_title = page_json['trackinfo'][0]['title']
try:
label = page_json['item_sellers'][f'{page_json["current"]["selling_band_id"]}']['name']
except KeyError:
label = None
album = {
"tracks": [],
"title": album_title,
"artist": page_json['artist'],
"label": label,
"full": False,
2014-06-02 01:51:27 -07:00
"art": "",
"date": str(dt.strptime(album_release, "%d %b %Y %H:%M:%S GMT").year),
"url": url
}
if "track" in page_json['url']:
artist_url = page_json['url'].rpartition('/track/')[0]
else:
artist_url = page_json['url'].rpartition('/album/')[0]
for track in self.tracks:
if lyrics:
track['lyrics'] = self.get_track_lyrics(f"{artist_url}{track['title_link']}#lyrics")
if track['file'] is not None:
track = self.get_track_metadata(track)
album['tracks'].append(track)
album['full'] = self.all_tracks_available()
if art:
album['art'] = self.get_album_art()
logging.debug(" Album generated..")
logging.debug(f" Album URL: {album['url']}")
return album
def get_track_lyrics(self, track_url):
logging.debug(" Fetching track lyrics..")
track_page = requests.get(track_url, headers=self.headers)
try:
track_soup = BeautifulSoup(track_page.text, "lxml")
except FeatureNotFound:
track_soup = BeautifulSoup(track_page.text, "html.parser")
track_lyrics = track_soup.find("div", {"class": "lyricsText"})
if track_lyrics:
logging.debug(" Lyrics retrieved..")
return track_lyrics.text
else:
logging.debug(" Lyrics not found..")
return "lyrics unavailable"
def all_tracks_available(self) -> bool:
"""Verify that all tracks have a url
:return: True if all urls accounted for
"""
for track in self.tracks:
if track['file'] is None:
return False
return True
@staticmethod
def get_track_metadata(track: dict or None) -> dict:
"""Extract individual track metadata
:param track: track dict
:return: track metadata dict
"""
logging.debug(" Generating track metadata..")
track_metadata = {
"duration": track['duration'],
"track": str(track['track_num']),
"title": track['title'],
"url": None
}
if 'mp3-128' in track['file']:
if 'https' in track['file']['mp3-128']:
track_metadata['url'] = track['file']['mp3-128']
else:
track_metadata['url'] = "http:" + track['file']['mp3-128']
else:
track_metadata['url'] = None
if track['has_lyrics'] is not False:
if track['lyrics'] is None:
track['lyrics'] = "lyrics unavailable"
track_metadata['lyrics'] = track['lyrics'].replace('\\r\\n', '\n')
logging.debug(" Track metadata generated..")
return track_metadata
@staticmethod
def generate_album_url(artist: str, slug: str, page_type: str) -> str:
"""Generate an album url based on the artist and album name
:param artist: artist name
:param slug: Slug of album/track
:param page_type: Type of page album/track
:return: url as str
"""
return f"http://{artist}.bandcamp.com/{page_type}/{slug}"
def get_album_art(self) -> str:
"""Find and retrieve album art url from page
:return: url as str
"""
try:
url = self.soup.find(id='tralbumArt').find_all('a')[0]['href']
return url
except None:
pass