Jsdatafix (#167)

* bandcampjson simplify

* one merged json

* change script identifier

* version bump

* rm extract_data

* go back to json data as list

* tralbum fix

* fix title path

* logging fix
master
Piotr Patrzyk 2020-09-27 17:05:04 +02:00 committed by GitHub
parent 4f8c28e3b1
commit 3d3a524af2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 32 additions and 39 deletions

View File

@ -38,39 +38,39 @@ class Bandcamp:
logging.debug(" Generating BandcampJSON..")
bandcamp_json = BandcampJSON(self.soup, debugging).generate()
album_json = json.loads(bandcamp_json[0])
embed_json = json.loads(bandcamp_json[1])
page_json = json.loads(bandcamp_json[2])
page_json = {}
for entry in bandcamp_json:
page_json = {**page_json, **json.loads(entry)}
logging.debug(" BandcampJSON generated..")
logging.debug(" Generating Album..")
self.tracks = album_json['trackinfo']
self.tracks = page_json['trackinfo']
album_release = album_json['album_release_date']
album_release = page_json['album_release_date']
if album_release is None:
album_release = album_json['current']['release_date']
album_release = page_json['current']['release_date']
try:
album_title = embed_json['album_title']
album_title = page_json['current']['title']
except KeyError:
album_title = album_json['trackinfo'][0]['title']
album_title = page_json['trackinfo'][0]['title']
try:
label = page_json['item_sellers']['{}'.format(album_json['current']['selling_band_id'])]['name']
label = page_json['item_sellers']['{}'.format(page_json['current']['selling_band_id'])]['name']
except KeyError:
label = None
album = {
"tracks": [],
"title": album_title,
"artist": embed_json['artist'],
"artist": page_json['artist'],
"label": label,
"full": False,
"art": "",
"date": str(dt.strptime(album_release, "%d %b %Y %H:%M:%S GMT").year)
}
artist_url = album_json['url'].rpartition('/album/')[0]
artist_url = page_json['url'].rpartition('/album/')[0]
for track in self.tracks:
if lyrics:
track['lyrics'] = self.get_track_lyrics("{}{}#lyrics".format(artist_url, track['title_link']))

View File

@ -7,48 +7,41 @@ import demjson
class BandcampJSON:
def __init__(self, body, debugging: bool=False):
self.body = body
self.targets = ['TralbumData', 'EmbedData', 'pagedata']
self.json_data = []
if debugging:
logging.basicConfig(level=logging.DEBUG)
def generate(self) -> list:
"""Iterate through targets grabbing needed data"""
for target in self.targets:
if target[:4] == 'page':
self.get_pagedata()
else:
logging.debug(" Grabbing target data..")
self.regex = re.compile(r"(?<=var\s" + target + "\s=\s).*?(?=};)", re.DOTALL)
self.target = target
self.js_to_json()
def generate(self):
"""Grabbing needed data from the page"""
self.get_pagedata()
self.get_js()
return self.json_data
def get_pagedata(self):
"""Grab bandcamp pagedata JSON"""
logging.debug(" Grab pagedata JSON..")
pagedata = self.body.find('div', {'id': 'pagedata'})['data-blob']
# Add pagedata to the list of JSON strings
self.json_data.append(pagedata)
def get_js(self):
"""Get <script> element containing the data we need and return the raw JS"""
logging.debug(" Grabbing embedded script..")
self.js_data = self.body.find("script", {"src": False}, text=re.compile(self.target)).string
self.extract_data(self.js_data)
logging.debug(" Grabbing embedded scripts..")
embedded_scripts_raw = [self.body.find("script", {"type": "application/json+ld"}).string]
for script in self.body.find_all('script'):
try:
album_info = script['data-tralbum']
embedded_scripts_raw.append(album_info)
except:
continue
for script in embedded_scripts_raw:
js_data = self.js_to_json(script)
self.json_data.append(js_data)
def extract_data(self, js: str):
"""Extract values from JS dictionary
:param js: Raw JS
"""
self.js_data = self.regex.search(js).group().replace('" + "', '') + "}"
def js_to_json(self):
def js_to_json(self, js_data):
"""Convert JavaScript dictionary to JSON"""
logging.debug(" Converting JS to JSON..")
self.get_js()
# Decode with demjson first to reformat keys and lists
decoded_js = demjson.decode(self.js_data)
decoded_js = demjson.decode(js_data)
# Encode to make valid JSON, add to list of JSON strings
self.json_data.append(demjson.encode(decoded_js))
return demjson.encode(decoded_js)

View File

@ -3,7 +3,7 @@ from codecs import open
from os import path
import sys
appversion = "0.0.8-12"
appversion = "0.0.9-01"
here = path.abspath(path.dirname(__file__))