Jsdatafix (#167)
* bandcampjson simplify * one merged json * change script identifier * version bump * rm extract_data * go back to json data as list * tralbum fix * fix title path * logging fixmaster
parent
4f8c28e3b1
commit
3d3a524af2
|
@ -38,39 +38,39 @@ class Bandcamp:
|
|||
|
||||
logging.debug(" Generating BandcampJSON..")
|
||||
bandcamp_json = BandcampJSON(self.soup, debugging).generate()
|
||||
album_json = json.loads(bandcamp_json[0])
|
||||
embed_json = json.loads(bandcamp_json[1])
|
||||
page_json = json.loads(bandcamp_json[2])
|
||||
page_json = {}
|
||||
for entry in bandcamp_json:
|
||||
page_json = {**page_json, **json.loads(entry)}
|
||||
logging.debug(" BandcampJSON generated..")
|
||||
|
||||
logging.debug(" Generating Album..")
|
||||
self.tracks = album_json['trackinfo']
|
||||
self.tracks = page_json['trackinfo']
|
||||
|
||||
album_release = album_json['album_release_date']
|
||||
album_release = page_json['album_release_date']
|
||||
if album_release is None:
|
||||
album_release = album_json['current']['release_date']
|
||||
album_release = page_json['current']['release_date']
|
||||
|
||||
try:
|
||||
album_title = embed_json['album_title']
|
||||
album_title = page_json['current']['title']
|
||||
except KeyError:
|
||||
album_title = album_json['trackinfo'][0]['title']
|
||||
album_title = page_json['trackinfo'][0]['title']
|
||||
|
||||
try:
|
||||
label = page_json['item_sellers']['{}'.format(album_json['current']['selling_band_id'])]['name']
|
||||
label = page_json['item_sellers']['{}'.format(page_json['current']['selling_band_id'])]['name']
|
||||
except KeyError:
|
||||
label = None
|
||||
|
||||
album = {
|
||||
"tracks": [],
|
||||
"title": album_title,
|
||||
"artist": embed_json['artist'],
|
||||
"artist": page_json['artist'],
|
||||
"label": label,
|
||||
"full": False,
|
||||
"art": "",
|
||||
"date": str(dt.strptime(album_release, "%d %b %Y %H:%M:%S GMT").year)
|
||||
}
|
||||
|
||||
artist_url = album_json['url'].rpartition('/album/')[0]
|
||||
artist_url = page_json['url'].rpartition('/album/')[0]
|
||||
for track in self.tracks:
|
||||
if lyrics:
|
||||
track['lyrics'] = self.get_track_lyrics("{}{}#lyrics".format(artist_url, track['title_link']))
|
||||
|
|
|
@ -7,48 +7,41 @@ import demjson
|
|||
class BandcampJSON:
|
||||
def __init__(self, body, debugging: bool=False):
|
||||
self.body = body
|
||||
self.targets = ['TralbumData', 'EmbedData', 'pagedata']
|
||||
self.json_data = []
|
||||
|
||||
if debugging:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def generate(self) -> list:
|
||||
"""Iterate through targets grabbing needed data"""
|
||||
for target in self.targets:
|
||||
if target[:4] == 'page':
|
||||
self.get_pagedata()
|
||||
else:
|
||||
logging.debug(" Grabbing target data..")
|
||||
self.regex = re.compile(r"(?<=var\s" + target + "\s=\s).*?(?=};)", re.DOTALL)
|
||||
self.target = target
|
||||
self.js_to_json()
|
||||
def generate(self):
|
||||
"""Grabbing needed data from the page"""
|
||||
self.get_pagedata()
|
||||
self.get_js()
|
||||
return self.json_data
|
||||
|
||||
def get_pagedata(self):
|
||||
"""Grab bandcamp pagedata JSON"""
|
||||
logging.debug(" Grab pagedata JSON..")
|
||||
pagedata = self.body.find('div', {'id': 'pagedata'})['data-blob']
|
||||
# Add pagedata to the list of JSON strings
|
||||
self.json_data.append(pagedata)
|
||||
|
||||
def get_js(self):
|
||||
"""Get <script> element containing the data we need and return the raw JS"""
|
||||
logging.debug(" Grabbing embedded script..")
|
||||
self.js_data = self.body.find("script", {"src": False}, text=re.compile(self.target)).string
|
||||
self.extract_data(self.js_data)
|
||||
logging.debug(" Grabbing embedded scripts..")
|
||||
embedded_scripts_raw = [self.body.find("script", {"type": "application/json+ld"}).string]
|
||||
for script in self.body.find_all('script'):
|
||||
try:
|
||||
album_info = script['data-tralbum']
|
||||
embedded_scripts_raw.append(album_info)
|
||||
except:
|
||||
continue
|
||||
for script in embedded_scripts_raw:
|
||||
js_data = self.js_to_json(script)
|
||||
self.json_data.append(js_data)
|
||||
|
||||
def extract_data(self, js: str):
|
||||
"""Extract values from JS dictionary
|
||||
|
||||
:param js: Raw JS
|
||||
"""
|
||||
self.js_data = self.regex.search(js).group().replace('" + "', '') + "}"
|
||||
|
||||
def js_to_json(self):
|
||||
def js_to_json(self, js_data):
|
||||
"""Convert JavaScript dictionary to JSON"""
|
||||
logging.debug(" Converting JS to JSON..")
|
||||
self.get_js()
|
||||
# Decode with demjson first to reformat keys and lists
|
||||
decoded_js = demjson.decode(self.js_data)
|
||||
decoded_js = demjson.decode(js_data)
|
||||
# Encode to make valid JSON, add to list of JSON strings
|
||||
self.json_data.append(demjson.encode(decoded_js))
|
||||
return demjson.encode(decoded_js)
|
||||
|
||||
|
|
Loading…
Reference in New Issue