From 88107f7538ed2f66a4f392305ea6d84182579446 Mon Sep 17 00:00:00 2001 From: Anthony Forsberg Date: Sat, 28 Jan 2017 06:12:21 -0500 Subject: [PATCH] Fixes #100 and possibly #99 Patches Requests if Python version is below 3.6.0 to fix a quirk in httplib relating to utf-8 headers. Also tracks are now sanitized before tagging. --- .gitignore | 1 + bandcamp_dl/bandcamp.py | 5 +++- bandcamp_dl/bandcamp_dl.py | 2 +- bandcamp_dl/bandcampdownloader.py | 26 ++++++++++++++---- bandcamp_dl/bandcampjson.py | 4 +++ bandcamp_dl/deps.txt | 1 + bandcamp_dl/utils/requests_patch.py | 42 +++++++++++++++++++++++++++++ setup.py | 3 ++- 8 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 bandcamp_dl/utils/requests_patch.py diff --git a/.gitignore b/.gitignore index c5e9269..523a573 100644 --- a/.gitignore +++ b/.gitignore @@ -39,3 +39,4 @@ nosetests.xml *.iml *.xml bandcamp_dl/asyncdownloader.py +*.log diff --git a/bandcamp_dl/bandcamp.py b/bandcamp_dl/bandcamp.py index 848b53e..4b7e4ed 100644 --- a/bandcamp_dl/bandcamp.py +++ b/bandcamp_dl/bandcamp.py @@ -1,6 +1,7 @@ from .bandcampjson import BandcampJSON from bs4 import BeautifulSoup from bs4 import FeatureNotFound +from datetime import datetime import requests import json @@ -26,13 +27,15 @@ class Bandcamp: self.generate_album_json() self.tracks = self.tralbum_data_json['trackinfo'] + album_release = self.tralbum_data_json['album_release_date'] + album = { "tracks": [], "title": self.embed_data_json['album_title'], "artist": self.embed_data_json['artist'], "full": False, "art": "", - "date": self.tralbum_data_json['album_release_date'] + "date": datetime.strptime(album_release, "%d %b %Y %X %Z").strftime("%m%d%Y") } for track in self.tracks: diff --git a/bandcamp_dl/bandcamp_dl.py b/bandcamp_dl/bandcamp_dl.py index a29052a..b1f61f4 100755 --- a/bandcamp_dl/bandcamp_dl.py +++ b/bandcamp_dl/bandcamp_dl.py @@ -51,7 +51,7 @@ from .bandcampdownloader import BandcampDownloader def main(): - arguments = docopt(__doc__, version='bandcamp-dl 0.0.7-03') + arguments = docopt(__doc__, version='bandcamp-dl 0.0.7-05') bandcamp = Bandcamp() basedir = arguments['--base-dir'] or os.getcwd() diff --git a/bandcamp_dl/bandcampdownloader.py b/bandcamp_dl/bandcampdownloader.py index 39f12fa..d7b7f20 100644 --- a/bandcamp_dl/bandcampdownloader.py +++ b/bandcamp_dl/bandcampdownloader.py @@ -6,6 +6,14 @@ from mutagen.id3._frames import TIT2 from mutagen.easyid3 import EasyID3 from slugify import slugify +if not sys.version_info[:2] == (3, 6): + import mock + from .utils import requests_patch + +# DEBUG +# import logging +# logging.basicConfig(filename='bandcamp-dl.log', level=logging.INFO) + class BandcampDownloader: def __init__(self, urls=None, template=None, directory=None, overwrite=False): @@ -16,7 +24,7 @@ class BandcampDownloader: :param directory: download location :param overwrite: if True overwrite existing files """ - self.headers = {'user_agent': 'bandcamp-dl/0.0.7-02 (https://github.com/iheanyi/bandcamp-dl)'} + self.headers = {'user_agent': 'bandcamp-dl/0.0.7-05 (https://github.com/iheanyi/bandcamp-dl)'} self.session = requests.Session() if type(urls) is str: @@ -98,9 +106,13 @@ class BandcampDownloader: while True: try: - r = self.session.get(track['url'], headers=self.headers, stream=True) - file_length = int(r.headers['content-length']) - total = int(file_length/100) + if not sys.version_info[:2] == (3, 6): + with mock.patch('http.client.parse_headers', requests_patch.parse_headers): + r = self.session.get(track['url'], headers=self.headers, stream=True) + else: + r = self.session.get(track['url'], headers=self.headers, stream=True) + file_length = int(r.headers.get('content-length', 0)) + total = int(file_length / 100) # If file exists and is still a tmp file skip downloading and encode if os.path.exists(filepath): self.write_id3_tags(filepath, track_meta) @@ -121,7 +133,10 @@ class BandcampDownloader: dl += len(data) f.write(data) done = int(50 * dl / file_length) - sys.stdout.write("\r({}/{}) [{}{}] :: Downloading: {}".format(self.track_num, self.num_tracks, "=" * done, " " * (50 - done), filename[:-8])) + sys.stdout.write( + "\r({}/{}) [{}{}] :: Downloading: {}".format(self.track_num, self.num_tracks, + "=" * done, " " * (50 - done), + filename[:-8])) sys.stdout.flush() local_size = os.path.getsize(filepath) # if the local filesize before encoding doesn't match the remote filesize redownload @@ -168,6 +183,7 @@ class BandcampDownloader: sys.stdout.write("\r({}/{}) [{}] :: Encoding: {}".format(self.track_num, self.num_tracks, "=" * 50, filename)) audio = MP3(filepath) + audio.delete() audio["TIT2"] = TIT2(encoding=3, text=["title"]) audio.save(filename=None, v1=2) diff --git a/bandcamp_dl/bandcampjson.py b/bandcamp_dl/bandcampjson.py index 095f6bd..65f211f 100644 --- a/bandcamp_dl/bandcampjson.py +++ b/bandcamp_dl/bandcampjson.py @@ -1,6 +1,10 @@ import demjson import re +"""TODO + + More in-depth error messages +""" class BandcampJSON: def __init__(self, body, var_name: str, js_data=None): diff --git a/bandcamp_dl/deps.txt b/bandcamp_dl/deps.txt index f3432fc..0350cbf 100644 --- a/bandcamp_dl/deps.txt +++ b/bandcamp_dl/deps.txt @@ -4,3 +4,4 @@ docopt==0.6.2 mutagen==1.35.1 requests==2.12.4 unicode-slugify==0.1.3 +mock==2.0.0 diff --git a/bandcamp_dl/utils/requests_patch.py b/bandcamp_dl/utils/requests_patch.py new file mode 100644 index 0000000..1fdf90a --- /dev/null +++ b/bandcamp_dl/utils/requests_patch.py @@ -0,0 +1,42 @@ +try: + import cchardet as chardet +except ImportError: + import chardet as chardet + +import http.client +import email.parser + + +def parse_headers(fp, _class=http.client.HTTPMessage): + """Parses only RFC2822 headers from a file pointer. + + email Parser wants to see strings rather than bytes. + But a TextIOWrapper around self.rfile would buffer too many bytes + from the stream, bytes which we later need to read as bytes. + So we read the correct bytes here, as bytes, for email Parser + to parse. + + Note: Monkey-patched version to try to more intelligently determine + header encoding + + """ + headers = [] + while True: + line = fp.readline(http.client._MAXLINE + 1) + if len(line) > http.client._MAXLINE: + raise http.client.LineTooLong("header line") + headers.append(line) + if len(headers) > http.client._MAXHEADERS: + raise HTTPException("got more than {} headers".format(http.client._MAXHEADERS)) + if line in (b'\r\n', b'\n', b''): + break + + hstring = b''.join(headers) + inferred = chardet.detect(hstring) + if inferred and inferred['confidence'] > 0.8: + # print("Parsing headers!", hstring) + hstring = hstring.decode(inferred['encoding']) + else: + hstring = hstring.decode('iso-8859-1') + + return email.parser.Parser(_class=_class).parsestr(hstring) diff --git a/setup.py b/setup.py index 386db3f..2a50e2a 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ here = path.abspath(path.dirname(__file__)) setup( name='bandcamp-downloader', - version='0.0.7-03', + version='0.0.7-05', description='bandcamp-dl downloads albums and tracks from Bandcamp for you', long_description=open('README.rst').read(), url='https://github.com/iheanyi/bandcamp-dl', @@ -29,6 +29,7 @@ setup( 'mutagen', 'requests', 'unicode-slugify', + 'mock', ], entry_points={ 'console_scripts': [