From f3e91a7d4dbc73b9b6d687f49b37379f8d93068d Mon Sep 17 00:00:00 2001 From: AnthonyF Date: Sun, 10 Jan 2021 15:03:53 -0500 Subject: [PATCH] Exposed slugify args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhanced flexibility when naming output files: Example: `--template="%{artist}/%{album}/%{track}-%{title}" --space-char="_"` Would produce "scene" style filenames. Also added is the ability to: - Retain upper case letters - Leave spaces intact - Convert characters to ASCII (北京 -> beijing) - Allow additional characters like `()[];` etc --- .gitignore | 2 + README.rst | 11 ++- bandcamp_dl/__main__.py | 20 ++++-- bandcamp_dl/bandcamp.py | 5 +- bandcamp_dl/bandcampdownloader.py | 30 ++++++-- bandcamp_dl/bandcampjson.py | 5 +- bandcamp_dl/utils/LICENSE-Unicode-Slugify | 27 +++++++ bandcamp_dl/utils/unicode_slugify.py | 88 +++++++++++++++++++++++ requirements.txt | 10 +-- setup.py | 2 +- 10 files changed, 175 insertions(+), 25 deletions(-) create mode 100644 bandcamp_dl/utils/LICENSE-Unicode-Slugify create mode 100644 bandcamp_dl/utils/unicode_slugify.py diff --git a/.gitignore b/.gitignore index b83a140..9641cd4 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,5 @@ bandcamp_dl/asyncdownloader.py *.log bandcamp_dl/__init__\.py + +venv/ diff --git a/README.rst b/README.rst index df436bb..e8fdcbc 100644 --- a/README.rst +++ b/README.rst @@ -79,6 +79,13 @@ Options -g --group Use album/track Label as iTunes grouping. -r --embed-art Embed album art (If available) -y --no-slugify Disable slugification of track, album, and artist names. + -c --ok-chars= Specify allowed chars in slugify. + [default: -_~] + -s --space-char= Specify the char to use in place of spaces. + [default: -] + -a --ascii-only Only allow ASCII chars (北京 (capital of china) -> bei-jing-capital-of-china) + -k --keep-spaces Retain whitespace in filenames + -u --keep-upper Retain uppercase letters in filenames Filename Template ----------------- @@ -170,9 +177,9 @@ Dependencies - `BeautifulSoup4 `_ - HTML Parsing - `Demjson `_- JavaScript dict to JSON conversion - `Mutagen `_ - ID3 Encoding -- `Requests `_ - for retriving the HTML +- `Requests `_ - for retrieving the HTML - `Unicode-Slugify `_ - A slug generator that turns strings into unicode slugs. -- `Chardet `_ - Charecter encoding detection +- `Chardet `_ - Character encoding detection - `Docopt `_ - CLI help - `Six `_ - Python 2-3 compatibility - `Unidecode `_ - ASCII representation of Unicode text diff --git a/bandcamp_dl/__main__.py b/bandcamp_dl/__main__.py index f99e3c0..e0051b9 100644 --- a/bandcamp_dl/__main__.py +++ b/bandcamp_dl/__main__.py @@ -23,6 +23,14 @@ Options: -g --group Use album/track Label as iTunes grouping. -r --embed-art Embed album art (If available) -y --no-slugify Disable slugification of track, album, and artist names. + -c --ok-chars= Specify allowed chars in slugify. + [default: -_~] + -s --space-char= Specify the char to use in place of spaces. + [default: -] + -a --ascii-only Only allow ASCII chars (北京 (capital of china) -> bei-jing-capital-of-china) + -k --keep-spaces Retain whitespace in filenames + -u --keep-upper Retain uppercase letters in filenames + """ """ Coded by: @@ -87,7 +95,7 @@ def main(): for url in urls: logging.debug("\n\tURL: {}".format(url)) # url is now a list of URLs. So lets make an albumList and append each parsed album to it. - albumList = []; + albumList = [] for url in urls: albumList.append(bandcamp.parse(url, not arguments['--no-art'], arguments['--embed-lyrics'], arguments['--debug'])) @@ -97,15 +105,17 @@ def main(): for album in albumList: if arguments['--full-album'] and not album['full']: print("Full album not available. Skipping ", album['title'], " ...") - albumList.remove(album) #Remove not-full albums BUT continue with the rest of the albums. + albumList.remove(album) # Remove not-full albums BUT continue with the rest of the albums. if arguments['URL'] or arguments['--artist']: logging.debug("Preparing download process..") for album in albumList: bandcamp_downloader = BandcampDownloader(arguments['--template'], basedir, arguments['--overwrite'], - arguments['--embed-lyrics'], arguments['--group'], - arguments['--embed-art'], arguments['--no-slugify'], - arguments['--debug'], album['url']) + arguments['--embed-lyrics'], arguments['--group'], + arguments['--embed-art'], arguments['--no-slugify'], + arguments['--ok-chars'], arguments['--space-char'], + arguments['--ascii-only'], arguments['--keep-spaces'], + arguments['--keep-upper'], arguments['--debug'], album['url']) logging.debug("Initiating download process..") bandcamp_downloader.start(album) # Add a newline to stop prompt mangling diff --git a/bandcamp_dl/bandcamp.py b/bandcamp_dl/bandcamp.py index 65a8133..b1713a3 100644 --- a/bandcamp_dl/bandcamp.py +++ b/bandcamp_dl/bandcamp.py @@ -68,7 +68,7 @@ class Bandcamp: "full": False, "art": "", "date": str(dt.strptime(album_release, "%d %b %Y %H:%M:%S GMT").year), - "url":url + "url": url } artist_url = page_json['url'].rpartition('/album/')[0] @@ -84,7 +84,8 @@ class Bandcamp: album['art'] = self.get_album_art() logging.debug(" Album generated..") - print("ALBUM URL:", album["url"]) + logging.debug(" Album URL: {}".format(album['url'])) + return album def get_track_lyrics(self, track_url): diff --git a/bandcamp_dl/bandcampdownloader.py b/bandcamp_dl/bandcampdownloader.py index ebc0c4d..b618c8c 100644 --- a/bandcamp_dl/bandcampdownloader.py +++ b/bandcamp_dl/bandcampdownloader.py @@ -8,7 +8,7 @@ from mutagen.id3._frames import TIT1 from mutagen.id3._frames import TIT2 from mutagen.id3._frames import USLT from mutagen.id3._frames import APIC -from slugify import slugify +from bandcamp_dl.utils.unicode_slugify import slugify if not sys.version_info[:2] == (3, 6): import mock @@ -20,7 +20,8 @@ from bandcamp_dl.utils.clean_print import print_clean class BandcampDownloader: - def __init__(self, template, directory, overwrite, embed_lyrics, grouping, embed_art, no_slugify, debugging, urls=None): + def __init__(self, template, directory, overwrite, embed_lyrics, grouping, embed_art, no_slugify, ok_chars, + space_char, ascii_only, keep_space, keep_upper, debugging, urls=None): """Initialize variables we will need throughout the Class :param urls: list of urls @@ -42,6 +43,11 @@ class BandcampDownloader: self.embed_art = embed_art self.embed_lyrics = embed_lyrics self.no_slugify = no_slugify + self.ok_chars = ok_chars + self.space_char = space_char + self.ascii_only = ascii_only + self.keep_space = keep_space + self.keep_upper = keep_upper self.debugging = debugging def start(self, album: dict): @@ -63,23 +69,33 @@ class BandcampDownloader: else: self.download_album(album) - def template_to_path(self, track: dict) -> str: + def template_to_path(self, track: dict, ascii_only, ok_chars, space_char, keep_space, keep_upper) -> str: """Create valid filepath based on template :param track: track metadata + :param ok_chars: optional chars to allow + :param ascii_only: allow only ascii chars in filename + :param keep_space: retain whitespace in filename + :param keep_upper: retain uppercase chars in filename + :param space_char: char to use in place of spaces :return: filepath """ logging.debug(" Generating filepath/trackname..") path = self.template + def slugify_preset(content): + slugged = slugify(content, ok=ok_chars, only_ascii=ascii_only, spaces=keep_space, lower=not keep_upper, + space_replacement=space_char) + return slugged + if self.no_slugify: path = path.replace("%{artist}", track['artist']) path = path.replace("%{album}", track['album']) path = path.replace("%{title}", track['title']) else: - path = path.replace("%{artist}", slugify(track['artist'])) - path = path.replace("%{album}", slugify(track['album'])) - path = path.replace("%{title}", slugify(track['title'])) + path = path.replace("%{artist}", slugify_preset(track['artist'])) + path = path.replace("%{album}", slugify_preset(track['album'])) + path = path.replace("%{title}", slugify_preset(track['title'])) if track['track'] == "None": path = path.replace("%{track}", "Single") @@ -128,7 +144,7 @@ class BandcampDownloader: self.num_tracks = len(album['tracks']) self.track_num = track_index + 1 - filepath = self.template_to_path(track_meta) + ".tmp" + filepath = self.template_to_path(track_meta, self.ascii_only, self.ok_chars, self.space_char, self.keep_space, self.keep_upper) + ".tmp" filename = filepath.rsplit('/', 1)[1] dirname = self.create_directory(filepath) diff --git a/bandcamp_dl/bandcampjson.py b/bandcamp_dl/bandcampjson.py index 65b1887..425d83d 100644 --- a/bandcamp_dl/bandcampjson.py +++ b/bandcamp_dl/bandcampjson.py @@ -1,4 +1,3 @@ -import re import logging import demjson @@ -37,11 +36,11 @@ class BandcampJSON: js_data = self.js_to_json(script) self.json_data.append(js_data) - def js_to_json(self, js_data): + @staticmethod + def js_to_json(js_data): """Convert JavaScript dictionary to JSON""" logging.debug(" Converting JS to JSON..") # Decode with demjson first to reformat keys and lists decoded_js = demjson.decode(js_data) # Encode to make valid JSON, add to list of JSON strings return demjson.encode(decoded_js) - diff --git a/bandcamp_dl/utils/LICENSE-Unicode-Slugify b/bandcamp_dl/utils/LICENSE-Unicode-Slugify new file mode 100644 index 0000000..4a9af50 --- /dev/null +++ b/bandcamp_dl/utils/LICENSE-Unicode-Slugify @@ -0,0 +1,27 @@ +Copyright (c) 2011, Mozilla Foundation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. Neither the name of unicode-slugify nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/bandcamp_dl/utils/unicode_slugify.py b/bandcamp_dl/utils/unicode_slugify.py new file mode 100644 index 0000000..a0307f3 --- /dev/null +++ b/bandcamp_dl/utils/unicode_slugify.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 +from __future__ import unicode_literals + +import re +import six +import unicodedata +from unidecode import unidecode + + +def smart_text(s, encoding='utf-8', errors='strict'): + if isinstance(s, six.text_type): + return s + + if not isinstance(s, six.string_types): + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + elif hasattr(s, '__unicode__'): + s = six.text_type(s) + else: + s = six.text_type(bytes(s), encoding, errors) + else: + s = six.text_type(s) + return s + + +def _sanitize(text, ok): + rv = [] + for c in text: + cat = unicodedata.category(c)[0] + if cat in 'LN' or c in ok: + rv.append(c) + elif cat == 'Z': # space + rv.append(' ') + return ''.join(rv).strip() + + +# Extra characters outside of alphanumerics that we'll allow. +SLUG_OK = '-_~' + + +def slugify(s, ok=SLUG_OK, lower=True, spaces=False, only_ascii=False, space_replacement='-'): + """ + Creates a unicode slug for given string with several options. + + L and N signify letter/number. + http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table + + :param s: Your unicode string. + :param ok: Extra characters outside of alphanumerics to be allowed. + Default is '-_~' + :param lower: Lower the output string. + Default is True + :param spaces: True allows spaces, False replaces a space with the "space_replacement" param + :param only_ascii: True to replace non-ASCII unicode characters with + their ASCII representations. + :param space_replacement: Char used to replace spaces if "spaces" is False. + Default is dash ("-") or first char in ok if dash not allowed + :type s: String + :type ok: String + :type lower: Bool + :type spaces: Bool + :type only_ascii: Bool + :type space_replacement: String + :return: Slugified unicode string + + """ + + if only_ascii and ok != SLUG_OK and hasattr(ok, 'decode'): + try: + ok.decode('ascii') + except UnicodeEncodeError: + raise ValueError(('You can not use "only_ascii=True" with ' + 'a non ascii available chars in "ok" ("%s" given)') % ok) + + new = _sanitize(unicodedata.normalize('NFKC', smart_text(s)), ok) + if only_ascii: + new = _sanitize(smart_text(unidecode(new)), ok) + if not spaces: + if space_replacement and space_replacement not in ok: + space_replacement = ok[0] if ok else '' + new = re.sub('[%s\s]+' % space_replacement, space_replacement, new) + if lower: + new = new.lower() + + return new diff --git a/requirements.txt b/requirements.txt index 4cf6eb5..7b1e169 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ --index-url https://pypi.python.org/simple/ -beautifulsoup4==4.6.0 +beautifulsoup4==4.9.3 demjson==2.2.4 docopt==0.6.2 -mutagen==1.38 -requests==2.18.4 +mutagen==1.45.1 +requests==2.25.1 unicode-slugify==0.1.3 -mock==2.0.0 -chardet==3.0.4 +mock==4.0.3 +chardet==4.0.0 diff --git a/setup.py b/setup.py index 87dbc81..c7f311e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from codecs import open from os import path import sys -appversion = "0.0.9-01" +appversion = "0.0.10" here = path.abspath(path.dirname(__file__))