Exposed slugify args

Enhanced flexibility when naming output files: Example: `--template="%{artist}/%{album}/%{track}-%{title}" --space-char="_"` Would produce "scene" style filenames. Also added is the ability to: - Retain upper case letters - Leave spaces intact - Convert characters to ASCII (北京 -> beijing) - Allow additional characters like `()[];` etc
2021-01-10 15:03:53 -05:00 · 2021-01-10 15:03:53 -05:00 · f3e91a7d4d
parent 0b9ce91621
commit f3e91a7d4d
10 changed files with 175 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,3 +45,5 @@ bandcamp_dl/asyncdownloader.py
 *.log

 bandcamp_dl/__init__\.py
+
+venv/
--- a/README.rst
+++ b/README.rst
@ -79,6 +79,13 @@ Options
        -g --group              Use album/track Label as iTunes grouping.
        -r --embed-art          Embed album art (If available)
        -y --no-slugify         Disable slugification of track, album, and artist names.
+        -c --ok-chars=<chars>   Specify allowed chars in slugify.
+                                [default: -_~]
+        -s --space-char=<char>  Specify the char to use in place of spaces.
+                                [default: -]
+        -a --ascii-only         Only allow ASCII chars (北京 (capital of china) -> bei-jing-capital-of-china)
+        -k --keep-spaces        Retain whitespace in filenames
+        -u --keep-upper         Retain uppercase letters in filenames

 Filename Template
 -----------------
@ -170,9 +177,9 @@ Dependencies
 -  `BeautifulSoup4 <https://pypi.python.org/pypi/beautifulsoup4>`_ - HTML Parsing
 -  `Demjson <https://pypi.python.org/pypi/demjson>`_- JavaScript dict to JSON conversion
 -  `Mutagen <https://pypi.python.org/pypi/mutagen>`_ - ID3 Encoding
-  `Requests <https://pypi.python.org/pypi/requests>`_ - for retriving the HTML
+-  `Requests <https://pypi.python.org/pypi/requests>`_ - for retrieving the HTML
 -  `Unicode-Slugify <https://pypi.python.org/pypi/unicode-slugify>`_ - A slug generator that turns strings into unicode slugs.
-  `Chardet <https://pypi.python.org/pypi/chardet>`_ - Charecter encoding detection
+-  `Chardet <https://pypi.python.org/pypi/chardet>`_ - Character encoding detection
 -  `Docopt <https://pypi.python.org/pypi/docopt>`_ - CLI help
 -  `Six <https://pypi.python.org/pypi/six>`_ - Python 2-3 compatibility
 -  `Unidecode <https://pypi.python.org/pypi/unidecode>`_ - ASCII representation of Unicode text
--- a/bandcamp_dl/main.py
+++ b/bandcamp_dl/main.py
@ -23,6 +23,14 @@ Options:
    -g --group              Use album/track Label as iTunes grouping.
    -r --embed-art          Embed album art (If available)
    -y --no-slugify         Disable slugification of track, album, and artist names.
+    -c --ok-chars=<chars>   Specify allowed chars in slugify.
+                            [default: -_~]
+    -s --space-char=<char>  Specify the char to use in place of spaces.
+                            [default: -]
+    -a --ascii-only         Only allow ASCII chars (北京 (capital of china) -> bei-jing-capital-of-china)
+    -k --keep-spaces        Retain whitespace in filenames
+    -u --keep-upper         Retain uppercase letters in filenames
+
 """
 """
 Coded by:
@ -87,7 +95,7 @@ def main():
    for url in urls:
        logging.debug("\n\tURL: {}".format(url))
    # url is now a list of URLs. So lets make an albumList and append each parsed album to it.
-    albumList = [];
+    albumList = []
    for url in urls:
        albumList.append(bandcamp.parse(url, not arguments['--no-art'], arguments['--embed-lyrics'], arguments['--debug']))
    
@ -97,7 +105,7 @@ def main():
    for album in albumList:
        if arguments['--full-album'] and not album['full']:
            print("Full album not available. Skipping ", album['title'], " ...")
-            albumList.remove(album) #Remove not-full albums BUT continue with the rest of the albums.
+            albumList.remove(album)  # Remove not-full albums BUT continue with the rest of the albums.

    if arguments['URL'] or arguments['--artist']:
        logging.debug("Preparing download process..")
@ -105,7 +113,9 @@ def main():
            bandcamp_downloader = BandcampDownloader(arguments['--template'], basedir, arguments['--overwrite'],
                                                     arguments['--embed-lyrics'], arguments['--group'],
                                                     arguments['--embed-art'], arguments['--no-slugify'],
-                                                 arguments['--debug'], album['url'])
+                                                     arguments['--ok-chars'], arguments['--space-char'],
+                                                     arguments['--ascii-only'], arguments['--keep-spaces'],
+                                                     arguments['--keep-upper'], arguments['--debug'], album['url'])
            logging.debug("Initiating download process..")
            bandcamp_downloader.start(album)
            # Add a newline to stop prompt mangling
--- a/bandcamp_dl/bandcamp.py
+++ b/bandcamp_dl/bandcamp.py
@ -68,7 +68,7 @@ class Bandcamp:
            "full": False,
            "art": "",
            "date": str(dt.strptime(album_release, "%d %b %Y %H:%M:%S GMT").year),
-            "url":url
+            "url": url
        }

        artist_url = page_json['url'].rpartition('/album/')[0]
@ -84,7 +84,8 @@ class Bandcamp:
            album['art'] = self.get_album_art()

        logging.debug(" Album generated..")
-        print("ALBUM URL:", album["url"])
+        logging.debug(" Album URL: {}".format(album['url']))
+
        return album

    def get_track_lyrics(self, track_url):
--- a/bandcamp_dl/bandcampdownloader.py
+++ b/bandcamp_dl/bandcampdownloader.py
@ -8,7 +8,7 @@ from mutagen.id3._frames import TIT1
 from mutagen.id3._frames import TIT2
 from mutagen.id3._frames import USLT
 from mutagen.id3._frames import APIC
-from slugify import slugify
+from bandcamp_dl.utils.unicode_slugify import slugify

 if not sys.version_info[:2] == (3, 6):
    import mock
@ -20,7 +20,8 @@ from bandcamp_dl.utils.clean_print import print_clean


 class BandcampDownloader:
-    def __init__(self, template, directory, overwrite, embed_lyrics, grouping, embed_art, no_slugify, debugging, urls=None):
+    def __init__(self, template, directory, overwrite, embed_lyrics, grouping, embed_art, no_slugify, ok_chars,
+                 space_char, ascii_only, keep_space, keep_upper, debugging, urls=None):
        """Initialize variables we will need throughout the Class

        :param urls: list of urls
@ -42,6 +43,11 @@ class BandcampDownloader:
        self.embed_art = embed_art
        self.embed_lyrics = embed_lyrics
        self.no_slugify = no_slugify
+        self.ok_chars = ok_chars
+        self.space_char = space_char
+        self.ascii_only = ascii_only
+        self.keep_space = keep_space
+        self.keep_upper = keep_upper
        self.debugging = debugging

    def start(self, album: dict):
@ -63,23 +69,33 @@ class BandcampDownloader:
        else:
            self.download_album(album)

-    def template_to_path(self, track: dict) -> str:
+    def template_to_path(self, track: dict, ascii_only, ok_chars, space_char, keep_space, keep_upper) -> str:
        """Create valid filepath based on template

        :param track: track metadata
+        :param ok_chars: optional chars to allow
+        :param ascii_only: allow only ascii chars in filename
+        :param keep_space: retain whitespace in filename
+        :param keep_upper: retain uppercase chars in filename
+        :param space_char: char to use in place of spaces
        :return: filepath
        """
        logging.debug(" Generating filepath/trackname..")
        path = self.template

+        def slugify_preset(content):
+            slugged = slugify(content, ok=ok_chars, only_ascii=ascii_only, spaces=keep_space, lower=not keep_upper,
+                              space_replacement=space_char)
+            return slugged
+
        if self.no_slugify:
            path = path.replace("%{artist}", track['artist'])
            path = path.replace("%{album}", track['album'])
            path = path.replace("%{title}", track['title'])
        else:
-            path = path.replace("%{artist}", slugify(track['artist']))
-            path = path.replace("%{album}", slugify(track['album']))
-            path = path.replace("%{title}", slugify(track['title']))
+            path = path.replace("%{artist}", slugify_preset(track['artist']))
+            path = path.replace("%{album}", slugify_preset(track['album']))
+            path = path.replace("%{title}", slugify_preset(track['title']))

        if track['track'] == "None":
            path = path.replace("%{track}", "Single")
@ -128,7 +144,7 @@ class BandcampDownloader:
            self.num_tracks = len(album['tracks'])
            self.track_num = track_index + 1

-            filepath = self.template_to_path(track_meta) + ".tmp"
+            filepath = self.template_to_path(track_meta, self.ascii_only, self.ok_chars, self.space_char, self.keep_space, self.keep_upper) + ".tmp"
            filename = filepath.rsplit('/', 1)[1]
            dirname = self.create_directory(filepath)

--- a/bandcamp_dl/bandcampjson.py
+++ b/bandcamp_dl/bandcampjson.py
@ -1,4 +1,3 @@
-import re
 import logging

 import demjson
@ -37,11 +36,11 @@ class BandcampJSON:
            js_data = self.js_to_json(script)
            self.json_data.append(js_data)

-    def js_to_json(self, js_data):
+    @staticmethod
+    def js_to_json(js_data):
        """Convert JavaScript dictionary to JSON"""
        logging.debug(" Converting JS to JSON..")
        # Decode with demjson first to reformat keys and lists
        decoded_js = demjson.decode(js_data)
        # Encode to make valid JSON, add to list of JSON strings
        return demjson.encode(decoded_js)
-        
--- a/bandcamp_dl/utils/LICENSE-Unicode-Slugify
+++ b/bandcamp_dl/utils/LICENSE-Unicode-Slugify
@ -0,0 +1,27 @@
+Copyright (c) 2011, Mozilla Foundation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    3. Neither the name of unicode-slugify nor the names of its contributors
+       may be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/bandcamp_dl/utils/unicode_slugify.py
+++ b/bandcamp_dl/utils/unicode_slugify.py
@ -0,0 +1,88 @@
+# -*- coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import six
+import unicodedata
+from unidecode import unidecode
+
+
+def smart_text(s, encoding='utf-8', errors='strict'):
+	if isinstance(s, six.text_type):
+		return s
+
+	if not isinstance(s, six.string_types):
+		if six.PY3:
+			if isinstance(s, bytes):
+				s = six.text_type(s, encoding, errors)
+			else:
+				s = six.text_type(s)
+		elif hasattr(s, '__unicode__'):
+			s = six.text_type(s)
+		else:
+			s = six.text_type(bytes(s), encoding, errors)
+	else:
+		s = six.text_type(s)
+	return s
+
+
+def _sanitize(text, ok):
+	rv = []
+	for c in text:
+		cat = unicodedata.category(c)[0]
+		if cat in 'LN' or c in ok:
+			rv.append(c)
+		elif cat == 'Z':  # space
+			rv.append(' ')
+	return ''.join(rv).strip()
+
+
+# Extra characters outside of alphanumerics that we'll allow.
+SLUG_OK = '-_~'
+
+
+def slugify(s, ok=SLUG_OK, lower=True, spaces=False, only_ascii=False, space_replacement='-'):
+	"""
+	Creates a unicode slug for given string with several options.
+
+	L and N signify letter/number.
+	http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table
+
+	:param s: Your unicode string.
+	:param ok: Extra characters outside of alphanumerics to be allowed.
+				Default is '-_~'
+	:param lower: Lower the output string.
+					Default is True
+	:param spaces: True allows spaces, False replaces a space with the "space_replacement" param
+	:param only_ascii: True to replace non-ASCII unicode characters with
+						their ASCII representations.
+	:param space_replacement: Char used to replace spaces if "spaces" is False.
+								Default is dash ("-") or first char in ok if dash not allowed
+	:type s: String
+	:type ok: String
+	:type lower: Bool
+	:type spaces: Bool
+	:type only_ascii: Bool
+	:type space_replacement: String
+	:return: Slugified unicode string
+
+	"""
+
+	if only_ascii and ok != SLUG_OK and hasattr(ok, 'decode'):
+		try:
+			ok.decode('ascii')
+		except UnicodeEncodeError:
+			raise ValueError(('You can not use "only_ascii=True" with '
+									'a non ascii available chars in "ok" ("%s" given)') % ok)
+
+	new = _sanitize(unicodedata.normalize('NFKC', smart_text(s)), ok)
+	if only_ascii:
+		new = _sanitize(smart_text(unidecode(new)), ok)
+	if not spaces:
+		if space_replacement and space_replacement not in ok:
+			space_replacement = ok[0] if ok else ''
+		new = re.sub('[%s\s]+' % space_replacement, space_replacement, new)
+	if lower:
+		new = new.lower()
+
+	return new
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,10 @@
 --index-url https://pypi.python.org/simple/

-beautifulsoup4==4.6.0
+beautifulsoup4==4.9.3
 demjson==2.2.4
 docopt==0.6.2
-mutagen==1.38
-requests==2.18.4
+mutagen==1.45.1
+requests==2.25.1
 unicode-slugify==0.1.3
-mock==2.0.0
-chardet==3.0.4
+mock==4.0.3
+chardet==4.0.0
--- a/setup.py
+++ b/setup.py
@ -3,7 +3,7 @@ from codecs import open
 from os import path
 import sys

-appversion = "0.0.9-01"
+appversion = "0.0.10"

 here = path.abspath(path.dirname(__file__))