bandcamp-dl/bandcamp_dl/utils/unicode_slugify.py

# -*- coding: utf-8
from __future__ import unicode_literals

import re
import six
import unicodedata
from unidecode import unidecode


def smart_text(s, encoding='utf-8', errors='strict'):
	if isinstance(s, six.text_type):
		return s

	if not isinstance(s, six.string_types):
		if six.PY3:
			if isinstance(s, bytes):
				s = six.text_type(s, encoding, errors)
			else:
				s = six.text_type(s)
		elif hasattr(s, '__unicode__'):
			s = six.text_type(s)
		else:
			s = six.text_type(bytes(s), encoding, errors)
	else:
		s = six.text_type(s)
	return s


def _sanitize(text, ok):
	rv = []
	for c in text:
		cat = unicodedata.category(c)[0]
		if cat in 'LN' or c in ok:
			rv.append(c)
		elif cat == 'Z':  # space
			rv.append(' ')
	return ''.join(rv).strip()


# Extra characters outside of alphanumerics that we'll allow.
SLUG_OK = '-_~'


def slugify(s, ok=SLUG_OK, lower=True, spaces=False, only_ascii=False, space_replacement='-'):
	"""
	Creates a unicode slug for given string with several options.

	L and N signify letter/number.
	http://www.unicode.org/reports/tr44/tr44-4.html#GC_Values_Table

	:param s: Your unicode string.
	:param ok: Extra characters outside of alphanumerics to be allowed.
				Default is '-_~'
	:param lower: Lower the output string.
					Default is True
	:param spaces: True allows spaces, False replaces a space with the "space_replacement" param
	:param only_ascii: True to replace non-ASCII unicode characters with
						their ASCII representations.
	:param space_replacement: Char used to replace spaces if "spaces" is False.
								Default is dash ("-") or first char in ok if dash not allowed
	:type s: String
	:type ok: String
	:type lower: Bool
	:type spaces: Bool
	:type only_ascii: Bool
	:type space_replacement: String
	:return: Slugified unicode string

	"""

	if only_ascii and ok != SLUG_OK and hasattr(ok, 'decode'):
		try:
			ok.decode('ascii')
		except UnicodeEncodeError:
			raise ValueError(('You can not use "only_ascii=True" with '
									'a non ascii available chars in "ok" ("%s" given)') % ok)

	new = _sanitize(unicodedata.normalize('NFKC', smart_text(s)), ok)
	if only_ascii:
		new = _sanitize(smart_text(unidecode(new)), ok)
	if not spaces:
		if space_replacement and space_replacement not in ok:
			space_replacement = ok[0] if ok else ''
		new = re.sub('[%s\s]+' % space_replacement, space_replacement, new)
	if lower:
		new = new.lower()

	return new