implement a way to shorten filenames with east-asian characters

()

Setting 'output.shorten' to "eaw" (East-Asian Width) uses a slower
algorithm that also considers characters with a width > 1.
This commit is contained in:
Mike Fährmann 2021-09-13 21:29:38 +02:00
parent 2ff2974353
commit bd845303ad
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 219 additions and 15 deletions

@ -2531,6 +2531,9 @@ Description
Controls whether the output strings should be shortened to fit
on one console line.
Set this option to ``"eaw"`` to also work with east-asian characters
with a display width greater than 1.
output.skip
-----------

@ -10,6 +10,7 @@ import os
import sys
import shutil
import logging
import unicodedata
from . import config, util
@ -270,9 +271,14 @@ class PipeOutput(NullOutput):
class TerminalOutput(NullOutput):
def __init__(self):
self.short = config.get(("output",), "shorten", True)
if self.short:
self.width = shutil.get_terminal_size().columns - OFFSET
shorten = config.get(("output",), "shorten", True)
if shorten:
func = shorten_string_eaw if shorten == "eaw" else shorten_string
limit = shutil.get_terminal_size().columns - OFFSET
sep = CHAR_ELLIPSIES
self.shorten = lambda txt: func(txt, limit, sep)
else:
self.shorten = util.identity
def start(self, path):
print(self.shorten(" " + path), end="", flush=True)
@ -283,17 +289,6 @@ class TerminalOutput(NullOutput):
def success(self, path, tries):
print("\r", self.shorten(CHAR_SUCCESS + path), sep="")
def shorten(self, txt):
"""Reduce the length of 'txt' to the width of the terminal"""
if self.short and len(txt) > self.width:
hwidth = self.width // 2 - OFFSET
return "".join((
txt[:hwidth-1],
CHAR_ELLIPSIES,
txt[-hwidth-(self.width % 2):]
))
return txt
class ColorOutput(TerminalOutput):
@ -307,6 +302,56 @@ class ColorOutput(TerminalOutput):
print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")
class EAWCache(dict):
def __missing__(self, key):
width = self[key] = \
2 if unicodedata.east_asian_width(key) in "WF" else 1
return width
def shorten_string(txt, limit, sep=""):
"""Limit width of 'txt'; assume all characters have a width of 1"""
if len(txt) <= limit:
return txt
limit -= len(sep)
return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]
def shorten_string_eaw(txt, limit, sep="", cache=EAWCache()):
"""Limit width of 'txt'; check for east-asian characters with width > 1"""
char_widths = [cache[c] for c in txt]
text_width = sum(char_widths)
if text_width <= limit:
# no shortening required
return txt
limit -= len(sep)
if text_width == len(txt):
# all characters have a width of 1
return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]
# wide characters
left = 0
lwidth = limit // 2
while True:
lwidth -= char_widths[left]
if lwidth < 0:
break
left += 1
right = -1
rwidth = (limit+1) // 2 + (lwidth + char_widths[left])
while True:
rwidth -= char_widths[right]
if rwidth < 0:
break
right -= 1
return txt[:left] + sep + txt[right+1:]
if util.WINDOWS:
ANSI = os.environ.get("TERM") == "ANSI"
OFFSET = 1

@ -2,7 +2,7 @@
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
TESTS_CORE=(cache config cookies downloader extractor job oauth postprocessor text util)
TESTS_CORE=(cache config cookies downloader extractor job oauth output postprocessor text util)
TESTS_RESULTS=(results)

156
test/test_output.py Normal file

@ -0,0 +1,156 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import output # noqa E402
class TestShorten(unittest.TestCase):
def test_shorten_noop(self, f=output.shorten_string):
self.assertEqual(f("" , 10), "")
self.assertEqual(f("foobar", 10), "foobar")
def test_shorten(self, f=output.shorten_string):
s = "01234567890123456789" # string of length 20
self.assertEqual(f(s, 30), s)
self.assertEqual(f(s, 25), s)
self.assertEqual(f(s, 20), s)
self.assertEqual(f(s, 19), "012345678…123456789")
self.assertEqual(f(s, 18), "01234567…123456789")
self.assertEqual(f(s, 17), "01234567…23456789")
self.assertEqual(f(s, 16), "0123456…23456789")
self.assertEqual(f(s, 15), "0123456…3456789")
self.assertEqual(f(s, 14), "012345…3456789")
self.assertEqual(f(s, 13), "012345…456789")
self.assertEqual(f(s, 12), "01234…456789")
self.assertEqual(f(s, 11), "01234…56789")
self.assertEqual(f(s, 10), "0123…56789")
self.assertEqual(f(s, 9) , "0123…6789")
self.assertEqual(f(s, 3) , "0…9")
self.assertEqual(f(s, 2) , "…9")
def test_shorten_separator(self, f=output.shorten_string):
s = "01234567890123456789" # string of length 20
self.assertEqual(f(s, 20, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
self.assertEqual(f(s, 10, "|---|"), "01|---|789")
self.assertEqual(f(s, 19, "..."), "01234567...23456789")
self.assertEqual(f(s, 19, "..") , "01234567..123456789")
self.assertEqual(f(s, 19, ".") , "012345678.123456789")
self.assertEqual(f(s, 19, "") , "0123456780123456789")
class TestShortenEAW(unittest.TestCase):
def test_shorten_eaw_noop(self, f=output.shorten_string_eaw):
self.assertEqual(f("" , 10), "")
self.assertEqual(f("foobar", 10), "foobar")
def test_shorten_eaw(self, f=output.shorten_string_eaw):
s = "01234567890123456789" # 20 ascii characters
self.assertEqual(f(s, 30), s)
self.assertEqual(f(s, 25), s)
self.assertEqual(f(s, 20), s)
self.assertEqual(f(s, 19), "012345678…123456789")
self.assertEqual(f(s, 18), "01234567…123456789")
self.assertEqual(f(s, 17), "01234567…23456789")
self.assertEqual(f(s, 16), "0123456…23456789")
self.assertEqual(f(s, 15), "0123456…3456789")
self.assertEqual(f(s, 14), "012345…3456789")
self.assertEqual(f(s, 13), "012345…456789")
self.assertEqual(f(s, 12), "01234…456789")
self.assertEqual(f(s, 11), "01234…56789")
self.assertEqual(f(s, 10), "0123…56789")
self.assertEqual(f(s, 9) , "0123…6789")
self.assertEqual(f(s, 3) , "0…9")
self.assertEqual(f(s, 2) , "…9")
def test_shorten_eaw_wide(self, f=output.shorten_string_eaw):
s = "幻想郷幻想郷幻想郷幻想郷" # 12 wide characters
self.assertEqual(f(s, 30), s)
self.assertEqual(f(s, 25), s)
self.assertEqual(f(s, 20), "幻想郷幻…想郷幻想郷")
self.assertEqual(f(s, 19), "幻想郷幻…想郷幻想郷")
self.assertEqual(f(s, 18), "幻想郷幻…郷幻想郷")
self.assertEqual(f(s, 17), "幻想郷幻…郷幻想郷")
self.assertEqual(f(s, 16), "幻想郷…郷幻想郷")
self.assertEqual(f(s, 15), "幻想郷…郷幻想郷")
self.assertEqual(f(s, 14), "幻想郷…幻想郷")
self.assertEqual(f(s, 13), "幻想郷…幻想郷")
self.assertEqual(f(s, 12), "幻想…幻想郷")
self.assertEqual(f(s, 11), "幻想…幻想郷")
self.assertEqual(f(s, 10), "幻想…想郷")
self.assertEqual(f(s, 9) , "幻想…想郷")
self.assertEqual(f(s, 3) , "…郷")
def test_shorten_eaw_mix(self, f=output.shorten_string_eaw):
s = "幻-想-郷##幻-想-郷##幻-想-郷" # mixed characters
self.assertEqual(f(s, 28), s)
self.assertEqual(f(s, 25), "幻-想-郷##幻…郷##幻-想-郷")
self.assertEqual(f(s, 20), "幻-想-郷#…##幻-想-郷")
self.assertEqual(f(s, 19), "幻-想-郷#…#幻-想-郷")
self.assertEqual(f(s, 18), "幻-想-郷…#幻-想-郷")
self.assertEqual(f(s, 17), "幻-想-郷…幻-想-郷")
self.assertEqual(f(s, 16), "幻-想-…#幻-想-郷")
self.assertEqual(f(s, 15), "幻-想-…幻-想-郷")
self.assertEqual(f(s, 14), "幻-想-…-想-郷")
self.assertEqual(f(s, 13), "幻-想-…-想-郷")
self.assertEqual(f(s, 12), "幻-想…-想-郷")
self.assertEqual(f(s, 11), "幻-想…想-郷")
self.assertEqual(f(s, 10), "幻-…-想-郷")
self.assertEqual(f(s, 9) , "幻-…想-郷")
self.assertEqual(f(s, 3) , "…郷")
def test_shorten_eaw_separator(self, f=output.shorten_string_eaw):
s = "01234567890123456789" # 20 ascii characters
self.assertEqual(f(s, 20, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
self.assertEqual(f(s, 10, "|---|"), "01|---|789")
self.assertEqual(f(s, 19, "..."), "01234567...23456789")
self.assertEqual(f(s, 19, "..") , "01234567..123456789")
self.assertEqual(f(s, 19, ".") , "012345678.123456789")
self.assertEqual(f(s, 19, "") , "0123456780123456789")
def test_shorten_eaw_separator_wide(self, f=output.shorten_string_eaw):
s = "幻想郷幻想郷幻想郷幻想郷" # 12 wide characters
self.assertEqual(f(s, 24, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "幻想郷|---|郷幻想郷")
self.assertEqual(f(s, 15, "|---|"), "幻想|---|幻想郷")
self.assertEqual(f(s, 10, "|---|"), "幻|---|郷")
self.assertEqual(f(s, 19, "..."), "幻想郷幻...郷幻想郷")
self.assertEqual(f(s, 19, "..") , "幻想郷幻..郷幻想郷")
self.assertEqual(f(s, 19, ".") , "幻想郷幻.想郷幻想郷")
self.assertEqual(f(s, 19, "") , "幻想郷幻想郷幻想郷")
def test_shorten_eaw_separator_mix_(self, f=output.shorten_string_eaw):
s = "幻-想-郷##幻-想-郷##幻-想-郷" # mixed characters
self.assertEqual(f(s, 30, "|---|"), s)
self.assertEqual(f(s, 19, "|---|"), "幻-想-|---|幻-想-郷")
self.assertEqual(f(s, 15, "|---|"), "幻-想|---|想-郷")
self.assertEqual(f(s, 10, "|---|"), "幻|---|-郷")
self.assertEqual(f(s, 19, "..."), "幻-想-郷...幻-想-郷")
self.assertEqual(f(s, 19, "..") , "幻-想-郷..#幻-想-郷")
self.assertEqual(f(s, 19, ".") , "幻-想-郷#.#幻-想-郷")
self.assertEqual(f(s, 19, "") , "幻-想-郷###幻-想-郷")
if __name__ == '__main__':
unittest.main()