implement a way to shorten filenames with east-asian characters

(#1377) Setting 'output.shorten' to "eaw" (East-Asian Width) uses a slower algorithm that also considers characters with a width > 1.
2021-09-13 21:29:38 +02:00 · 2021-09-13 21:29:38 +02:00 · bd845303ad
commit bd845303ad
parent 2ff2974353
4 changed files with 219 additions and 15 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -2531,6 +2531,9 @@ Description
    Controls whether the output strings should be shortened to fit
    on one console line.

+    Set this option to ``"eaw"`` to also work with east-asian characters
+    with a display width greater than 1.
+

 output.skip
 -----------
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@ -10,6 +10,7 @@ import os
 import sys
 import shutil
 import logging
+import unicodedata
 from . import config, util


@ -270,9 +271,14 @@ class PipeOutput(NullOutput):
 class TerminalOutput(NullOutput):

    def __init__(self):
-        self.short = config.get(("output",), "shorten", True)
-        if self.short:
-            self.width = shutil.get_terminal_size().columns - OFFSET
+        shorten = config.get(("output",), "shorten", True)
+        if shorten:
+            func = shorten_string_eaw if shorten == "eaw" else shorten_string
+            limit = shutil.get_terminal_size().columns - OFFSET
+            sep = CHAR_ELLIPSIES
+            self.shorten = lambda txt: func(txt, limit, sep)
+        else:
+            self.shorten = util.identity

    def start(self, path):
        print(self.shorten("  " + path), end="", flush=True)
@ -283,17 +289,6 @@ class TerminalOutput(NullOutput):
    def success(self, path, tries):
        print("\r", self.shorten(CHAR_SUCCESS + path), sep="")

-    def shorten(self, txt):
-        """Reduce the length of 'txt' to the width of the terminal"""
-        if self.short and len(txt) > self.width:
-            hwidth = self.width // 2 - OFFSET
-            return "".join((
-                txt[:hwidth-1],
-                CHAR_ELLIPSIES,
-                txt[-hwidth-(self.width % 2):]
-            ))
-        return txt
-

 class ColorOutput(TerminalOutput):

@ -307,6 +302,56 @@ class ColorOutput(TerminalOutput):
        print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")


+class EAWCache(dict):
+
+    def __missing__(self, key):
+        width = self[key] = \
+            2 if unicodedata.east_asian_width(key) in "WF" else 1
+        return width
+
+
+def shorten_string(txt, limit, sep="…"):
+    """Limit width of 'txt'; assume all characters have a width of 1"""
+    if len(txt) <= limit:
+        return txt
+    limit -= len(sep)
+    return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]
+
+
+def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()):
+    """Limit width of 'txt'; check for east-asian characters with width > 1"""
+    char_widths = [cache[c] for c in txt]
+    text_width = sum(char_widths)
+
+    if text_width <= limit:
+        # no shortening required
+        return txt
+
+    limit -= len(sep)
+    if text_width == len(txt):
+        # all characters have a width of 1
+        return txt[:limit // 2] + sep + txt[-((limit+1) // 2):]
+
+    # wide characters
+    left = 0
+    lwidth = limit // 2
+    while True:
+        lwidth -= char_widths[left]
+        if lwidth < 0:
+            break
+        left += 1
+
+    right = -1
+    rwidth = (limit+1) // 2 + (lwidth + char_widths[left])
+    while True:
+        rwidth -= char_widths[right]
+        if rwidth < 0:
+            break
+        right -= 1
+
+    return txt[:left] + sep + txt[right+1:]
+
+
 if util.WINDOWS:
    ANSI = os.environ.get("TERM") == "ANSI"
    OFFSET = 1
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@ -2,7 +2,7 @@

 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-TESTS_CORE=(cache config cookies downloader extractor job oauth postprocessor text util)
+TESTS_CORE=(cache config cookies downloader extractor job oauth output postprocessor text util)
 TESTS_RESULTS=(results)


--- a/test/test_output.py
+++ b/test/test_output.py
@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import os
+import sys
+import unittest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from gallery_dl import output  # noqa E402
+
+
+class TestShorten(unittest.TestCase):
+
+    def test_shorten_noop(self, f=output.shorten_string):
+        self.assertEqual(f(""      , 10), "")
+        self.assertEqual(f("foobar", 10), "foobar")
+
+    def test_shorten(self, f=output.shorten_string):
+        s = "01234567890123456789"  # string of length 20
+        self.assertEqual(f(s, 30), s)
+        self.assertEqual(f(s, 25), s)
+        self.assertEqual(f(s, 20), s)
+        self.assertEqual(f(s, 19), "012345678…123456789")
+        self.assertEqual(f(s, 18), "01234567…123456789")
+        self.assertEqual(f(s, 17), "01234567…23456789")
+        self.assertEqual(f(s, 16), "0123456…23456789")
+        self.assertEqual(f(s, 15), "0123456…3456789")
+        self.assertEqual(f(s, 14), "012345…3456789")
+        self.assertEqual(f(s, 13), "012345…456789")
+        self.assertEqual(f(s, 12), "01234…456789")
+        self.assertEqual(f(s, 11), "01234…56789")
+        self.assertEqual(f(s, 10), "0123…56789")
+        self.assertEqual(f(s, 9) , "0123…6789")
+        self.assertEqual(f(s, 3) , "0…9")
+        self.assertEqual(f(s, 2) , "…9")
+
+    def test_shorten_separator(self, f=output.shorten_string):
+        s = "01234567890123456789"  # string of length 20
+        self.assertEqual(f(s, 20, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
+        self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
+        self.assertEqual(f(s, 10, "|---|"), "01|---|789")
+
+        self.assertEqual(f(s, 19, "..."), "01234567...23456789")
+        self.assertEqual(f(s, 19, "..") , "01234567..123456789")
+        self.assertEqual(f(s, 19, ".")  , "012345678.123456789")
+        self.assertEqual(f(s, 19, "")   , "0123456780123456789")
+
+
+class TestShortenEAW(unittest.TestCase):
+
+    def test_shorten_eaw_noop(self, f=output.shorten_string_eaw):
+        self.assertEqual(f(""      , 10), "")
+        self.assertEqual(f("foobar", 10), "foobar")
+
+    def test_shorten_eaw(self, f=output.shorten_string_eaw):
+        s = "01234567890123456789"  # 20 ascii characters
+        self.assertEqual(f(s, 30), s)
+        self.assertEqual(f(s, 25), s)
+        self.assertEqual(f(s, 20), s)
+        self.assertEqual(f(s, 19), "012345678…123456789")
+        self.assertEqual(f(s, 18), "01234567…123456789")
+        self.assertEqual(f(s, 17), "01234567…23456789")
+        self.assertEqual(f(s, 16), "0123456…23456789")
+        self.assertEqual(f(s, 15), "0123456…3456789")
+        self.assertEqual(f(s, 14), "012345…3456789")
+        self.assertEqual(f(s, 13), "012345…456789")
+        self.assertEqual(f(s, 12), "01234…456789")
+        self.assertEqual(f(s, 11), "01234…56789")
+        self.assertEqual(f(s, 10), "0123…56789")
+        self.assertEqual(f(s, 9) , "0123…6789")
+        self.assertEqual(f(s, 3) , "0…9")
+        self.assertEqual(f(s, 2) , "…9")
+
+    def test_shorten_eaw_wide(self, f=output.shorten_string_eaw):
+        s = "幻想郷幻想郷幻想郷幻想郷"  # 12 wide characters
+        self.assertEqual(f(s, 30), s)
+        self.assertEqual(f(s, 25), s)
+        self.assertEqual(f(s, 20), "幻想郷幻…想郷幻想郷")
+        self.assertEqual(f(s, 19), "幻想郷幻…想郷幻想郷")
+        self.assertEqual(f(s, 18), "幻想郷幻…郷幻想郷")
+        self.assertEqual(f(s, 17), "幻想郷幻…郷幻想郷")
+        self.assertEqual(f(s, 16), "幻想郷…郷幻想郷")
+        self.assertEqual(f(s, 15), "幻想郷…郷幻想郷")
+        self.assertEqual(f(s, 14), "幻想郷…幻想郷")
+        self.assertEqual(f(s, 13), "幻想郷…幻想郷")
+        self.assertEqual(f(s, 12), "幻想…幻想郷")
+        self.assertEqual(f(s, 11), "幻想…幻想郷")
+        self.assertEqual(f(s, 10), "幻想…想郷")
+        self.assertEqual(f(s, 9) , "幻想…想郷")
+        self.assertEqual(f(s, 3) , "…郷")
+
+    def test_shorten_eaw_mix(self, f=output.shorten_string_eaw):
+        s = "幻-想-郷##幻-想-郷##幻-想-郷"  # mixed characters
+        self.assertEqual(f(s, 28), s)
+        self.assertEqual(f(s, 25), "幻-想-郷##幻…郷##幻-想-郷")
+
+        self.assertEqual(f(s, 20), "幻-想-郷#…##幻-想-郷")
+        self.assertEqual(f(s, 19), "幻-想-郷#…#幻-想-郷")
+        self.assertEqual(f(s, 18), "幻-想-郷…#幻-想-郷")
+        self.assertEqual(f(s, 17), "幻-想-郷…幻-想-郷")
+        self.assertEqual(f(s, 16), "幻-想-…#幻-想-郷")
+        self.assertEqual(f(s, 15), "幻-想-…幻-想-郷")
+        self.assertEqual(f(s, 14), "幻-想-…-想-郷")
+        self.assertEqual(f(s, 13), "幻-想-…-想-郷")
+        self.assertEqual(f(s, 12), "幻-想…-想-郷")
+        self.assertEqual(f(s, 11), "幻-想…想-郷")
+        self.assertEqual(f(s, 10), "幻-…-想-郷")
+        self.assertEqual(f(s, 9) , "幻-…想-郷")
+        self.assertEqual(f(s, 3) , "…郷")
+
+    def test_shorten_eaw_separator(self, f=output.shorten_string_eaw):
+        s = "01234567890123456789"  # 20 ascii characters
+        self.assertEqual(f(s, 20, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "0123456|---|3456789")
+        self.assertEqual(f(s, 15, "|---|"), "01234|---|56789")
+        self.assertEqual(f(s, 10, "|---|"), "01|---|789")
+
+        self.assertEqual(f(s, 19, "..."), "01234567...23456789")
+        self.assertEqual(f(s, 19, "..") , "01234567..123456789")
+        self.assertEqual(f(s, 19, ".")  , "012345678.123456789")
+        self.assertEqual(f(s, 19, "")   , "0123456780123456789")
+
+    def test_shorten_eaw_separator_wide(self, f=output.shorten_string_eaw):
+        s = "幻想郷幻想郷幻想郷幻想郷"  # 12 wide characters
+        self.assertEqual(f(s, 24, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "幻想郷|---|郷幻想郷")
+        self.assertEqual(f(s, 15, "|---|"), "幻想|---|幻想郷")
+        self.assertEqual(f(s, 10, "|---|"), "幻|---|郷")
+
+        self.assertEqual(f(s, 19, "..."), "幻想郷幻...郷幻想郷")
+        self.assertEqual(f(s, 19, "..") , "幻想郷幻..郷幻想郷")
+        self.assertEqual(f(s, 19, ".")  , "幻想郷幻.想郷幻想郷")
+        self.assertEqual(f(s, 19, "")   , "幻想郷幻想郷幻想郷")
+
+    def test_shorten_eaw_separator_mix_(self, f=output.shorten_string_eaw):
+        s = "幻-想-郷##幻-想-郷##幻-想-郷"  # mixed characters
+        self.assertEqual(f(s, 30, "|---|"), s)
+        self.assertEqual(f(s, 19, "|---|"), "幻-想-|---|幻-想-郷")
+        self.assertEqual(f(s, 15, "|---|"), "幻-想|---|想-郷")
+        self.assertEqual(f(s, 10, "|---|"), "幻|---|-郷")
+
+        self.assertEqual(f(s, 19, "..."), "幻-想-郷...幻-想-郷")
+        self.assertEqual(f(s, 19, "..") , "幻-想-郷..#幻-想-郷")
+        self.assertEqual(f(s, 19, ".")  , "幻-想-郷#.#幻-想-郷")
+        self.assertEqual(f(s, 19, "")   , "幻-想-郷###幻-想-郷")
+
+
+if __name__ == '__main__':
+    unittest.main()