add a python script to clean up language data files

This strips out translations for tokens that aren't declared in LangStrings.inc.h, and cleans up whitespace. Entries are left in the order that they appear in the translation file, and blank lines and comments are left as they are apart from stripping excess whitespace.
2013-10-14 01:51:35 +01:00 · 2013-10-14 01:51:35 +01:00 · 2c44fa5d27
parent 8e64c8ab21
commit 2c44fa5d27
1 changed files with 99 additions and 0 deletions
--- a/scripts/clean_lang_data.py
+++ b/scripts/clean_lang_data.py
@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# vim: set ts=8 sts=4 sw=4 expandtab autoindent fileencoding=utf-8:
+
+import sys
+import os
+from optparse import OptionParser
+import re
+import fnmatch
+import glob
+
+RX_IDENT = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')
+RX_DECLARATION = re.compile(r'DECLARE_STRING\(([a-zA-Z_][a-zA-Z0-9_]*)\)')
+
+def parse_header(fl):
+    tokens = []
+    for line in fl:
+        line = line.strip()
+        m = RX_DECLARATION.match(line)
+        if m is not None:
+            tokens.append(m.group(1))
+    return tokens
+
+def strip_unknown_entries(fl, tokens, lines):
+    head = True
+    key = ''
+    value = ''
+    for line in lines:
+        line = line.strip()
+        if line == '' or line.startswith('#'):
+            fl.write(line)
+            fl.write('\n')
+            continue
+        if head:
+            if RX_IDENT.match(line):
+                head = False
+                key = line
+            else:
+                raise Exception('bad syntax')
+        else:
+            if line.startswith('"') and line.endswith('"'):
+                line = line[1:-1]
+            value = line.replace('\\n', '\n')
+            if key in tokens:
+                fl.write(key)
+                fl.write('\n    ')
+                add_quotes = (value != value.strip())
+                if add_quotes:
+                    fl.write('"')
+                fl.write(value.replace('\n', '\\n'))
+                if add_quotes:
+                    fl.write('"')
+                fl.write('\n')
+            else:
+                sys.stderr.write("# skipping unknown token '" + key + "'\n")
+            key = ''
+            head = True
+
+def main():
+    oparse = OptionParser(usage='%prog [options] header data-files')
+    (options, args) = oparse.parse_args()
+
+    # if no input files are specified, default to reading from stdin
+    if len(args) == 0:
+        oparse.error('You must specify the language header')
+
+    headerpath = args[0]
+    inpaths = args[1:]
+
+    with open(headerpath, 'rU') as fl:
+        tokens = parse_header(fl)
+        tokens = set(tokens)
+
+    for path in inpaths:
+        try:
+            with open(path, 'rU') as fl:
+                # skip an optional UTF-8 Byte Order Mark
+                if sys.version_info[0] >= 3:
+                    hasbom = (fl.read(1) == '\uFEFF')
+                else:
+                    hasbom = (fl.read(3) == '\xef\xbb\xbf')
+                if hasbom:
+                    sys.stderr.write("Warning: file '" + path + "' uses a UTF-8 Byte Order Mark\n")
+                else:
+                    fl.seek(0)
+
+                lines = fl.readlines()
+
+            with open(path, 'w', encoding='utf-8') as fl:
+                strip_unknown_entries(fl, tokens, lines)
+        except UnicodeDecodeError as e:
+            if path == '-':
+                prettypath = 'input'
+            else:
+                prettypath = "'" + path + "'"
+            sys.stderr.write("Warning: UTF-8 decode error in " + prettypath + ":\n")
+            sys.stderr.write('    ' + str(e) + '\n')
+
+if __name__ == '__main__' and not sys.flags.interactive:
+    main()