Mypal/parser/html/java/htmlparser/generate-encoding-data.py

#!/usr/bin/python

# Copyright (c) 2013-2015 Mozilla Foundation
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

import json

class Label:
  def __init__(self, label, preferred):
    self.label = label
    self.preferred = preferred
  def __cmp__(self, other):
    return cmp(self.label, other.label)

# If a multi-byte encoding is on this list, it is assumed to have a
# non-generated decoder implementation class. Otherwise, the JDK default
# decoder is used as a placeholder.
MULTI_BYTE_DECODER_IMPLEMENTED = [
  u"x-user-defined",
  u"replacement",
  u"big5",
]

MULTI_BYTE_ENCODER_IMPLEMENTED = [
  u"big5",
]

preferred = []

labels = []

data = json.load(open("../encoding/encodings.json", "r"))

indexes = json.load(open("../encoding/indexes.json", "r"))

single_byte = []

multi_byte = []

def to_camel_name(name):
  if name == u"iso-8859-8-i":
    return u"Iso8I"
  if name.startswith(u"iso-8859-"):
    return name.replace(u"iso-8859-", u"Iso")
  return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")

def to_constant_name(name):
  return name.replace(u"-", u"_").upper()

# Encoding.java

for group in data:
  if group["heading"] == "Legacy single-byte encodings":
    single_byte = group["encodings"]
  else:
    multi_byte.extend(group["encodings"])
  for encoding in group["encodings"]:
    preferred.append(encoding["name"])
    for label in encoding["labels"]:
      labels.append(Label(label, encoding["name"]))

preferred.sort()
labels.sort()

label_file = open("src/nu/validator/encoding/Encoding.java", "w")

label_file.write("""/*
 * Copyright (c) 2015 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.encoding;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.charset.spi.CharsetProvider;
import java.util.Arrays;
import java.util.Collections;
import java.util.SortedMap;
import java.util.TreeMap;

/**
 * Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a>
 * as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding
 * Standard</a>, provides access to each encoding defined in the Encoding
 * Standard via a static constant and provides the
 * "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an
 * encoding</a>" algorithm defined in the Encoding Standard.
 *
 * <p>This class inherits from {@link Charset} to allow the Encoding
 * Standard-compliant encodings to be used in contexts that support
 * <code>Charset</code> instances. However, by design, the Encoding
 * Standard-compliant encodings are not supplied via a {@link CharsetProvider}
 * and, therefore, are not available via and do not interfere with the static
 * methods provided by <code>Charset</code>. (This class provides methods of
 * the same name to hide each static method of <code>Charset</code> to help
 * avoid accidental calls to the static methods of the superclass when working
 * with Encoding Standard-compliant encodings.)
 *
 * <p>When an application needs to use a particular encoding, such as utf-8
 * or windows-1252, the corresponding constant, i.e.
 * {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252}
 * respectively, should be used. However, when the application receives an
 * encoding label from external input, the method {@link #forName(String)
 * forName()} should be used to obtain the object representing the encoding
 * identified by the label. In contexts where labels that map to the
 * <a href="https://encoding.spec.whatwg.org/#replacement">replacement
 * encoding</a> should be treated as unknown, the method {@link
 * #forNameNoReplacement(String) forNameNoReplacement()} should be used instead.
 *
 *
 * @author hsivonen
 */
public abstract class Encoding extends Charset {

    private static final String[] LABELS = {
""")

for label in labels:
  label_file.write("        \"%s\",\n" % label.label)

label_file.write("""    };

    private static final Encoding[] ENCODINGS_FOR_LABELS = {
""")

for label in labels:
  label_file.write("        %s.INSTANCE,\n" % to_camel_name(label.preferred))

label_file.write("""    };

    private static final Encoding[] ENCODINGS = {
""")

for label in preferred:
  label_file.write("        %s.INSTANCE,\n" % to_camel_name(label))

label_file.write("""    };

""")

for label in preferred:
  label_file.write("""    /**
     * The %s encoding.
     */
    public static final Encoding %s = %s.INSTANCE;

""" % (label, to_constant_name(label), to_camel_name(label)))

label_file.write("""
private static SortedMap<String, Charset> encodings = null;

    protected Encoding(String canonicalName, String[] aliases) {
        super(canonicalName, aliases);
    }

    private enum State {
        HEAD, LABEL, TAIL
    };

    public static Encoding forName(String label) {
        if (label == null) {
            throw new IllegalArgumentException("Label must not be null.");
        }
        if (label.length() == 0) {
            throw new IllegalCharsetNameException(label);
        }
        // First try the fast path
        int index = Arrays.binarySearch(LABELS, label);
        if (index >= 0) {
            return ENCODINGS_FOR_LABELS[index];
        }
        // Else, slow path
        StringBuilder sb = new StringBuilder();
        State state = State.HEAD;
        for (int i = 0; i < label.length(); i++) {
            char c = label.charAt(i);
            if ((c == ' ') || (c == '\\n') || (c == '\\r') || (c == '\\t')
                    || (c == '\\u000C')) {
                if (state == State.LABEL) {
                    state = State.TAIL;
                }
                continue;
            }
            if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
                switch (state) {
                    case HEAD:
                        state = State.LABEL;
                        // Fall through
                    case LABEL:
                        sb.append(c);
                        continue;
                    case TAIL:
                        throw new IllegalCharsetNameException(label);
                }
            }
            if (c >= 'A' && c <= 'Z') {
                c += 0x20;
                switch (state) {
                    case HEAD:
                        state = State.LABEL;
                        // Fall through
                    case LABEL:
                        sb.append(c);
                        continue;
                    case TAIL:
                        throw new IllegalCharsetNameException(label);
                }
            }
            if ((c == '-') || (c == '+') || (c == '.') || (c == ':')
                    || (c == '_')) {
                switch (state) {
                    case LABEL:
                        sb.append(c);
                        continue;
                    case HEAD:
                    case TAIL:
                        throw new IllegalCharsetNameException(label);
                }
            }
            throw new IllegalCharsetNameException(label);
        }
        index = Arrays.binarySearch(LABELS, sb.toString());
        if (index >= 0) {
            return ENCODINGS_FOR_LABELS[index];
        }
        throw new UnsupportedCharsetException(label);
    }

    public static Encoding forNameNoReplacement(String label) {
        Encoding encoding = Encoding.forName(label);
        if (encoding == Encoding.REPLACEMENT) {
            throw new UnsupportedCharsetException(label);
        }
        return encoding;
    }

    public static boolean isSupported(String label) {
        try {
            Encoding.forName(label);
        } catch (UnsupportedCharsetException e) {
            return false;
        }
        return true;
    }

    public static boolean isSupportedNoReplacement(String label) {
        try {
            Encoding.forNameNoReplacement(label);
        } catch (UnsupportedCharsetException e) {
            return false;
        }
        return true;
    }

    public static SortedMap<String, Charset> availableCharsets() {
        if (encodings == null) {
            TreeMap<String, Charset> map = new TreeMap<String, Charset>();
            for (Encoding encoding : ENCODINGS) {
                map.put(encoding.name(), encoding);
            }
            encodings = Collections.unmodifiableSortedMap(map);
        }
        return encodings;
    }

    public static Encoding defaultCharset() {
        return WINDOWS_1252;
    }

    @Override public boolean canEncode() {
        return false;
    }

    @Override public boolean contains(Charset cs) {
        return false;
    }

    @Override public CharsetEncoder newEncoder() {
        throw new UnsupportedOperationException("Encoder not implemented.");
    }
}
""")

label_file.close()

# Single-byte encodings

for encoding in single_byte:
  name = encoding["name"]
  labels = encoding["labels"]
  labels.sort()
  class_name = to_camel_name(name)
  mapping_name = name
  if mapping_name == u"iso-8859-8-i":
    mapping_name = u"iso-8859-8"
  mapping = indexes[mapping_name]
  class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
  class_file.write('''/*
 * Copyright (c) 2013-2015 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/*
 * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
 * Instead, please regenerate using generate-encoding-data.py
 */

package nu.validator.encoding;

import java.nio.charset.CharsetDecoder;

class ''')
  class_file.write(class_name)
  class_file.write(''' extends Encoding {

    private static final char[] TABLE = {''')
  fallible = False
  comma = False
  for code_point in mapping:
    # XXX should we have error reporting?
    if not code_point:
      code_point = 0xFFFD
      fallible = True
    if comma:
      class_file.write(",")
    class_file.write("\n        '\u%04x'" % code_point);
    comma = True
  class_file.write('''
    };

    private static final String[] LABELS = {''')

  comma = False
  for label in labels:
    if comma:
      class_file.write(",")
    class_file.write("\n        \"%s\"" % label);
    comma = True
  class_file.write('''
    };

    private static final String NAME = "''')
  class_file.write(name)
  class_file.write('''";

    static final Encoding INSTANCE = new ''')
  class_file.write(class_name)
  class_file.write('''();

    private ''')
  class_file.write(class_name)
  class_file.write('''() {
        super(NAME, LABELS);
    }

    @Override public CharsetDecoder newDecoder() {
        return new ''')
  class_file.write("Fallible" if fallible else "Infallible")
  class_file.write('''SingleByteDecoder(this, TABLE);
    }

}
''')
  class_file.close()

# Multi-byte encodings

for encoding in multi_byte:
  name = encoding["name"]
  labels = encoding["labels"]
  labels.sort()
  class_name = to_camel_name(name)
  class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
  class_file.write('''/*
 * Copyright (c) 2013-2015 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/*
 * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
 * Instead, please regenerate using generate-encoding-data.py
 */

package nu.validator.encoding;

import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;

class ''')
  class_file.write(class_name)
  class_file.write(''' extends Encoding {

    private static final String[] LABELS = {''')

  comma = False
  for label in labels:
    if comma:
      class_file.write(",")
    class_file.write("\n        \"%s\"" % label);
    comma = True
  class_file.write('''
    };

    private static final String NAME = "''')
  class_file.write(name)
  class_file.write('''";

    static final ''')
  class_file.write(class_name)
  class_file.write(''' INSTANCE = new ''')
  class_file.write(class_name)
  class_file.write('''();

    private ''')
  class_file.write(class_name)
  class_file.write('''() {
        super(NAME, LABELS);
    }

    @Override public CharsetDecoder newDecoder() {
        ''')
  if name == "gbk":
    class_file.write('''return Charset.forName("gb18030").newDecoder();''')
  elif name in MULTI_BYTE_DECODER_IMPLEMENTED:
    class_file.write("return new %sDecoder(this);" % class_name)
  else:
    class_file.write('''return Charset.forName(NAME).newDecoder();''')
  class_file.write('''
    }

    @Override public CharsetEncoder newEncoder() {
        ''')
  if name in MULTI_BYTE_ENCODER_IMPLEMENTED:
    class_file.write("return new %sEncoder(this);" % class_name)
  else:
    class_file.write('''return Charset.forName(NAME).newEncoder();''')
  class_file.write('''
    }
}
''')
  class_file.close()

# Big5

def null_to_zero(code_point):
  if not code_point:
    code_point = 0
  return code_point

index = []

for code_point in indexes["big5"]:
  index.append(null_to_zero(code_point))

# There are four major gaps consisting of more than 4 consecutive invalid pointers
gaps = []
consecutive = 0
consecutive_start = 0
offset = 0
for code_point in index:
  if code_point == 0:
    if consecutive == 0:
      consecutive_start = offset
    consecutive +=1
  else:
    if consecutive > 4:
      gaps.append((consecutive_start, consecutive_start + consecutive))
    consecutive = 0
  offset += 1

def invert_ranges(ranges, cap):
  inverted = []
  invert_start = 0
  for (start, end) in ranges:
    if start != 0:
      inverted.append((invert_start, start))
    invert_start = end
  inverted.append((invert_start, cap))
  return inverted

cap = len(index)
ranges = invert_ranges(gaps, cap)

# Now compute a compressed lookup table for astralness

gaps = []
consecutive = 0
consecutive_start = 0
offset = 0
for code_point in index:
  if code_point <= 0xFFFF:
    if consecutive == 0:
      consecutive_start = offset
    consecutive +=1
  else:
    if consecutive > 40:
      gaps.append((consecutive_start, consecutive_start + consecutive))
    consecutive = 0
  offset += 1

astral_ranges = invert_ranges(gaps, cap)

class_file = open("src/nu/validator/encoding/Big5Data.java", "w")
class_file.write('''/*
 * Copyright (c) 2015 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/*
 * THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
 * Instead, please regenerate using generate-encoding-data.py
 */

package nu.validator.encoding;

final class Big5Data {

    private static final String ASTRALNESS = "''')

bits = []
for (low, high) in astral_ranges:
  for i in xrange(low, high):
    bits.append(1 if index[i] > 0xFFFF else 0)
# pad length to multiple of 16
for j in xrange(16 - (len(bits) % 16)):
  bits.append(0)

i = 0
while i < len(bits):
  accu = 0
  for j in xrange(16):
    accu |= bits[i + j] << j
  if accu == 0x22:
    class_file.write('\\"')
  else:
    class_file.write('\\u%04X' % accu)
  i += 16

class_file.write('''";

''')

j = 0
for (low, high) in ranges:
  class_file.write('''    private static final String TABLE%d = "''' % j)
  for i in xrange(low, high):
    class_file.write('\\u%04X' % (index[i] & 0xFFFF))
  class_file.write('''";

''')
  j += 1

class_file.write('''    private static boolean readBit(int i) {
        return (ASTRALNESS.charAt(i >> 4) & (1 << (i & 0xF))) != 0;
    }

    static char lowBits(int pointer) {
''')

j = 0
for (low, high) in ranges:
  class_file.write('''        if (pointer < %d) {
            return '\\u0000';
        }
        if (pointer < %d) {
            return TABLE%d.charAt(pointer - %d);
        }
''' % (low, high, j, low))
  j += 1

class_file.write('''        return '\\u0000';
    }

    static boolean isAstral(int pointer) {
''')

base = 0
for (low, high) in astral_ranges:
  if high - low == 1:
    class_file.write('''        if (pointer < %d) {
            return false;
        }
        if (pointer == %d) {
            return true;
        }
''' % (low, low))
  else:
    class_file.write('''        if (pointer < %d) {
            return false;
        }
        if (pointer < %d) {
            return readBit(%d + (pointer - %d));
        }
''' % (low, high, base, low))
  base += (high - low)

class_file.write('''        return false;
    }

    public static int findPointer(char lowBits, boolean isAstral) {
        if (!isAstral) {
            switch (lowBits) {
''')

hkscs_bound = (0xA1 - 0x81) * 157

prefer_last = [
  0x2550,
  0x255E,
  0x2561,
  0x256A,
  0x5341,
  0x5345,
]

for code_point in prefer_last:
  # Python lists don't have .rindex() :-(
  for i in xrange(len(index) - 1, -1, -1):
    candidate = index[i]
    if candidate == code_point:
       class_file.write('''                case 0x%04X:
                    return %d;
''' % (code_point, i))
       break

class_file.write('''                default:
                    break;
            }
        }''')

j = 0
for (low, high) in ranges:
  if high > hkscs_bound:
    start = 0
    if low <= hkscs_bound and hkscs_bound < high:
      # This is the first range we don't ignore and the
      # range that contains the first non-HKSCS pointer.
      # Avoid searching HKSCS.
      start = hkscs_bound - low
    class_file.write('''
        for (int i = %d; i < TABLE%d.length(); i++) {
            if (TABLE%d.charAt(i) == lowBits) {
                int pointer = i + %d;
                if (isAstral == isAstral(pointer)) {
                    return pointer;
                }
            }
        }''' % (start, j, j, low))
  j += 1

class_file.write('''
        return 0;
    }
}
''')
class_file.close()