746 lines
22 KiB
Python
746 lines
22 KiB
Python
#!/usr/bin/python
|
|
|
|
# Copyright (c) 2013-2015 Mozilla Foundation
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
# copy of this software and associated documentation files (the "Software"),
|
|
# to deal in the Software without restriction, including without limitation
|
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
# and/or sell copies of the Software, and to permit persons to whom the
|
|
# Software is furnished to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in
|
|
# all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
# DEALINGS IN THE SOFTWARE.
|
|
|
|
import json
|
|
|
|
class Label:
|
|
def __init__(self, label, preferred):
|
|
self.label = label
|
|
self.preferred = preferred
|
|
def __cmp__(self, other):
|
|
return cmp(self.label, other.label)
|
|
|
|
# If a multi-byte encoding is on this list, it is assumed to have a
|
|
# non-generated decoder implementation class. Otherwise, the JDK default
|
|
# decoder is used as a placeholder.
|
|
MULTI_BYTE_DECODER_IMPLEMENTED = [
|
|
u"x-user-defined",
|
|
u"replacement",
|
|
u"big5",
|
|
]
|
|
|
|
MULTI_BYTE_ENCODER_IMPLEMENTED = [
|
|
u"big5",
|
|
]
|
|
|
|
preferred = []
|
|
|
|
labels = []
|
|
|
|
data = json.load(open("../encoding/encodings.json", "r"))
|
|
|
|
indexes = json.load(open("../encoding/indexes.json", "r"))
|
|
|
|
single_byte = []
|
|
|
|
multi_byte = []
|
|
|
|
def to_camel_name(name):
|
|
if name == u"iso-8859-8-i":
|
|
return u"Iso8I"
|
|
if name.startswith(u"iso-8859-"):
|
|
return name.replace(u"iso-8859-", u"Iso")
|
|
return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
|
|
|
|
def to_constant_name(name):
|
|
return name.replace(u"-", u"_").upper()
|
|
|
|
# Encoding.java
|
|
|
|
for group in data:
|
|
if group["heading"] == "Legacy single-byte encodings":
|
|
single_byte = group["encodings"]
|
|
else:
|
|
multi_byte.extend(group["encodings"])
|
|
for encoding in group["encodings"]:
|
|
preferred.append(encoding["name"])
|
|
for label in encoding["labels"]:
|
|
labels.append(Label(label, encoding["name"]))
|
|
|
|
preferred.sort()
|
|
labels.sort()
|
|
|
|
label_file = open("src/nu/validator/encoding/Encoding.java", "w")
|
|
|
|
label_file.write("""/*
|
|
* Copyright (c) 2015 Mozilla Foundation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
package nu.validator.encoding;
|
|
|
|
import java.nio.charset.Charset;
|
|
import java.nio.charset.CharsetEncoder;
|
|
import java.nio.charset.IllegalCharsetNameException;
|
|
import java.nio.charset.UnsupportedCharsetException;
|
|
import java.nio.charset.spi.CharsetProvider;
|
|
import java.util.Arrays;
|
|
import java.util.Collections;
|
|
import java.util.SortedMap;
|
|
import java.util.TreeMap;
|
|
|
|
/**
|
|
* Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a>
|
|
* as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding
|
|
* Standard</a>, provides access to each encoding defined in the Encoding
|
|
* Standard via a static constant and provides the
|
|
* "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an
|
|
* encoding</a>" algorithm defined in the Encoding Standard.
|
|
*
|
|
* <p>This class inherits from {@link Charset} to allow the Encoding
|
|
* Standard-compliant encodings to be used in contexts that support
|
|
* <code>Charset</code> instances. However, by design, the Encoding
|
|
* Standard-compliant encodings are not supplied via a {@link CharsetProvider}
|
|
* and, therefore, are not available via and do not interfere with the static
|
|
* methods provided by <code>Charset</code>. (This class provides methods of
|
|
* the same name to hide each static method of <code>Charset</code> to help
|
|
* avoid accidental calls to the static methods of the superclass when working
|
|
* with Encoding Standard-compliant encodings.)
|
|
*
|
|
* <p>When an application needs to use a particular encoding, such as utf-8
|
|
* or windows-1252, the corresponding constant, i.e.
|
|
* {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252}
|
|
* respectively, should be used. However, when the application receives an
|
|
* encoding label from external input, the method {@link #forName(String)
|
|
* forName()} should be used to obtain the object representing the encoding
|
|
* identified by the label. In contexts where labels that map to the
|
|
* <a href="https://encoding.spec.whatwg.org/#replacement">replacement
|
|
* encoding</a> should be treated as unknown, the method {@link
|
|
* #forNameNoReplacement(String) forNameNoReplacement()} should be used instead.
|
|
*
|
|
*
|
|
* @author hsivonen
|
|
*/
|
|
public abstract class Encoding extends Charset {
|
|
|
|
private static final String[] LABELS = {
|
|
""")
|
|
|
|
for label in labels:
|
|
label_file.write(" \"%s\",\n" % label.label)
|
|
|
|
label_file.write(""" };
|
|
|
|
private static final Encoding[] ENCODINGS_FOR_LABELS = {
|
|
""")
|
|
|
|
for label in labels:
|
|
label_file.write(" %s.INSTANCE,\n" % to_camel_name(label.preferred))
|
|
|
|
label_file.write(""" };
|
|
|
|
private static final Encoding[] ENCODINGS = {
|
|
""")
|
|
|
|
for label in preferred:
|
|
label_file.write(" %s.INSTANCE,\n" % to_camel_name(label))
|
|
|
|
label_file.write(""" };
|
|
|
|
""")
|
|
|
|
for label in preferred:
|
|
label_file.write(""" /**
|
|
* The %s encoding.
|
|
*/
|
|
public static final Encoding %s = %s.INSTANCE;
|
|
|
|
""" % (label, to_constant_name(label), to_camel_name(label)))
|
|
|
|
label_file.write("""
|
|
private static SortedMap<String, Charset> encodings = null;
|
|
|
|
protected Encoding(String canonicalName, String[] aliases) {
|
|
super(canonicalName, aliases);
|
|
}
|
|
|
|
private enum State {
|
|
HEAD, LABEL, TAIL
|
|
};
|
|
|
|
public static Encoding forName(String label) {
|
|
if (label == null) {
|
|
throw new IllegalArgumentException("Label must not be null.");
|
|
}
|
|
if (label.length() == 0) {
|
|
throw new IllegalCharsetNameException(label);
|
|
}
|
|
// First try the fast path
|
|
int index = Arrays.binarySearch(LABELS, label);
|
|
if (index >= 0) {
|
|
return ENCODINGS_FOR_LABELS[index];
|
|
}
|
|
// Else, slow path
|
|
StringBuilder sb = new StringBuilder();
|
|
State state = State.HEAD;
|
|
for (int i = 0; i < label.length(); i++) {
|
|
char c = label.charAt(i);
|
|
if ((c == ' ') || (c == '\\n') || (c == '\\r') || (c == '\\t')
|
|
|| (c == '\\u000C')) {
|
|
if (state == State.LABEL) {
|
|
state = State.TAIL;
|
|
}
|
|
continue;
|
|
}
|
|
if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
|
switch (state) {
|
|
case HEAD:
|
|
state = State.LABEL;
|
|
// Fall through
|
|
case LABEL:
|
|
sb.append(c);
|
|
continue;
|
|
case TAIL:
|
|
throw new IllegalCharsetNameException(label);
|
|
}
|
|
}
|
|
if (c >= 'A' && c <= 'Z') {
|
|
c += 0x20;
|
|
switch (state) {
|
|
case HEAD:
|
|
state = State.LABEL;
|
|
// Fall through
|
|
case LABEL:
|
|
sb.append(c);
|
|
continue;
|
|
case TAIL:
|
|
throw new IllegalCharsetNameException(label);
|
|
}
|
|
}
|
|
if ((c == '-') || (c == '+') || (c == '.') || (c == ':')
|
|
|| (c == '_')) {
|
|
switch (state) {
|
|
case LABEL:
|
|
sb.append(c);
|
|
continue;
|
|
case HEAD:
|
|
case TAIL:
|
|
throw new IllegalCharsetNameException(label);
|
|
}
|
|
}
|
|
throw new IllegalCharsetNameException(label);
|
|
}
|
|
index = Arrays.binarySearch(LABELS, sb.toString());
|
|
if (index >= 0) {
|
|
return ENCODINGS_FOR_LABELS[index];
|
|
}
|
|
throw new UnsupportedCharsetException(label);
|
|
}
|
|
|
|
public static Encoding forNameNoReplacement(String label) {
|
|
Encoding encoding = Encoding.forName(label);
|
|
if (encoding == Encoding.REPLACEMENT) {
|
|
throw new UnsupportedCharsetException(label);
|
|
}
|
|
return encoding;
|
|
}
|
|
|
|
public static boolean isSupported(String label) {
|
|
try {
|
|
Encoding.forName(label);
|
|
} catch (UnsupportedCharsetException e) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public static boolean isSupportedNoReplacement(String label) {
|
|
try {
|
|
Encoding.forNameNoReplacement(label);
|
|
} catch (UnsupportedCharsetException e) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public static SortedMap<String, Charset> availableCharsets() {
|
|
if (encodings == null) {
|
|
TreeMap<String, Charset> map = new TreeMap<String, Charset>();
|
|
for (Encoding encoding : ENCODINGS) {
|
|
map.put(encoding.name(), encoding);
|
|
}
|
|
encodings = Collections.unmodifiableSortedMap(map);
|
|
}
|
|
return encodings;
|
|
}
|
|
|
|
public static Encoding defaultCharset() {
|
|
return WINDOWS_1252;
|
|
}
|
|
|
|
@Override public boolean canEncode() {
|
|
return false;
|
|
}
|
|
|
|
@Override public boolean contains(Charset cs) {
|
|
return false;
|
|
}
|
|
|
|
@Override public CharsetEncoder newEncoder() {
|
|
throw new UnsupportedOperationException("Encoder not implemented.");
|
|
}
|
|
}
|
|
""")
|
|
|
|
label_file.close()
|
|
|
|
# Single-byte encodings
|
|
|
|
for encoding in single_byte:
|
|
name = encoding["name"]
|
|
labels = encoding["labels"]
|
|
labels.sort()
|
|
class_name = to_camel_name(name)
|
|
mapping_name = name
|
|
if mapping_name == u"iso-8859-8-i":
|
|
mapping_name = u"iso-8859-8"
|
|
mapping = indexes[mapping_name]
|
|
class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
|
|
class_file.write('''/*
|
|
* Copyright (c) 2013-2015 Mozilla Foundation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
|
* Instead, please regenerate using generate-encoding-data.py
|
|
*/
|
|
|
|
package nu.validator.encoding;
|
|
|
|
import java.nio.charset.CharsetDecoder;
|
|
|
|
class ''')
|
|
class_file.write(class_name)
|
|
class_file.write(''' extends Encoding {
|
|
|
|
private static final char[] TABLE = {''')
|
|
fallible = False
|
|
comma = False
|
|
for code_point in mapping:
|
|
# XXX should we have error reporting?
|
|
if not code_point:
|
|
code_point = 0xFFFD
|
|
fallible = True
|
|
if comma:
|
|
class_file.write(",")
|
|
class_file.write("\n '\u%04x'" % code_point);
|
|
comma = True
|
|
class_file.write('''
|
|
};
|
|
|
|
private static final String[] LABELS = {''')
|
|
|
|
comma = False
|
|
for label in labels:
|
|
if comma:
|
|
class_file.write(",")
|
|
class_file.write("\n \"%s\"" % label);
|
|
comma = True
|
|
class_file.write('''
|
|
};
|
|
|
|
private static final String NAME = "''')
|
|
class_file.write(name)
|
|
class_file.write('''";
|
|
|
|
static final Encoding INSTANCE = new ''')
|
|
class_file.write(class_name)
|
|
class_file.write('''();
|
|
|
|
private ''')
|
|
class_file.write(class_name)
|
|
class_file.write('''() {
|
|
super(NAME, LABELS);
|
|
}
|
|
|
|
@Override public CharsetDecoder newDecoder() {
|
|
return new ''')
|
|
class_file.write("Fallible" if fallible else "Infallible")
|
|
class_file.write('''SingleByteDecoder(this, TABLE);
|
|
}
|
|
|
|
}
|
|
''')
|
|
class_file.close()
|
|
|
|
# Multi-byte encodings
|
|
|
|
for encoding in multi_byte:
|
|
name = encoding["name"]
|
|
labels = encoding["labels"]
|
|
labels.sort()
|
|
class_name = to_camel_name(name)
|
|
class_file = open("src/nu/validator/encoding/%s.java" % class_name, "w")
|
|
class_file.write('''/*
|
|
* Copyright (c) 2013-2015 Mozilla Foundation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
|
* Instead, please regenerate using generate-encoding-data.py
|
|
*/
|
|
|
|
package nu.validator.encoding;
|
|
|
|
import java.nio.charset.Charset;
|
|
import java.nio.charset.CharsetDecoder;
|
|
import java.nio.charset.CharsetEncoder;
|
|
|
|
class ''')
|
|
class_file.write(class_name)
|
|
class_file.write(''' extends Encoding {
|
|
|
|
private static final String[] LABELS = {''')
|
|
|
|
comma = False
|
|
for label in labels:
|
|
if comma:
|
|
class_file.write(",")
|
|
class_file.write("\n \"%s\"" % label);
|
|
comma = True
|
|
class_file.write('''
|
|
};
|
|
|
|
private static final String NAME = "''')
|
|
class_file.write(name)
|
|
class_file.write('''";
|
|
|
|
static final ''')
|
|
class_file.write(class_name)
|
|
class_file.write(''' INSTANCE = new ''')
|
|
class_file.write(class_name)
|
|
class_file.write('''();
|
|
|
|
private ''')
|
|
class_file.write(class_name)
|
|
class_file.write('''() {
|
|
super(NAME, LABELS);
|
|
}
|
|
|
|
@Override public CharsetDecoder newDecoder() {
|
|
''')
|
|
if name == "gbk":
|
|
class_file.write('''return Charset.forName("gb18030").newDecoder();''')
|
|
elif name in MULTI_BYTE_DECODER_IMPLEMENTED:
|
|
class_file.write("return new %sDecoder(this);" % class_name)
|
|
else:
|
|
class_file.write('''return Charset.forName(NAME).newDecoder();''')
|
|
class_file.write('''
|
|
}
|
|
|
|
@Override public CharsetEncoder newEncoder() {
|
|
''')
|
|
if name in MULTI_BYTE_ENCODER_IMPLEMENTED:
|
|
class_file.write("return new %sEncoder(this);" % class_name)
|
|
else:
|
|
class_file.write('''return Charset.forName(NAME).newEncoder();''')
|
|
class_file.write('''
|
|
}
|
|
}
|
|
''')
|
|
class_file.close()
|
|
|
|
# Big5
|
|
|
|
def null_to_zero(code_point):
|
|
if not code_point:
|
|
code_point = 0
|
|
return code_point
|
|
|
|
index = []
|
|
|
|
for code_point in indexes["big5"]:
|
|
index.append(null_to_zero(code_point))
|
|
|
|
# There are four major gaps consisting of more than 4 consecutive invalid pointers
|
|
gaps = []
|
|
consecutive = 0
|
|
consecutive_start = 0
|
|
offset = 0
|
|
for code_point in index:
|
|
if code_point == 0:
|
|
if consecutive == 0:
|
|
consecutive_start = offset
|
|
consecutive +=1
|
|
else:
|
|
if consecutive > 4:
|
|
gaps.append((consecutive_start, consecutive_start + consecutive))
|
|
consecutive = 0
|
|
offset += 1
|
|
|
|
def invert_ranges(ranges, cap):
|
|
inverted = []
|
|
invert_start = 0
|
|
for (start, end) in ranges:
|
|
if start != 0:
|
|
inverted.append((invert_start, start))
|
|
invert_start = end
|
|
inverted.append((invert_start, cap))
|
|
return inverted
|
|
|
|
cap = len(index)
|
|
ranges = invert_ranges(gaps, cap)
|
|
|
|
# Now compute a compressed lookup table for astralness
|
|
|
|
gaps = []
|
|
consecutive = 0
|
|
consecutive_start = 0
|
|
offset = 0
|
|
for code_point in index:
|
|
if code_point <= 0xFFFF:
|
|
if consecutive == 0:
|
|
consecutive_start = offset
|
|
consecutive +=1
|
|
else:
|
|
if consecutive > 40:
|
|
gaps.append((consecutive_start, consecutive_start + consecutive))
|
|
consecutive = 0
|
|
offset += 1
|
|
|
|
astral_ranges = invert_ranges(gaps, cap)
|
|
|
|
class_file = open("src/nu/validator/encoding/Big5Data.java", "w")
|
|
class_file.write('''/*
|
|
* Copyright (c) 2015 Mozilla Foundation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
|
* Instead, please regenerate using generate-encoding-data.py
|
|
*/
|
|
|
|
package nu.validator.encoding;
|
|
|
|
final class Big5Data {
|
|
|
|
private static final String ASTRALNESS = "''')
|
|
|
|
bits = []
|
|
for (low, high) in astral_ranges:
|
|
for i in xrange(low, high):
|
|
bits.append(1 if index[i] > 0xFFFF else 0)
|
|
# pad length to multiple of 16
|
|
for j in xrange(16 - (len(bits) % 16)):
|
|
bits.append(0)
|
|
|
|
i = 0
|
|
while i < len(bits):
|
|
accu = 0
|
|
for j in xrange(16):
|
|
accu |= bits[i + j] << j
|
|
if accu == 0x22:
|
|
class_file.write('\\"')
|
|
else:
|
|
class_file.write('\\u%04X' % accu)
|
|
i += 16
|
|
|
|
class_file.write('''";
|
|
|
|
''')
|
|
|
|
j = 0
|
|
for (low, high) in ranges:
|
|
class_file.write(''' private static final String TABLE%d = "''' % j)
|
|
for i in xrange(low, high):
|
|
class_file.write('\\u%04X' % (index[i] & 0xFFFF))
|
|
class_file.write('''";
|
|
|
|
''')
|
|
j += 1
|
|
|
|
class_file.write(''' private static boolean readBit(int i) {
|
|
return (ASTRALNESS.charAt(i >> 4) & (1 << (i & 0xF))) != 0;
|
|
}
|
|
|
|
static char lowBits(int pointer) {
|
|
''')
|
|
|
|
j = 0
|
|
for (low, high) in ranges:
|
|
class_file.write(''' if (pointer < %d) {
|
|
return '\\u0000';
|
|
}
|
|
if (pointer < %d) {
|
|
return TABLE%d.charAt(pointer - %d);
|
|
}
|
|
''' % (low, high, j, low))
|
|
j += 1
|
|
|
|
class_file.write(''' return '\\u0000';
|
|
}
|
|
|
|
static boolean isAstral(int pointer) {
|
|
''')
|
|
|
|
base = 0
|
|
for (low, high) in astral_ranges:
|
|
if high - low == 1:
|
|
class_file.write(''' if (pointer < %d) {
|
|
return false;
|
|
}
|
|
if (pointer == %d) {
|
|
return true;
|
|
}
|
|
''' % (low, low))
|
|
else:
|
|
class_file.write(''' if (pointer < %d) {
|
|
return false;
|
|
}
|
|
if (pointer < %d) {
|
|
return readBit(%d + (pointer - %d));
|
|
}
|
|
''' % (low, high, base, low))
|
|
base += (high - low)
|
|
|
|
class_file.write(''' return false;
|
|
}
|
|
|
|
public static int findPointer(char lowBits, boolean isAstral) {
|
|
if (!isAstral) {
|
|
switch (lowBits) {
|
|
''')
|
|
|
|
hkscs_bound = (0xA1 - 0x81) * 157
|
|
|
|
prefer_last = [
|
|
0x2550,
|
|
0x255E,
|
|
0x2561,
|
|
0x256A,
|
|
0x5341,
|
|
0x5345,
|
|
]
|
|
|
|
for code_point in prefer_last:
|
|
# Python lists don't have .rindex() :-(
|
|
for i in xrange(len(index) - 1, -1, -1):
|
|
candidate = index[i]
|
|
if candidate == code_point:
|
|
class_file.write(''' case 0x%04X:
|
|
return %d;
|
|
''' % (code_point, i))
|
|
break
|
|
|
|
class_file.write(''' default:
|
|
break;
|
|
}
|
|
}''')
|
|
|
|
j = 0
|
|
for (low, high) in ranges:
|
|
if high > hkscs_bound:
|
|
start = 0
|
|
if low <= hkscs_bound and hkscs_bound < high:
|
|
# This is the first range we don't ignore and the
|
|
# range that contains the first non-HKSCS pointer.
|
|
# Avoid searching HKSCS.
|
|
start = hkscs_bound - low
|
|
class_file.write('''
|
|
for (int i = %d; i < TABLE%d.length(); i++) {
|
|
if (TABLE%d.charAt(i) == lowBits) {
|
|
int pointer = i + %d;
|
|
if (isAstral == isAstral(pointer)) {
|
|
return pointer;
|
|
}
|
|
}
|
|
}''' % (start, j, j, low))
|
|
j += 1
|
|
|
|
class_file.write('''
|
|
return 0;
|
|
}
|
|
}
|
|
''')
|
|
class_file.close()
|