Mypal/parser/html/java/htmlparser/src/nu/validator/encoding/Encoding.java

887 lines
23 KiB
Java

/*
* Copyright (c) 2015 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.encoding;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.nio.charset.spi.CharsetProvider;
import java.util.Arrays;
import java.util.Collections;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Represents an <a href="https://encoding.spec.whatwg.org/#encoding">encoding</a>
* as defined in the <a href="https://encoding.spec.whatwg.org/">Encoding
* Standard</a>, provides access to each encoding defined in the Encoding
* Standard via a static constant and provides the
* "<a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an
* encoding</a>" algorithm defined in the Encoding Standard.
*
* <p>This class inherits from {@link Charset} to allow the Encoding
* Standard-compliant encodings to be used in contexts that support
* <code>Charset</code> instances. However, by design, the Encoding
* Standard-compliant encodings are not supplied via a {@link CharsetProvider}
* and, therefore, are not available via and do not interfere with the static
* methods provided by <code>Charset</code>. (This class provides methods of
* the same name to hide each static method of <code>Charset</code> to help
* avoid accidental calls to the static methods of the superclass when working
* with Encoding Standard-compliant encodings.)
*
* <p>When an application needs to use a particular encoding, such as utf-8
* or windows-1252, the corresponding constant, i.e.
* {@link #UTF_8 Encoding.UTF_8} and {@link #WINDOWS_1252 Encoding.WINDOWS_1252}
* respectively, should be used. However, when the application receives an
* encoding label from external input, the method {@link #forName(String)
* forName()} should be used to obtain the object representing the encoding
* identified by the label. In contexts where labels that map to the
* <a href="https://encoding.spec.whatwg.org/#replacement">replacement
* encoding</a> should be treated as unknown, the method {@link
* #forNameNoReplacement(String) forNameNoReplacement()} should be used instead.
*
*
* @author hsivonen
*/
public abstract class Encoding extends Charset {
private static final String[] LABELS = {
"866",
"ansi_x3.4-1968",
"arabic",
"ascii",
"asmo-708",
"big5",
"big5-hkscs",
"chinese",
"cn-big5",
"cp1250",
"cp1251",
"cp1252",
"cp1253",
"cp1254",
"cp1255",
"cp1256",
"cp1257",
"cp1258",
"cp819",
"cp866",
"csbig5",
"cseuckr",
"cseucpkdfmtjapanese",
"csgb2312",
"csibm866",
"csiso2022jp",
"csiso2022kr",
"csiso58gb231280",
"csiso88596e",
"csiso88596i",
"csiso88598e",
"csiso88598i",
"csisolatin1",
"csisolatin2",
"csisolatin3",
"csisolatin4",
"csisolatin5",
"csisolatin6",
"csisolatin9",
"csisolatinarabic",
"csisolatincyrillic",
"csisolatingreek",
"csisolatinhebrew",
"cskoi8r",
"csksc56011987",
"csmacintosh",
"csshiftjis",
"cyrillic",
"dos-874",
"ecma-114",
"ecma-118",
"elot_928",
"euc-jp",
"euc-kr",
"gb18030",
"gb2312",
"gb_2312",
"gb_2312-80",
"gbk",
"greek",
"greek8",
"hebrew",
"hz-gb-2312",
"ibm819",
"ibm866",
"iso-2022-cn",
"iso-2022-cn-ext",
"iso-2022-jp",
"iso-2022-kr",
"iso-8859-1",
"iso-8859-10",
"iso-8859-11",
"iso-8859-13",
"iso-8859-14",
"iso-8859-15",
"iso-8859-16",
"iso-8859-2",
"iso-8859-3",
"iso-8859-4",
"iso-8859-5",
"iso-8859-6",
"iso-8859-6-e",
"iso-8859-6-i",
"iso-8859-7",
"iso-8859-8",
"iso-8859-8-e",
"iso-8859-8-i",
"iso-8859-9",
"iso-ir-100",
"iso-ir-101",
"iso-ir-109",
"iso-ir-110",
"iso-ir-126",
"iso-ir-127",
"iso-ir-138",
"iso-ir-144",
"iso-ir-148",
"iso-ir-149",
"iso-ir-157",
"iso-ir-58",
"iso8859-1",
"iso8859-10",
"iso8859-11",
"iso8859-13",
"iso8859-14",
"iso8859-15",
"iso8859-2",
"iso8859-3",
"iso8859-4",
"iso8859-5",
"iso8859-6",
"iso8859-7",
"iso8859-8",
"iso8859-9",
"iso88591",
"iso885910",
"iso885911",
"iso885913",
"iso885914",
"iso885915",
"iso88592",
"iso88593",
"iso88594",
"iso88595",
"iso88596",
"iso88597",
"iso88598",
"iso88599",
"iso_8859-1",
"iso_8859-15",
"iso_8859-1:1987",
"iso_8859-2",
"iso_8859-2:1987",
"iso_8859-3",
"iso_8859-3:1988",
"iso_8859-4",
"iso_8859-4:1988",
"iso_8859-5",
"iso_8859-5:1988",
"iso_8859-6",
"iso_8859-6:1987",
"iso_8859-7",
"iso_8859-7:1987",
"iso_8859-8",
"iso_8859-8:1988",
"iso_8859-9",
"iso_8859-9:1989",
"koi",
"koi8",
"koi8-r",
"koi8-ru",
"koi8-u",
"koi8_r",
"korean",
"ks_c_5601-1987",
"ks_c_5601-1989",
"ksc5601",
"ksc_5601",
"l1",
"l2",
"l3",
"l4",
"l5",
"l6",
"l9",
"latin1",
"latin2",
"latin3",
"latin4",
"latin5",
"latin6",
"logical",
"mac",
"macintosh",
"ms932",
"ms_kanji",
"shift-jis",
"shift_jis",
"sjis",
"sun_eu_greek",
"tis-620",
"unicode-1-1-utf-8",
"us-ascii",
"utf-16",
"utf-16be",
"utf-16le",
"utf-8",
"utf8",
"visual",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"windows-31j",
"windows-874",
"windows-949",
"x-cp1250",
"x-cp1251",
"x-cp1252",
"x-cp1253",
"x-cp1254",
"x-cp1255",
"x-cp1256",
"x-cp1257",
"x-cp1258",
"x-euc-jp",
"x-gbk",
"x-mac-cyrillic",
"x-mac-roman",
"x-mac-ukrainian",
"x-sjis",
"x-user-defined",
"x-x-big5",
};
private static final Encoding[] ENCODINGS_FOR_LABELS = {
Ibm866.INSTANCE,
Windows1252.INSTANCE,
Iso6.INSTANCE,
Windows1252.INSTANCE,
Iso6.INSTANCE,
Big5.INSTANCE,
Big5.INSTANCE,
Gbk.INSTANCE,
Big5.INSTANCE,
Windows1250.INSTANCE,
Windows1251.INSTANCE,
Windows1252.INSTANCE,
Windows1253.INSTANCE,
Windows1254.INSTANCE,
Windows1255.INSTANCE,
Windows1256.INSTANCE,
Windows1257.INSTANCE,
Windows1258.INSTANCE,
Windows1252.INSTANCE,
Ibm866.INSTANCE,
Big5.INSTANCE,
EucKr.INSTANCE,
EucJp.INSTANCE,
Gbk.INSTANCE,
Ibm866.INSTANCE,
Iso2022Jp.INSTANCE,
Replacement.INSTANCE,
Gbk.INSTANCE,
Iso6.INSTANCE,
Iso6.INSTANCE,
Iso8.INSTANCE,
Iso8I.INSTANCE,
Windows1252.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Windows1254.INSTANCE,
Iso10.INSTANCE,
Iso15.INSTANCE,
Iso6.INSTANCE,
Iso5.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Koi8R.INSTANCE,
EucKr.INSTANCE,
Macintosh.INSTANCE,
ShiftJis.INSTANCE,
Iso5.INSTANCE,
Windows874.INSTANCE,
Iso6.INSTANCE,
Iso7.INSTANCE,
Iso7.INSTANCE,
EucJp.INSTANCE,
EucKr.INSTANCE,
Gb18030.INSTANCE,
Gbk.INSTANCE,
Gbk.INSTANCE,
Gbk.INSTANCE,
Gbk.INSTANCE,
Iso7.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Replacement.INSTANCE,
Windows1252.INSTANCE,
Ibm866.INSTANCE,
Replacement.INSTANCE,
Replacement.INSTANCE,
Iso2022Jp.INSTANCE,
Replacement.INSTANCE,
Windows1252.INSTANCE,
Iso10.INSTANCE,
Windows874.INSTANCE,
Iso13.INSTANCE,
Iso14.INSTANCE,
Iso15.INSTANCE,
Iso16.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Iso5.INSTANCE,
Iso6.INSTANCE,
Iso6.INSTANCE,
Iso6.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Iso8.INSTANCE,
Iso8I.INSTANCE,
Windows1254.INSTANCE,
Windows1252.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Iso7.INSTANCE,
Iso6.INSTANCE,
Iso8.INSTANCE,
Iso5.INSTANCE,
Windows1254.INSTANCE,
EucKr.INSTANCE,
Iso10.INSTANCE,
Gbk.INSTANCE,
Windows1252.INSTANCE,
Iso10.INSTANCE,
Windows874.INSTANCE,
Iso13.INSTANCE,
Iso14.INSTANCE,
Iso15.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Iso5.INSTANCE,
Iso6.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Windows1254.INSTANCE,
Windows1252.INSTANCE,
Iso10.INSTANCE,
Windows874.INSTANCE,
Iso13.INSTANCE,
Iso14.INSTANCE,
Iso15.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Iso5.INSTANCE,
Iso6.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Windows1254.INSTANCE,
Windows1252.INSTANCE,
Iso15.INSTANCE,
Windows1252.INSTANCE,
Iso2.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Iso4.INSTANCE,
Iso5.INSTANCE,
Iso5.INSTANCE,
Iso6.INSTANCE,
Iso6.INSTANCE,
Iso7.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Iso8.INSTANCE,
Windows1254.INSTANCE,
Windows1254.INSTANCE,
Koi8R.INSTANCE,
Koi8R.INSTANCE,
Koi8R.INSTANCE,
Koi8U.INSTANCE,
Koi8U.INSTANCE,
Koi8R.INSTANCE,
EucKr.INSTANCE,
EucKr.INSTANCE,
EucKr.INSTANCE,
EucKr.INSTANCE,
EucKr.INSTANCE,
Windows1252.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Windows1254.INSTANCE,
Iso10.INSTANCE,
Iso15.INSTANCE,
Windows1252.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Windows1254.INSTANCE,
Iso10.INSTANCE,
Iso8I.INSTANCE,
Macintosh.INSTANCE,
Macintosh.INSTANCE,
ShiftJis.INSTANCE,
ShiftJis.INSTANCE,
ShiftJis.INSTANCE,
ShiftJis.INSTANCE,
ShiftJis.INSTANCE,
Iso7.INSTANCE,
Windows874.INSTANCE,
Utf8.INSTANCE,
Windows1252.INSTANCE,
Utf16Le.INSTANCE,
Utf16Be.INSTANCE,
Utf16Le.INSTANCE,
Utf8.INSTANCE,
Utf8.INSTANCE,
Iso8.INSTANCE,
Windows1250.INSTANCE,
Windows1251.INSTANCE,
Windows1252.INSTANCE,
Windows1253.INSTANCE,
Windows1254.INSTANCE,
Windows1255.INSTANCE,
Windows1256.INSTANCE,
Windows1257.INSTANCE,
Windows1258.INSTANCE,
ShiftJis.INSTANCE,
Windows874.INSTANCE,
EucKr.INSTANCE,
Windows1250.INSTANCE,
Windows1251.INSTANCE,
Windows1252.INSTANCE,
Windows1253.INSTANCE,
Windows1254.INSTANCE,
Windows1255.INSTANCE,
Windows1256.INSTANCE,
Windows1257.INSTANCE,
Windows1258.INSTANCE,
EucJp.INSTANCE,
Gbk.INSTANCE,
MacCyrillic.INSTANCE,
Macintosh.INSTANCE,
MacCyrillic.INSTANCE,
ShiftJis.INSTANCE,
UserDefined.INSTANCE,
Big5.INSTANCE,
};
private static final Encoding[] ENCODINGS = {
Big5.INSTANCE,
EucJp.INSTANCE,
EucKr.INSTANCE,
Gb18030.INSTANCE,
Gbk.INSTANCE,
Ibm866.INSTANCE,
Iso2022Jp.INSTANCE,
Iso10.INSTANCE,
Iso13.INSTANCE,
Iso14.INSTANCE,
Iso15.INSTANCE,
Iso16.INSTANCE,
Iso2.INSTANCE,
Iso3.INSTANCE,
Iso4.INSTANCE,
Iso5.INSTANCE,
Iso6.INSTANCE,
Iso7.INSTANCE,
Iso8.INSTANCE,
Iso8I.INSTANCE,
Koi8R.INSTANCE,
Koi8U.INSTANCE,
Macintosh.INSTANCE,
Replacement.INSTANCE,
ShiftJis.INSTANCE,
Utf16Be.INSTANCE,
Utf16Le.INSTANCE,
Utf8.INSTANCE,
Windows1250.INSTANCE,
Windows1251.INSTANCE,
Windows1252.INSTANCE,
Windows1253.INSTANCE,
Windows1254.INSTANCE,
Windows1255.INSTANCE,
Windows1256.INSTANCE,
Windows1257.INSTANCE,
Windows1258.INSTANCE,
Windows874.INSTANCE,
MacCyrillic.INSTANCE,
UserDefined.INSTANCE,
};
/**
* The big5 encoding.
*/
public static final Encoding BIG5 = Big5.INSTANCE;
/**
* The euc-jp encoding.
*/
public static final Encoding EUC_JP = EucJp.INSTANCE;
/**
* The euc-kr encoding.
*/
public static final Encoding EUC_KR = EucKr.INSTANCE;
/**
* The gb18030 encoding.
*/
public static final Encoding GB18030 = Gb18030.INSTANCE;
/**
* The gbk encoding.
*/
public static final Encoding GBK = Gbk.INSTANCE;
/**
* The ibm866 encoding.
*/
public static final Encoding IBM866 = Ibm866.INSTANCE;
/**
* The iso-2022-jp encoding.
*/
public static final Encoding ISO_2022_JP = Iso2022Jp.INSTANCE;
/**
* The iso-8859-10 encoding.
*/
public static final Encoding ISO_8859_10 = Iso10.INSTANCE;
/**
* The iso-8859-13 encoding.
*/
public static final Encoding ISO_8859_13 = Iso13.INSTANCE;
/**
* The iso-8859-14 encoding.
*/
public static final Encoding ISO_8859_14 = Iso14.INSTANCE;
/**
* The iso-8859-15 encoding.
*/
public static final Encoding ISO_8859_15 = Iso15.INSTANCE;
/**
* The iso-8859-16 encoding.
*/
public static final Encoding ISO_8859_16 = Iso16.INSTANCE;
/**
* The iso-8859-2 encoding.
*/
public static final Encoding ISO_8859_2 = Iso2.INSTANCE;
/**
* The iso-8859-3 encoding.
*/
public static final Encoding ISO_8859_3 = Iso3.INSTANCE;
/**
* The iso-8859-4 encoding.
*/
public static final Encoding ISO_8859_4 = Iso4.INSTANCE;
/**
* The iso-8859-5 encoding.
*/
public static final Encoding ISO_8859_5 = Iso5.INSTANCE;
/**
* The iso-8859-6 encoding.
*/
public static final Encoding ISO_8859_6 = Iso6.INSTANCE;
/**
* The iso-8859-7 encoding.
*/
public static final Encoding ISO_8859_7 = Iso7.INSTANCE;
/**
* The iso-8859-8 encoding.
*/
public static final Encoding ISO_8859_8 = Iso8.INSTANCE;
/**
* The iso-8859-8-i encoding.
*/
public static final Encoding ISO_8859_8_I = Iso8I.INSTANCE;
/**
* The koi8-r encoding.
*/
public static final Encoding KOI8_R = Koi8R.INSTANCE;
/**
* The koi8-u encoding.
*/
public static final Encoding KOI8_U = Koi8U.INSTANCE;
/**
* The macintosh encoding.
*/
public static final Encoding MACINTOSH = Macintosh.INSTANCE;
/**
* The replacement encoding.
*/
public static final Encoding REPLACEMENT = Replacement.INSTANCE;
/**
* The shift_jis encoding.
*/
public static final Encoding SHIFT_JIS = ShiftJis.INSTANCE;
/**
* The utf-16be encoding.
*/
public static final Encoding UTF_16BE = Utf16Be.INSTANCE;
/**
* The utf-16le encoding.
*/
public static final Encoding UTF_16LE = Utf16Le.INSTANCE;
/**
* The utf-8 encoding.
*/
public static final Encoding UTF_8 = Utf8.INSTANCE;
/**
* The windows-1250 encoding.
*/
public static final Encoding WINDOWS_1250 = Windows1250.INSTANCE;
/**
* The windows-1251 encoding.
*/
public static final Encoding WINDOWS_1251 = Windows1251.INSTANCE;
/**
* The windows-1252 encoding.
*/
public static final Encoding WINDOWS_1252 = Windows1252.INSTANCE;
/**
* The windows-1253 encoding.
*/
public static final Encoding WINDOWS_1253 = Windows1253.INSTANCE;
/**
* The windows-1254 encoding.
*/
public static final Encoding WINDOWS_1254 = Windows1254.INSTANCE;
/**
* The windows-1255 encoding.
*/
public static final Encoding WINDOWS_1255 = Windows1255.INSTANCE;
/**
* The windows-1256 encoding.
*/
public static final Encoding WINDOWS_1256 = Windows1256.INSTANCE;
/**
* The windows-1257 encoding.
*/
public static final Encoding WINDOWS_1257 = Windows1257.INSTANCE;
/**
* The windows-1258 encoding.
*/
public static final Encoding WINDOWS_1258 = Windows1258.INSTANCE;
/**
* The windows-874 encoding.
*/
public static final Encoding WINDOWS_874 = Windows874.INSTANCE;
/**
* The x-mac-cyrillic encoding.
*/
public static final Encoding X_MAC_CYRILLIC = MacCyrillic.INSTANCE;
/**
* The x-user-defined encoding.
*/
public static final Encoding X_USER_DEFINED = UserDefined.INSTANCE;
private static SortedMap<String, Charset> encodings = null;
protected Encoding(String canonicalName, String[] aliases) {
super(canonicalName, aliases);
}
private enum State {
HEAD, LABEL, TAIL
};
public static Encoding forName(String label) {
if (label == null) {
throw new IllegalArgumentException("Label must not be null.");
}
if (label.length() == 0) {
throw new IllegalCharsetNameException(label);
}
// First try the fast path
int index = Arrays.binarySearch(LABELS, label);
if (index >= 0) {
return ENCODINGS_FOR_LABELS[index];
}
// Else, slow path
StringBuilder sb = new StringBuilder();
State state = State.HEAD;
for (int i = 0; i < label.length(); i++) {
char c = label.charAt(i);
if ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')
|| (c == '\u000C')) {
if (state == State.LABEL) {
state = State.TAIL;
}
continue;
}
if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
switch (state) {
case HEAD:
state = State.LABEL;
// Fall through
case LABEL:
sb.append(c);
continue;
case TAIL:
throw new IllegalCharsetNameException(label);
}
}
if (c >= 'A' && c <= 'Z') {
c += 0x20;
switch (state) {
case HEAD:
state = State.LABEL;
// Fall through
case LABEL:
sb.append(c);
continue;
case TAIL:
throw new IllegalCharsetNameException(label);
}
}
if ((c == '-') || (c == '+') || (c == '.') || (c == ':')
|| (c == '_')) {
switch (state) {
case LABEL:
sb.append(c);
continue;
case HEAD:
case TAIL:
throw new IllegalCharsetNameException(label);
}
}
throw new IllegalCharsetNameException(label);
}
index = Arrays.binarySearch(LABELS, sb.toString());
if (index >= 0) {
return ENCODINGS_FOR_LABELS[index];
}
throw new UnsupportedCharsetException(label);
}
public static Encoding forNameNoReplacement(String label) {
Encoding encoding = Encoding.forName(label);
if (encoding == Encoding.REPLACEMENT) {
throw new UnsupportedCharsetException(label);
}
return encoding;
}
public static boolean isSupported(String label) {
try {
Encoding.forName(label);
} catch (UnsupportedCharsetException e) {
return false;
}
return true;
}
public static boolean isSupportedNoReplacement(String label) {
try {
Encoding.forNameNoReplacement(label);
} catch (UnsupportedCharsetException e) {
return false;
}
return true;
}
public static SortedMap<String, Charset> availableCharsets() {
if (encodings == null) {
TreeMap<String, Charset> map = new TreeMap<String, Charset>();
for (Encoding encoding : ENCODINGS) {
map.put(encoding.name(), encoding);
}
encodings = Collections.unmodifiableSortedMap(map);
}
return encodings;
}
public static Encoding defaultCharset() {
return WINDOWS_1252;
}
@Override public boolean canEncode() {
return false;
}
@Override public boolean contains(Charset cs) {
return false;
}
@Override public CharsetEncoder newEncoder() {
throw new UnsupportedOperationException("Encoder not implemented.");
}
}