1143 lines
39 KiB
Python
1143 lines
39 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# Based upon makeunicodedata.py
|
|
# (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
|
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
|
#
|
|
# Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
from __future__ import print_function
|
|
import csv
|
|
import io
|
|
import re
|
|
import os
|
|
import sys
|
|
from contextlib import closing
|
|
|
|
# ECMAScript 2016
|
|
# §11.2 White Space
|
|
whitespace = [
|
|
# python doesn't support using control character names :(
|
|
0x9, # CHARACTER TABULATION
|
|
0xb, # LINE TABULATION
|
|
0xc, # FORM FEED
|
|
ord(u'\N{SPACE}'),
|
|
ord(u'\N{NO-BREAK SPACE}'),
|
|
ord(u'\N{ZERO WIDTH NO-BREAK SPACE}'), # also BOM
|
|
]
|
|
|
|
# §11.3 Line Terminators
|
|
line_terminator = [
|
|
0xa, # LINE FEED
|
|
0xd, # CARRIAGE RETURN
|
|
ord(u'\N{LINE SEPARATOR}'),
|
|
ord(u'\N{PARAGRAPH SEPARATOR}'),
|
|
]
|
|
|
|
# These are also part of IdentifierPart §11.6 Names and Keywords
|
|
compatibility_identifier_part = [
|
|
ord(u'\N{ZERO WIDTH NON-JOINER}'),
|
|
ord(u'\N{ZERO WIDTH JOINER}'),
|
|
]
|
|
|
|
FLAG_SPACE = 1 << 0
|
|
FLAG_UNICODE_ID_START = 1 << 1
|
|
FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2
|
|
|
|
MAX_BMP = 0xffff
|
|
|
|
public_domain = """
|
|
/*
|
|
* Any copyright is dedicated to the Public Domain.
|
|
* http://creativecommons.org/licenses/publicdomain/
|
|
*/
|
|
"""
|
|
|
|
mpl_license = """\
|
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
|
|
* vim: set ts=8 sts=4 et sw=4 tw=99:
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
"""
|
|
|
|
warning_message = """\
|
|
/* Generated by make_unicode.py DO NOT MODIFY */
|
|
"""
|
|
|
|
unicode_version_message = """\
|
|
/* Unicode version: {0} */
|
|
"""
|
|
|
|
def read_unicode_data(unicode_data):
|
|
"""
|
|
If you want to understand how this wonderful file format works checkout
|
|
Unicode Standard Annex #44 - Unicode Character Database
|
|
http://www.unicode.org/reports/tr44/
|
|
"""
|
|
|
|
reader = csv.reader(unicode_data, delimiter=';')
|
|
|
|
while True:
|
|
row = reader.next()
|
|
name = row[1]
|
|
|
|
# We need to expand the UAX #44 4.2.3 Code Point Range
|
|
if name.startswith('<') and name.endswith('First>'):
|
|
next_row = reader.next()
|
|
|
|
for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
|
|
row[0] = i
|
|
row[1] = name[1:-8]
|
|
|
|
yield row
|
|
else:
|
|
row[0] = int(row[0], 16)
|
|
yield row
|
|
|
|
def read_case_folding(case_folding):
|
|
for line in case_folding:
|
|
if line == '\n' or line.startswith('#'):
|
|
continue
|
|
row = line.split('; ')
|
|
if row[1] in ['F', 'T']:
|
|
continue
|
|
row[0] = int(row[0], 16)
|
|
row[2] = int(row[2], 16)
|
|
yield row
|
|
|
|
def read_derived_core_properties(derived_core_properties):
|
|
for line in derived_core_properties:
|
|
if line == '\n' or line.startswith('#'):
|
|
continue
|
|
row = line.split('#')[0].split(';')
|
|
char_range = row[0].strip()
|
|
char_property = row[1].strip()
|
|
if '..' not in char_range:
|
|
yield (int(char_range, 16), char_property)
|
|
else:
|
|
[start, end] = char_range.split('..')
|
|
for char in range(int(start, 16), int(end, 16) + 1):
|
|
yield (char, char_property)
|
|
|
|
def int_ranges(ints):
|
|
""" Yields consecutive ranges (inclusive) from integer values. """
|
|
from itertools import tee, izip_longest
|
|
|
|
(a, b) = tee(sorted(ints))
|
|
start = next(b)
|
|
for (curr, succ) in izip_longest(a, b):
|
|
if curr + 1 != succ:
|
|
yield (start, curr)
|
|
start = succ
|
|
|
|
def utf16_encode(code):
|
|
NonBMPMin = 0x10000
|
|
LeadSurrogateMin = 0xD800
|
|
TrailSurrogateMin = 0xDC00
|
|
|
|
lead = (code - NonBMPMin) / 1024 + LeadSurrogateMin
|
|
trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
|
|
|
|
return lead, trail
|
|
|
|
def make_non_bmp_convert_macro(out_file, name, convert_map):
|
|
# Find continuous range in convert_map.
|
|
convert_list = []
|
|
entry = None
|
|
for code in sorted(convert_map.keys()):
|
|
lead, trail = utf16_encode(code)
|
|
converted = convert_map[code]
|
|
diff = converted - code
|
|
|
|
if (entry and code == entry['code'] + entry['length'] and
|
|
diff == entry['diff'] and lead == entry['lead']):
|
|
|
|
entry['length'] += 1
|
|
continue
|
|
|
|
entry = {
|
|
'code': code,
|
|
'diff': diff,
|
|
'length': 1,
|
|
'lead': lead,
|
|
'trail': trail,
|
|
}
|
|
convert_list.append(entry)
|
|
|
|
# Generate macro call for each range.
|
|
lines = []
|
|
for entry in convert_list:
|
|
from_code = entry['code']
|
|
to_code = entry['code'] + entry['length'] - 1
|
|
diff = entry['diff']
|
|
|
|
lead = entry['lead']
|
|
from_trail = entry['trail']
|
|
to_trail = entry['trail'] + entry['length'] - 1
|
|
|
|
lines.append(' macro(0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}, {:d})'.format(
|
|
from_code, to_code, lead, from_trail, to_trail, diff))
|
|
|
|
out_file.write('#define FOR_EACH_NON_BMP_{}(macro) \\\n'.format(name))
|
|
out_file.write(' \\\n'.join(lines))
|
|
out_file.write('\n')
|
|
|
|
def for_each_non_bmp_group(group_set):
|
|
# Find continuous range in group_set.
|
|
group_list = []
|
|
entry = None
|
|
for code in sorted(group_set.keys()):
|
|
if entry and code == entry['code'] + entry['length']:
|
|
entry['length'] += 1
|
|
continue
|
|
|
|
entry = {
|
|
'code': code,
|
|
'length': 1
|
|
}
|
|
group_list.append(entry)
|
|
|
|
for entry in group_list:
|
|
yield (entry['code'], entry['code'] + entry['length'] - 1)
|
|
|
|
def process_derived_core_properties(derived_core_properties):
|
|
id_start = set()
|
|
id_continue = set()
|
|
|
|
for (char, prop) in read_derived_core_properties(derived_core_properties):
|
|
if prop == 'ID_Start':
|
|
id_start.add(char)
|
|
if prop == 'ID_Continue':
|
|
id_continue.add(char)
|
|
|
|
return (id_start, id_continue)
|
|
|
|
def process_unicode_data(unicode_data, derived_core_properties):
|
|
dummy = (0, 0, 0)
|
|
table = [dummy]
|
|
cache = {dummy: 0}
|
|
index = [0] * (MAX_BMP + 1)
|
|
same_upper_map = {}
|
|
same_upper_dummy = (0, 0, 0)
|
|
same_upper_table = [same_upper_dummy]
|
|
same_upper_cache = {same_upper_dummy: 0}
|
|
same_upper_index = [0] * (MAX_BMP + 1)
|
|
|
|
test_table = {}
|
|
test_space_table = []
|
|
|
|
non_bmp_lower_map = {}
|
|
non_bmp_upper_map = {}
|
|
non_bmp_id_start_set = {}
|
|
non_bmp_id_cont_set = {}
|
|
non_bmp_space_set = {}
|
|
|
|
(id_start, id_continue) = process_derived_core_properties(derived_core_properties)
|
|
|
|
for row in read_unicode_data(unicode_data):
|
|
code = row[0]
|
|
name = row[1]
|
|
category = row[2]
|
|
alias = row[-5]
|
|
uppercase = row[-3]
|
|
lowercase = row[-2]
|
|
flags = 0
|
|
|
|
if uppercase:
|
|
upper = int(uppercase, 16)
|
|
|
|
if upper not in same_upper_map:
|
|
same_upper_map[upper] = [code]
|
|
else:
|
|
same_upper_map[upper].append(code)
|
|
else:
|
|
upper = code
|
|
|
|
if lowercase:
|
|
lower = int(lowercase, 16)
|
|
else:
|
|
lower = code
|
|
|
|
if code > MAX_BMP:
|
|
if code != lower:
|
|
non_bmp_lower_map[code] = lower
|
|
if code != upper:
|
|
non_bmp_upper_map[code] = upper
|
|
if category == 'Zs':
|
|
non_bmp_space_set[code] = 1
|
|
test_space_table.append(code)
|
|
if code in id_start:
|
|
non_bmp_id_start_set[code] = 1
|
|
if code in id_continue:
|
|
non_bmp_id_cont_set[code] = 1
|
|
continue
|
|
|
|
# we combine whitespace and lineterminators because in pratice we don't need them separated
|
|
if category == 'Zs' or code in whitespace or code in line_terminator:
|
|
flags |= FLAG_SPACE
|
|
test_space_table.append(code)
|
|
|
|
# §11.6 (IdentifierStart)
|
|
if code in id_start:
|
|
flags |= FLAG_UNICODE_ID_START
|
|
|
|
# §11.6 (IdentifierPart)
|
|
elif code in id_continue or code in compatibility_identifier_part:
|
|
flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
|
|
|
|
test_table[code] = (upper, lower, name, alias)
|
|
|
|
up_d = upper - code
|
|
low_d = lower - code
|
|
|
|
assert up_d > -65535 and up_d < 65535
|
|
assert low_d > -65535 and low_d < 65535
|
|
|
|
upper = up_d & 0xffff
|
|
lower = low_d & 0xffff
|
|
|
|
item = (upper, lower, flags)
|
|
|
|
i = cache.get(item)
|
|
if i is None:
|
|
assert item not in table
|
|
cache[item] = i = len(table)
|
|
table.append(item)
|
|
index[code] = i
|
|
|
|
for code in range(0, MAX_BMP + 1):
|
|
entry = test_table.get(code)
|
|
|
|
if not entry:
|
|
continue
|
|
|
|
(upper, lower, name, alias) = entry
|
|
|
|
if upper not in same_upper_map:
|
|
continue
|
|
|
|
same_upper_ds = [v - code for v in same_upper_map[upper]]
|
|
|
|
assert len(same_upper_ds) <= 3
|
|
assert all([v > -65535 and v < 65535 for v in same_upper_ds])
|
|
|
|
same_upper = [v & 0xffff for v in same_upper_ds]
|
|
same_upper_0 = same_upper[0] if len(same_upper) >= 1 else 0
|
|
same_upper_1 = same_upper[1] if len(same_upper) >= 2 else 0
|
|
same_upper_2 = same_upper[2] if len(same_upper) >= 3 else 0
|
|
|
|
item = (same_upper_0, same_upper_1, same_upper_2)
|
|
|
|
i = same_upper_cache.get(item)
|
|
if i is None:
|
|
assert item not in same_upper_table
|
|
same_upper_cache[item] = i = len(same_upper_table)
|
|
same_upper_table.append(item)
|
|
same_upper_index[code] = i
|
|
|
|
return (
|
|
table, index,
|
|
same_upper_table, same_upper_index,
|
|
non_bmp_lower_map, non_bmp_upper_map,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set, non_bmp_id_cont_set,
|
|
test_table, test_space_table,
|
|
)
|
|
|
|
def process_case_folding(case_folding):
|
|
folding_map = {}
|
|
rev_folding_map = {}
|
|
folding_dummy = (0, 0, 0, 0)
|
|
folding_table = [folding_dummy]
|
|
folding_cache = {folding_dummy: 0}
|
|
folding_index = [0] * (MAX_BMP + 1)
|
|
|
|
folding_tests = []
|
|
folding_codes = set()
|
|
|
|
non_bmp_folding_map = {}
|
|
non_bmp_rev_folding_map = {}
|
|
|
|
for row in read_case_folding(case_folding):
|
|
code = row[0]
|
|
mapping = row[2]
|
|
folding_map[code] = mapping
|
|
|
|
if code > MAX_BMP:
|
|
non_bmp_folding_map[code] = mapping
|
|
non_bmp_rev_folding_map[mapping] = code
|
|
|
|
if mapping not in rev_folding_map:
|
|
rev_folding_map[mapping] = [code]
|
|
else:
|
|
rev_folding_map[mapping].append(code)
|
|
|
|
folding_codes.add(code)
|
|
folding_codes.add(mapping)
|
|
|
|
for code in sorted(folding_codes):
|
|
if code in folding_map:
|
|
folding = folding_map[code]
|
|
else:
|
|
folding = code
|
|
|
|
if code in rev_folding_map:
|
|
rev_folding = rev_folding_map[code]
|
|
elif folding in rev_folding_map:
|
|
rev_folding = [c for c in rev_folding_map[folding] if c != code]
|
|
else:
|
|
rev_folding = []
|
|
|
|
assert len(rev_folding) <= 3
|
|
|
|
if folding != code or len(rev_folding):
|
|
item = [code]
|
|
if folding != code:
|
|
item.append(folding)
|
|
folding_tests.append(item + rev_folding)
|
|
|
|
if code > MAX_BMP:
|
|
continue
|
|
|
|
folding_d = folding - code
|
|
rev_folding_ds = [v - code for v in rev_folding]
|
|
|
|
assert folding_d > -65535 and folding_d < 65535
|
|
assert all([v > -65535 and v < 65535 for v in rev_folding])
|
|
|
|
folding = folding_d & 0xffff
|
|
rev_folding = [v & 0xffff for v in rev_folding_ds]
|
|
rev_folding_0 = rev_folding[0] if len(rev_folding) >= 1 else 0
|
|
rev_folding_1 = rev_folding[1] if len(rev_folding) >= 2 else 0
|
|
rev_folding_2 = rev_folding[2] if len(rev_folding) >= 3 else 0
|
|
|
|
item = (folding, rev_folding_0, rev_folding_1, rev_folding_2)
|
|
|
|
i = folding_cache.get(item)
|
|
if i is None:
|
|
assert item not in folding_table
|
|
folding_cache[item] = i = len(folding_table)
|
|
folding_table.append(item)
|
|
folding_index[code] = i
|
|
return (
|
|
folding_table, folding_index,
|
|
non_bmp_folding_map, non_bmp_rev_folding_map,
|
|
folding_tests
|
|
)
|
|
|
|
def make_non_bmp_file(version,
|
|
non_bmp_lower_map, non_bmp_upper_map,
|
|
non_bmp_folding_map, non_bmp_rev_folding_map):
|
|
file_name = 'UnicodeNonBMP.h';
|
|
with io.open(file_name, mode='wb') as non_bmp_file:
|
|
non_bmp_file.write(mpl_license)
|
|
non_bmp_file.write('\n')
|
|
non_bmp_file.write(warning_message)
|
|
non_bmp_file.write(unicode_version_message.format(version))
|
|
non_bmp_file.write("""
|
|
#ifndef vm_UnicodeNonBMP_h
|
|
#define vm_UnicodeNonBMP_h
|
|
|
|
// |macro| receives the following arguments
|
|
// macro(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)
|
|
// FROM: code point where the range starts
|
|
// TO: code point where the range ends
|
|
// LEAD: common lead surrogate of FROM and TO
|
|
// TRAIL_FROM: trail surrogate of FROM
|
|
// TRAIL_FROM: trail surrogate of TO
|
|
// DIFF: the difference between the code point in the range and
|
|
// converted code point
|
|
|
|
""")
|
|
|
|
make_non_bmp_convert_macro(non_bmp_file, 'LOWERCASE', non_bmp_lower_map)
|
|
non_bmp_file.write('\n')
|
|
make_non_bmp_convert_macro(non_bmp_file, 'UPPERCASE', non_bmp_upper_map)
|
|
non_bmp_file.write('\n')
|
|
make_non_bmp_convert_macro(non_bmp_file, 'CASE_FOLDING', non_bmp_folding_map)
|
|
non_bmp_file.write('\n')
|
|
make_non_bmp_convert_macro(non_bmp_file, 'REV_CASE_FOLDING', non_bmp_rev_folding_map)
|
|
|
|
non_bmp_file.write("""
|
|
#endif /* vm_UnicodeNonBMP_h */
|
|
""")
|
|
|
|
def make_bmp_mapping_test(version, test_table):
|
|
file_name = '../tests/ecma_5/String/string-upper-lower-mapping.js'
|
|
with io.open(file_name, mode='wb') as test_mapping:
|
|
test_mapping.write(warning_message)
|
|
test_mapping.write(unicode_version_message.format(version))
|
|
test_mapping.write(public_domain)
|
|
test_mapping.write('var mapping = [\n')
|
|
for code in range(0, MAX_BMP + 1):
|
|
entry = test_table.get(code)
|
|
|
|
if entry:
|
|
(upper, lower, name, alias) = entry
|
|
test_mapping.write(' [' + hex(upper) + ', ' + hex(lower) + '], /* ' +
|
|
name + (' (' + alias + ')' if alias else '') + ' */\n')
|
|
else:
|
|
test_mapping.write(' [' + hex(code) + ', ' + hex(code) + '],\n')
|
|
test_mapping.write('];')
|
|
test_mapping.write("""
|
|
assertEq(mapping.length, 0x10000);
|
|
for (var i = 0; i <= 0xffff; i++) {
|
|
var char = String.fromCharCode(i);
|
|
var info = mapping[i];
|
|
|
|
assertEq(char.toUpperCase().charCodeAt(0), info[0]);
|
|
assertEq(char.toLowerCase().charCodeAt(0), info[1]);
|
|
}
|
|
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
""")
|
|
|
|
def make_non_bmp_mapping_test(version, non_bmp_upper_map, non_bmp_lower_map):
|
|
file_name = '../tests/ecma_6/String/string-code-point-upper-lower-mapping.js'
|
|
with io.open(file_name, mode='wb') as test_non_bmp_mapping:
|
|
test_non_bmp_mapping.write(warning_message)
|
|
test_non_bmp_mapping.write(unicode_version_message.format(version))
|
|
test_non_bmp_mapping.write(public_domain)
|
|
for code in sorted(non_bmp_upper_map.keys()):
|
|
test_non_bmp_mapping.write("""\
|
|
assertEq(String.fromCodePoint(0x{:x}).toUpperCase().codePointAt(0), 0x{:x});
|
|
""".format(code, non_bmp_upper_map[code]))
|
|
for code in sorted(non_bmp_lower_map.keys()):
|
|
test_non_bmp_mapping.write("""\
|
|
assertEq(String.fromCodePoint(0x{:x}).toLowerCase().codePointAt(0), 0x{:x});
|
|
""".format(code, non_bmp_lower_map[code]))
|
|
|
|
test_non_bmp_mapping.write("""
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
""")
|
|
|
|
def make_space_test(version, test_space_table):
|
|
file_name = '../tests/ecma_5/String/string-space-trim.js'
|
|
with io.open(file_name, mode='wb') as test_space:
|
|
test_space.write(warning_message)
|
|
test_space.write(unicode_version_message.format(version))
|
|
test_space.write(public_domain)
|
|
test_space.write('var onlySpace = String.fromCharCode(' +
|
|
', '.join(map(lambda c: hex(c), test_space_table)) + ');\n')
|
|
test_space.write("""
|
|
assertEq(onlySpace.trim(), "");
|
|
assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
|
|
assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
|
|
assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
|
|
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
""")
|
|
|
|
def make_icase_test(version, folding_tests):
|
|
file_name = '../tests/ecma_6/RegExp/unicode-ignoreCase.js'
|
|
with io.open(file_name, mode='wb') as test_icase:
|
|
test_icase.write(warning_message)
|
|
test_icase.write(unicode_version_message.format(version))
|
|
test_icase.write(public_domain)
|
|
test_icase.write("""
|
|
var BUGNUMBER = 1135377;
|
|
var summary = "Implement RegExp unicode flag -- ignoreCase flag.";
|
|
|
|
print(BUGNUMBER + ": " + summary);
|
|
|
|
function test(code, ...equivs) {
|
|
var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu");
|
|
var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join("");
|
|
assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
|
|
codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu");
|
|
assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
|
|
}
|
|
""")
|
|
for args in folding_tests:
|
|
test_icase.write('test(' + ','.join([hex(c) for c in args]) + ');\n')
|
|
test_icase.write("""
|
|
if (typeof reportCompare === "function")
|
|
reportCompare(true, true);
|
|
""")
|
|
|
|
def make_unicode_file(version,
|
|
table, index,
|
|
same_upper_table, same_upper_index,
|
|
folding_table, folding_index,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set, non_bmp_id_cont_set):
|
|
index1, index2, shift = splitbins(index)
|
|
|
|
# Don't forget to update CharInfo in Unicode.h if you need to change this
|
|
assert shift == 6
|
|
|
|
same_upper_index1, same_upper_index2, same_upper_shift = splitbins(same_upper_index)
|
|
|
|
# Don't forget to update CodepointsWithSameUpperCaseInfo in Unicode.h if you need to change this
|
|
assert same_upper_shift == 6
|
|
|
|
folding_index1, folding_index2, folding_shift = splitbins(folding_index)
|
|
|
|
# Don't forget to update CaseFoldInfo in Unicode.h if you need to change this
|
|
assert folding_shift == 6
|
|
|
|
# verify correctness
|
|
for char in index:
|
|
test = table[index[char]]
|
|
|
|
idx = index1[char >> shift]
|
|
idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
|
|
|
|
assert test == table[idx]
|
|
|
|
# verify correctness
|
|
for char in same_upper_index:
|
|
test = same_upper_table[same_upper_index[char]]
|
|
|
|
idx = same_upper_index1[char >> same_upper_shift]
|
|
idx = same_upper_index2[(idx << same_upper_shift) + (char & ((1 << same_upper_shift) - 1))]
|
|
|
|
assert test == same_upper_table[idx]
|
|
|
|
# verify correctness
|
|
for char in folding_index:
|
|
test = folding_table[folding_index[char]]
|
|
|
|
idx = folding_index1[char >> folding_shift]
|
|
idx = folding_index2[(idx << folding_shift) + (char & ((1 << folding_shift) - 1))]
|
|
|
|
assert test == folding_table[idx]
|
|
|
|
comment = """
|
|
/*
|
|
* So how does indexing work?
|
|
* First let's have a look at a char16_t, 16-bits:
|
|
* [................]
|
|
* Step 1:
|
|
* Extracting the upper 11 bits from the char16_t.
|
|
* upper = char >> 5 ([***********.....])
|
|
* Step 2:
|
|
* Using these bits to get an reduced index from index1.
|
|
* index = index1[upper]
|
|
* Step 3:
|
|
* Combining the index and the bottom 5 bits of the original char16_t.
|
|
* real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
|
|
*
|
|
* The advantage here is that the biggest number in index1 doesn't need 10 bits,
|
|
* but 7 and we save some memory.
|
|
*
|
|
* Step 4:
|
|
* Get the character informations by looking up real_index in js_charinfo.
|
|
*
|
|
* Pseudocode of generation:
|
|
*
|
|
* let table be the mapping of char16_t => js_charinfo_index
|
|
* let index1 be an empty array
|
|
* let index2 be an empty array
|
|
* let cache be a hash map
|
|
*
|
|
* while shift is less then maximal amount you can shift 0xffff before it's 0
|
|
* let chunks be table split in chunks of size 2**shift
|
|
*
|
|
* for every chunk in chunks
|
|
* if chunk is in cache
|
|
* let index be cache[chunk]
|
|
* else
|
|
* let index be the max key of index2 + 1
|
|
* for element in chunk
|
|
* push element to index2
|
|
* put index as chunk in cache
|
|
*
|
|
* push index >> shift to index1
|
|
*
|
|
* increase shift
|
|
* stop if you found the best shift
|
|
*/
|
|
"""
|
|
def dump(data, name, file):
|
|
file.write('const uint8_t unicode::' + name + '[] = {\n')
|
|
|
|
line = pad = ' ' * 4
|
|
lines = []
|
|
for entry in data:
|
|
assert entry < 256
|
|
s = str(entry)
|
|
s = s.rjust(3)
|
|
|
|
if len(line + s) + 5 > 99:
|
|
lines.append(line.rstrip())
|
|
line = pad + s + ', '
|
|
else:
|
|
line = line + s + ', '
|
|
lines.append(line.rstrip())
|
|
|
|
file.write('\n'.join(lines))
|
|
file.write('\n};\n')
|
|
|
|
file_name = 'Unicode.cpp'
|
|
with io.open(file_name, 'wb') as data_file:
|
|
data_file.write(warning_message)
|
|
data_file.write(unicode_version_message.format(version))
|
|
data_file.write(public_domain)
|
|
data_file.write('#include "vm/Unicode.h"\n\n')
|
|
data_file.write('using namespace js;\n')
|
|
data_file.write('using namespace js::unicode;\n')
|
|
data_file.write(comment)
|
|
data_file.write('const CharacterInfo unicode::js_charinfo[] = {\n')
|
|
for d in table:
|
|
data_file.write(' {')
|
|
data_file.write(', '.join((str(e) for e in d)))
|
|
data_file.write('},\n')
|
|
data_file.write('};\n')
|
|
data_file.write('\n')
|
|
|
|
dump(index1, 'index1', data_file)
|
|
data_file.write('\n')
|
|
dump(index2, 'index2', data_file)
|
|
data_file.write('\n')
|
|
|
|
data_file.write('const CodepointsWithSameUpperCaseInfo unicode::js_codepoints_with_same_upper_info[] = {\n')
|
|
for d in same_upper_table:
|
|
data_file.write(' {')
|
|
data_file.write(', '.join((str(e) for e in d)))
|
|
data_file.write('},\n')
|
|
data_file.write('};\n')
|
|
data_file.write('\n')
|
|
|
|
dump(same_upper_index1, 'codepoints_with_same_upper_index1', data_file)
|
|
data_file.write('\n')
|
|
dump(same_upper_index2, 'codepoints_with_same_upper_index2', data_file)
|
|
data_file.write('\n')
|
|
|
|
data_file.write('const FoldingInfo unicode::js_foldinfo[] = {\n')
|
|
for d in folding_table:
|
|
data_file.write(' {')
|
|
data_file.write(', '.join((str(e) for e in d)))
|
|
data_file.write('},\n')
|
|
data_file.write('};\n')
|
|
data_file.write('\n')
|
|
|
|
dump(folding_index1, 'folding_index1', data_file)
|
|
data_file.write('\n')
|
|
dump(folding_index2, 'folding_index2', data_file)
|
|
data_file.write('\n')
|
|
|
|
# If the following assert fails, it means space character is added to
|
|
# non-BMP area. In that case the following code should be uncommented
|
|
# and the corresponding code should be added to frontend.
|
|
assert len(non_bmp_space_set.keys()) == 0
|
|
|
|
data_file.write("""\
|
|
bool
|
|
js::unicode::IsIdentifierStartNonBMP(uint32_t codePoint)
|
|
{
|
|
""")
|
|
|
|
for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_start_set):
|
|
data_file.write("""\
|
|
if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
|
|
return true;
|
|
""".format(from_code, to_code))
|
|
|
|
data_file.write("""\
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
js::unicode::IsIdentifierPartNonBMP(uint32_t codePoint)
|
|
{
|
|
""")
|
|
|
|
for (from_code, to_code) in for_each_non_bmp_group(non_bmp_id_cont_set):
|
|
data_file.write("""\
|
|
if (codePoint >= 0x{:x} && codePoint <= 0x{:x})
|
|
return true;
|
|
""".format(from_code, to_code))
|
|
|
|
data_file.write("""\
|
|
return false;
|
|
}
|
|
""")
|
|
|
|
def getsize(data):
|
|
""" return smallest possible integer size for the given array """
|
|
maxdata = max(data)
|
|
assert maxdata < 2**32
|
|
|
|
if maxdata < 256:
|
|
return 1
|
|
elif maxdata < 65536:
|
|
return 2
|
|
else:
|
|
return 4
|
|
|
|
def splitbins(t):
|
|
"""t -> (t1, t2, shift). Split a table to save space.
|
|
|
|
t is a sequence of ints. This function can be useful to save space if
|
|
many of the ints are the same. t1 and t2 are lists of ints, and shift
|
|
is an int, chosen to minimize the combined size of t1 and t2 (in C
|
|
code), and where for each i in range(len(t)),
|
|
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
where mask is a bitmask isolating the last "shift" bits.
|
|
"""
|
|
|
|
def dump(t1, t2, shift, bytes):
|
|
print("%d+%d bins at shift %d; %d bytes" % (
|
|
len(t1), len(t2), shift, bytes), file=sys.stderr)
|
|
print("Size of original table:", len(t)*getsize(t), \
|
|
"bytes", file=sys.stderr)
|
|
n = len(t)-1 # last valid index
|
|
maxshift = 0 # the most we can shift n and still have something left
|
|
if n > 0:
|
|
while n >> 1:
|
|
n >>= 1
|
|
maxshift += 1
|
|
del n
|
|
bytes = sys.maxsize # smallest total size so far
|
|
t = tuple(t) # so slices can be dict keys
|
|
for shift in range(maxshift + 1):
|
|
t1 = []
|
|
t2 = []
|
|
size = 2**shift
|
|
bincache = {}
|
|
|
|
for i in range(0, len(t), size):
|
|
bin = t[i:i + size]
|
|
|
|
index = bincache.get(bin)
|
|
if index is None:
|
|
index = len(t2)
|
|
bincache[bin] = index
|
|
t2.extend(bin)
|
|
t1.append(index >> shift)
|
|
|
|
# determine memory size
|
|
b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
|
|
if b < bytes:
|
|
best = t1, t2, shift
|
|
bytes = b
|
|
t1, t2, shift = best
|
|
|
|
print("Best:", end=' ', file=sys.stderr)
|
|
dump(t1, t2, shift, bytes)
|
|
|
|
# exhaustively verify that the decomposition is correct
|
|
mask = 2**shift - 1
|
|
for i in range(len(t)):
|
|
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
|
return best
|
|
|
|
def make_irregexp_tables(version,
|
|
table, index,
|
|
folding_table, folding_index,
|
|
test_table):
|
|
import string
|
|
from functools import partial
|
|
from itertools import chain, ifilter, imap
|
|
|
|
MAX_ASCII = 0x7F
|
|
MAX_LATIN1 = 0xFF
|
|
LEAD_SURROGATE_MIN = 0xD800
|
|
TRAIL_SURROGATE_MAX = 0xDFFF
|
|
|
|
def hex2(n):
|
|
assert 0 <= n and n < 16**2
|
|
return '0x{:02X}'.format(n)
|
|
|
|
def hex4(n):
|
|
assert 0 <= n and n < 16**4
|
|
return '0x{:04X}'.format(n)
|
|
|
|
def uhex4(n):
|
|
assert 0 <= n and n < 16**4
|
|
return 'U+{:04X}'.format(n)
|
|
|
|
def case_info(code):
|
|
assert 0 <= code and code <= MAX_BMP
|
|
(upper, lower, flags) = table[index[code]]
|
|
return ((code + upper) & 0xffff, (code + lower) & 0xffff, flags)
|
|
|
|
def is_space(code):
|
|
(_, _, flags) = case_info(code)
|
|
return bool(flags & FLAG_SPACE)
|
|
|
|
def to_upper(code):
|
|
(upper, _, _) = case_info(code)
|
|
return upper
|
|
|
|
def casefold(code):
|
|
assert 0 <= code and code <= MAX_BMP
|
|
(folding, _, _, _) = folding_table[folding_index[code]]
|
|
return (code + folding) & 0xffff
|
|
|
|
def casefolds_to_ascii(code):
|
|
return casefold(code) <= MAX_ASCII
|
|
|
|
def casefolds_to_latin1(code):
|
|
return casefold(code) <= MAX_LATIN1
|
|
|
|
def casemaps_to_nonlatin1(code):
|
|
upper = to_upper(code)
|
|
return upper > MAX_LATIN1
|
|
|
|
def char_name(code):
|
|
assert 0 <= code and code <= MAX_BMP
|
|
if code not in test_table:
|
|
return '<Unused>'
|
|
if code == LEAD_SURROGATE_MIN:
|
|
return '<Lead Surrogate Min>'
|
|
if code == TRAIL_SURROGATE_MAX:
|
|
return '<Trail Surrogate Max>'
|
|
(_, _, name, alias) = test_table[code]
|
|
return name if not name.startswith('<') else alias
|
|
|
|
def write_character_range(println, name, characters):
|
|
char_ranges = list(int_ranges(characters))
|
|
println('')
|
|
println('const int js::irregexp::k{}Ranges[] = {{'.format(name))
|
|
for (start, end) in char_ranges:
|
|
s_name = char_name(start)
|
|
e_name = char_name(end)
|
|
println(' {}, {} + 1, // {}'.format(hex4(start), hex4(end),
|
|
'{}..{}'.format(s_name, e_name)
|
|
if start != end else s_name))
|
|
println(' {} + 1'.format(hex4(MAX_BMP)))
|
|
println('};')
|
|
println('const int js::irregexp::k{}RangeCount = {};'.format(name,
|
|
len(char_ranges) * 2 + 1))
|
|
|
|
def write_character_test(println, test, consequent, default):
|
|
# Latin1 characters which, when case-mapped through
|
|
# String.prototype.toUpperCase(), canonicalize to a non-Latin1 character.
|
|
# ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize
|
|
casemapped_to_nonlatin1 = ifilter(casemaps_to_nonlatin1, xrange(0, MAX_LATIN1 + 1))
|
|
|
|
def casemap_closure(ch):
|
|
upper = to_upper(ch)
|
|
return (ch, [c for c in xrange(MAX_LATIN1 + 1, MAX_BMP + 1) if upper == to_upper(c)])
|
|
|
|
# Mapping from Latin1 characters to the list of case map equivalent
|
|
# non-Latin1 characters.
|
|
casemap_for_latin1 = dict(chain(imap(casemap_closure, casemapped_to_nonlatin1)))
|
|
|
|
# Non-latin1 characters which, when Unicode case-folded, canonicalize to
|
|
# a Latin1 character.
|
|
# ES2017, §21.2.2.8.2 Runtime Semantics: Canonicalize
|
|
casefolded_to_latin1 = ifilter(casefolds_to_latin1, xrange(MAX_LATIN1 + 1, MAX_BMP + 1))
|
|
|
|
println(' if (unicode) {')
|
|
for ch in casefolded_to_latin1:
|
|
casefolded = casefold(ch)
|
|
# Skip if also handled below for case mapping.
|
|
if casefolded in casemap_for_latin1 and ch in casemap_for_latin1[casefolded]:
|
|
continue
|
|
println(' // "{}" case folds to "{}".'.format(char_name(ch),
|
|
char_name(casefolded)))
|
|
println(' if ({})'.format(test(ch)))
|
|
println(' return {};'.format(consequent(casefolded)))
|
|
println(' }')
|
|
println('')
|
|
for (ch, casemapped_chars) in casemap_for_latin1.iteritems():
|
|
for casemapped in casemapped_chars:
|
|
println(' // "{}" case maps to "{}".'.format(char_name(casemapped),
|
|
char_name(ch)))
|
|
println(' if ({})'.format(' || '.join(imap(test, casemapped_chars))))
|
|
println(' return {};'.format(consequent(ch)))
|
|
println(' return {};'.format(default))
|
|
|
|
with io.open('../irregexp/RegExpCharacters-inl.h', 'wb') as chars_file:
|
|
write = partial(print, file=chars_file, sep='', end='')
|
|
println = partial(write, end='\n')
|
|
|
|
write(warning_message)
|
|
write(unicode_version_message.format(version))
|
|
|
|
println('#ifndef V8_JSREGEXPCHARACTERS_INL_H_')
|
|
println('#define V8_JSREGEXPCHARACTERS_INL_H_')
|
|
println('')
|
|
println('namespace js {')
|
|
println('')
|
|
println('namespace irregexp {')
|
|
println('')
|
|
|
|
println('static inline bool')
|
|
println('RangeContainsLatin1Equivalents(CharacterRange range, bool unicode)')
|
|
println('{')
|
|
write_character_test(println, lambda ch: 'range.Contains({})'.format(hex4(ch)),
|
|
lambda _: 'true', 'false')
|
|
println('}')
|
|
|
|
println('')
|
|
println('} } // namespace js::irregexp')
|
|
println('')
|
|
println('#endif // V8_JSREGEXPCHARACTERS_INL_H_')
|
|
|
|
with io.open('../irregexp/RegExpCharacters.cpp', 'wb') as chars_file:
|
|
write = partial(print, file=chars_file, sep='', end='')
|
|
println = partial(write, end='\n')
|
|
character_range = partial(write_character_range, println)
|
|
|
|
# Characters in \s, 21.2.2.12 CharacterClassEscape.
|
|
space_chars = filter(is_space, xrange(0, MAX_BMP + 1))
|
|
|
|
# Characters in \d, 21.2.2.12 CharacterClassEscape.
|
|
digit_chars = map(ord, string.digits)
|
|
assert all(ch <= MAX_ASCII for ch in digit_chars)
|
|
|
|
# Characters in \w, 21.2.2.12 CharacterClassEscape.
|
|
word_chars = map(ord, string.digits + string.ascii_letters + '_')
|
|
assert all(ch <= MAX_ASCII for ch in word_chars)
|
|
|
|
# Characters which case-fold to characters in \w.
|
|
ignorecase_word_chars = (word_chars +
|
|
filter(casefolds_to_ascii, xrange(MAX_ASCII + 1, MAX_BMP + 1)))
|
|
|
|
# Surrogate characters.
|
|
surrogate_chars = range(LEAD_SURROGATE_MIN, TRAIL_SURROGATE_MAX + 1)
|
|
|
|
write(warning_message)
|
|
write(unicode_version_message.format(version))
|
|
println('#include "irregexp/RegExpCharacters.h"')
|
|
println('')
|
|
println('#include "mozilla/Assertions.h"')
|
|
println('')
|
|
|
|
println('char16_t')
|
|
println('js::irregexp::ConvertNonLatin1ToLatin1(char16_t c, bool unicode)')
|
|
println('{')
|
|
println(' MOZ_ASSERT(c > {}, "Character mustn\'t be Latin1");'.format(hex2(MAX_LATIN1)))
|
|
write_character_test(println, lambda ch: 'c == {}'.format(hex4(ch)), hex2, '0')
|
|
println('}')
|
|
|
|
character_range('Space', space_chars)
|
|
character_range('SpaceAndSurrogate', space_chars + surrogate_chars)
|
|
|
|
character_range('Word', word_chars)
|
|
character_range('IgnoreCaseWord', ignorecase_word_chars)
|
|
character_range('WordAndSurrogate', word_chars + surrogate_chars)
|
|
character_range('NegatedIgnoreCaseWordAndSurrogate',
|
|
set(xrange(0, MAX_BMP + 1)) - set(ignorecase_word_chars + surrogate_chars))
|
|
|
|
character_range('Digit', digit_chars)
|
|
character_range('DigitAndSurrogate', digit_chars + surrogate_chars)
|
|
|
|
character_range('Surrogate', surrogate_chars)
|
|
|
|
character_range('LineTerminator', line_terminator)
|
|
|
|
def update_unicode(args):
|
|
import urllib2
|
|
|
|
version = args.version
|
|
if version is not None:
|
|
baseurl = 'http://unicode.org/Public'
|
|
if version == 'UNIDATA':
|
|
url = '%s/%s' % (baseurl, version)
|
|
else:
|
|
url = '%s/%s/ucd' % (baseurl, version)
|
|
|
|
print('Arguments:')
|
|
if version is not None:
|
|
print('\tVersion: %s' % version)
|
|
print('\tDownload url: %s' % url)
|
|
else:
|
|
print('\tUsing local files.')
|
|
print('\tAlways make sure you have the newest Unicode files!')
|
|
print('')
|
|
|
|
def download_or_open(fname):
|
|
tfile_path = os.path.join(os.getcwd(), fname)
|
|
if version is not None:
|
|
print('Downloading %s...' % fname)
|
|
unicode_data_url = '%s/%s' % (url, fname)
|
|
with closing(urllib2.urlopen(unicode_data_url)) as reader:
|
|
data = reader.read()
|
|
tfile = io.open(tfile_path, 'w+b')
|
|
tfile.write(data)
|
|
tfile.flush()
|
|
tfile.seek(0)
|
|
else:
|
|
if not os.path.isfile(tfile_path):
|
|
raise RuntimeError('File not found: %s' % tfile_path)
|
|
tfile = io.open(tfile_path, 'rb');
|
|
return tfile
|
|
|
|
def version_from_file(f, fname):
|
|
pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname)
|
|
return pat_version.match(f.readline()).group("version")
|
|
|
|
with download_or_open('UnicodeData.txt') as unicode_data, \
|
|
download_or_open('CaseFolding.txt') as case_folding, \
|
|
download_or_open('DerivedCoreProperties.txt') as derived_core_properties:
|
|
unicode_version = version_from_file(derived_core_properties, 'DerivedCoreProperties')
|
|
|
|
print('Processing...')
|
|
(
|
|
table, index,
|
|
same_upper_table, same_upper_index,
|
|
non_bmp_lower_map, non_bmp_upper_map,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set, non_bmp_id_cont_set,
|
|
test_table, test_space_table
|
|
) = process_unicode_data(unicode_data, derived_core_properties)
|
|
(
|
|
folding_table, folding_index,
|
|
non_bmp_folding_map, non_bmp_rev_folding_map,
|
|
folding_tests
|
|
) = process_case_folding(case_folding)
|
|
|
|
print('Generating...')
|
|
make_unicode_file(unicode_version,
|
|
table, index,
|
|
same_upper_table, same_upper_index,
|
|
folding_table, folding_index,
|
|
non_bmp_space_set,
|
|
non_bmp_id_start_set, non_bmp_id_cont_set)
|
|
make_non_bmp_file(unicode_version,
|
|
non_bmp_lower_map, non_bmp_upper_map,
|
|
non_bmp_folding_map, non_bmp_rev_folding_map)
|
|
make_irregexp_tables(unicode_version,
|
|
table, index,
|
|
folding_table, folding_index,
|
|
test_table)
|
|
|
|
make_bmp_mapping_test(unicode_version, test_table)
|
|
make_non_bmp_mapping_test(unicode_version, non_bmp_upper_map, non_bmp_lower_map)
|
|
make_space_test(unicode_version, test_space_table)
|
|
make_icase_test(unicode_version, folding_tests)
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
|
|
# This script must be run from js/src/vm to work correctly.
|
|
if '/'.join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != 'js/src/vm':
|
|
raise RuntimeError('%s must be run from js/src/vm' % sys.argv[0])
|
|
|
|
parser = argparse.ArgumentParser(description='Update Unicode data.')
|
|
|
|
parser.add_argument('--version',
|
|
help='Optional Unicode version number. If specified, downloads the\
|
|
selected version from <http://unicode.org/Public>. If not specified\
|
|
uses the existing local files to generate the Unicode data. The\
|
|
number must match a published Unicode version, e.g. use\
|
|
"--version=8.0.0" to download Unicode 8 files. Alternatively use\
|
|
"--version=UNIDATA" to download the latest published version.')
|
|
|
|
parser.set_defaults(func=update_unicode)
|
|
|
|
args = parser.parse_args()
|
|
args.func(args)
|