118 lines
4.6 KiB
C++
118 lines
4.6 KiB
C++
// Copyright 2019 the V8 project authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style license that can be
|
||
// found in the LICENSE file.
|
||
|
||
#ifndef V8_REGEXP_SPECIAL_CASE_H_
|
||
#define V8_REGEXP_SPECIAL_CASE_H_
|
||
|
||
#ifdef V8_INTL_SUPPORT
|
||
#include "regexp/regexp-shim.h"
|
||
|
||
#include "unicode/uchar.h"
|
||
#include "unicode/uniset.h"
|
||
#include "unicode/unistr.h"
|
||
|
||
namespace v8 {
|
||
namespace internal {
|
||
|
||
// Sets of Unicode characters that need special handling under "i" mode
|
||
|
||
// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
|
||
// defines slightly different case-folding rules than Unicode. An
|
||
// input character should match a pattern character if the result of
|
||
// the Canonicalize algorithm is the same for both characters.
|
||
//
|
||
// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
|
||
// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
|
||
// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
|
||
// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
|
||
// the precise definition.
|
||
//
|
||
// While compiling such regular expressions, we need to compute the
|
||
// set of characters that should match a given input character. (See
|
||
// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
|
||
// For almost all characters, this can be efficiently computed using
|
||
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
|
||
// the remaining special cases.
|
||
//
|
||
// For a character c, the rules are as follows:
|
||
//
|
||
// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
|
||
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
|
||
// containing c will produce the set of characters that should
|
||
// match /c/i (or /[c]/i), and only those characters.
|
||
//
|
||
// 2. If c is in IgnoreSet, then the only character it should match is
|
||
// itself. However, closeOver will add additional incorrect
|
||
// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
|
||
// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
|
||
// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
|
||
// itself, and should not match 'ẞ'. In these cases, we can skip
|
||
// the closeOver entirely, because it will never add an equivalent
|
||
// character.
|
||
//
|
||
// 3. If c is in SpecialAddSet, then it should match at least one
|
||
// character other than itself. However, closeOver will add at
|
||
// least one additional incorrect match. For example, consider the
|
||
// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
|
||
// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
|
||
// SIGN should not match either of the other two characters. As a
|
||
// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
|
||
// IgnoreSet). To find the correct matches for characters in
|
||
// SpecialAddSet, we closeOver the original character, but filter
|
||
// out the results that do not have the same canonical value.
|
||
//
|
||
// The contents of these sets are calculated at build time by
|
||
// src/regexp/gen-regexp-special-case.cc, which generates
|
||
// gen/src/regexp/special-case.cc. This is done by iterating over the
|
||
// result of closeOver for each BMP character, and finding sets for
|
||
// which at least one character has a different canonical value than
|
||
// another character. Characters that match no other characters in
|
||
// their equivalence class are added to IgnoreSet. Characters that
|
||
// match at least one other character are added to SpecialAddSet.
|
||
|
||
class RegExpCaseFolding final : public AllStatic {
|
||
public:
|
||
static const icu::UnicodeSet& IgnoreSet();
|
||
static const icu::UnicodeSet& SpecialAddSet();
|
||
|
||
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
|
||
// Canonicalize) step 3, which is used to determine whether
|
||
// characters match when ignoreCase is true and unicode is false.
|
||
static UChar32 Canonicalize(UChar32 ch) {
|
||
// a. Assert: ch is a UTF-16 code unit.
|
||
CHECK_LE(ch, 0xffff);
|
||
|
||
// b. Let s be the String value consisting of the single code unit ch.
|
||
icu::UnicodeString s(ch);
|
||
|
||
// c. Let u be the same result produced as if by performing the algorithm
|
||
// for String.prototype.toUpperCase using s as the this value.
|
||
// d. Assert: Type(u) is String.
|
||
icu::UnicodeString& u = s.toUpper();
|
||
|
||
// e. If u does not consist of a single code unit, return ch.
|
||
if (u.length() != 1) {
|
||
return ch;
|
||
}
|
||
|
||
// f. Let cu be u's single code unit element.
|
||
UChar32 cu = u.char32At(0);
|
||
|
||
// g. If the value of ch >= 128 and the value of cu < 128, return ch.
|
||
if (ch >= 128 && cu < 128) {
|
||
return ch;
|
||
}
|
||
|
||
// h. Return cu.
|
||
return cu;
|
||
}
|
||
};
|
||
|
||
} // namespace internal
|
||
} // namespace v8
|
||
|
||
#endif // V8_INTL_SUPPORT
|
||
|
||
#endif // V8_REGEXP_SPECIAL_CASE_H_
|