Import V8's regexp parser code and add build config.
parent
8e4756b26b
commit
03415291a3
|
@ -86,6 +86,7 @@ included_inclnames_to_ignore = set([
|
|||
'unicode/udat.h', # ICU
|
||||
'unicode/udatpg.h', # ICU
|
||||
'unicode/uenum.h', # ICU
|
||||
'unicode/uniset.h', # ICU
|
||||
'unicode/unorm.h', # ICU
|
||||
'unicode/unum.h', # ICU
|
||||
'unicode/unumsys.h', # ICU
|
||||
|
|
|
@ -246,3 +246,16 @@ with only_when('--enable-compile-environment'):
|
|||
|
||||
set_config('LIBFUZZER', enable_libfuzzer)
|
||||
set_define('LIBFUZZER', enable_libfuzzer)
|
||||
|
||||
# Initial support for new regexp engine
|
||||
# ==================================================
|
||||
|
||||
js_option('--enable-new-regexp', default=False, help='Enable new regexp engine')
|
||||
|
||||
@depends('--enable-new-regexp')
|
||||
def enable_new_regexp(value):
|
||||
if value:
|
||||
return True
|
||||
|
||||
set_config('JS_NEW_REGEXP', enable_new_regexp)
|
||||
set_define('JS_NEW_REGEXP', enable_new_regexp)
|
||||
|
|
|
@ -122,6 +122,9 @@ if CONFIG['JS_HAS_CTYPES']:
|
|||
if CONFIG['JS_BUNDLED_EDITLINE']:
|
||||
DIRS += ['editline']
|
||||
|
||||
if CONFIG['JS_NEW_REGEXP']:
|
||||
DIRS += ['regexp']
|
||||
|
||||
if not CONFIG['JS_DISABLE_SHELL']:
|
||||
DIRS += ['shell']
|
||||
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
* vim: set ts=8 sts=2 et sw=2 tw=80:
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
// This file forward-defines Irregexp classes that need to be visible
|
||||
// to the rest of Spidermonkey and re-exports them into js::irregexp.
|
||||
|
||||
#ifndef regexp_RegExpTypes_h
|
||||
#define regexp_RegExpTypes_h
|
||||
|
||||
namespace js {
|
||||
class MatchPairs;
|
||||
}
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
struct InputOutputData {
|
||||
const void* inputStart;
|
||||
const void* inputEnd;
|
||||
|
||||
// Index into inputStart (in chars) at which to begin matching.
|
||||
size_t startIndex;
|
||||
|
||||
js::MatchPairs* matches;
|
||||
|
||||
template <typename CharT>
|
||||
InputOutputData(const CharT* inputStart, const CharT* inputEnd,
|
||||
size_t startIndex, js::MatchPairs* matches)
|
||||
: inputStart(inputStart),
|
||||
inputEnd(inputEnd),
|
||||
startIndex(startIndex),
|
||||
matches(matches)
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
|
||||
namespace js {
|
||||
namespace irregexp {
|
||||
|
||||
using InputOutputData = v8::internal::InputOutputData;
|
||||
|
||||
} // namespace irregexp
|
||||
} // namespace js
|
||||
|
||||
#endif // regexp_RegExpTypes_h
|
|
@ -0,0 +1,2 @@
|
|||
Imported using import-irregexp.py from:
|
||||
https://github.com/v8/v8/tree/560f2d8bb3f3a72d78e1a7d7654235d53fdcc83c/src/regexp
|
|
@ -0,0 +1,165 @@
|
|||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include <fstream>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "regexp/special-case.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
static const uc32 kSurrogateStart = 0xd800;
|
||||
static const uc32 kSurrogateEnd = 0xdfff;
|
||||
static const uc32 kNonBmpStart = 0x10000;
|
||||
|
||||
// The following code generates "src/regexp/special-case.cc".
|
||||
void PrintSet(std::ofstream& out, const char* name,
|
||||
const icu::UnicodeSet& set) {
|
||||
out << "icu::UnicodeSet Build" << name << "() {\n"
|
||||
<< " icu::UnicodeSet set;\n";
|
||||
for (int32_t i = 0; i < set.getRangeCount(); i++) {
|
||||
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
|
||||
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
|
||||
} else {
|
||||
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
|
||||
<< set.getRangeEnd(i) << ");\n";
|
||||
}
|
||||
}
|
||||
out << " set.freeze();\n"
|
||||
<< " return set;\n"
|
||||
<< "}\n\n";
|
||||
|
||||
out << "struct " << name << "Data {\n"
|
||||
<< " " << name << "Data() : set(Build" << name << "()) {}\n"
|
||||
<< " const icu::UnicodeSet set;\n"
|
||||
<< "};\n\n";
|
||||
|
||||
out << "//static\n"
|
||||
<< "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
|
||||
<< " static base::LazyInstance<" << name << "Data>::type set =\n"
|
||||
<< " LAZY_INSTANCE_INITIALIZER;\n"
|
||||
<< " return set.Pointer()->set;\n"
|
||||
<< "}\n\n";
|
||||
}
|
||||
|
||||
void PrintSpecial(std::ofstream& out) {
|
||||
icu::UnicodeSet current;
|
||||
icu::UnicodeSet special_add;
|
||||
icu::UnicodeSet ignore;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
icu::UnicodeSet upper("[\\p{Lu}]", status);
|
||||
CHECK(U_SUCCESS(status));
|
||||
|
||||
// Iterate through all chars in BMP except surrogates.
|
||||
for (UChar32 i = 0; i < kNonBmpStart; i++) {
|
||||
if (i >= kSurrogateStart && i <= kSurrogateEnd) {
|
||||
continue; // Ignore surrogate range
|
||||
}
|
||||
current.set(i, i);
|
||||
current.closeOver(USET_CASE_INSENSITIVE);
|
||||
|
||||
// Check to see if all characters in the case-folding equivalence
|
||||
// class as defined by UnicodeSet::closeOver all map to the same
|
||||
// canonical value.
|
||||
UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
|
||||
bool class_has_matching_canonical_char = false;
|
||||
bool class_has_non_matching_canonical_char = false;
|
||||
for (int32_t j = 0; j < current.getRangeCount(); j++) {
|
||||
for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
|
||||
c++) {
|
||||
if (c == i) {
|
||||
continue;
|
||||
}
|
||||
UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
|
||||
if (canonical == other_canonical) {
|
||||
class_has_matching_canonical_char = true;
|
||||
} else {
|
||||
class_has_non_matching_canonical_char = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// If any other character in i's equivalence class has a
|
||||
// different canonical value, then i needs special handling. If
|
||||
// no other character shares a canonical value with i, we can
|
||||
// ignore i when adding alternatives for case-independent
|
||||
// comparison. If at least one other character shares a
|
||||
// canonical value, then i needs special handling.
|
||||
if (class_has_non_matching_canonical_char) {
|
||||
if (class_has_matching_canonical_char) {
|
||||
special_add.add(i);
|
||||
} else {
|
||||
ignore.add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify that no Unicode equivalence class contains two non-trivial
|
||||
// JS equivalence classes. Every character in SpecialAddSet has the
|
||||
// same canonical value as every other non-IgnoreSet character in
|
||||
// its Unicode equivalence class. Therefore, if we call closeOver on
|
||||
// a set containing no IgnoreSet characters, the only characters
|
||||
// that must be removed from the result are in IgnoreSet. This fact
|
||||
// is used in CharacterRange::AddCaseEquivalents.
|
||||
for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
|
||||
for (UChar32 c = special_add.getRangeStart(i);
|
||||
c <= special_add.getRangeEnd(i); c++) {
|
||||
UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
|
||||
current.set(c, c);
|
||||
current.closeOver(USET_CASE_INSENSITIVE);
|
||||
current.removeAll(ignore);
|
||||
for (int32_t j = 0; j < current.getRangeCount(); j++) {
|
||||
for (UChar32 c2 = current.getRangeStart(j);
|
||||
c2 <= current.getRangeEnd(j); c2++) {
|
||||
CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PrintSet(out, "IgnoreSet", ignore);
|
||||
PrintSet(out, "SpecialAddSet", special_add);
|
||||
}
|
||||
|
||||
void WriteHeader(const char* header_filename) {
|
||||
std::ofstream out(header_filename);
|
||||
out << std::hex << std::setfill('0') << std::setw(4);
|
||||
out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
|
||||
<< "// Use of this source code is governed by a BSD-style license that\n"
|
||||
<< "// can be found in the LICENSE file.\n\n"
|
||||
<< "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
|
||||
<< "// The following functions are used to build UnicodeSets\n"
|
||||
<< "// for special cases where the case-folding algorithm used by\n"
|
||||
<< "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
|
||||
<< "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
|
||||
<< "// Semantics: Canonicalize) step 3.\n\n"
|
||||
<< "#ifdef V8_INTL_SUPPORT\n"
|
||||
<< "#include \"src/base/lazy-instance.h\"\n\n"
|
||||
<< "#include \"src/regexp/special-case.h\"\n\n"
|
||||
<< "#include \"unicode/uniset.h\"\n"
|
||||
<< "namespace v8 {\n"
|
||||
<< "namespace internal {\n\n";
|
||||
|
||||
PrintSpecial(out);
|
||||
|
||||
out << "\n"
|
||||
<< "} // namespace internal\n"
|
||||
<< "} // namespace v8\n"
|
||||
<< "#endif // V8_INTL_SUPPORT\n";
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
int main(int argc, const char** argv) {
|
||||
if (argc != 2) {
|
||||
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
|
||||
std::exit(1);
|
||||
}
|
||||
v8::internal::WriteHeader(argv[1]);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# This script handles all the mechanical steps of importing irregexp from v8:
|
||||
#
|
||||
# 1. Acquire the source: either from github, or optionally from a local copy of v8.
|
||||
# 2. Copy the contents of v8/src/regexp into js/src/regexp
|
||||
# - Exclude files that we have chosen not to import.
|
||||
# 3. While doing so, update #includes:
|
||||
# - Change "src/regexp/*" to "regexp/*".
|
||||
# - Remove other v8-specific headers completely.
|
||||
# 4. Add '#include "regexp/regexp-shim.h" in the necessary places.
|
||||
# 5. Update the VERSION file to include the correct git hash.
|
||||
#
|
||||
# Usage:
|
||||
# cd path/to/js/src/regexp
|
||||
# ./import-irregexp.py --path path/to/v8/src/regexp
|
||||
#
|
||||
# Alternatively, without the --path argument, import-irregexp.py will
|
||||
# clone v8 from github into a temporary directory.
|
||||
#
|
||||
# After running this script, changes to the shim code may be necessary
|
||||
# to account for changes in upstream irregexp.
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_hash(path):
|
||||
# Get the hash for the current git revision
|
||||
cwd = os.getcwd()
|
||||
os.chdir(path)
|
||||
command = ['git', 'rev-parse', 'HEAD']
|
||||
result = subprocess.check_output(command, encoding='utf-8')
|
||||
os.chdir(cwd)
|
||||
return result.rstrip()
|
||||
|
||||
|
||||
def copy_and_update_includes(src_path, dst_path):
|
||||
# List of header files that need to include the shim header
|
||||
need_shim = ['property-sequences.h',
|
||||
'regexp-ast.h',
|
||||
'regexp-bytecode-peephole.h',
|
||||
'regexp-bytecodes.h',
|
||||
'regexp-dotprinter.h',
|
||||
'regexp.h',
|
||||
'regexp-macro-assembler.h',
|
||||
'regexp-stack.h',
|
||||
'special-case.h']
|
||||
|
||||
src = open(str(src_path), 'r')
|
||||
dst = open(str(dst_path), 'w')
|
||||
|
||||
# 1. Rewrite includes of V8 regexp headers:
|
||||
regexp_include = re.compile('#include "src/regexp')
|
||||
regexp_include_new = '#include "regexp'
|
||||
|
||||
# 2. Remove includes of other V8 headers
|
||||
other_include = re.compile('#include "src/')
|
||||
|
||||
# 3. If needed, add '#include "regexp/regexp-shim.h"'.
|
||||
# Note: We get a little fancy to ensure that header files are
|
||||
# in alphabetic order. `need_to_add_shim` is true if we still
|
||||
# have to add the shim header in this file. `adding_shim_now`
|
||||
# is true if we have found a '#include "src/*' and we are just
|
||||
# waiting to find something alphabetically smaller (or an empty
|
||||
# line) so that we can insert the shim header in the right place.
|
||||
need_to_add_shim = src_path.name in need_shim
|
||||
adding_shim_now = False
|
||||
|
||||
for line in src:
|
||||
if adding_shim_now:
|
||||
if (line == '\n' or line > '#include "src/regexp/regexp-shim.h"'):
|
||||
dst.write('#include "regexp/regexp-shim.h"\n')
|
||||
need_to_add_shim = False
|
||||
adding_shim_now = False
|
||||
|
||||
if regexp_include.search(line):
|
||||
dst.write(re.sub(regexp_include, regexp_include_new, line))
|
||||
elif other_include.search(line):
|
||||
if need_to_add_shim:
|
||||
adding_shim_now = True
|
||||
else:
|
||||
dst.write(line)
|
||||
|
||||
|
||||
def import_from(srcdir, dstdir):
|
||||
excluded = ['OWNERS',
|
||||
'regexp.cc',
|
||||
'regexp-utils.cc',
|
||||
'regexp-utils.h',
|
||||
'regexp-macro-assembler-arch.h']
|
||||
|
||||
for file in srcdir.iterdir():
|
||||
if file.is_dir():
|
||||
continue
|
||||
if str(file.name) in excluded:
|
||||
continue
|
||||
copy_and_update_includes(file, dstdir / file.name)
|
||||
|
||||
# Update VERSION file
|
||||
hash = get_hash(srcdir)
|
||||
version_file = open(str(dstdir / 'VERSION'), 'w')
|
||||
version_file.write('Imported using import-irregexp.py from:\n')
|
||||
version_file.write('https://github.com/v8/v8/tree/%s/src/regexp\n' % hash)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import argparse
|
||||
import tempfile
|
||||
|
||||
# This script should be run from js/src/regexp to work correctly.
|
||||
current_path = Path(os.getcwd())
|
||||
expected_path = 'js/src/regexp'
|
||||
if not current_path.match(expected_path):
|
||||
raise RuntimeError('%s must be run from %s' % (sys.argv[0],
|
||||
expected_path))
|
||||
|
||||
parser = argparse.ArgumentParser(description='Import irregexp from v8')
|
||||
parser.add_argument('-p', '--path', help='path to v8/src/regexp')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.path:
|
||||
src_path = Path(args.path)
|
||||
|
||||
if not (src_path / 'regexp.h').exists():
|
||||
print('Usage:\n import-irregexp.py --path <path/to/v8/src/regexp>')
|
||||
sys.exit(1)
|
||||
import_from(src_path, current_path)
|
||||
sys.exit(0)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
v8_git = 'https://github.com/v8/v8.git'
|
||||
clone = 'git clone --depth 1 %s %s' % (v8_git, tempdir)
|
||||
os.system(clone)
|
||||
src_path = Path(tempdir) / 'src/regexp'
|
||||
import_from(src_path, current_path)
|
|
@ -0,0 +1,37 @@
|
|||
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
include('../js-config.mozbuild')
|
||||
include('../js-cxxflags.mozbuild')
|
||||
|
||||
FINAL_LIBRARY = "js"
|
||||
|
||||
# Includes should be relative to parent path
|
||||
LOCAL_INCLUDES += ["!..", ".."]
|
||||
|
||||
SOURCES += [
|
||||
'regexp-ast.cc',
|
||||
'regexp-bytecode-generator.cc',
|
||||
'regexp-bytecode-peephole.cc',
|
||||
'regexp-bytecodes.cc',
|
||||
'regexp-compiler-tonode.cc',
|
||||
'regexp-compiler.cc',
|
||||
'regexp-dotprinter.cc',
|
||||
'regexp-interpreter.cc',
|
||||
'regexp-macro-assembler-tracer.cc',
|
||||
'regexp-macro-assembler.cc',
|
||||
'regexp-native-macro-assembler.cc',
|
||||
'regexp-parser.cc',
|
||||
'regexp-shim.cc',
|
||||
'regexp-stack.cc',
|
||||
'util/unicode.cc'
|
||||
]
|
||||
|
||||
if CONFIG['ENABLE_INTL_API']:
|
||||
CXXFLAGS += ['-DV8_INTL_SUPPORT']
|
||||
SOURCES += [
|
||||
'property-sequences.cc',
|
||||
'special-case.cc'
|
||||
]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
|||
// Copyright 2018 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_PROPERTY_SEQUENCES_H_
|
||||
#define V8_REGEXP_PROPERTY_SEQUENCES_H_
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class UnicodePropertySequences : public AllStatic {
|
||||
public:
|
||||
static const uc32 kEmojiFlagSequences[];
|
||||
static const uc32 kEmojiTagSequences[];
|
||||
static const uc32 kEmojiZWJSequences[];
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
#endif // V8_REGEXP_PROPERTY_SEQUENCES_H_
|
|
@ -0,0 +1,342 @@
|
|||
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-ast.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
#define MAKE_ACCEPT(Name) \
|
||||
void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
|
||||
return visitor->Visit##Name(this, data); \
|
||||
}
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
|
||||
#undef MAKE_ACCEPT
|
||||
|
||||
#define MAKE_TYPE_CASE(Name) \
|
||||
RegExp##Name* RegExpTree::As##Name() { return nullptr; } \
|
||||
bool RegExpTree::Is##Name() { return false; }
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
|
||||
#undef MAKE_TYPE_CASE
|
||||
|
||||
#define MAKE_TYPE_CASE(Name) \
|
||||
RegExp##Name* RegExp##Name::As##Name() { return this; } \
|
||||
bool RegExp##Name::Is##Name() { return true; }
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
|
||||
#undef MAKE_TYPE_CASE
|
||||
|
||||
|
||||
static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
|
||||
Interval result = Interval::Empty();
|
||||
for (int i = 0; i < children->length(); i++)
|
||||
result = result.Union(children->at(i)->CaptureRegisters());
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
Interval RegExpAlternative::CaptureRegisters() {
|
||||
return ListCaptureRegisters(nodes());
|
||||
}
|
||||
|
||||
|
||||
Interval RegExpDisjunction::CaptureRegisters() {
|
||||
return ListCaptureRegisters(alternatives());
|
||||
}
|
||||
|
||||
|
||||
Interval RegExpLookaround::CaptureRegisters() {
|
||||
return body()->CaptureRegisters();
|
||||
}
|
||||
|
||||
|
||||
Interval RegExpCapture::CaptureRegisters() {
|
||||
Interval self(StartRegister(index()), EndRegister(index()));
|
||||
return self.Union(body()->CaptureRegisters());
|
||||
}
|
||||
|
||||
|
||||
Interval RegExpQuantifier::CaptureRegisters() {
|
||||
return body()->CaptureRegisters();
|
||||
}
|
||||
|
||||
|
||||
bool RegExpAssertion::IsAnchoredAtStart() {
|
||||
return assertion_type() == RegExpAssertion::START_OF_INPUT;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpAssertion::IsAnchoredAtEnd() {
|
||||
return assertion_type() == RegExpAssertion::END_OF_INPUT;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpAlternative::IsAnchoredAtStart() {
|
||||
ZoneList<RegExpTree*>* nodes = this->nodes();
|
||||
for (int i = 0; i < nodes->length(); i++) {
|
||||
RegExpTree* node = nodes->at(i);
|
||||
if (node->IsAnchoredAtStart()) {
|
||||
return true;
|
||||
}
|
||||
if (node->max_match() > 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpAlternative::IsAnchoredAtEnd() {
|
||||
ZoneList<RegExpTree*>* nodes = this->nodes();
|
||||
for (int i = nodes->length() - 1; i >= 0; i--) {
|
||||
RegExpTree* node = nodes->at(i);
|
||||
if (node->IsAnchoredAtEnd()) {
|
||||
return true;
|
||||
}
|
||||
if (node->max_match() > 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpDisjunction::IsAnchoredAtStart() {
|
||||
ZoneList<RegExpTree*>* alternatives = this->alternatives();
|
||||
for (int i = 0; i < alternatives->length(); i++) {
|
||||
if (!alternatives->at(i)->IsAnchoredAtStart()) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpDisjunction::IsAnchoredAtEnd() {
|
||||
ZoneList<RegExpTree*>* alternatives = this->alternatives();
|
||||
for (int i = 0; i < alternatives->length(); i++) {
|
||||
if (!alternatives->at(i)->IsAnchoredAtEnd()) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpLookaround::IsAnchoredAtStart() {
|
||||
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
|
||||
}
|
||||
|
||||
|
||||
bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
|
||||
|
||||
|
||||
bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
|
||||
|
||||
|
||||
// Convert regular expression trees to a simple sexp representation.
|
||||
// This representation should be different from the input grammar
|
||||
// in as many cases as possible, to make it more difficult for incorrect
|
||||
// parses to look as correct ones which is likely if the input and
|
||||
// output formats are alike.
|
||||
class RegExpUnparser final : public RegExpVisitor {
|
||||
public:
|
||||
RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
|
||||
void VisitCharacterRange(CharacterRange that);
|
||||
#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
|
||||
#undef MAKE_CASE
|
||||
private:
|
||||
std::ostream& os_;
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
|
||||
os_ << "(|";
|
||||
for (int i = 0; i < that->alternatives()->length(); i++) {
|
||||
os_ << " ";
|
||||
that->alternatives()->at(i)->Accept(this, data);
|
||||
}
|
||||
os_ << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
|
||||
os_ << "(:";
|
||||
for (int i = 0; i < that->nodes()->length(); i++) {
|
||||
os_ << " ";
|
||||
that->nodes()->at(i)->Accept(this, data);
|
||||
}
|
||||
os_ << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
|
||||
os_ << AsUC32(that.from());
|
||||
if (!that.IsSingleton()) {
|
||||
os_ << "-" << AsUC32(that.to());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
|
||||
void* data) {
|
||||
if (that->is_negated()) os_ << "^";
|
||||
os_ << "[";
|
||||
for (int i = 0; i < that->ranges(zone_)->length(); i++) {
|
||||
if (i > 0) os_ << " ";
|
||||
VisitCharacterRange(that->ranges(zone_)->at(i));
|
||||
}
|
||||
os_ << "]";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
|
||||
switch (that->assertion_type()) {
|
||||
case RegExpAssertion::START_OF_INPUT:
|
||||
os_ << "@^i";
|
||||
break;
|
||||
case RegExpAssertion::END_OF_INPUT:
|
||||
os_ << "@$i";
|
||||
break;
|
||||
case RegExpAssertion::START_OF_LINE:
|
||||
os_ << "@^l";
|
||||
break;
|
||||
case RegExpAssertion::END_OF_LINE:
|
||||
os_ << "@$l";
|
||||
break;
|
||||
case RegExpAssertion::BOUNDARY:
|
||||
os_ << "@b";
|
||||
break;
|
||||
case RegExpAssertion::NON_BOUNDARY:
|
||||
os_ << "@B";
|
||||
break;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
|
||||
os_ << "'";
|
||||
Vector<const uc16> chardata = that->data();
|
||||
for (int i = 0; i < chardata.length(); i++) {
|
||||
os_ << AsUC16(chardata[i]);
|
||||
}
|
||||
os_ << "'";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
|
||||
if (that->elements()->length() == 1) {
|
||||
that->elements()->at(0).tree()->Accept(this, data);
|
||||
} else {
|
||||
os_ << "(!";
|
||||
for (int i = 0; i < that->elements()->length(); i++) {
|
||||
os_ << " ";
|
||||
that->elements()->at(i).tree()->Accept(this, data);
|
||||
}
|
||||
os_ << ")";
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
|
||||
os_ << "(# " << that->min() << " ";
|
||||
if (that->max() == RegExpTree::kInfinity) {
|
||||
os_ << "- ";
|
||||
} else {
|
||||
os_ << that->max() << " ";
|
||||
}
|
||||
os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
|
||||
that->body()->Accept(this, data);
|
||||
os_ << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
|
||||
os_ << "(^ ";
|
||||
that->body()->Accept(this, data);
|
||||
os_ << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) {
|
||||
os_ << "(?: ";
|
||||
that->body()->Accept(this, data);
|
||||
os_ << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
|
||||
os_ << "(";
|
||||
os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
|
||||
os_ << (that->is_positive() ? " + " : " - ");
|
||||
that->body()->Accept(this, data);
|
||||
os_ << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
|
||||
void* data) {
|
||||
os_ << "(<- " << that->index() << ")";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
|
||||
os_ << '%';
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { // NOLINT
|
||||
RegExpUnparser unparser(os, zone);
|
||||
Accept(&unparser, nullptr);
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
|
||||
: alternatives_(alternatives) {
|
||||
DCHECK_LT(1, alternatives->length());
|
||||
RegExpTree* first_alternative = alternatives->at(0);
|
||||
min_match_ = first_alternative->min_match();
|
||||
max_match_ = first_alternative->max_match();
|
||||
for (int i = 1; i < alternatives->length(); i++) {
|
||||
RegExpTree* alternative = alternatives->at(i);
|
||||
min_match_ = Min(min_match_, alternative->min_match());
|
||||
max_match_ = Max(max_match_, alternative->max_match());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int IncreaseBy(int previous, int increase) {
|
||||
if (RegExpTree::kInfinity - previous < increase) {
|
||||
return RegExpTree::kInfinity;
|
||||
} else {
|
||||
return previous + increase;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
|
||||
: nodes_(nodes) {
|
||||
DCHECK_LT(1, nodes->length());
|
||||
min_match_ = 0;
|
||||
max_match_ = 0;
|
||||
for (int i = 0; i < nodes->length(); i++) {
|
||||
RegExpTree* node = nodes->at(i);
|
||||
int node_min_match = node->min_match();
|
||||
min_match_ = IncreaseBy(min_match_, node_min_match);
|
||||
int node_max_match = node->max_match();
|
||||
max_match_ = IncreaseBy(max_match_, node_max_match);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,615 @@
|
|||
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_AST_H_
|
||||
#define V8_REGEXP_REGEXP_AST_H_
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
|
||||
VISIT(Disjunction) \
|
||||
VISIT(Alternative) \
|
||||
VISIT(Assertion) \
|
||||
VISIT(CharacterClass) \
|
||||
VISIT(Atom) \
|
||||
VISIT(Quantifier) \
|
||||
VISIT(Capture) \
|
||||
VISIT(Group) \
|
||||
VISIT(Lookaround) \
|
||||
VISIT(BackReference) \
|
||||
VISIT(Empty) \
|
||||
VISIT(Text)
|
||||
|
||||
#define FORWARD_DECLARE(Name) class RegExp##Name;
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
|
||||
#undef FORWARD_DECLARE
|
||||
|
||||
class RegExpCompiler;
|
||||
class RegExpNode;
|
||||
class RegExpTree;
|
||||
|
||||
class RegExpVisitor {
|
||||
public:
|
||||
virtual ~RegExpVisitor() = default;
|
||||
#define MAKE_CASE(Name) \
|
||||
virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
|
||||
#undef MAKE_CASE
|
||||
};
|
||||
|
||||
|
||||
// A simple closed interval.
|
||||
class Interval {
|
||||
public:
|
||||
Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size().
|
||||
Interval(int from, int to) : from_(from), to_(to) {}
|
||||
Interval Union(Interval that) {
|
||||
if (that.from_ == kNone)
|
||||
return *this;
|
||||
else if (from_ == kNone)
|
||||
return that;
|
||||
else
|
||||
return Interval(Min(from_, that.from_), Max(to_, that.to_));
|
||||
}
|
||||
|
||||
bool Contains(int value) { return (from_ <= value) && (value <= to_); }
|
||||
bool is_empty() { return from_ == kNone; }
|
||||
int from() const { return from_; }
|
||||
int to() const { return to_; }
|
||||
int size() const { return to_ - from_ + 1; }
|
||||
|
||||
static Interval Empty() { return Interval(); }
|
||||
|
||||
static constexpr int kNone = -1;
|
||||
|
||||
private:
|
||||
int from_;
|
||||
int to_;
|
||||
};
|
||||
|
||||
|
||||
// Represents code units in the range from from_ to to_, both ends are
|
||||
// inclusive.
|
||||
class CharacterRange {
|
||||
public:
|
||||
CharacterRange() : from_(0), to_(0) {}
|
||||
// For compatibility with the CHECK_OK macro
|
||||
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
|
||||
V8_EXPORT_PRIVATE static void AddClassEscape(char type,
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
Zone* zone);
|
||||
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
|
||||
V8_EXPORT_PRIVATE static void AddClassEscape(
|
||||
char type, ZoneList<CharacterRange>* ranges,
|
||||
bool add_unicode_case_equivalents, Zone* zone);
|
||||
static Vector<const int> GetWordBounds();
|
||||
static inline CharacterRange Singleton(uc32 value) {
|
||||
return CharacterRange(value, value);
|
||||
}
|
||||
static inline CharacterRange Range(uc32 from, uc32 to) {
|
||||
DCHECK(0 <= from && to <= String::kMaxCodePoint);
|
||||
DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
|
||||
return CharacterRange(from, to);
|
||||
}
|
||||
static inline CharacterRange Everything() {
|
||||
return CharacterRange(0, String::kMaxCodePoint);
|
||||
}
|
||||
static inline ZoneList<CharacterRange>* List(Zone* zone,
|
||||
CharacterRange range) {
|
||||
ZoneList<CharacterRange>* list =
|
||||
new (zone) ZoneList<CharacterRange>(1, zone);
|
||||
list->Add(range, zone);
|
||||
return list;
|
||||
}
|
||||
bool Contains(uc32 i) { return from_ <= i && i <= to_; }
|
||||
uc32 from() const { return from_; }
|
||||
void set_from(uc32 value) { from_ = value; }
|
||||
uc32 to() const { return to_; }
|
||||
void set_to(uc32 value) { to_ = value; }
|
||||
bool is_valid() { return from_ <= to_; }
|
||||
bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; }
|
||||
bool IsSingleton() { return (from_ == to_); }
|
||||
V8_EXPORT_PRIVATE static void AddCaseEquivalents(
|
||||
Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
|
||||
bool is_one_byte);
|
||||
// Whether a range list is in canonical form: Ranges ordered by from value,
|
||||
// and ranges non-overlapping and non-adjacent.
|
||||
V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
|
||||
// Convert range list to canonical form. The characters covered by the ranges
|
||||
// will still be the same, but no character is in more than one range, and
|
||||
// adjacent ranges are merged. The resulting list may be shorter than the
|
||||
// original, but cannot be longer.
|
||||
static void Canonicalize(ZoneList<CharacterRange>* ranges);
|
||||
// Negate the contents of a character range in canonical form.
|
||||
static void Negate(ZoneList<CharacterRange>* src,
|
||||
ZoneList<CharacterRange>* dst, Zone* zone);
|
||||
static const int kStartMarker = (1 << 24);
|
||||
static const int kPayloadMask = (1 << 24) - 1;
|
||||
|
||||
private:
|
||||
CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
|
||||
|
||||
uc32 from_;
|
||||
uc32 to_;
|
||||
};
|
||||
|
||||
class CharacterSet final {
|
||||
public:
|
||||
explicit CharacterSet(uc16 standard_set_type)
|
||||
: ranges_(nullptr), standard_set_type_(standard_set_type) {}
|
||||
explicit CharacterSet(ZoneList<CharacterRange>* ranges)
|
||||
: ranges_(ranges), standard_set_type_(0) {}
|
||||
ZoneList<CharacterRange>* ranges(Zone* zone);
|
||||
uc16 standard_set_type() const { return standard_set_type_; }
|
||||
void set_standard_set_type(uc16 special_set_type) {
|
||||
standard_set_type_ = special_set_type;
|
||||
}
|
||||
bool is_standard() { return standard_set_type_ != 0; }
|
||||
V8_EXPORT_PRIVATE void Canonicalize();
|
||||
|
||||
private:
|
||||
ZoneList<CharacterRange>* ranges_;
|
||||
// If non-zero, the value represents a standard set (e.g., all whitespace
|
||||
// characters) without having to expand the ranges.
|
||||
uc16 standard_set_type_;
|
||||
};
|
||||
|
||||
class TextElement final {
|
||||
public:
|
||||
enum TextType { ATOM, CHAR_CLASS };
|
||||
|
||||
static TextElement Atom(RegExpAtom* atom);
|
||||
static TextElement CharClass(RegExpCharacterClass* char_class);
|
||||
|
||||
int cp_offset() const { return cp_offset_; }
|
||||
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
|
||||
int length() const;
|
||||
|
||||
TextType text_type() const { return text_type_; }
|
||||
|
||||
RegExpTree* tree() const { return tree_; }
|
||||
|
||||
RegExpAtom* atom() const {
|
||||
DCHECK(text_type() == ATOM);
|
||||
return reinterpret_cast<RegExpAtom*>(tree());
|
||||
}
|
||||
|
||||
RegExpCharacterClass* char_class() const {
|
||||
DCHECK(text_type() == CHAR_CLASS);
|
||||
return reinterpret_cast<RegExpCharacterClass*>(tree());
|
||||
}
|
||||
|
||||
private:
|
||||
TextElement(TextType text_type, RegExpTree* tree)
|
||||
: cp_offset_(-1), text_type_(text_type), tree_(tree) {}
|
||||
|
||||
int cp_offset_;
|
||||
TextType text_type_;
|
||||
RegExpTree* tree_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpTree : public ZoneObject {
|
||||
public:
|
||||
static const int kInfinity = kMaxInt;
|
||||
virtual ~RegExpTree() = default;
|
||||
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
|
||||
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) = 0;
|
||||
virtual bool IsTextElement() { return false; }
|
||||
virtual bool IsAnchoredAtStart() { return false; }
|
||||
virtual bool IsAnchoredAtEnd() { return false; }
|
||||
virtual int min_match() = 0;
|
||||
virtual int max_match() = 0;
|
||||
// Returns the interval of registers used for captures within this
|
||||
// expression.
|
||||
virtual Interval CaptureRegisters() { return Interval::Empty(); }
|
||||
virtual void AppendToText(RegExpText* text, Zone* zone);
|
||||
V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os,
|
||||
Zone* zone); // NOLINT
|
||||
#define MAKE_ASTYPE(Name) \
|
||||
virtual RegExp##Name* As##Name(); \
|
||||
virtual bool Is##Name();
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
|
||||
#undef MAKE_ASTYPE
|
||||
};
|
||||
|
||||
|
||||
class RegExpDisjunction final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpDisjunction* AsDisjunction() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsDisjunction() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
|
||||
|
||||
private:
|
||||
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
|
||||
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
|
||||
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
|
||||
ZoneList<RegExpTree*>* alternatives_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAlternative final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpAlternative* AsAlternative() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsAlternative() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
ZoneList<RegExpTree*>* nodes() { return nodes_; }
|
||||
|
||||
private:
|
||||
ZoneList<RegExpTree*>* nodes_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAssertion final : public RegExpTree {
|
||||
public:
|
||||
enum AssertionType {
|
||||
START_OF_LINE = 0,
|
||||
START_OF_INPUT = 1,
|
||||
END_OF_LINE = 2,
|
||||
END_OF_INPUT = 3,
|
||||
BOUNDARY = 4,
|
||||
NON_BOUNDARY = 5,
|
||||
LAST_TYPE = NON_BOUNDARY,
|
||||
};
|
||||
RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
|
||||
: assertion_type_(type), flags_(flags) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpAssertion* AsAssertion() override;
|
||||
bool IsAssertion() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
AssertionType assertion_type() const { return assertion_type_; }
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
|
||||
private:
|
||||
const AssertionType assertion_type_;
|
||||
const JSRegExp::Flags flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpCharacterClass final : public RegExpTree {
|
||||
public:
|
||||
// NEGATED: The character class is negated and should match everything but
|
||||
// the specified ranges.
|
||||
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
|
||||
// surrogate and should not be unicode-desugared (crbug.com/641091).
|
||||
enum Flag {
|
||||
NEGATED = 1 << 0,
|
||||
CONTAINS_SPLIT_SURROGATE = 1 << 1,
|
||||
};
|
||||
using CharacterClassFlags = base::Flags<Flag>;
|
||||
|
||||
RegExpCharacterClass(
|
||||
Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
|
||||
CharacterClassFlags character_class_flags = CharacterClassFlags())
|
||||
: set_(ranges),
|
||||
flags_(flags),
|
||||
character_class_flags_(character_class_flags) {
|
||||
// Convert the empty set of ranges to the negated Everything() range.
|
||||
if (ranges->is_empty()) {
|
||||
ranges->Add(CharacterRange::Everything(), zone);
|
||||
character_class_flags_ ^= NEGATED;
|
||||
}
|
||||
}
|
||||
RegExpCharacterClass(uc16 type, JSRegExp::Flags flags)
|
||||
: set_(type),
|
||||
flags_(flags),
|
||||
character_class_flags_(CharacterClassFlags()) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpCharacterClass* AsCharacterClass() override;
|
||||
bool IsCharacterClass() override;
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return 1; }
|
||||
// The character class may match two code units for unicode regexps.
|
||||
// TODO(yangguo): we should split this class for usage in TextElement, and
|
||||
// make max_match() dependent on the character class content.
|
||||
int max_match() override { return 2; }
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
CharacterSet character_set() { return set_; }
|
||||
// TODO(lrn): Remove need for complex version if is_standard that
|
||||
// recognizes a mangled standard set and just do { return set_.is_special(); }
|
||||
bool is_standard(Zone* zone);
|
||||
// Returns a value representing the standard character set if is_standard()
|
||||
// returns true.
|
||||
// Currently used values are:
|
||||
// s : unicode whitespace
|
||||
// S : unicode non-whitespace
|
||||
// w : ASCII word character (digit, letter, underscore)
|
||||
// W : non-ASCII word character
|
||||
// d : ASCII digit
|
||||
// D : non-ASCII digit
|
||||
// . : non-newline
|
||||
// * : All characters, for advancing unanchored regexp
|
||||
uc16 standard_type() const { return set_.standard_set_type(); }
|
||||
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
||||
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
bool contains_split_surrogate() const {
|
||||
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
CharacterSet set_;
|
||||
const JSRegExp::Flags flags_;
|
||||
CharacterClassFlags character_class_flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAtom final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
|
||||
: data_(data), flags_(flags) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpAtom* AsAtom() override;
|
||||
bool IsAtom() override;
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return data_.length(); }
|
||||
int max_match() override { return data_.length(); }
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
Vector<const uc16> data() { return data_; }
|
||||
int length() { return data_.length(); }
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
|
||||
|
||||
private:
|
||||
Vector<const uc16> data_;
|
||||
const JSRegExp::Flags flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpText final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpText* AsText() override;
|
||||
bool IsText() override;
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return length_; }
|
||||
int max_match() override { return length_; }
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
void AddElement(TextElement elm, Zone* zone) {
|
||||
elements_.Add(elm, zone);
|
||||
length_ += elm.length();
|
||||
}
|
||||
ZoneList<TextElement>* elements() { return &elements_; }
|
||||
|
||||
private:
|
||||
ZoneList<TextElement> elements_;
|
||||
int length_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpQuantifier final : public RegExpTree {
|
||||
public:
|
||||
enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
|
||||
RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
|
||||
: body_(body),
|
||||
min_(min),
|
||||
max_(max),
|
||||
quantifier_type_(type) {
|
||||
if (min > 0 && body->min_match() > kInfinity / min) {
|
||||
min_match_ = kInfinity;
|
||||
} else {
|
||||
min_match_ = min * body->min_match();
|
||||
}
|
||||
if (max > 0 && body->max_match() > kInfinity / max) {
|
||||
max_match_ = kInfinity;
|
||||
} else {
|
||||
max_match_ = max * body->max_match();
|
||||
}
|
||||
}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
|
||||
RegExpCompiler* compiler, RegExpNode* on_success,
|
||||
bool not_at_start = false);
|
||||
RegExpQuantifier* AsQuantifier() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsQuantifier() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
int min() { return min_; }
|
||||
int max() { return max_; }
|
||||
bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
|
||||
bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
|
||||
bool is_greedy() { return quantifier_type_ == GREEDY; }
|
||||
RegExpTree* body() { return body_; }
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
int min_;
|
||||
int max_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
QuantifierType quantifier_type_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpCapture final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpCapture(int index)
|
||||
: body_(nullptr),
|
||||
index_(index),
|
||||
min_match_(0),
|
||||
max_match_(0),
|
||||
name_(nullptr) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
static RegExpNode* ToNode(RegExpTree* body, int index,
|
||||
RegExpCompiler* compiler, RegExpNode* on_success);
|
||||
RegExpCapture* AsCapture() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsCapture() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
RegExpTree* body() { return body_; }
|
||||
void set_body(RegExpTree* body) {
|
||||
body_ = body;
|
||||
min_match_ = body->min_match();
|
||||
max_match_ = body->max_match();
|
||||
}
|
||||
int index() const { return index_; }
|
||||
const ZoneVector<uc16>* name() const { return name_; }
|
||||
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
|
||||
static int StartRegister(int index) { return index * 2; }
|
||||
static int EndRegister(int index) { return index * 2 + 1; }
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
int index_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
const ZoneVector<uc16>* name_;
|
||||
};
|
||||
|
||||
class RegExpGroup final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpGroup(RegExpTree* body)
|
||||
: body_(body),
|
||||
min_match_(body->min_match()),
|
||||
max_match_(body->max_match()) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) override {
|
||||
return body_->ToNode(compiler, on_success);
|
||||
}
|
||||
RegExpGroup* AsGroup() override;
|
||||
bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
|
||||
bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
|
||||
bool IsGroup() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
|
||||
RegExpTree* body() { return body_; }
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
class RegExpLookaround final : public RegExpTree {
|
||||
public:
|
||||
enum Type { LOOKAHEAD, LOOKBEHIND };
|
||||
|
||||
RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
|
||||
int capture_from, Type type)
|
||||
: body_(body),
|
||||
is_positive_(is_positive),
|
||||
capture_count_(capture_count),
|
||||
capture_from_(capture_from),
|
||||
type_(type) {}
|
||||
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpLookaround* AsLookaround() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsLookaround() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
RegExpTree* body() { return body_; }
|
||||
bool is_positive() { return is_positive_; }
|
||||
int capture_count() { return capture_count_; }
|
||||
int capture_from() { return capture_from_; }
|
||||
Type type() { return type_; }
|
||||
|
||||
class Builder {
|
||||
public:
|
||||
Builder(bool is_positive, RegExpNode* on_success,
|
||||
int stack_pointer_register, int position_register,
|
||||
int capture_register_count = 0, int capture_register_start = 0);
|
||||
RegExpNode* on_match_success() { return on_match_success_; }
|
||||
RegExpNode* ForMatch(RegExpNode* match);
|
||||
|
||||
private:
|
||||
bool is_positive_;
|
||||
RegExpNode* on_match_success_;
|
||||
RegExpNode* on_success_;
|
||||
int stack_pointer_register_;
|
||||
int position_register_;
|
||||
};
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
bool is_positive_;
|
||||
int capture_count_;
|
||||
int capture_from_;
|
||||
Type type_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpBackReference final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpBackReference(JSRegExp::Flags flags)
|
||||
: capture_(nullptr), name_(nullptr), flags_(flags) {}
|
||||
RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags)
|
||||
: capture_(capture), name_(nullptr), flags_(flags) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpBackReference* AsBackReference() override;
|
||||
bool IsBackReference() override;
|
||||
int min_match() override { return 0; }
|
||||
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
|
||||
// recursion, we give up. Ignorance is bliss.
|
||||
int max_match() override { return kInfinity; }
|
||||
int index() { return capture_->index(); }
|
||||
RegExpCapture* capture() { return capture_; }
|
||||
void set_capture(RegExpCapture* capture) { capture_ = capture; }
|
||||
const ZoneVector<uc16>* name() const { return name_; }
|
||||
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
|
||||
|
||||
private:
|
||||
RegExpCapture* capture_;
|
||||
const ZoneVector<uc16>* name_;
|
||||
const JSRegExp::Flags flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpEmpty final : public RegExpTree {
|
||||
public:
|
||||
RegExpEmpty() = default;
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpEmpty* AsEmpty() override;
|
||||
bool IsEmpty() override;
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_AST_H_
|
|
@ -0,0 +1,55 @@
|
|||
// Copyright 2008-2009 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
|
||||
#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
|
||||
|
||||
#include "regexp/regexp-bytecode-generator.h"
|
||||
|
||||
#include "regexp/regexp-bytecodes.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
void RegExpBytecodeGenerator::Emit(uint32_t byte, uint32_t twenty_four_bits) {
|
||||
uint32_t word = ((twenty_four_bits << BYTECODE_SHIFT) | byte);
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ + 3 >= buffer_.length()) {
|
||||
Expand();
|
||||
}
|
||||
*reinterpret_cast<uint32_t*>(buffer_.begin() + pc_) = word;
|
||||
pc_ += 4;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Emit16(uint32_t word) {
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ + 1 >= buffer_.length()) {
|
||||
Expand();
|
||||
}
|
||||
*reinterpret_cast<uint16_t*>(buffer_.begin() + pc_) = word;
|
||||
pc_ += 2;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Emit8(uint32_t word) {
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ == buffer_.length()) {
|
||||
Expand();
|
||||
}
|
||||
*reinterpret_cast<unsigned char*>(buffer_.begin() + pc_) = word;
|
||||
pc_ += 1;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Emit32(uint32_t word) {
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ + 3 >= buffer_.length()) {
|
||||
Expand();
|
||||
}
|
||||
*reinterpret_cast<uint32_t*>(buffer_.begin() + pc_) = word;
|
||||
pc_ += 4;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
|
|
@ -0,0 +1,395 @@
|
|||
// Copyright 2008-2009 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-bytecode-generator.h"
|
||||
|
||||
#include "regexp/regexp-bytecode-generator-inl.h"
|
||||
#include "regexp/regexp-bytecode-peephole.h"
|
||||
#include "regexp/regexp-bytecodes.h"
|
||||
#include "regexp/regexp-macro-assembler.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
|
||||
: RegExpMacroAssembler(isolate, zone),
|
||||
buffer_(Vector<byte>::New(1024)),
|
||||
pc_(0),
|
||||
advance_current_end_(kInvalidPC),
|
||||
jump_edges_(zone),
|
||||
isolate_(isolate) {}
|
||||
|
||||
RegExpBytecodeGenerator::~RegExpBytecodeGenerator() {
|
||||
if (backtrack_.is_linked()) backtrack_.Unuse();
|
||||
buffer_.Dispose();
|
||||
}
|
||||
|
||||
RegExpBytecodeGenerator::IrregexpImplementation
|
||||
RegExpBytecodeGenerator::Implementation() {
|
||||
return kBytecodeImplementation;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Bind(Label* l) {
|
||||
advance_current_end_ = kInvalidPC;
|
||||
DCHECK(!l->is_bound());
|
||||
if (l->is_linked()) {
|
||||
int pos = l->pos();
|
||||
while (pos != 0) {
|
||||
int fixup = pos;
|
||||
pos = *reinterpret_cast<int32_t*>(buffer_.begin() + fixup);
|
||||
*reinterpret_cast<uint32_t*>(buffer_.begin() + fixup) = pc_;
|
||||
jump_edges_.emplace(fixup, pc_);
|
||||
}
|
||||
}
|
||||
l->bind_to(pc_);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::EmitOrLink(Label* l) {
|
||||
if (l == nullptr) l = &backtrack_;
|
||||
int pos = 0;
|
||||
if (l->is_bound()) {
|
||||
pos = l->pos();
|
||||
jump_edges_.emplace(pc_, pos);
|
||||
} else {
|
||||
if (l->is_linked()) {
|
||||
pos = l->pos();
|
||||
}
|
||||
l->link_to(pc_);
|
||||
}
|
||||
Emit32(pos);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::PopRegister(int register_index) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_POP_REGISTER, register_index);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_PUSH_REGISTER, register_index);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::WriteCurrentPositionToRegister(int register_index,
|
||||
int cp_offset) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_SET_REGISTER_TO_CP, register_index);
|
||||
Emit32(cp_offset); // Current position offset.
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::ClearRegisters(int reg_from, int reg_to) {
|
||||
DCHECK(reg_from <= reg_to);
|
||||
for (int reg = reg_from; reg <= reg_to; reg++) {
|
||||
SetRegister(reg, -1);
|
||||
}
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::ReadCurrentPositionFromRegister(
|
||||
int register_index) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_SET_CP_TO_REGISTER, register_index);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::WriteStackPointerToRegister(int register_index) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_SET_REGISTER_TO_SP, register_index);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::ReadStackPointerFromRegister(int register_index) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_SET_SP_TO_REGISTER, register_index);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::SetCurrentPositionFromEnd(int by) {
|
||||
DCHECK(is_uint24(by));
|
||||
Emit(BC_SET_CURRENT_POSITION_FROM_END, by);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::SetRegister(int register_index, int to) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_SET_REGISTER, register_index);
|
||||
Emit32(to);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::AdvanceRegister(int register_index, int by) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_ADVANCE_REGISTER, register_index);
|
||||
Emit32(by);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::PopCurrentPosition() { Emit(BC_POP_CP, 0); }
|
||||
|
||||
void RegExpBytecodeGenerator::PushCurrentPosition() { Emit(BC_PUSH_CP, 0); }
|
||||
|
||||
void RegExpBytecodeGenerator::Backtrack() { Emit(BC_POP_BT, 0); }
|
||||
|
||||
void RegExpBytecodeGenerator::GoTo(Label* l) {
|
||||
if (advance_current_end_ == pc_) {
|
||||
// Combine advance current and goto.
|
||||
pc_ = advance_current_start_;
|
||||
Emit(BC_ADVANCE_CP_AND_GOTO, advance_current_offset_);
|
||||
EmitOrLink(l);
|
||||
advance_current_end_ = kInvalidPC;
|
||||
} else {
|
||||
// Regular goto.
|
||||
Emit(BC_GOTO, 0);
|
||||
EmitOrLink(l);
|
||||
}
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::PushBacktrack(Label* l) {
|
||||
Emit(BC_PUSH_BT, 0);
|
||||
EmitOrLink(l);
|
||||
}
|
||||
|
||||
bool RegExpBytecodeGenerator::Succeed() {
|
||||
Emit(BC_SUCCEED, 0);
|
||||
return false; // Restart matching for global regexp not supported.
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Fail() { Emit(BC_FAIL, 0); }
|
||||
|
||||
void RegExpBytecodeGenerator::AdvanceCurrentPosition(int by) {
|
||||
DCHECK_LE(kMinCPOffset, by);
|
||||
DCHECK_GE(kMaxCPOffset, by);
|
||||
advance_current_start_ = pc_;
|
||||
advance_current_offset_ = by;
|
||||
Emit(BC_ADVANCE_CP, by);
|
||||
advance_current_end_ = pc_;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckGreedyLoop(
|
||||
Label* on_tos_equals_current_position) {
|
||||
Emit(BC_CHECK_GREEDY, 0);
|
||||
EmitOrLink(on_tos_equals_current_position);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset,
|
||||
Label* on_failure,
|
||||
bool check_bounds,
|
||||
int characters,
|
||||
int eats_at_least) {
|
||||
DCHECK_GE(eats_at_least, characters);
|
||||
if (eats_at_least > characters && check_bounds) {
|
||||
DCHECK(is_uint24(cp_offset + eats_at_least));
|
||||
Emit(BC_CHECK_CURRENT_POSITION, cp_offset + eats_at_least);
|
||||
EmitOrLink(on_failure);
|
||||
check_bounds = false; // Load below doesn't need to check.
|
||||
}
|
||||
|
||||
DCHECK_LE(kMinCPOffset, cp_offset);
|
||||
DCHECK_GE(kMaxCPOffset, cp_offset);
|
||||
int bytecode;
|
||||
if (check_bounds) {
|
||||
if (characters == 4) {
|
||||
bytecode = BC_LOAD_4_CURRENT_CHARS;
|
||||
} else if (characters == 2) {
|
||||
bytecode = BC_LOAD_2_CURRENT_CHARS;
|
||||
} else {
|
||||
DCHECK_EQ(1, characters);
|
||||
bytecode = BC_LOAD_CURRENT_CHAR;
|
||||
}
|
||||
} else {
|
||||
if (characters == 4) {
|
||||
bytecode = BC_LOAD_4_CURRENT_CHARS_UNCHECKED;
|
||||
} else if (characters == 2) {
|
||||
bytecode = BC_LOAD_2_CURRENT_CHARS_UNCHECKED;
|
||||
} else {
|
||||
DCHECK_EQ(1, characters);
|
||||
bytecode = BC_LOAD_CURRENT_CHAR_UNCHECKED;
|
||||
}
|
||||
}
|
||||
Emit(bytecode, cp_offset);
|
||||
if (check_bounds) EmitOrLink(on_failure);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterLT(uc16 limit, Label* on_less) {
|
||||
Emit(BC_CHECK_LT, limit);
|
||||
EmitOrLink(on_less);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterGT(uc16 limit, Label* on_greater) {
|
||||
Emit(BC_CHECK_GT, limit);
|
||||
EmitOrLink(on_greater);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacter(uint32_t c, Label* on_equal) {
|
||||
if (c > MAX_FIRST_ARG) {
|
||||
Emit(BC_CHECK_4_CHARS, 0);
|
||||
Emit32(c);
|
||||
} else {
|
||||
Emit(BC_CHECK_CHAR, c);
|
||||
}
|
||||
EmitOrLink(on_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckAtStart(int cp_offset, Label* on_at_start) {
|
||||
Emit(BC_CHECK_AT_START, cp_offset);
|
||||
EmitOrLink(on_at_start);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotAtStart(int cp_offset,
|
||||
Label* on_not_at_start) {
|
||||
Emit(BC_CHECK_NOT_AT_START, cp_offset);
|
||||
EmitOrLink(on_not_at_start);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotCharacter(uint32_t c,
|
||||
Label* on_not_equal) {
|
||||
if (c > MAX_FIRST_ARG) {
|
||||
Emit(BC_CHECK_NOT_4_CHARS, 0);
|
||||
Emit32(c);
|
||||
} else {
|
||||
Emit(BC_CHECK_NOT_CHAR, c);
|
||||
}
|
||||
EmitOrLink(on_not_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterAfterAnd(uint32_t c, uint32_t mask,
|
||||
Label* on_equal) {
|
||||
if (c > MAX_FIRST_ARG) {
|
||||
Emit(BC_AND_CHECK_4_CHARS, 0);
|
||||
Emit32(c);
|
||||
} else {
|
||||
Emit(BC_AND_CHECK_CHAR, c);
|
||||
}
|
||||
Emit32(mask);
|
||||
EmitOrLink(on_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c,
|
||||
uint32_t mask,
|
||||
Label* on_not_equal) {
|
||||
if (c > MAX_FIRST_ARG) {
|
||||
Emit(BC_AND_CHECK_NOT_4_CHARS, 0);
|
||||
Emit32(c);
|
||||
} else {
|
||||
Emit(BC_AND_CHECK_NOT_CHAR, c);
|
||||
}
|
||||
Emit32(mask);
|
||||
EmitOrLink(on_not_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd(
|
||||
uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) {
|
||||
Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c);
|
||||
Emit16(minus);
|
||||
Emit16(mask);
|
||||
EmitOrLink(on_not_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to,
|
||||
Label* on_in_range) {
|
||||
Emit(BC_CHECK_CHAR_IN_RANGE, 0);
|
||||
Emit16(from);
|
||||
Emit16(to);
|
||||
EmitOrLink(on_in_range);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
Label* on_not_in_range) {
|
||||
Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0);
|
||||
Emit16(from);
|
||||
Emit16(to);
|
||||
EmitOrLink(on_not_in_range);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckBitInTable(Handle<ByteArray> table,
|
||||
Label* on_bit_set) {
|
||||
Emit(BC_CHECK_BIT_IN_TABLE, 0);
|
||||
EmitOrLink(on_bit_set);
|
||||
for (int i = 0; i < kTableSize; i += kBitsPerByte) {
|
||||
int byte = 0;
|
||||
for (int j = 0; j < kBitsPerByte; j++) {
|
||||
if (table->get(i + j) != 0) byte |= 1 << j;
|
||||
}
|
||||
Emit8(byte);
|
||||
}
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg,
|
||||
bool read_backward,
|
||||
Label* on_not_equal) {
|
||||
DCHECK_LE(0, start_reg);
|
||||
DCHECK_GE(kMaxRegister, start_reg);
|
||||
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF,
|
||||
start_reg);
|
||||
EmitOrLink(on_not_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase(
|
||||
int start_reg, bool read_backward, Label* on_not_equal) {
|
||||
DCHECK_LE(0, start_reg);
|
||||
DCHECK_GE(kMaxRegister, start_reg);
|
||||
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
|
||||
: BC_CHECK_NOT_BACK_REF_NO_CASE,
|
||||
start_reg);
|
||||
EmitOrLink(on_not_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::IfRegisterLT(int register_index, int comparand,
|
||||
Label* on_less_than) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_CHECK_REGISTER_LT, register_index);
|
||||
Emit32(comparand);
|
||||
EmitOrLink(on_less_than);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::IfRegisterGE(int register_index, int comparand,
|
||||
Label* on_greater_or_equal) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_CHECK_REGISTER_GE, register_index);
|
||||
Emit32(comparand);
|
||||
EmitOrLink(on_greater_or_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::IfRegisterEqPos(int register_index,
|
||||
Label* on_eq) {
|
||||
DCHECK_LE(0, register_index);
|
||||
DCHECK_GE(kMaxRegister, register_index);
|
||||
Emit(BC_CHECK_REGISTER_EQ_POS, register_index);
|
||||
EmitOrLink(on_eq);
|
||||
}
|
||||
|
||||
Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
|
||||
Bind(&backtrack_);
|
||||
Emit(BC_POP_BT, 0);
|
||||
|
||||
Handle<ByteArray> array;
|
||||
if (FLAG_regexp_peephole_optimization) {
|
||||
array = RegExpBytecodePeepholeOptimization::OptimizeBytecode(
|
||||
isolate_, zone(), source, buffer_.begin(), length(), jump_edges_);
|
||||
} else {
|
||||
array = isolate_->factory()->NewByteArray(length());
|
||||
Copy(array->GetDataStartAddress());
|
||||
}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
int RegExpBytecodeGenerator::length() { return pc_; }
|
||||
|
||||
void RegExpBytecodeGenerator::Copy(byte* a) {
|
||||
MemCopy(a, buffer_.begin(), length());
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Expand() {
|
||||
Vector<byte> old_buffer = buffer_;
|
||||
buffer_ = Vector<byte>::New(old_buffer.length() * 2);
|
||||
MemCopy(buffer_.begin(), old_buffer.begin(), old_buffer.length());
|
||||
old_buffer.Dispose();
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,119 @@
|
|||
// Copyright 2012 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
|
||||
#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
|
||||
|
||||
#include "regexp/regexp-macro-assembler.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// An assembler/generator for the Irregexp byte code.
|
||||
class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
|
||||
public:
|
||||
// Create an assembler. Instructions and relocation information are emitted
|
||||
// into a buffer, with the instructions starting from the beginning and the
|
||||
// relocation information starting from the end of the buffer. See CodeDesc
|
||||
// for a detailed comment on the layout (globals.h).
|
||||
//
|
||||
// The assembler allocates and grows its own buffer, and buffer_size
|
||||
// determines the initial buffer size. The buffer is owned by the assembler
|
||||
// and deallocated upon destruction of the assembler.
|
||||
RegExpBytecodeGenerator(Isolate* isolate, Zone* zone);
|
||||
virtual ~RegExpBytecodeGenerator();
|
||||
// The byte-code interpreter checks on each push anyway.
|
||||
virtual int stack_limit_slack() { return 1; }
|
||||
virtual bool CanReadUnaligned() { return false; }
|
||||
virtual void Bind(Label* label);
|
||||
virtual void AdvanceCurrentPosition(int by); // Signed cp change.
|
||||
virtual void PopCurrentPosition();
|
||||
virtual void PushCurrentPosition();
|
||||
virtual void Backtrack();
|
||||
virtual void GoTo(Label* label);
|
||||
virtual void PushBacktrack(Label* label);
|
||||
virtual bool Succeed();
|
||||
virtual void Fail();
|
||||
virtual void PopRegister(int register_index);
|
||||
virtual void PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit);
|
||||
virtual void AdvanceRegister(int reg, int by); // r[reg] += by.
|
||||
virtual void SetCurrentPositionFromEnd(int by);
|
||||
virtual void SetRegister(int register_index, int to);
|
||||
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
|
||||
virtual void ClearRegisters(int reg_from, int reg_to);
|
||||
virtual void ReadCurrentPositionFromRegister(int reg);
|
||||
virtual void WriteStackPointerToRegister(int reg);
|
||||
virtual void ReadStackPointerFromRegister(int reg);
|
||||
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least);
|
||||
virtual void CheckCharacter(unsigned c, Label* on_equal);
|
||||
virtual void CheckCharacterAfterAnd(unsigned c, unsigned mask,
|
||||
Label* on_equal);
|
||||
virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
|
||||
virtual void CheckCharacterLT(uc16 limit, Label* on_less);
|
||||
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position);
|
||||
virtual void CheckAtStart(int cp_offset, Label* on_at_start);
|
||||
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start);
|
||||
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
|
||||
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
|
||||
Label* on_not_equal);
|
||||
virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask,
|
||||
Label* on_not_equal);
|
||||
virtual void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range);
|
||||
virtual void CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
Label* on_not_in_range);
|
||||
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set);
|
||||
virtual void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match);
|
||||
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
|
||||
bool read_backward,
|
||||
Label* on_no_match);
|
||||
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
|
||||
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
|
||||
virtual void IfRegisterEqPos(int register_index, Label* if_eq);
|
||||
|
||||
virtual IrregexpImplementation Implementation();
|
||||
virtual Handle<HeapObject> GetCode(Handle<String> source);
|
||||
|
||||
private:
|
||||
void Expand();
|
||||
// Code and bitmap emission.
|
||||
inline void EmitOrLink(Label* label);
|
||||
inline void Emit32(uint32_t x);
|
||||
inline void Emit16(uint32_t x);
|
||||
inline void Emit8(uint32_t x);
|
||||
inline void Emit(uint32_t bc, uint32_t arg);
|
||||
// Bytecode buffer.
|
||||
int length();
|
||||
void Copy(byte* a);
|
||||
|
||||
// The buffer into which code and relocation info are generated.
|
||||
Vector<byte> buffer_;
|
||||
// The program counter.
|
||||
int pc_;
|
||||
Label backtrack_;
|
||||
|
||||
int advance_current_start_;
|
||||
int advance_current_offset_;
|
||||
int advance_current_end_;
|
||||
|
||||
// Stores jump edges emitted for the bytecode (used by
|
||||
// RegExpBytecodePeepholeOptimization).
|
||||
// Key: jump source (offset in buffer_ where jump destination is stored).
|
||||
// Value: jump destination (offset in buffer_ to jump to).
|
||||
ZoneUnorderedMap<int, int> jump_edges_;
|
||||
|
||||
Isolate* isolate_;
|
||||
|
||||
static const int kInvalidPC = -1;
|
||||
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodeGenerator);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,30 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
|
||||
#define V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class ByteArray;
|
||||
|
||||
// Peephole optimization for regexp interpreter bytecode.
|
||||
// Pre-defined bytecode sequences occuring in the bytecode generated by the
|
||||
// RegExpBytecodeGenerator can be optimized into a single bytecode.
|
||||
class RegExpBytecodePeepholeOptimization : public AllStatic {
|
||||
public:
|
||||
// Performs peephole optimization on the given bytecode and returns the
|
||||
// optimized bytecode.
|
||||
static Handle<ByteArray> OptimizeBytecode(
|
||||
Isolate* isolate, Zone* zone, Handle<String> source, const byte* bytecode,
|
||||
int length, const ZoneUnorderedMap<int, int>& jump_edges);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
|
|
@ -0,0 +1,45 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-bytecodes.h"
|
||||
|
||||
#include <cctype>
|
||||
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc) {
|
||||
PrintF("%s", RegExpBytecodeName(*pc));
|
||||
|
||||
// Args and the bytecode as hex.
|
||||
for (int i = 0; i < RegExpBytecodeLength(*pc); i++) {
|
||||
PrintF(", %02x", pc[i]);
|
||||
}
|
||||
PrintF(" ");
|
||||
|
||||
// Args as ascii.
|
||||
for (int i = 1; i < RegExpBytecodeLength(*pc); i++) {
|
||||
unsigned char b = pc[i];
|
||||
PrintF("%c", std::isprint(b) ? b : '.');
|
||||
}
|
||||
PrintF("\n");
|
||||
}
|
||||
|
||||
void RegExpBytecodeDisassemble(const byte* code_base, int length,
|
||||
const char* pattern) {
|
||||
PrintF("[generated bytecode for regexp pattern: '%s']\n", pattern);
|
||||
|
||||
ptrdiff_t offset = 0;
|
||||
|
||||
while (offset < length) {
|
||||
const byte* const pc = code_base + offset;
|
||||
PrintF("%p %4" V8PRIxPTRDIFF " ", pc, offset);
|
||||
RegExpBytecodeDisassembleSingle(code_base, pc);
|
||||
offset += RegExpBytecodeLength(*pc);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,251 @@
|
|||
// Copyright 2011 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_BYTECODES_H_
|
||||
#define V8_REGEXP_REGEXP_BYTECODES_H_
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// Maximum number of bytecodes that will be used (next power of 2 of actually
|
||||
// defined bytecodes).
|
||||
// All slots between the last actually defined bytecode and maximum id will be
|
||||
// filled with BREAKs, indicating an invalid operation. This way using
|
||||
// BYTECODE_MASK guarantees no OOB access to the dispatch table.
|
||||
constexpr int kRegExpPaddedBytecodeCount = 1 << 6;
|
||||
constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
|
||||
// The first argument is packed in with the byte code in one word, but so it
|
||||
// has 24 bits, but it can be positive and negative so only use 23 bits for
|
||||
// positive values.
|
||||
const unsigned int MAX_FIRST_ARG = 0x7fffffu;
|
||||
const int BYTECODE_SHIFT = 8;
|
||||
STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
|
||||
|
||||
// TODO(pthier): Argument offsets of bytecodes should be easily accessible by
|
||||
// name or at least by position.
|
||||
#define BYTECODE_ITERATOR(V) \
|
||||
V(BREAK, 0, 4) /* bc8 */ \
|
||||
V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
|
||||
V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
|
||||
V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
|
||||
V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
|
||||
V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
|
||||
V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
|
||||
V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
|
||||
V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
|
||||
V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
|
||||
V(POP_CP, 10, 4) /* bc8 pad24 */ \
|
||||
V(POP_BT, 11, 4) /* bc8 pad24 */ \
|
||||
V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
|
||||
V(FAIL, 13, 4) /* bc8 pad24 */ \
|
||||
V(SUCCEED, 14, 4) /* bc8 pad24 */ \
|
||||
V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
|
||||
/* Jump to another bytecode given its offset. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x10 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: 0x00 (unused) Padding */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode to jump to */ \
|
||||
V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
|
||||
/* Check if offset is in range and load character at given offset. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x11 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: Offset from current position */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode when load is out of range */ \
|
||||
V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
|
||||
/* Load character at given offset without range checks. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x12 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: Offset from current position */ \
|
||||
V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
|
||||
V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
|
||||
V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
|
||||
V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
|
||||
V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
|
||||
V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
|
||||
/* Check if current character is equal to a given character */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x19 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x0F: 0x00 (unused) Padding */ \
|
||||
/* 0x10 - 0x1F: Character to check */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode when matched */ \
|
||||
V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
|
||||
V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
|
||||
V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
|
||||
V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
|
||||
/* Checks if the current character combined with mask (bitwise and) */ \
|
||||
/* matches a character (e.g. used when two characters in a disjunction */ \
|
||||
/* differ by only a single bit */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x1c (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x0F: 0x00 (unused) Padding */ \
|
||||
/* 0x10 - 0x1F: Character to match against (after mask aplied) */ \
|
||||
/* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \
|
||||
/* 0x40 - 0x5F: Address of bytecode when matched */ \
|
||||
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
|
||||
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
|
||||
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
|
||||
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
|
||||
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
|
||||
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
|
||||
/* Checks if the current character matches any of the characters encoded */ \
|
||||
/* in a bit table. Similar to/inspired by boyer moore string search */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x22 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: 0x00 (unused) Padding */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode when bit is set */ \
|
||||
/* 0x40 - 0xBF: Bit table */ \
|
||||
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
|
||||
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
|
||||
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \
|
||||
V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \
|
||||
V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
|
||||
V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
|
||||
V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
|
||||
V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
|
||||
V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
|
||||
/* Checks if the current position matches top of backtrack stack */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x31 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: 0x00 (unused) Padding */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode when current matches tos */ \
|
||||
V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
|
||||
/* Advance character pointer by given offset and jump to another bytecode.*/ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x32 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: Number of characters to advance */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode to jump to */ \
|
||||
V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
|
||||
V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ \
|
||||
/* Checks if current position + given offset is in range. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07: 0x34 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F: Offset from current position */ \
|
||||
/* 0x20 - 0x3F: Address of bytecode when position is out of range */ \
|
||||
V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32 */ \
|
||||
/* Combination of: */ \
|
||||
/* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO */ \
|
||||
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07 0x35 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F Load character offset from current position */ \
|
||||
/* 0x20 - 0x3F Number of characters to advance */ \
|
||||
/* 0x40 - 0xBF Bit Table */ \
|
||||
/* 0xC0 - 0xDF Address of bytecode when character is matched */ \
|
||||
/* 0xE0 - 0xFF Address of bytecode when no match */ \
|
||||
V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32) \
|
||||
/* Combination of: */ \
|
||||
/* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR */ \
|
||||
/* and ADVANCE_CP_AND_GOTO */ \
|
||||
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07 0x36 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F Load character offset from current position */ \
|
||||
/* 0x20 - 0x2F Number of characters to advance */ \
|
||||
/* 0x30 - 0x3F Character to match against (after mask applied) */ \
|
||||
/* 0x40 - 0x5F: Bitmask bitwise and combined with current character */ \
|
||||
/* 0x60 - 0x7F Minimum number of characters this pattern consumes */ \
|
||||
/* 0x80 - 0x9F Address of bytecode when character is matched */ \
|
||||
/* 0xA0 - 0xBF Address of bytecode when no match */ \
|
||||
V(SKIP_UNTIL_CHAR_AND, 54, 24) \
|
||||
/* Combination of: */ \
|
||||
/* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
|
||||
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07 0x37 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F Load character offset from current position */ \
|
||||
/* 0x20 - 0x2F Number of characters to advance */ \
|
||||
/* 0x30 - 0x3F Character to match */ \
|
||||
/* 0x40 - 0x5F Address of bytecode when character is matched */ \
|
||||
/* 0x60 - 0x7F Address of bytecode when no match */ \
|
||||
V(SKIP_UNTIL_CHAR, 55, 16) \
|
||||
/* Combination of: */ \
|
||||
/* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR */ \
|
||||
/* and ADVANCE_CP_AND_GOTO */ \
|
||||
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07 0x38 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F Load character offset from current position */ \
|
||||
/* 0x20 - 0x2F Number of characters to advance */ \
|
||||
/* 0x30 - 0x3F Character to match */ \
|
||||
/* 0x40 - 0x5F Minimum number of characters this pattern consumes */ \
|
||||
/* 0x60 - 0x7F Address of bytecode when character is matched */ \
|
||||
/* 0x80 - 0x9F Address of bytecode when no match */ \
|
||||
V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20) \
|
||||
/* Combination of: */ \
|
||||
/* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
|
||||
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07 0x39 (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F Load character offset from current position */ \
|
||||
/* 0x20 - 0x3F Number of characters to advance */ \
|
||||
/* 0x40 - 0x4F Character to match */ \
|
||||
/* 0x50 - 0x5F Other Character to match */ \
|
||||
/* 0x60 - 0x7F Address of bytecode when either character is matched */ \
|
||||
/* 0x80 - 0x9F Address of bytecode when no match */ \
|
||||
V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20) \
|
||||
/* Combination of: */ \
|
||||
/* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and */ \
|
||||
/* and ADVANCE_CP_AND_GOTO */ \
|
||||
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
|
||||
/* Bit Layout: */ \
|
||||
/* 0x00 - 0x07 0x3A (fixed) Bytecode */ \
|
||||
/* 0x08 - 0x1F Load character offset from current position */ \
|
||||
/* 0x20 - 0x2F Number of characters to advance */ \
|
||||
/* 0x30 - 0x3F Character to check if it is less than current char */ \
|
||||
/* 0x40 - 0xBF Bit Table */ \
|
||||
/* 0xC0 - 0xDF Address of bytecode when character is matched */ \
|
||||
/* 0xE0 - 0xFF Address of bytecode when no match */ \
|
||||
V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32)
|
||||
|
||||
#define COUNT(...) +1
|
||||
static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
|
||||
#undef COUNT
|
||||
|
||||
// Just making sure we assigned values above properly. They should be
|
||||
// contiguous, strictly increasing, and start at 0.
|
||||
// TODO(jgruber): Do not explicitly assign values, instead generate them
|
||||
// implicitly from the list order.
|
||||
STATIC_ASSERT(kRegExpBytecodeCount == 59);
|
||||
|
||||
#define DECLARE_BYTECODES(name, code, length) \
|
||||
static constexpr int BC_##name = code;
|
||||
BYTECODE_ITERATOR(DECLARE_BYTECODES)
|
||||
#undef DECLARE_BYTECODES
|
||||
|
||||
static constexpr int kRegExpBytecodeLengths[] = {
|
||||
#define DECLARE_BYTECODE_LENGTH(name, code, length) length,
|
||||
BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH)
|
||||
#undef DECLARE_BYTECODE_LENGTH
|
||||
};
|
||||
|
||||
inline constexpr int RegExpBytecodeLength(int bytecode) {
|
||||
return kRegExpBytecodeLengths[bytecode];
|
||||
}
|
||||
|
||||
static const char* const kRegExpBytecodeNames[] = {
|
||||
#define DECLARE_BYTECODE_NAME(name, ...) #name,
|
||||
BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME)
|
||||
#undef DECLARE_BYTECODE_NAME
|
||||
};
|
||||
|
||||
inline const char* RegExpBytecodeName(int bytecode) {
|
||||
return kRegExpBytecodeNames[bytecode];
|
||||
}
|
||||
|
||||
void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc);
|
||||
void RegExpBytecodeDisassemble(const byte* code_base, int length,
|
||||
const char* pattern);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_BYTECODES_H_
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,621 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_COMPILER_H_
|
||||
#define V8_REGEXP_REGEXP_COMPILER_H_
|
||||
|
||||
#include <bitset>
|
||||
|
||||
#include "regexp/regexp-nodes.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class DynamicBitSet;
|
||||
class Isolate;
|
||||
|
||||
namespace regexp_compiler_constants {
|
||||
|
||||
// The '2' variant is has inclusive from and exclusive to.
|
||||
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
|
||||
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
|
||||
constexpr uc32 kRangeEndMarker = 0x110000;
|
||||
constexpr int kSpaceRanges[] = {
|
||||
'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
|
||||
0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
|
||||
0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
|
||||
constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
|
||||
|
||||
constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
|
||||
'_' + 1, 'a', 'z' + 1, kRangeEndMarker};
|
||||
constexpr int kWordRangeCount = arraysize(kWordRanges);
|
||||
constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
|
||||
constexpr int kDigitRangeCount = arraysize(kDigitRanges);
|
||||
constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
|
||||
kLeadSurrogateStart + 1, kRangeEndMarker};
|
||||
constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
|
||||
constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
|
||||
0x2028, 0x202A, kRangeEndMarker};
|
||||
constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
|
||||
|
||||
// More makes code generation slower, less makes V8 benchmark score lower.
|
||||
constexpr int kMaxLookaheadForBoyerMoore = 8;
|
||||
// In a 3-character pattern you can maximally step forwards 3 characters
|
||||
// at a time, which is not always enough to pay for the extra logic.
|
||||
constexpr int kPatternTooShortForBoyerMoore = 2;
|
||||
|
||||
} // namespace regexp_compiler_constants
|
||||
|
||||
inline bool IgnoreCase(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kIgnoreCase) != 0;
|
||||
}
|
||||
|
||||
inline bool IsUnicode(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kUnicode) != 0;
|
||||
}
|
||||
|
||||
inline bool IsSticky(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kSticky) != 0;
|
||||
}
|
||||
|
||||
inline bool IsGlobal(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kGlobal) != 0;
|
||||
}
|
||||
|
||||
inline bool DotAll(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kDotAll) != 0;
|
||||
}
|
||||
|
||||
inline bool Multiline(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kMultiline) != 0;
|
||||
}
|
||||
|
||||
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
|
||||
// Both unicode and ignore_case flags are set. We need to use ICU to find
|
||||
// the closure over case equivalents.
|
||||
return IsUnicode(flags) && IgnoreCase(flags);
|
||||
}
|
||||
|
||||
// Details of a quick mask-compare check that can look ahead in the
|
||||
// input stream.
|
||||
class QuickCheckDetails {
|
||||
public:
|
||||
QuickCheckDetails()
|
||||
: characters_(0), mask_(0), value_(0), cannot_match_(false) {}
|
||||
explicit QuickCheckDetails(int characters)
|
||||
: characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
|
||||
bool Rationalize(bool one_byte);
|
||||
// Merge in the information from another branch of an alternation.
|
||||
void Merge(QuickCheckDetails* other, int from_index);
|
||||
// Advance the current position by some amount.
|
||||
void Advance(int by, bool one_byte);
|
||||
void Clear();
|
||||
bool cannot_match() { return cannot_match_; }
|
||||
void set_cannot_match() { cannot_match_ = true; }
|
||||
struct Position {
|
||||
Position() : mask(0), value(0), determines_perfectly(false) {}
|
||||
uc16 mask;
|
||||
uc16 value;
|
||||
bool determines_perfectly;
|
||||
};
|
||||
int characters() { return characters_; }
|
||||
void set_characters(int characters) { characters_ = characters; }
|
||||
Position* positions(int index) {
|
||||
DCHECK_LE(0, index);
|
||||
DCHECK_GT(characters_, index);
|
||||
return positions_ + index;
|
||||
}
|
||||
uint32_t mask() { return mask_; }
|
||||
uint32_t value() { return value_; }
|
||||
|
||||
private:
|
||||
// How many characters do we have quick check information from. This is
|
||||
// the same for all branches of a choice node.
|
||||
int characters_;
|
||||
Position positions_[4];
|
||||
// These values are the condensate of the above array after Rationalize().
|
||||
uint32_t mask_;
|
||||
uint32_t value_;
|
||||
// If set to true, there is no way this quick check can match at all.
|
||||
// E.g., if it requires to be at the start of the input, and isn't.
|
||||
bool cannot_match_;
|
||||
};
|
||||
|
||||
// Improve the speed that we scan for an initial point where a non-anchored
|
||||
// regexp can match by using a Boyer-Moore-like table. This is done by
|
||||
// identifying non-greedy non-capturing loops in the nodes that eat any
|
||||
// character one at a time. For example in the middle of the regexp
|
||||
// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
|
||||
// inserted at the start of any non-anchored regexp.
|
||||
//
|
||||
// When we have found such a loop we look ahead in the nodes to find the set of
|
||||
// characters that can come at given distances. For example for the regexp
|
||||
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
|
||||
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
|
||||
// the lookahead info where the set of characters is reasonably constrained. In
|
||||
// our example this is from index 1 to 2 (0 is not constrained). We can now
|
||||
// look 3 characters ahead and if we don't find one of [f, o] (the union of
|
||||
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
|
||||
//
|
||||
// For Unicode input strings we do the same, but modulo 128.
|
||||
//
|
||||
// We also look at the first string fed to the regexp and use that to get a hint
|
||||
// of the character frequencies in the inputs. This affects the assessment of
|
||||
// whether the set of characters is 'reasonably constrained'.
|
||||
//
|
||||
// We also have another lookahead mechanism (called quick check in the code),
|
||||
// which uses a wide load of multiple characters followed by a mask and compare
|
||||
// to determine whether a match is possible at this point.
|
||||
enum ContainedInLattice {
|
||||
kNotYet = 0,
|
||||
kLatticeIn = 1,
|
||||
kLatticeOut = 2,
|
||||
kLatticeUnknown = 3 // Can also mean both in and out.
|
||||
};
|
||||
|
||||
inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
|
||||
return static_cast<ContainedInLattice>(a | b);
|
||||
}
|
||||
|
||||
class BoyerMoorePositionInfo : public ZoneObject {
|
||||
public:
|
||||
bool at(int i) const { return map_[i]; }
|
||||
|
||||
static constexpr int kMapSize = 128;
|
||||
static constexpr int kMask = kMapSize - 1;
|
||||
|
||||
int map_count() const { return map_count_; }
|
||||
|
||||
void Set(int character);
|
||||
void SetInterval(const Interval& interval);
|
||||
void SetAll();
|
||||
|
||||
bool is_non_word() { return w_ == kLatticeOut; }
|
||||
bool is_word() { return w_ == kLatticeIn; }
|
||||
|
||||
using Bitset = std::bitset<kMapSize>;
|
||||
Bitset raw_bitset() const { return map_; }
|
||||
|
||||
private:
|
||||
Bitset map_;
|
||||
int map_count_ = 0; // Number of set bits in the map.
|
||||
ContainedInLattice w_ = kNotYet; // The \w character class.
|
||||
};
|
||||
|
||||
class BoyerMooreLookahead : public ZoneObject {
|
||||
public:
|
||||
BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
|
||||
|
||||
int length() { return length_; }
|
||||
int max_char() { return max_char_; }
|
||||
RegExpCompiler* compiler() { return compiler_; }
|
||||
|
||||
int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
|
||||
|
||||
BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
|
||||
|
||||
void Set(int map_number, int character) {
|
||||
if (character > max_char_) return;
|
||||
BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
|
||||
info->Set(character);
|
||||
}
|
||||
|
||||
void SetInterval(int map_number, const Interval& interval) {
|
||||
if (interval.from() > max_char_) return;
|
||||
BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
|
||||
if (interval.to() > max_char_) {
|
||||
info->SetInterval(Interval(interval.from(), max_char_));
|
||||
} else {
|
||||
info->SetInterval(interval);
|
||||
}
|
||||
}
|
||||
|
||||
void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
|
||||
|
||||
void SetRest(int from_map) {
|
||||
for (int i = from_map; i < length_; i++) SetAll(i);
|
||||
}
|
||||
void EmitSkipInstructions(RegExpMacroAssembler* masm);
|
||||
|
||||
private:
|
||||
// This is the value obtained by EatsAtLeast. If we do not have at least this
|
||||
// many characters left in the sample string then the match is bound to fail.
|
||||
// Therefore it is OK to read a character this far ahead of the current match
|
||||
// point.
|
||||
int length_;
|
||||
RegExpCompiler* compiler_;
|
||||
// 0xff for Latin1, 0xffff for UTF-16.
|
||||
int max_char_;
|
||||
ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
|
||||
|
||||
int GetSkipTable(int min_lookahead, int max_lookahead,
|
||||
Handle<ByteArray> boolean_skip_table);
|
||||
bool FindWorthwhileInterval(int* from, int* to);
|
||||
int FindBestInterval(int max_number_of_chars, int old_biggest_points,
|
||||
int* from, int* to);
|
||||
};
|
||||
|
||||
// There are many ways to generate code for a node. This class encapsulates
|
||||
// the current way we should be generating. In other words it encapsulates
|
||||
// the current state of the code generator. The effect of this is that we
|
||||
// generate code for paths that the matcher can take through the regular
|
||||
// expression. A given node in the regexp can be code-generated several times
|
||||
// as it can be part of several traces. For example for the regexp:
|
||||
// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
|
||||
// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
|
||||
// to match foo is generated only once (the traces have a common prefix). The
|
||||
// code to store the capture is deferred and generated (twice) after the places
|
||||
// where baz has been matched.
|
||||
class Trace {
|
||||
public:
|
||||
// A value for a property that is either known to be true, know to be false,
|
||||
// or not known.
|
||||
enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
|
||||
|
||||
class DeferredAction {
|
||||
public:
|
||||
DeferredAction(ActionNode::ActionType action_type, int reg)
|
||||
: action_type_(action_type), reg_(reg), next_(nullptr) {}
|
||||
DeferredAction* next() { return next_; }
|
||||
bool Mentions(int reg);
|
||||
int reg() { return reg_; }
|
||||
ActionNode::ActionType action_type() { return action_type_; }
|
||||
|
||||
private:
|
||||
ActionNode::ActionType action_type_;
|
||||
int reg_;
|
||||
DeferredAction* next_;
|
||||
friend class Trace;
|
||||
};
|
||||
|
||||
class DeferredCapture : public DeferredAction {
|
||||
public:
|
||||
DeferredCapture(int reg, bool is_capture, Trace* trace)
|
||||
: DeferredAction(ActionNode::STORE_POSITION, reg),
|
||||
cp_offset_(trace->cp_offset()),
|
||||
is_capture_(is_capture) {}
|
||||
int cp_offset() { return cp_offset_; }
|
||||
bool is_capture() { return is_capture_; }
|
||||
|
||||
private:
|
||||
int cp_offset_;
|
||||
bool is_capture_;
|
||||
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
|
||||
};
|
||||
|
||||
class DeferredSetRegisterForLoop : public DeferredAction {
|
||||
public:
|
||||
DeferredSetRegisterForLoop(int reg, int value)
|
||||
: DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
|
||||
value_(value) {}
|
||||
int value() { return value_; }
|
||||
|
||||
private:
|
||||
int value_;
|
||||
};
|
||||
|
||||
class DeferredClearCaptures : public DeferredAction {
|
||||
public:
|
||||
explicit DeferredClearCaptures(Interval range)
|
||||
: DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
|
||||
Interval range() { return range_; }
|
||||
|
||||
private:
|
||||
Interval range_;
|
||||
};
|
||||
|
||||
class DeferredIncrementRegister : public DeferredAction {
|
||||
public:
|
||||
explicit DeferredIncrementRegister(int reg)
|
||||
: DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
|
||||
};
|
||||
|
||||
Trace()
|
||||
: cp_offset_(0),
|
||||
actions_(nullptr),
|
||||
backtrack_(nullptr),
|
||||
stop_node_(nullptr),
|
||||
loop_label_(nullptr),
|
||||
characters_preloaded_(0),
|
||||
bound_checked_up_to_(0),
|
||||
flush_budget_(100),
|
||||
at_start_(UNKNOWN) {}
|
||||
|
||||
// End the trace. This involves flushing the deferred actions in the trace
|
||||
// and pushing a backtrack location onto the backtrack stack. Once this is
|
||||
// done we can start a new trace or go to one that has already been
|
||||
// generated.
|
||||
void Flush(RegExpCompiler* compiler, RegExpNode* successor);
|
||||
int cp_offset() { return cp_offset_; }
|
||||
DeferredAction* actions() { return actions_; }
|
||||
// A trivial trace is one that has no deferred actions or other state that
|
||||
// affects the assumptions used when generating code. There is no recorded
|
||||
// backtrack location in a trivial trace, so with a trivial trace we will
|
||||
// generate code that, on a failure to match, gets the backtrack location
|
||||
// from the backtrack stack rather than using a direct jump instruction. We
|
||||
// always start code generation with a trivial trace and non-trivial traces
|
||||
// are created as we emit code for nodes or add to the list of deferred
|
||||
// actions in the trace. The location of the code generated for a node using
|
||||
// a trivial trace is recorded in a label in the node so that gotos can be
|
||||
// generated to that code.
|
||||
bool is_trivial() {
|
||||
return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
|
||||
characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
|
||||
quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
|
||||
}
|
||||
TriBool at_start() { return at_start_; }
|
||||
void set_at_start(TriBool at_start) { at_start_ = at_start; }
|
||||
Label* backtrack() { return backtrack_; }
|
||||
Label* loop_label() { return loop_label_; }
|
||||
RegExpNode* stop_node() { return stop_node_; }
|
||||
int characters_preloaded() { return characters_preloaded_; }
|
||||
int bound_checked_up_to() { return bound_checked_up_to_; }
|
||||
int flush_budget() { return flush_budget_; }
|
||||
QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
|
||||
bool mentions_reg(int reg);
|
||||
// Returns true if a deferred position store exists to the specified
|
||||
// register and stores the offset in the out-parameter. Otherwise
|
||||
// returns false.
|
||||
bool GetStoredPosition(int reg, int* cp_offset);
|
||||
// These set methods and AdvanceCurrentPositionInTrace should be used only on
|
||||
// new traces - the intention is that traces are immutable after creation.
|
||||
void add_action(DeferredAction* new_action) {
|
||||
DCHECK(new_action->next_ == nullptr);
|
||||
new_action->next_ = actions_;
|
||||
actions_ = new_action;
|
||||
}
|
||||
void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
|
||||
void set_stop_node(RegExpNode* node) { stop_node_ = node; }
|
||||
void set_loop_label(Label* label) { loop_label_ = label; }
|
||||
void set_characters_preloaded(int count) { characters_preloaded_ = count; }
|
||||
void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
|
||||
void set_flush_budget(int to) { flush_budget_ = to; }
|
||||
void set_quick_check_performed(QuickCheckDetails* d) {
|
||||
quick_check_performed_ = *d;
|
||||
}
|
||||
void InvalidateCurrentCharacter();
|
||||
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
|
||||
|
||||
private:
|
||||
int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
|
||||
void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
|
||||
const DynamicBitSet& affected_registers,
|
||||
DynamicBitSet* registers_to_pop,
|
||||
DynamicBitSet* registers_to_clear, Zone* zone);
|
||||
void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
|
||||
const DynamicBitSet& registers_to_pop,
|
||||
const DynamicBitSet& registers_to_clear);
|
||||
int cp_offset_;
|
||||
DeferredAction* actions_;
|
||||
Label* backtrack_;
|
||||
RegExpNode* stop_node_;
|
||||
Label* loop_label_;
|
||||
int characters_preloaded_;
|
||||
int bound_checked_up_to_;
|
||||
QuickCheckDetails quick_check_performed_;
|
||||
int flush_budget_;
|
||||
TriBool at_start_;
|
||||
};
|
||||
|
||||
class GreedyLoopState {
|
||||
public:
|
||||
explicit GreedyLoopState(bool not_at_start);
|
||||
|
||||
Label* label() { return &label_; }
|
||||
Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
|
||||
|
||||
private:
|
||||
Label label_;
|
||||
Trace counter_backtrack_trace_;
|
||||
};
|
||||
|
||||
struct PreloadState {
|
||||
static const int kEatsAtLeastNotYetInitialized = -1;
|
||||
bool preload_is_current_;
|
||||
bool preload_has_checked_bounds_;
|
||||
int preload_characters_;
|
||||
int eats_at_least_;
|
||||
void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
|
||||
};
|
||||
|
||||
// Analysis performs assertion propagation and computes eats_at_least_ values.
|
||||
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
|
||||
// details.
|
||||
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
|
||||
|
||||
class FrequencyCollator {
|
||||
public:
|
||||
FrequencyCollator() : total_samples_(0) {
|
||||
for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
|
||||
frequencies_[i] = CharacterFrequency(i);
|
||||
}
|
||||
}
|
||||
|
||||
void CountCharacter(int character) {
|
||||
int index = (character & RegExpMacroAssembler::kTableMask);
|
||||
frequencies_[index].Increment();
|
||||
total_samples_++;
|
||||
}
|
||||
|
||||
// Does not measure in percent, but rather per-128 (the table size from the
|
||||
// regexp macro assembler).
|
||||
int Frequency(int in_character) {
|
||||
DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
|
||||
if (total_samples_ < 1) return 1; // Division by zero.
|
||||
int freq_in_per128 =
|
||||
(frequencies_[in_character].counter() * 128) / total_samples_;
|
||||
return freq_in_per128;
|
||||
}
|
||||
|
||||
private:
|
||||
class CharacterFrequency {
|
||||
public:
|
||||
CharacterFrequency() : counter_(0), character_(-1) {}
|
||||
explicit CharacterFrequency(int character)
|
||||
: counter_(0), character_(character) {}
|
||||
|
||||
void Increment() { counter_++; }
|
||||
int counter() { return counter_; }
|
||||
int character() { return character_; }
|
||||
|
||||
private:
|
||||
int counter_;
|
||||
int character_;
|
||||
};
|
||||
|
||||
private:
|
||||
CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
|
||||
int total_samples_;
|
||||
};
|
||||
|
||||
class RegExpCompiler {
|
||||
public:
|
||||
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
|
||||
bool is_one_byte);
|
||||
|
||||
int AllocateRegister() {
|
||||
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
|
||||
reg_exp_too_big_ = true;
|
||||
return next_register_;
|
||||
}
|
||||
return next_register_++;
|
||||
}
|
||||
|
||||
// Lookarounds to match lone surrogates for unicode character class matches
|
||||
// are never nested. We can therefore reuse registers.
|
||||
int UnicodeLookaroundStackRegister() {
|
||||
if (unicode_lookaround_stack_register_ == kNoRegister) {
|
||||
unicode_lookaround_stack_register_ = AllocateRegister();
|
||||
}
|
||||
return unicode_lookaround_stack_register_;
|
||||
}
|
||||
|
||||
int UnicodeLookaroundPositionRegister() {
|
||||
if (unicode_lookaround_position_register_ == kNoRegister) {
|
||||
unicode_lookaround_position_register_ = AllocateRegister();
|
||||
}
|
||||
return unicode_lookaround_position_register_;
|
||||
}
|
||||
|
||||
struct CompilationResult final {
|
||||
explicit CompilationResult(RegExpError err) : error(err) {}
|
||||
CompilationResult(Object code, int registers)
|
||||
: code(code), num_registers(registers) {}
|
||||
|
||||
static CompilationResult RegExpTooBig() {
|
||||
return CompilationResult(RegExpError::kTooLarge);
|
||||
}
|
||||
|
||||
bool Succeeded() const { return error == RegExpError::kNone; }
|
||||
|
||||
const RegExpError error = RegExpError::kNone;
|
||||
Object code;
|
||||
int num_registers = 0;
|
||||
};
|
||||
|
||||
CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
|
||||
RegExpNode* start, int capture_count,
|
||||
Handle<String> pattern);
|
||||
|
||||
// If the regexp matching starts within a surrogate pair, step back to the
|
||||
// lead surrogate and start matching from there.
|
||||
static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags);
|
||||
|
||||
inline void AddWork(RegExpNode* node) {
|
||||
if (!node->on_work_list() && !node->label()->is_bound()) {
|
||||
node->set_on_work_list(true);
|
||||
work_list_->push_back(node);
|
||||
}
|
||||
}
|
||||
|
||||
static const int kImplementationOffset = 0;
|
||||
static const int kNumberOfRegistersOffset = 0;
|
||||
static const int kCodeOffset = 1;
|
||||
|
||||
RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
|
||||
EndNode* accept() { return accept_; }
|
||||
|
||||
static const int kMaxRecursion = 100;
|
||||
inline int recursion_depth() { return recursion_depth_; }
|
||||
inline void IncrementRecursionDepth() { recursion_depth_++; }
|
||||
inline void DecrementRecursionDepth() { recursion_depth_--; }
|
||||
|
||||
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
|
||||
|
||||
inline bool one_byte() { return one_byte_; }
|
||||
inline bool optimize() { return optimize_; }
|
||||
inline void set_optimize(bool value) { optimize_ = value; }
|
||||
inline bool limiting_recursion() { return limiting_recursion_; }
|
||||
inline void set_limiting_recursion(bool value) {
|
||||
limiting_recursion_ = value;
|
||||
}
|
||||
bool read_backward() { return read_backward_; }
|
||||
void set_read_backward(bool value) { read_backward_ = value; }
|
||||
FrequencyCollator* frequency_collator() { return &frequency_collator_; }
|
||||
|
||||
int current_expansion_factor() { return current_expansion_factor_; }
|
||||
void set_current_expansion_factor(int value) {
|
||||
current_expansion_factor_ = value;
|
||||
}
|
||||
|
||||
Isolate* isolate() const { return isolate_; }
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
static const int kNoRegister = -1;
|
||||
|
||||
private:
|
||||
EndNode* accept_;
|
||||
int next_register_;
|
||||
int unicode_lookaround_stack_register_;
|
||||
int unicode_lookaround_position_register_;
|
||||
ZoneVector<RegExpNode*>* work_list_;
|
||||
int recursion_depth_;
|
||||
RegExpMacroAssembler* macro_assembler_;
|
||||
bool one_byte_;
|
||||
bool reg_exp_too_big_;
|
||||
bool limiting_recursion_;
|
||||
bool optimize_;
|
||||
bool read_backward_;
|
||||
int current_expansion_factor_;
|
||||
FrequencyCollator frequency_collator_;
|
||||
Isolate* isolate_;
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
|
||||
class UnicodeRangeSplitter {
|
||||
public:
|
||||
V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
|
||||
|
||||
static constexpr int kInitialSize = 8;
|
||||
using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
|
||||
|
||||
const CharacterRangeVector* bmp() const { return &bmp_; }
|
||||
const CharacterRangeVector* lead_surrogates() const {
|
||||
return &lead_surrogates_;
|
||||
}
|
||||
const CharacterRangeVector* trail_surrogates() const {
|
||||
return &trail_surrogates_;
|
||||
}
|
||||
const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
|
||||
|
||||
private:
|
||||
void AddRange(CharacterRange range);
|
||||
|
||||
CharacterRangeVector bmp_;
|
||||
CharacterRangeVector lead_surrogates_;
|
||||
CharacterRangeVector trail_surrogates_;
|
||||
CharacterRangeVector non_bmp_;
|
||||
};
|
||||
|
||||
// We need to check for the following characters: 0x39C 0x3BC 0x178.
|
||||
// TODO(jgruber): Move to CharacterRange.
|
||||
bool RangeContainsLatin1Equivalents(CharacterRange range);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_COMPILER_H_
|
|
@ -0,0 +1,252 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-dotprinter.h"
|
||||
|
||||
#include "regexp/regexp-compiler.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// -------------------------------------------------------------------
|
||||
// Dot/dotty output
|
||||
|
||||
#ifdef DEBUG
|
||||
|
||||
class DotPrinterImpl : public NodeVisitor {
|
||||
public:
|
||||
explicit DotPrinterImpl(std::ostream& os) : os_(os) {}
|
||||
void PrintNode(const char* label, RegExpNode* node);
|
||||
void Visit(RegExpNode* node);
|
||||
void PrintAttributes(RegExpNode* from);
|
||||
void PrintOnFailure(RegExpNode* from, RegExpNode* to);
|
||||
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
|
||||
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
|
||||
#undef DECLARE_VISIT
|
||||
private:
|
||||
std::ostream& os_;
|
||||
};
|
||||
|
||||
void DotPrinterImpl::PrintNode(const char* label, RegExpNode* node) {
|
||||
os_ << "digraph G {\n graph [label=\"";
|
||||
for (int i = 0; label[i]; i++) {
|
||||
switch (label[i]) {
|
||||
case '\\':
|
||||
os_ << "\\\\";
|
||||
break;
|
||||
case '"':
|
||||
os_ << "\"";
|
||||
break;
|
||||
default:
|
||||
os_ << label[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
os_ << "\"];\n";
|
||||
Visit(node);
|
||||
os_ << "}" << std::endl;
|
||||
}
|
||||
|
||||
void DotPrinterImpl::Visit(RegExpNode* node) {
|
||||
if (node->info()->visited) return;
|
||||
node->info()->visited = true;
|
||||
node->Accept(this);
|
||||
}
|
||||
|
||||
void DotPrinterImpl::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
|
||||
os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
|
||||
Visit(on_failure);
|
||||
}
|
||||
|
||||
class AttributePrinter {
|
||||
public:
|
||||
explicit AttributePrinter(std::ostream& os) // NOLINT
|
||||
: os_(os), first_(true) {}
|
||||
void PrintSeparator() {
|
||||
if (first_) {
|
||||
first_ = false;
|
||||
} else {
|
||||
os_ << "|";
|
||||
}
|
||||
}
|
||||
void PrintBit(const char* name, bool value) {
|
||||
if (!value) return;
|
||||
PrintSeparator();
|
||||
os_ << "{" << name << "}";
|
||||
}
|
||||
void PrintPositive(const char* name, int value) {
|
||||
if (value < 0) return;
|
||||
PrintSeparator();
|
||||
os_ << "{" << name << "|" << value << "}";
|
||||
}
|
||||
|
||||
private:
|
||||
std::ostream& os_;
|
||||
bool first_;
|
||||
};
|
||||
|
||||
void DotPrinterImpl::PrintAttributes(RegExpNode* that) {
|
||||
os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
|
||||
<< "margin=0.1, fontsize=10, label=\"{";
|
||||
AttributePrinter printer(os_);
|
||||
NodeInfo* info = that->info();
|
||||
printer.PrintBit("NI", info->follows_newline_interest);
|
||||
printer.PrintBit("WI", info->follows_word_interest);
|
||||
printer.PrintBit("SI", info->follows_start_interest);
|
||||
Label* label = that->label();
|
||||
if (label->is_bound()) printer.PrintPositive("@", label->pos());
|
||||
os_ << "}\"];\n"
|
||||
<< " a" << that << " -> n" << that
|
||||
<< " [style=dashed, color=grey, arrowhead=none];\n";
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitChoice(ChoiceNode* that) {
|
||||
os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
|
||||
for (int i = 0; i < that->alternatives()->length(); i++) {
|
||||
GuardedAlternative alt = that->alternatives()->at(i);
|
||||
os_ << " n" << that << " -> n" << alt.node();
|
||||
}
|
||||
for (int i = 0; i < that->alternatives()->length(); i++) {
|
||||
GuardedAlternative alt = that->alternatives()->at(i);
|
||||
alt.node()->Accept(this);
|
||||
}
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitLoopChoice(LoopChoiceNode* that) {
|
||||
VisitChoice(that);
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitNegativeLookaroundChoice(
|
||||
NegativeLookaroundChoiceNode* that) {
|
||||
VisitChoice(that);
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitText(TextNode* that) {
|
||||
Zone* zone = that->zone();
|
||||
os_ << " n" << that << " [label=\"";
|
||||
for (int i = 0; i < that->elements()->length(); i++) {
|
||||
if (i > 0) os_ << " ";
|
||||
TextElement elm = that->elements()->at(i);
|
||||
switch (elm.text_type()) {
|
||||
case TextElement::ATOM: {
|
||||
Vector<const uc16> data = elm.atom()->data();
|
||||
for (int i = 0; i < data.length(); i++) {
|
||||
os_ << static_cast<char>(data[i]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case TextElement::CHAR_CLASS: {
|
||||
RegExpCharacterClass* node = elm.char_class();
|
||||
os_ << "[";
|
||||
if (node->is_negated()) os_ << "^";
|
||||
for (int j = 0; j < node->ranges(zone)->length(); j++) {
|
||||
CharacterRange range = node->ranges(zone)->at(j);
|
||||
os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
|
||||
}
|
||||
os_ << "]";
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
}
|
||||
os_ << "\", shape=box, peripheries=2];\n";
|
||||
PrintAttributes(that);
|
||||
os_ << " n" << that << " -> n" << that->on_success() << ";\n";
|
||||
Visit(that->on_success());
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitBackReference(BackReferenceNode* that) {
|
||||
os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
|
||||
<< that->end_register() << "\", shape=doubleoctagon];\n";
|
||||
PrintAttributes(that);
|
||||
os_ << " n" << that << " -> n" << that->on_success() << ";\n";
|
||||
Visit(that->on_success());
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitEnd(EndNode* that) {
|
||||
os_ << " n" << that << " [style=bold, shape=point];\n";
|
||||
PrintAttributes(that);
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
|
||||
os_ << " n" << that << " [";
|
||||
switch (that->assertion_type()) {
|
||||
case AssertionNode::AT_END:
|
||||
os_ << "label=\"$\", shape=septagon";
|
||||
break;
|
||||
case AssertionNode::AT_START:
|
||||
os_ << "label=\"^\", shape=septagon";
|
||||
break;
|
||||
case AssertionNode::AT_BOUNDARY:
|
||||
os_ << "label=\"\\b\", shape=septagon";
|
||||
break;
|
||||
case AssertionNode::AT_NON_BOUNDARY:
|
||||
os_ << "label=\"\\B\", shape=septagon";
|
||||
break;
|
||||
case AssertionNode::AFTER_NEWLINE:
|
||||
os_ << "label=\"(?<=\\n)\", shape=septagon";
|
||||
break;
|
||||
}
|
||||
os_ << "];\n";
|
||||
PrintAttributes(that);
|
||||
RegExpNode* successor = that->on_success();
|
||||
os_ << " n" << that << " -> n" << successor << ";\n";
|
||||
Visit(successor);
|
||||
}
|
||||
|
||||
void DotPrinterImpl::VisitAction(ActionNode* that) {
|
||||
os_ << " n" << that << " [";
|
||||
switch (that->action_type_) {
|
||||
case ActionNode::SET_REGISTER_FOR_LOOP:
|
||||
os_ << "label=\"$" << that->data_.u_store_register.reg
|
||||
<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
|
||||
break;
|
||||
case ActionNode::INCREMENT_REGISTER:
|
||||
os_ << "label=\"$" << that->data_.u_increment_register.reg
|
||||
<< "++\", shape=octagon";
|
||||
break;
|
||||
case ActionNode::STORE_POSITION:
|
||||
os_ << "label=\"$" << that->data_.u_position_register.reg
|
||||
<< ":=$pos\", shape=octagon";
|
||||
break;
|
||||
case ActionNode::BEGIN_SUBMATCH:
|
||||
os_ << "label=\"$" << that->data_.u_submatch.current_position_register
|
||||
<< ":=$pos,begin\", shape=septagon";
|
||||
break;
|
||||
case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
|
||||
os_ << "label=\"escape\", shape=septagon";
|
||||
break;
|
||||
case ActionNode::EMPTY_MATCH_CHECK:
|
||||
os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
|
||||
<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
|
||||
<< "<" << that->data_.u_empty_match_check.repetition_limit
|
||||
<< "?\", shape=septagon";
|
||||
break;
|
||||
case ActionNode::CLEAR_CAPTURES: {
|
||||
os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
|
||||
<< " to $" << that->data_.u_clear_captures.range_to
|
||||
<< "\", shape=septagon";
|
||||
break;
|
||||
}
|
||||
}
|
||||
os_ << "];\n";
|
||||
PrintAttributes(that);
|
||||
RegExpNode* successor = that->on_success();
|
||||
os_ << " n" << that << " -> n" << successor << ";\n";
|
||||
Visit(successor);
|
||||
}
|
||||
|
||||
#endif // DEBUG
|
||||
|
||||
void DotPrinter::DotPrint(const char* label, RegExpNode* node) {
|
||||
#ifdef DEBUG
|
||||
StdoutStream os;
|
||||
DotPrinterImpl printer(os);
|
||||
printer.PrintNode(label, node);
|
||||
#endif // DEBUG
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,23 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_DOTPRINTER_H_
|
||||
#define V8_REGEXP_REGEXP_DOTPRINTER_H_
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class RegExpNode;
|
||||
|
||||
class DotPrinter final : public AllStatic {
|
||||
public:
|
||||
static void DotPrint(const char* label, RegExpNode* node);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_DOTPRINTER_H_
|
|
@ -0,0 +1,22 @@
|
|||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-error.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
const char* kRegExpErrorStrings[] = {
|
||||
#define TEMPLATE(NAME, STRING) STRING,
|
||||
REGEXP_ERROR_MESSAGES(TEMPLATE)
|
||||
#undef TEMPLATE
|
||||
};
|
||||
|
||||
const char* RegExpErrorString(RegExpError error) {
|
||||
DCHECK_LT(error, RegExpError::NumErrors);
|
||||
return kRegExpErrorStrings[static_cast<int>(error)];
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,56 @@
|
|||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_ERROR_H_
|
||||
#define V8_REGEXP_REGEXP_ERROR_H_
|
||||
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
#define REGEXP_ERROR_MESSAGES(T) \
|
||||
T(None, "") \
|
||||
T(StackOverflow, "Maximum call stack size exceeded") \
|
||||
T(AnalysisStackOverflow, "Stack overflow") \
|
||||
T(TooLarge, "Regular expression too large") \
|
||||
T(UnterminatedGroup, "Unterminated group") \
|
||||
T(UnmatchedParen, "Unmatched ')'") \
|
||||
T(EscapeAtEndOfPattern, "\\ at end of pattern") \
|
||||
T(InvalidPropertyName, "Invalid property name") \
|
||||
T(InvalidEscape, "Invalid escape") \
|
||||
T(InvalidDecimalEscape, "Invalid decimal escape") \
|
||||
T(InvalidUnicodeEscape, "Invalid Unicode escape") \
|
||||
T(NothingToRepeat, "Nothing to repeat") \
|
||||
T(LoneQuantifierBrackets, "Lone quantifier brackets") \
|
||||
T(RangeOutOfOrder, "numbers out of order in {} quantifier") \
|
||||
T(IncompleteQuantifier, "Incomplete quantifier") \
|
||||
T(InvalidQuantifier, "Invalid quantifier") \
|
||||
T(InvalidGroup, "Invalid group") \
|
||||
T(MultipleFlagDashes, "Multiple dashes in flag group") \
|
||||
T(RepeatedFlag, "Repeated flag in flag group") \
|
||||
T(InvalidFlagGroup, "Invalid flag group") \
|
||||
T(TooManyCaptures, "Too many captures") \
|
||||
T(InvalidCaptureGroupName, "Invalid capture group name") \
|
||||
T(DuplicateCaptureGroupName, "Duplicate capture group name") \
|
||||
T(InvalidNamedReference, "Invalid named reference") \
|
||||
T(InvalidNamedCaptureReference, "Invalid named capture referenced") \
|
||||
T(InvalidClassEscape, "Invalid class escape") \
|
||||
T(InvalidClassPropertyName, "Invalid property name in character class") \
|
||||
T(InvalidCharacterClass, "Invalid character class") \
|
||||
T(UnterminatedCharacterClass, "Unterminated character class") \
|
||||
T(OutOfOrderCharacterClass, "Range out of order in character class")
|
||||
|
||||
enum class RegExpError : uint32_t {
|
||||
#define TEMPLATE(NAME, STRING) k##NAME,
|
||||
REGEXP_ERROR_MESSAGES(TEMPLATE)
|
||||
#undef TEMPLATE
|
||||
NumErrors
|
||||
};
|
||||
|
||||
V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_ERROR_H_
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,61 @@
|
|||
// Copyright 2011 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// A simple interpreter for the Irregexp byte code.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_INTERPRETER_H_
|
||||
#define V8_REGEXP_REGEXP_INTERPRETER_H_
|
||||
|
||||
#include "regexp/regexp.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
|
||||
public:
|
||||
enum Result {
|
||||
FAILURE = RegExp::kInternalRegExpFailure,
|
||||
SUCCESS = RegExp::kInternalRegExpSuccess,
|
||||
EXCEPTION = RegExp::kInternalRegExpException,
|
||||
RETRY = RegExp::kInternalRegExpRetry,
|
||||
};
|
||||
|
||||
// In case a StackOverflow occurs, a StackOverflowException is created and
|
||||
// EXCEPTION is returned.
|
||||
static Result MatchForCallFromRuntime(Isolate* isolate,
|
||||
Handle<JSRegExp> regexp,
|
||||
Handle<String> subject_string,
|
||||
int* registers, int registers_length,
|
||||
int start_position);
|
||||
|
||||
// In case a StackOverflow occurs, EXCEPTION is returned. The caller is
|
||||
// responsible for creating the exception.
|
||||
// RETRY is returned if a retry through the runtime is needed (e.g. when
|
||||
// interrupts have been scheduled or the regexp is marked for tier-up).
|
||||
// Arguments input_start, input_end and backtrack_stack are
|
||||
// unused. They are only passed to match the signature of the native irregex
|
||||
// code.
|
||||
static Result MatchForCallFromJs(Address subject, int32_t start_position,
|
||||
Address input_start, Address input_end,
|
||||
int* registers, int32_t registers_length,
|
||||
Address backtrack_stack,
|
||||
RegExp::CallOrigin call_origin,
|
||||
Isolate* isolate, Address regexp);
|
||||
|
||||
static Result MatchInternal(Isolate* isolate, ByteArray code_array,
|
||||
String subject_string, int* registers,
|
||||
int registers_length, int start_position,
|
||||
RegExp::CallOrigin call_origin,
|
||||
uint32_t backtrack_limit);
|
||||
|
||||
private:
|
||||
static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string,
|
||||
int* registers, int registers_length, int start_position,
|
||||
RegExp::CallOrigin call_origin);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_INTERPRETER_H_
|
|
@ -0,0 +1,291 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
* vim: set ts=8 sts=2 et sw=2 tw=80:
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
// This file implements the NativeRegExpMacroAssembler interface for
|
||||
// SpiderMonkey. It provides the same interface as each of V8's
|
||||
// architecture-specific implementations.
|
||||
|
||||
#ifndef RegexpMacroAssemblerArch_h
|
||||
#define RegexpMacroAssemblerArch_h
|
||||
|
||||
#include "jit/MacroAssembler.h"
|
||||
#include "regexp/regexp-macro-assembler.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
struct FrameData {
|
||||
// Character position at the start of the input, stored as a
|
||||
// negative offset from the end of the string (input_end_pointer_).
|
||||
size_t inputStart;
|
||||
|
||||
// The backtrack_stack_pointer_ register points to the top of the stack.
|
||||
// This points to the bottom of the backtrack stack.
|
||||
void* backtrackStackBase;
|
||||
|
||||
// Copy of the input MatchPairs.
|
||||
int32_t* matches; // pointer to capture array
|
||||
int32_t numMatches; // size of capture array
|
||||
};
|
||||
|
||||
class SMRegExpMacroAssembler final : public NativeRegExpMacroAssembler {
|
||||
public:
|
||||
SMRegExpMacroAssembler(JSContext* cx, Isolate* isolate,
|
||||
js::jit::StackMacroAssembler& masm, Zone* zone,
|
||||
Mode mode, uint32_t num_capture_registers);
|
||||
virtual ~SMRegExpMacroAssembler() {} // Nothing to do here
|
||||
|
||||
virtual int stack_limit_slack();
|
||||
virtual IrregexpImplementation Implementation();
|
||||
|
||||
virtual bool Succeed();
|
||||
virtual void Fail();
|
||||
|
||||
virtual void AdvanceCurrentPosition(int by);
|
||||
virtual void PopCurrentPosition();
|
||||
virtual void PushCurrentPosition();
|
||||
virtual void SetCurrentPositionFromEnd(int by);
|
||||
|
||||
virtual void Backtrack();
|
||||
virtual void Bind(Label* label);
|
||||
virtual void GoTo(Label* label);
|
||||
virtual void PushBacktrack(Label* label);
|
||||
|
||||
virtual void CheckCharacter(uint32_t c, Label* on_equal);
|
||||
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
|
||||
virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
|
||||
virtual void CheckCharacterLT(uc16 limit, Label* on_less);
|
||||
virtual void CheckCharacterAfterAnd(uint32_t c, uint32_t mask,
|
||||
Label* on_equal);
|
||||
virtual void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask,
|
||||
Label* on_not_equal);
|
||||
virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask,
|
||||
Label* on_not_equal);
|
||||
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position);
|
||||
virtual void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range);
|
||||
virtual void CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
Label* on_not_in_range);
|
||||
virtual void CheckAtStart(int cp_offset, Label* on_at_start);
|
||||
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start);
|
||||
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
|
||||
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set);
|
||||
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
|
||||
virtual void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match);
|
||||
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
|
||||
bool read_backward,
|
||||
Label* on_no_match);
|
||||
|
||||
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least);
|
||||
|
||||
virtual void AdvanceRegister(int reg, int by);
|
||||
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge);
|
||||
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt);
|
||||
virtual void IfRegisterEqPos(int reg, Label* if_eq);
|
||||
virtual void PopRegister(int register_index);
|
||||
virtual void PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit);
|
||||
virtual void ReadCurrentPositionFromRegister(int reg);
|
||||
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
|
||||
virtual void ReadStackPointerFromRegister(int reg);
|
||||
virtual void WriteStackPointerToRegister(int reg);
|
||||
virtual void SetRegister(int register_index, int to);
|
||||
virtual void ClearRegisters(int reg_from, int reg_to);
|
||||
|
||||
virtual Handle<HeapObject> GetCode(Handle<String> source);
|
||||
|
||||
private:
|
||||
size_t frameSize_ = 0;
|
||||
|
||||
void createStackFrame();
|
||||
void initFrameAndRegs();
|
||||
void successHandler();
|
||||
void exitHandler();
|
||||
void backtrackHandler();
|
||||
void stackOverflowHandler();
|
||||
|
||||
// Push a register on the backtrack stack.
|
||||
void Push(js::jit::Register value);
|
||||
|
||||
// Pop a value from the backtrack stack.
|
||||
void Pop(js::jit::Register target);
|
||||
|
||||
void CheckAtStartImpl(int cp_offset, Label* on_cond,
|
||||
js::jit::Assembler::Condition cond);
|
||||
void CheckCharacterImpl(js::jit::Imm32 c, Label* on_cond,
|
||||
js::jit::Assembler::Condition cond);
|
||||
void CheckCharacterAfterAndImpl(uint32_t c, uint32_t and_with, Label* on_cond,
|
||||
bool negate);
|
||||
void CheckCharacterInRangeImpl(uc16 from, uc16 to, Label* on_cond,
|
||||
js::jit::Assembler::Condition cond);
|
||||
void CheckNotBackReferenceImpl(int start_reg, bool read_backward,
|
||||
Label* on_no_match, bool ignore_case);
|
||||
|
||||
void LoadCurrentCharacterUnchecked(int cp_offset, int characters);
|
||||
|
||||
void JumpOrBacktrack(Label* to);
|
||||
|
||||
// MacroAssembler methods that take a Label can be called with a
|
||||
// null label, which means that we should backtrack if we would jump
|
||||
// to that label. This is a helper to avoid writing out the same
|
||||
// logic a dozen times.
|
||||
inline js::jit::Label* LabelOrBacktrack(Label* to) {
|
||||
return to ? to->inner() : &backtrack_label_;
|
||||
}
|
||||
|
||||
void CheckBacktrackStackLimit();
|
||||
|
||||
static bool GrowBacktrackStack(RegExpStack* regexp_stack);
|
||||
|
||||
static uint32_t CaseInsensitiveCompareStrings(const char16_t* substring1,
|
||||
const char16_t* substring2,
|
||||
size_t byteLength);
|
||||
static uint32_t CaseInsensitiveCompareUCStrings(const char16_t* substring1,
|
||||
const char16_t* substring2,
|
||||
size_t byteLength);
|
||||
|
||||
inline int char_size() { return static_cast<int>(mode_); }
|
||||
inline js::jit::Scale factor() {
|
||||
return mode_ == UC16 ? js::jit::TimesTwo : js::jit::TimesOne;
|
||||
}
|
||||
|
||||
js::jit::Address inputStart() {
|
||||
return js::jit::Address(masm_.getStackPointer(),
|
||||
offsetof(FrameData, inputStart));
|
||||
}
|
||||
js::jit::Address backtrackStackBase() {
|
||||
return js::jit::Address(masm_.getStackPointer(),
|
||||
offsetof(FrameData, backtrackStackBase));
|
||||
}
|
||||
js::jit::Address matches() {
|
||||
return js::jit::Address(masm_.getStackPointer(),
|
||||
offsetof(FrameData, matches));
|
||||
}
|
||||
js::jit::Address numMatches() {
|
||||
return js::jit::Address(masm_.getStackPointer(),
|
||||
offsetof(FrameData, numMatches));
|
||||
}
|
||||
|
||||
// The stack-pointer-relative location of a regexp register.
|
||||
js::jit::Address register_location(int register_index) {
|
||||
return js::jit::Address(masm_.getStackPointer(),
|
||||
register_offset(register_index));
|
||||
}
|
||||
|
||||
int32_t register_offset(int register_index) {
|
||||
MOZ_ASSERT(register_index >= 0 && register_index <= kMaxRegister);
|
||||
if (num_registers_ <= register_index) {
|
||||
num_registers_ = register_index + 1;
|
||||
}
|
||||
static_assert(alignof(uintptr_t) <= alignof(FrameData));
|
||||
return sizeof(FrameData) + register_index * sizeof(uintptr_t*);
|
||||
}
|
||||
|
||||
JSContext* cx_;
|
||||
js::jit::StackMacroAssembler& masm_;
|
||||
|
||||
/*
|
||||
* This assembler uses the following registers:
|
||||
*
|
||||
* - current_character_:
|
||||
* Contains the character (or characters) currently being examined.
|
||||
* Must be loaded using LoadCurrentCharacter before using any of the
|
||||
* dispatch methods. After a matching pass for a global regexp,
|
||||
* temporarily stores the index of capture start.
|
||||
* - current_position_:
|
||||
* Current position in input *as negative byte offset from end of string*.
|
||||
* - input_end_pointer_:
|
||||
* Points to byte after last character in the input. current_position_ is
|
||||
* relative to this.
|
||||
* - backtrack_stack_pointer_:
|
||||
* Points to tip of the (heap-allocated) backtrack stack. The stack grows
|
||||
* downward (like the native stack).
|
||||
* - temp0_, temp1_, temp2_:
|
||||
* Scratch registers.
|
||||
*
|
||||
* The native stack pointer is used to access arguments (InputOutputData),
|
||||
* local variables (FrameData), and irregexp's internal virtual registers
|
||||
* (see register_location).
|
||||
*/
|
||||
|
||||
js::jit::Register current_character_;
|
||||
js::jit::Register current_position_;
|
||||
js::jit::Register input_end_pointer_;
|
||||
js::jit::Register backtrack_stack_pointer_;
|
||||
js::jit::Register temp0_, temp1_, temp2_;
|
||||
|
||||
js::jit::Label entry_label_;
|
||||
js::jit::Label start_label_;
|
||||
js::jit::Label backtrack_label_;
|
||||
js::jit::Label success_label_;
|
||||
js::jit::Label exit_label_;
|
||||
js::jit::Label stack_overflow_label_;
|
||||
js::jit::Label exit_with_exception_label_;
|
||||
|
||||
// When we generate the code to push a backtrack label's address
|
||||
// onto the backtrack stack, we don't know its final address. We
|
||||
// have to patch it after linking. This is slightly delicate, as the
|
||||
// Label itself (which is allocated on the stack) may not exist by
|
||||
// the time we link. The approach is as follows:
|
||||
//
|
||||
// 1. When we push a label on the backtrack stack (PushBacktrack),
|
||||
// we bind the label's patchOffset_ field to the offset within
|
||||
// the code that should be overwritten. This works because each
|
||||
// label is only pushed by a single instruction.
|
||||
//
|
||||
// 2. When we bind a label (Bind), we check to see if it has a
|
||||
// bound patchOffset_. If it does, we create a LabelPatch mapping
|
||||
// its patch offset to the offset of the label itself.
|
||||
//
|
||||
// 3. While linking the code, we walk the list of label patches
|
||||
// and patch the code accordingly.
|
||||
class LabelPatch {
|
||||
public:
|
||||
LabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset)
|
||||
: patchOffset_(patchOffset), labelOffset_(labelOffset) {}
|
||||
|
||||
js::jit::CodeOffset patchOffset_;
|
||||
size_t labelOffset_ = 0;
|
||||
};
|
||||
|
||||
js::Vector<LabelPatch, 4, js::SystemAllocPolicy> labelPatches_;
|
||||
void AddLabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
if (!labelPatches_.emplaceBack(patchOffset, labelOffset)) {
|
||||
oomUnsafe.crash("Irregexp label patch");
|
||||
}
|
||||
}
|
||||
|
||||
Mode mode_;
|
||||
int num_registers_;
|
||||
int num_capture_registers_;
|
||||
js::jit::LiveGeneralRegisterSet savedRegisters_;
|
||||
|
||||
public:
|
||||
using TableVector =
|
||||
js::Vector<PseudoHandle<ByteArrayData>, 4, js::SystemAllocPolicy>;
|
||||
TableVector& tables() { return tables_; }
|
||||
|
||||
private:
|
||||
TableVector tables_;
|
||||
void AddTable(PseudoHandle<ByteArrayData> table) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
if (!tables_.append(std::move(table))) {
|
||||
oomUnsafe.crash("Irregexp table append");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // RegexpMacroAssemblerArch_h
|
|
@ -0,0 +1,418 @@
|
|||
// Copyright 2012 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-macro-assembler-tracer.h"
|
||||
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
RegExpMacroAssemblerTracer::RegExpMacroAssemblerTracer(
|
||||
Isolate* isolate, RegExpMacroAssembler* assembler)
|
||||
: RegExpMacroAssembler(isolate, assembler->zone()), assembler_(assembler) {
|
||||
IrregexpImplementation type = assembler->Implementation();
|
||||
DCHECK_LT(type, 9);
|
||||
const char* impl_names[] = {"IA32", "ARM", "ARM64", "MIPS", "S390",
|
||||
"PPC", "X64", "X87", "Bytecode"};
|
||||
PrintF("RegExpMacroAssembler%s();\n", impl_names[type]);
|
||||
}
|
||||
|
||||
RegExpMacroAssemblerTracer::~RegExpMacroAssemblerTracer() = default;
|
||||
|
||||
void RegExpMacroAssemblerTracer::AbortedCodeGeneration() {
|
||||
PrintF(" AbortedCodeGeneration\n");
|
||||
assembler_->AbortedCodeGeneration();
|
||||
}
|
||||
|
||||
|
||||
// This is used for printing out debugging information. It makes an integer
|
||||
// that is closely related to the address of an object.
|
||||
static int LabelToInt(Label* label) {
|
||||
return static_cast<int>(reinterpret_cast<intptr_t>(label));
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::Bind(Label* label) {
|
||||
PrintF("label[%08x]: (Bind)\n", LabelToInt(label));
|
||||
assembler_->Bind(label);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::AdvanceCurrentPosition(int by) {
|
||||
PrintF(" AdvanceCurrentPosition(by=%d);\n", by);
|
||||
assembler_->AdvanceCurrentPosition(by);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckGreedyLoop(Label* label) {
|
||||
PrintF(" CheckGreedyLoop(label[%08x]);\n\n", LabelToInt(label));
|
||||
assembler_->CheckGreedyLoop(label);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::PopCurrentPosition() {
|
||||
PrintF(" PopCurrentPosition();\n");
|
||||
assembler_->PopCurrentPosition();
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::PushCurrentPosition() {
|
||||
PrintF(" PushCurrentPosition();\n");
|
||||
assembler_->PushCurrentPosition();
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::Backtrack() {
|
||||
PrintF(" Backtrack();\n");
|
||||
assembler_->Backtrack();
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::GoTo(Label* label) {
|
||||
PrintF(" GoTo(label[%08x]);\n\n", LabelToInt(label));
|
||||
assembler_->GoTo(label);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::PushBacktrack(Label* label) {
|
||||
PrintF(" PushBacktrack(label[%08x]);\n", LabelToInt(label));
|
||||
assembler_->PushBacktrack(label);
|
||||
}
|
||||
|
||||
|
||||
bool RegExpMacroAssemblerTracer::Succeed() {
|
||||
bool restart = assembler_->Succeed();
|
||||
PrintF(" Succeed();%s\n", restart ? " [restart for global match]" : "");
|
||||
return restart;
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::Fail() {
|
||||
PrintF(" Fail();");
|
||||
assembler_->Fail();
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::PopRegister(int register_index) {
|
||||
PrintF(" PopRegister(register=%d);\n", register_index);
|
||||
assembler_->PopRegister(register_index);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::PushRegister(
|
||||
int register_index,
|
||||
StackCheckFlag check_stack_limit) {
|
||||
PrintF(" PushRegister(register=%d, %s);\n",
|
||||
register_index,
|
||||
check_stack_limit ? "check stack limit" : "");
|
||||
assembler_->PushRegister(register_index, check_stack_limit);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::AdvanceRegister(int reg, int by) {
|
||||
PrintF(" AdvanceRegister(register=%d, by=%d);\n", reg, by);
|
||||
assembler_->AdvanceRegister(reg, by);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) {
|
||||
PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by);
|
||||
assembler_->SetCurrentPositionFromEnd(by);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) {
|
||||
PrintF(" SetRegister(register=%d, to=%d);\n", register_index, to);
|
||||
assembler_->SetRegister(register_index, to);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::WriteCurrentPositionToRegister(int reg,
|
||||
int cp_offset) {
|
||||
PrintF(" WriteCurrentPositionToRegister(register=%d,cp_offset=%d);\n",
|
||||
reg,
|
||||
cp_offset);
|
||||
assembler_->WriteCurrentPositionToRegister(reg, cp_offset);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::ClearRegisters(int reg_from, int reg_to) {
|
||||
PrintF(" ClearRegister(from=%d, to=%d);\n", reg_from, reg_to);
|
||||
assembler_->ClearRegisters(reg_from, reg_to);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::ReadCurrentPositionFromRegister(int reg) {
|
||||
PrintF(" ReadCurrentPositionFromRegister(register=%d);\n", reg);
|
||||
assembler_->ReadCurrentPositionFromRegister(reg);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::WriteStackPointerToRegister(int reg) {
|
||||
PrintF(" WriteStackPointerToRegister(register=%d);\n", reg);
|
||||
assembler_->WriteStackPointerToRegister(reg);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::ReadStackPointerFromRegister(int reg) {
|
||||
PrintF(" ReadStackPointerFromRegister(register=%d);\n", reg);
|
||||
assembler_->ReadStackPointerFromRegister(reg);
|
||||
}
|
||||
|
||||
void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl(
|
||||
int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
|
||||
int eats_at_least) {
|
||||
const char* check_msg = check_bounds ? "" : " (unchecked)";
|
||||
PrintF(
|
||||
" LoadCurrentCharacter(cp_offset=%d, label[%08x]%s (%d chars) (eats at "
|
||||
"least %d));\n",
|
||||
cp_offset, LabelToInt(on_end_of_input), check_msg, characters,
|
||||
eats_at_least);
|
||||
assembler_->LoadCurrentCharacter(cp_offset, on_end_of_input, check_bounds,
|
||||
characters, eats_at_least);
|
||||
}
|
||||
|
||||
class PrintablePrinter {
|
||||
public:
|
||||
explicit PrintablePrinter(uc16 character) : character_(character) { }
|
||||
|
||||
const char* operator*() {
|
||||
if (character_ >= ' ' && character_ <= '~') {
|
||||
buffer_[0] = '(';
|
||||
buffer_[1] = static_cast<char>(character_);
|
||||
buffer_[2] = ')';
|
||||
buffer_[3] = '\0';
|
||||
} else {
|
||||
buffer_[0] = '\0';
|
||||
}
|
||||
return &buffer_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
uc16 character_;
|
||||
char buffer_[4];
|
||||
};
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) {
|
||||
PrintablePrinter printable(limit);
|
||||
PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n",
|
||||
limit,
|
||||
*printable,
|
||||
LabelToInt(on_less));
|
||||
assembler_->CheckCharacterLT(limit, on_less);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit,
|
||||
Label* on_greater) {
|
||||
PrintablePrinter printable(limit);
|
||||
PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n",
|
||||
limit,
|
||||
*printable,
|
||||
LabelToInt(on_greater));
|
||||
assembler_->CheckCharacterGT(limit, on_greater);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) {
|
||||
PrintablePrinter printable(c);
|
||||
PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n",
|
||||
c,
|
||||
*printable,
|
||||
LabelToInt(on_equal));
|
||||
assembler_->CheckCharacter(c, on_equal);
|
||||
}
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckAtStart(int cp_offset,
|
||||
Label* on_at_start) {
|
||||
PrintF(" CheckAtStart(cp_offset=%d, label[%08x]);\n", cp_offset,
|
||||
LabelToInt(on_at_start));
|
||||
assembler_->CheckAtStart(cp_offset, on_at_start);
|
||||
}
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotAtStart(int cp_offset,
|
||||
Label* on_not_at_start) {
|
||||
PrintF(" CheckNotAtStart(cp_offset=%d, label[%08x]);\n", cp_offset,
|
||||
LabelToInt(on_not_at_start));
|
||||
assembler_->CheckNotAtStart(cp_offset, on_not_at_start);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotCharacter(unsigned c,
|
||||
Label* on_not_equal) {
|
||||
PrintablePrinter printable(c);
|
||||
PrintF(" CheckNotCharacter(c=0x%04x%s, label[%08x]);\n",
|
||||
c,
|
||||
*printable,
|
||||
LabelToInt(on_not_equal));
|
||||
assembler_->CheckNotCharacter(c, on_not_equal);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterAfterAnd(
|
||||
unsigned c,
|
||||
unsigned mask,
|
||||
Label* on_equal) {
|
||||
PrintablePrinter printable(c);
|
||||
PrintF(" CheckCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n",
|
||||
c,
|
||||
*printable,
|
||||
mask,
|
||||
LabelToInt(on_equal));
|
||||
assembler_->CheckCharacterAfterAnd(c, mask, on_equal);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd(
|
||||
unsigned c,
|
||||
unsigned mask,
|
||||
Label* on_not_equal) {
|
||||
PrintablePrinter printable(c);
|
||||
PrintF(" CheckNotCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n",
|
||||
c,
|
||||
*printable,
|
||||
mask,
|
||||
LabelToInt(on_not_equal));
|
||||
assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
|
||||
uc16 c,
|
||||
uc16 minus,
|
||||
uc16 mask,
|
||||
Label* on_not_equal) {
|
||||
PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, "
|
||||
"label[%08x]);\n",
|
||||
c,
|
||||
minus,
|
||||
mask,
|
||||
LabelToInt(on_not_equal));
|
||||
assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterInRange(
|
||||
uc16 from,
|
||||
uc16 to,
|
||||
Label* on_not_in_range) {
|
||||
PrintablePrinter printable_from(from);
|
||||
PrintablePrinter printable_to(to);
|
||||
PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n",
|
||||
from,
|
||||
*printable_from,
|
||||
to,
|
||||
*printable_to,
|
||||
LabelToInt(on_not_in_range));
|
||||
assembler_->CheckCharacterInRange(from, to, on_not_in_range);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(
|
||||
uc16 from,
|
||||
uc16 to,
|
||||
Label* on_in_range) {
|
||||
PrintablePrinter printable_from(from);
|
||||
PrintablePrinter printable_to(to);
|
||||
PrintF(
|
||||
" CheckCharacterNotInRange(from=0x%04x%s," " to=%04x%s, label[%08x]);\n",
|
||||
from,
|
||||
*printable_from,
|
||||
to,
|
||||
*printable_to,
|
||||
LabelToInt(on_in_range));
|
||||
assembler_->CheckCharacterNotInRange(from, to, on_in_range);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckBitInTable(
|
||||
Handle<ByteArray> table, Label* on_bit_set) {
|
||||
PrintF(" CheckBitInTable(label[%08x] ", LabelToInt(on_bit_set));
|
||||
for (int i = 0; i < kTableSize; i++) {
|
||||
PrintF("%c", table->get(i) != 0 ? 'X' : '.');
|
||||
if (i % 32 == 31 && i != kTableMask) {
|
||||
PrintF("\n ");
|
||||
}
|
||||
}
|
||||
PrintF(");\n");
|
||||
assembler_->CheckBitInTable(table, on_bit_set);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
|
||||
bool read_backward,
|
||||
Label* on_no_match) {
|
||||
PrintF(" CheckNotBackReference(register=%d, %s, label[%08x]);\n", start_reg,
|
||||
read_backward ? "backward" : "forward", LabelToInt(on_no_match));
|
||||
assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match);
|
||||
}
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
|
||||
int start_reg, bool read_backward, Label* on_no_match) {
|
||||
PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
|
||||
start_reg, read_backward ? "backward" : "forward",
|
||||
LabelToInt(on_no_match));
|
||||
assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
|
||||
on_no_match);
|
||||
}
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
|
||||
Label* on_outside_input) {
|
||||
PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset,
|
||||
LabelToInt(on_outside_input));
|
||||
assembler_->CheckPosition(cp_offset, on_outside_input);
|
||||
}
|
||||
|
||||
|
||||
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
|
||||
uc16 type,
|
||||
Label* on_no_match) {
|
||||
bool supported = assembler_->CheckSpecialCharacterClass(type,
|
||||
on_no_match);
|
||||
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
|
||||
type,
|
||||
LabelToInt(on_no_match),
|
||||
supported ? "true" : "false");
|
||||
return supported;
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index,
|
||||
int comparand, Label* if_lt) {
|
||||
PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n",
|
||||
register_index, comparand, LabelToInt(if_lt));
|
||||
assembler_->IfRegisterLT(register_index, comparand, if_lt);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::IfRegisterEqPos(int register_index,
|
||||
Label* if_eq) {
|
||||
PrintF(" IfRegisterEqPos(register=%d, label[%08x]);\n",
|
||||
register_index, LabelToInt(if_eq));
|
||||
assembler_->IfRegisterEqPos(register_index, if_eq);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::IfRegisterGE(int register_index,
|
||||
int comparand, Label* if_ge) {
|
||||
PrintF(" IfRegisterGE(register=%d, number=%d, label[%08x]);\n",
|
||||
register_index, comparand, LabelToInt(if_ge));
|
||||
assembler_->IfRegisterGE(register_index, comparand, if_ge);
|
||||
}
|
||||
|
||||
|
||||
RegExpMacroAssembler::IrregexpImplementation
|
||||
RegExpMacroAssemblerTracer::Implementation() {
|
||||
return assembler_->Implementation();
|
||||
}
|
||||
|
||||
|
||||
Handle<HeapObject> RegExpMacroAssemblerTracer::GetCode(Handle<String> source) {
|
||||
PrintF(" GetCode(%s);\n", source->ToCString().get());
|
||||
return assembler_->GetCode(source);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,80 @@
|
|||
// Copyright 2008 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
|
||||
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
|
||||
|
||||
#include "regexp/regexp-macro-assembler.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// Decorator on a RegExpMacroAssembler that write all calls.
|
||||
class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
|
||||
public:
|
||||
RegExpMacroAssemblerTracer(Isolate* isolate, RegExpMacroAssembler* assembler);
|
||||
~RegExpMacroAssemblerTracer() override;
|
||||
void AbortedCodeGeneration() override;
|
||||
int stack_limit_slack() override { return assembler_->stack_limit_slack(); }
|
||||
bool CanReadUnaligned() override { return assembler_->CanReadUnaligned(); }
|
||||
void AdvanceCurrentPosition(int by) override; // Signed cp change.
|
||||
void AdvanceRegister(int reg, int by) override; // r[reg] += by.
|
||||
void Backtrack() override;
|
||||
void Bind(Label* label) override;
|
||||
void CheckCharacter(unsigned c, Label* on_equal) override;
|
||||
void CheckCharacterAfterAnd(unsigned c, unsigned and_with,
|
||||
Label* on_equal) override;
|
||||
void CheckCharacterGT(uc16 limit, Label* on_greater) override;
|
||||
void CheckCharacterLT(uc16 limit, Label* on_less) override;
|
||||
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
|
||||
void CheckAtStart(int cp_offset, Label* on_at_start) override;
|
||||
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
|
||||
void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match) override;
|
||||
void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
|
||||
Label* on_no_match) override;
|
||||
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
|
||||
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
|
||||
Label* on_not_equal) override;
|
||||
void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with,
|
||||
Label* on_not_equal) override;
|
||||
void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override;
|
||||
void CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
Label* on_not_in_range) override;
|
||||
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
|
||||
void CheckPosition(int cp_offset, Label* on_outside_input) override;
|
||||
bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) override;
|
||||
void Fail() override;
|
||||
Handle<HeapObject> GetCode(Handle<String> source) override;
|
||||
void GoTo(Label* label) override;
|
||||
void IfRegisterGE(int reg, int comparand, Label* if_ge) override;
|
||||
void IfRegisterLT(int reg, int comparand, Label* if_lt) override;
|
||||
void IfRegisterEqPos(int reg, Label* if_eq) override;
|
||||
IrregexpImplementation Implementation() override;
|
||||
void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least) override;
|
||||
void PopCurrentPosition() override;
|
||||
void PopRegister(int register_index) override;
|
||||
void PushBacktrack(Label* label) override;
|
||||
void PushCurrentPosition() override;
|
||||
void PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit) override;
|
||||
void ReadCurrentPositionFromRegister(int reg) override;
|
||||
void ReadStackPointerFromRegister(int reg) override;
|
||||
void SetCurrentPositionFromEnd(int by) override;
|
||||
void SetRegister(int register_index, int to) override;
|
||||
bool Succeed() override;
|
||||
void WriteCurrentPositionToRegister(int reg, int cp_offset) override;
|
||||
void ClearRegisters(int reg_from, int reg_to) override;
|
||||
void WriteStackPointerToRegister(int reg) override;
|
||||
|
||||
private:
|
||||
RegExpMacroAssembler* assembler_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
|
|
@ -0,0 +1,344 @@
|
|||
// Copyright 2012 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-macro-assembler.h"
|
||||
|
||||
#include "regexp/regexp-stack.h"
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/unistr.h"
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
|
||||
: slow_safe_compiler_(false),
|
||||
global_mode_(NOT_GLOBAL),
|
||||
isolate_(isolate),
|
||||
zone_(zone) {}
|
||||
|
||||
RegExpMacroAssembler::~RegExpMacroAssembler() = default;
|
||||
|
||||
int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
|
||||
Address byte_offset2,
|
||||
size_t byte_length,
|
||||
Isolate* isolate) {
|
||||
// This function is not allowed to cause a garbage collection.
|
||||
// A GC might move the calling generated code and invalidate the
|
||||
// return address on the stack.
|
||||
DCHECK_EQ(0, byte_length % 2);
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
int32_t length = (int32_t)(byte_length >> 1);
|
||||
icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
|
||||
length);
|
||||
return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
|
||||
length, U_FOLD_CASE_DEFAULT) == 0;
|
||||
#else
|
||||
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
|
||||
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
|
||||
size_t length = byte_length >> 1;
|
||||
DCHECK_NOT_NULL(isolate);
|
||||
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
|
||||
isolate->regexp_macro_assembler_canonicalize();
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
unibrow::uchar c1 = substring1[i];
|
||||
unibrow::uchar c2 = substring2[i];
|
||||
if (c1 != c2) {
|
||||
unibrow::uchar s1[1] = {c1};
|
||||
canonicalize->get(c1, '\0', s1);
|
||||
if (s1[0] != c2) {
|
||||
unibrow::uchar s2[1] = {c2};
|
||||
canonicalize->get(c2, '\0', s2);
|
||||
if (s1[0] != s2[0]) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
#endif // V8_INTL_SUPPORT
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
|
||||
Label* on_failure) {
|
||||
Label ok;
|
||||
// Check that current character is not a trail surrogate.
|
||||
LoadCurrentCharacter(cp_offset, &ok);
|
||||
CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
|
||||
// Check that previous character is not a lead surrogate.
|
||||
LoadCurrentCharacter(cp_offset - 1, &ok);
|
||||
CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
|
||||
Bind(&ok);
|
||||
}
|
||||
|
||||
void RegExpMacroAssembler::CheckPosition(int cp_offset,
|
||||
Label* on_outside_input) {
|
||||
LoadCurrentCharacter(cp_offset, on_outside_input, true);
|
||||
}
|
||||
|
||||
void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
|
||||
Label* on_end_of_input,
|
||||
bool check_bounds,
|
||||
int characters,
|
||||
int eats_at_least) {
|
||||
// By default, eats_at_least = characters.
|
||||
if (eats_at_least == kUseCharactersValue) {
|
||||
eats_at_least = characters;
|
||||
}
|
||||
|
||||
LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
|
||||
eats_at_least);
|
||||
}
|
||||
|
||||
bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
|
||||
Label* on_no_match) {
|
||||
return false;
|
||||
}
|
||||
|
||||
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
|
||||
Zone* zone)
|
||||
: RegExpMacroAssembler(isolate, zone) {}
|
||||
|
||||
NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
|
||||
|
||||
bool NativeRegExpMacroAssembler::CanReadUnaligned() {
|
||||
return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
|
||||
}
|
||||
|
||||
#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
|
||||
|
||||
// This method may only be called after an interrupt.
|
||||
int NativeRegExpMacroAssembler::CheckStackGuardState(
|
||||
Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
|
||||
Address* return_address, Code re_code, Address* subject,
|
||||
const byte** input_start, const byte** input_end) {
|
||||
DisallowHeapAllocation no_gc;
|
||||
Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
|
||||
DCHECK_LE(re_code.raw_instruction_start(), old_pc);
|
||||
DCHECK_LE(old_pc, re_code.raw_instruction_end());
|
||||
|
||||
StackLimitCheck check(isolate);
|
||||
bool js_has_overflowed = check.JsHasOverflowed();
|
||||
|
||||
if (call_origin == RegExp::CallOrigin::kFromJs) {
|
||||
// Direct calls from JavaScript can be interrupted in two ways:
|
||||
// 1. A real stack overflow, in which case we let the caller throw the
|
||||
// exception.
|
||||
// 2. The stack guard was used to interrupt execution for another purpose,
|
||||
// forcing the call through the runtime system.
|
||||
|
||||
// Bug(v8:9540) Investigate why this method is called from JS although no
|
||||
// stackoverflow or interrupt is pending on ARM64. We return 0 in this case
|
||||
// to continue execution normally.
|
||||
if (js_has_overflowed) {
|
||||
return EXCEPTION;
|
||||
} else if (check.InterruptRequested()) {
|
||||
return RETRY;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
|
||||
|
||||
// Prepare for possible GC.
|
||||
HandleScope handles(isolate);
|
||||
Handle<Code> code_handle(re_code, isolate);
|
||||
Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
|
||||
bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
|
||||
int return_value = 0;
|
||||
|
||||
if (js_has_overflowed) {
|
||||
AllowHeapAllocation yes_gc;
|
||||
isolate->StackOverflow();
|
||||
return_value = EXCEPTION;
|
||||
} else if (check.InterruptRequested()) {
|
||||
AllowHeapAllocation yes_gc;
|
||||
Object result = isolate->stack_guard()->HandleInterrupts();
|
||||
if (result.IsException(isolate)) return_value = EXCEPTION;
|
||||
}
|
||||
|
||||
if (*code_handle != re_code) { // Return address no longer valid
|
||||
// Overwrite the return address on the stack.
|
||||
intptr_t delta = code_handle->address() - re_code.address();
|
||||
Address new_pc = old_pc + delta;
|
||||
// TODO(v8:10026): avoid replacing a signed pointer.
|
||||
PointerAuthentication::ReplacePC(return_address, new_pc, 0);
|
||||
}
|
||||
|
||||
// If we continue, we need to update the subject string addresses.
|
||||
if (return_value == 0) {
|
||||
// String encoding might have changed.
|
||||
if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
|
||||
is_one_byte) {
|
||||
// If we changed between an LATIN1 and an UC16 string, the specialized
|
||||
// code cannot be used, and we need to restart regexp matching from
|
||||
// scratch (including, potentially, compiling a new version of the code).
|
||||
return_value = RETRY;
|
||||
} else {
|
||||
*subject = subject_handle->ptr();
|
||||
intptr_t byte_length = *input_end - *input_start;
|
||||
*input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
|
||||
*input_end = *input_start + byte_length;
|
||||
}
|
||||
}
|
||||
return return_value;
|
||||
}
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
|
||||
Handle<String> subject,
|
||||
int* offsets_vector,
|
||||
int offsets_vector_length,
|
||||
int previous_index, Isolate* isolate) {
|
||||
DCHECK(subject->IsFlat());
|
||||
DCHECK_LE(0, previous_index);
|
||||
DCHECK_LE(previous_index, subject->length());
|
||||
|
||||
// No allocations before calling the regexp, but we can't use
|
||||
// DisallowHeapAllocation, since regexps might be preempted, and another
|
||||
// thread might do allocation anyway.
|
||||
|
||||
String subject_ptr = *subject;
|
||||
// Character offsets into string.
|
||||
int start_offset = previous_index;
|
||||
int char_length = subject_ptr.length() - start_offset;
|
||||
int slice_offset = 0;
|
||||
|
||||
// The string has been flattened, so if it is a cons string it contains the
|
||||
// full string in the first part.
|
||||
if (StringShape(subject_ptr).IsCons()) {
|
||||
DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
|
||||
subject_ptr = ConsString::cast(subject_ptr).first();
|
||||
} else if (StringShape(subject_ptr).IsSliced()) {
|
||||
SlicedString slice = SlicedString::cast(subject_ptr);
|
||||
subject_ptr = slice.parent();
|
||||
slice_offset = slice.offset();
|
||||
}
|
||||
if (StringShape(subject_ptr).IsThin()) {
|
||||
subject_ptr = ThinString::cast(subject_ptr).actual();
|
||||
}
|
||||
// Ensure that an underlying string has the same representation.
|
||||
bool is_one_byte = subject_ptr.IsOneByteRepresentation();
|
||||
DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
|
||||
// String is now either Sequential or External
|
||||
int char_size_shift = is_one_byte ? 0 : 1;
|
||||
|
||||
DisallowHeapAllocation no_gc;
|
||||
const byte* input_start =
|
||||
subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
|
||||
int byte_length = char_length << char_size_shift;
|
||||
const byte* input_end = input_start + byte_length;
|
||||
return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
|
||||
offsets_vector_length, isolate, *regexp);
|
||||
}
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
// TODO(pthier): The JSRegExp object is passed to native irregexp code to match
|
||||
// the signature of the interpreter. We should get rid of JS objects passed to
|
||||
// internal methods.
|
||||
int NativeRegExpMacroAssembler::Execute(
|
||||
String input, // This needs to be the unpacked (sliced, cons) string.
|
||||
int start_offset, const byte* input_start, const byte* input_end,
|
||||
int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
|
||||
// Ensure that the minimum stack has been allocated.
|
||||
RegExpStackScope stack_scope(isolate);
|
||||
Address stack_base = stack_scope.stack()->stack_base();
|
||||
|
||||
bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
|
||||
Code code = Code::cast(regexp.Code(is_one_byte));
|
||||
RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
|
||||
|
||||
using RegexpMatcherSig = int(
|
||||
Address input_string, int start_offset, // NOLINT(readability/casting)
|
||||
const byte* input_start, const byte* input_end, int* output,
|
||||
int output_size, Address stack_base, int call_origin, Isolate* isolate,
|
||||
Address regexp);
|
||||
|
||||
auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
|
||||
int result =
|
||||
fn.Call(input.ptr(), start_offset, input_start, input_end, output,
|
||||
output_size, stack_base, call_origin, isolate, regexp.ptr());
|
||||
DCHECK(result >= RETRY);
|
||||
|
||||
if (result == EXCEPTION && !isolate->has_pending_exception()) {
|
||||
// We detected a stack overflow (on the backtrack stack) in RegExp code,
|
||||
// but haven't created the exception yet. Additionally, we allow heap
|
||||
// allocation because even though it invalidates {input_start} and
|
||||
// {input_end}, we are about to return anyway.
|
||||
AllowHeapAllocation allow_allocation;
|
||||
isolate->StackOverflow();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
|
||||
|
||||
// clang-format off
|
||||
const byte NativeRegExpMacroAssembler::word_character_map[] = {
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
|
||||
0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
|
||||
|
||||
0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
|
||||
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
|
||||
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
|
||||
0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
|
||||
|
||||
0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
|
||||
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
|
||||
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
|
||||
0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
|
||||
// Latin-1 range
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
|
||||
Address* stack_base,
|
||||
Isolate* isolate) {
|
||||
RegExpStack* regexp_stack = isolate->regexp_stack();
|
||||
size_t size = regexp_stack->stack_capacity();
|
||||
Address old_stack_base = regexp_stack->stack_base();
|
||||
DCHECK(old_stack_base == *stack_base);
|
||||
DCHECK(stack_pointer <= old_stack_base);
|
||||
DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
|
||||
Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
|
||||
if (new_stack_base == kNullAddress) {
|
||||
return kNullAddress;
|
||||
}
|
||||
*stack_base = new_stack_base;
|
||||
intptr_t stack_content_size = old_stack_base - stack_pointer;
|
||||
return new_stack_base - stack_content_size;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,280 @@
|
|||
// Copyright 2012 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
|
||||
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
|
||||
|
||||
#include "regexp/regexp-ast.h"
|
||||
#include "regexp/regexp-shim.h"
|
||||
#include "regexp/regexp.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
static const uc32 kLeadSurrogateStart = 0xd800;
|
||||
static const uc32 kLeadSurrogateEnd = 0xdbff;
|
||||
static const uc32 kTrailSurrogateStart = 0xdc00;
|
||||
static const uc32 kTrailSurrogateEnd = 0xdfff;
|
||||
static const uc32 kNonBmpStart = 0x10000;
|
||||
static const uc32 kNonBmpEnd = 0x10ffff;
|
||||
|
||||
struct DisjunctDecisionRow {
|
||||
RegExpCharacterClass cc;
|
||||
Label* on_match;
|
||||
};
|
||||
|
||||
|
||||
class RegExpMacroAssembler {
|
||||
public:
|
||||
// The implementation must be able to handle at least:
|
||||
static const int kMaxRegister = (1 << 16) - 1;
|
||||
static const int kMaxCPOffset = (1 << 15) - 1;
|
||||
static const int kMinCPOffset = -(1 << 15);
|
||||
|
||||
static const int kTableSizeBits = 7;
|
||||
static const int kTableSize = 1 << kTableSizeBits;
|
||||
static const int kTableMask = kTableSize - 1;
|
||||
|
||||
static constexpr int kUseCharactersValue = -1;
|
||||
|
||||
enum IrregexpImplementation {
|
||||
kIA32Implementation,
|
||||
kARMImplementation,
|
||||
kARM64Implementation,
|
||||
kMIPSImplementation,
|
||||
kS390Implementation,
|
||||
kPPCImplementation,
|
||||
kX64Implementation,
|
||||
kX87Implementation,
|
||||
kBytecodeImplementation
|
||||
};
|
||||
|
||||
enum StackCheckFlag {
|
||||
kNoStackLimitCheck = false,
|
||||
kCheckStackLimit = true
|
||||
};
|
||||
|
||||
RegExpMacroAssembler(Isolate* isolate, Zone* zone);
|
||||
virtual ~RegExpMacroAssembler();
|
||||
// This function is called when code generation is aborted, so that
|
||||
// the assembler could clean up internal data structures.
|
||||
virtual void AbortedCodeGeneration() {}
|
||||
// The maximal number of pushes between stack checks. Users must supply
|
||||
// kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
|
||||
// at least once for every stack_limit() pushes that are executed.
|
||||
virtual int stack_limit_slack() = 0;
|
||||
virtual bool CanReadUnaligned() = 0;
|
||||
virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
|
||||
virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
|
||||
// Continues execution from the position pushed on the top of the backtrack
|
||||
// stack by an earlier PushBacktrack(Label*).
|
||||
virtual void Backtrack() = 0;
|
||||
virtual void Bind(Label* label) = 0;
|
||||
// Dispatch after looking the current character up in a 2-bits-per-entry
|
||||
// map. The destinations vector has up to 4 labels.
|
||||
virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
|
||||
// Bitwise and the current character with the given constant and then
|
||||
// check for a match with c.
|
||||
virtual void CheckCharacterAfterAnd(unsigned c,
|
||||
unsigned and_with,
|
||||
Label* on_equal) = 0;
|
||||
virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0;
|
||||
virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0;
|
||||
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
|
||||
virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
|
||||
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
|
||||
virtual void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match) = 0;
|
||||
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
|
||||
bool read_backward,
|
||||
Label* on_no_match) = 0;
|
||||
// Check the current character for a match with a literal character. If we
|
||||
// fail to match then goto the on_failure label. End of input always
|
||||
// matches. If the label is nullptr then we should pop a backtrack address
|
||||
// off the stack and go to that.
|
||||
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
|
||||
virtual void CheckNotCharacterAfterAnd(unsigned c,
|
||||
unsigned and_with,
|
||||
Label* on_not_equal) = 0;
|
||||
// Subtract a constant from the current character, then and with the given
|
||||
// constant and then check for a match with c.
|
||||
virtual void CheckNotCharacterAfterMinusAnd(uc16 c,
|
||||
uc16 minus,
|
||||
uc16 and_with,
|
||||
Label* on_not_equal) = 0;
|
||||
virtual void CheckCharacterInRange(uc16 from,
|
||||
uc16 to, // Both inclusive.
|
||||
Label* on_in_range) = 0;
|
||||
virtual void CheckCharacterNotInRange(uc16 from,
|
||||
uc16 to, // Both inclusive.
|
||||
Label* on_not_in_range) = 0;
|
||||
|
||||
// The current character (modulus the kTableSize) is looked up in the byte
|
||||
// array, and if the found byte is non-zero, we jump to the on_bit_set label.
|
||||
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
|
||||
|
||||
// Checks whether the given offset from the current position is before
|
||||
// the end of the string. May overwrite the current character.
|
||||
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
|
||||
// Check whether a standard/default character class matches the current
|
||||
// character. Returns false if the type of special character class does
|
||||
// not have custom support.
|
||||
// May clobber the current loaded character.
|
||||
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
|
||||
|
||||
// Control-flow integrity:
|
||||
// Define a jump target and bind a label.
|
||||
virtual void BindJumpTarget(Label* label) { Bind(label); }
|
||||
|
||||
virtual void Fail() = 0;
|
||||
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
|
||||
virtual void GoTo(Label* label) = 0;
|
||||
// Check whether a register is >= a given constant and go to a label if it
|
||||
// is. Backtracks instead if the label is nullptr.
|
||||
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
|
||||
// Check whether a register is < a given constant and go to a label if it is.
|
||||
// Backtracks instead if the label is nullptr.
|
||||
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
|
||||
// Check whether a register is == to the current position and go to a
|
||||
// label if it is.
|
||||
virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
|
||||
virtual IrregexpImplementation Implementation() = 0;
|
||||
V8_EXPORT_PRIVATE void LoadCurrentCharacter(
|
||||
int cp_offset, Label* on_end_of_input, bool check_bounds = true,
|
||||
int characters = 1, int eats_at_least = kUseCharactersValue);
|
||||
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least) = 0;
|
||||
virtual void PopCurrentPosition() = 0;
|
||||
virtual void PopRegister(int register_index) = 0;
|
||||
// Pushes the label on the backtrack stack, so that a following Backtrack
|
||||
// will go to this label. Always checks the backtrack stack limit.
|
||||
virtual void PushBacktrack(Label* label) = 0;
|
||||
virtual void PushCurrentPosition() = 0;
|
||||
virtual void PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit) = 0;
|
||||
virtual void ReadCurrentPositionFromRegister(int reg) = 0;
|
||||
virtual void ReadStackPointerFromRegister(int reg) = 0;
|
||||
virtual void SetCurrentPositionFromEnd(int by) = 0;
|
||||
virtual void SetRegister(int register_index, int to) = 0;
|
||||
// Return whether the matching (with a global regexp) will be restarted.
|
||||
virtual bool Succeed() = 0;
|
||||
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
|
||||
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
|
||||
virtual void WriteStackPointerToRegister(int reg) = 0;
|
||||
|
||||
// Compares two-byte strings case insensitively.
|
||||
// Called from generated RegExp code.
|
||||
static int CaseInsensitiveCompareUC16(Address byte_offset1,
|
||||
Address byte_offset2,
|
||||
size_t byte_length, Isolate* isolate);
|
||||
|
||||
// Check that we are not in the middle of a surrogate pair.
|
||||
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
|
||||
|
||||
// Controls the generation of large inlined constants in the code.
|
||||
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
|
||||
bool slow_safe() { return slow_safe_compiler_; }
|
||||
|
||||
void set_backtrack_limit(uint32_t backtrack_limit) {
|
||||
backtrack_limit_ = backtrack_limit;
|
||||
}
|
||||
|
||||
enum GlobalMode {
|
||||
NOT_GLOBAL,
|
||||
GLOBAL_NO_ZERO_LENGTH_CHECK,
|
||||
GLOBAL,
|
||||
GLOBAL_UNICODE
|
||||
};
|
||||
// Set whether the regular expression has the global flag. Exiting due to
|
||||
// a failure in a global regexp may still mean success overall.
|
||||
inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
|
||||
inline bool global() { return global_mode_ != NOT_GLOBAL; }
|
||||
inline bool global_with_zero_length_check() {
|
||||
return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
|
||||
}
|
||||
inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
|
||||
|
||||
Isolate* isolate() const { return isolate_; }
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
protected:
|
||||
bool has_backtrack_limit() const {
|
||||
return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
|
||||
}
|
||||
uint32_t backtrack_limit() const { return backtrack_limit_; }
|
||||
|
||||
private:
|
||||
bool slow_safe_compiler_;
|
||||
uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit;
|
||||
GlobalMode global_mode_;
|
||||
Isolate* isolate_;
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
|
||||
public:
|
||||
// Type of input string to generate code for.
|
||||
enum Mode { LATIN1 = 1, UC16 = 2 };
|
||||
|
||||
// Result of calling generated native RegExp code.
|
||||
// RETRY: Something significant changed during execution, and the matching
|
||||
// should be retried from scratch.
|
||||
// EXCEPTION: Something failed during execution. If no exception has been
|
||||
// thrown, it's an internal out-of-memory, and the caller should
|
||||
// throw the exception.
|
||||
// FAILURE: Matching failed.
|
||||
// SUCCESS: Matching succeeded, and the output array has been filled with
|
||||
// capture positions.
|
||||
enum Result {
|
||||
FAILURE = RegExp::kInternalRegExpFailure,
|
||||
SUCCESS = RegExp::kInternalRegExpSuccess,
|
||||
EXCEPTION = RegExp::kInternalRegExpException,
|
||||
RETRY = RegExp::kInternalRegExpRetry,
|
||||
};
|
||||
|
||||
NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone);
|
||||
~NativeRegExpMacroAssembler() override;
|
||||
bool CanReadUnaligned() override;
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
static int Match(Handle<JSRegExp> regexp, Handle<String> subject,
|
||||
int* offsets_vector, int offsets_vector_length,
|
||||
int previous_index, Isolate* isolate);
|
||||
|
||||
// Called from RegExp if the backtrack stack limit is hit.
|
||||
// Tries to expand the stack. Returns the new stack-pointer if
|
||||
// successful, and updates the stack_top address, or returns 0 if unable
|
||||
// to grow the stack.
|
||||
// This function must not trigger a garbage collection.
|
||||
static Address GrowStack(Address stack_pointer, Address* stack_top,
|
||||
Isolate* isolate);
|
||||
|
||||
static int CheckStackGuardState(Isolate* isolate, int start_index,
|
||||
RegExp::CallOrigin call_origin,
|
||||
Address* return_address, Code re_code,
|
||||
Address* subject, const byte** input_start,
|
||||
const byte** input_end);
|
||||
|
||||
// Byte map of one byte characters with a 0xff if the character is a word
|
||||
// character (digit, letter or underscore) and 0x00 otherwise.
|
||||
// Used by generated RegExp code.
|
||||
static const byte word_character_map[256];
|
||||
|
||||
static Address word_character_map_address() {
|
||||
return reinterpret_cast<Address>(&word_character_map[0]);
|
||||
}
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
V8_EXPORT_PRIVATE static int Execute(String input, int start_offset,
|
||||
const byte* input_start,
|
||||
const byte* input_end, int* output,
|
||||
int output_size, Isolate* isolate,
|
||||
JSRegExp regexp);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,750 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_NODES_H_
|
||||
#define V8_REGEXP_REGEXP_NODES_H_
|
||||
|
||||
#include "regexp/regexp-macro-assembler.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class AlternativeGenerationList;
|
||||
class BoyerMooreLookahead;
|
||||
class GreedyLoopState;
|
||||
class Label;
|
||||
class NodeVisitor;
|
||||
class QuickCheckDetails;
|
||||
class RegExpCompiler;
|
||||
class Trace;
|
||||
struct PreloadState;
|
||||
class ChoiceNode;
|
||||
|
||||
#define FOR_EACH_NODE_TYPE(VISIT) \
|
||||
VISIT(End) \
|
||||
VISIT(Action) \
|
||||
VISIT(Choice) \
|
||||
VISIT(LoopChoice) \
|
||||
VISIT(NegativeLookaroundChoice) \
|
||||
VISIT(BackReference) \
|
||||
VISIT(Assertion) \
|
||||
VISIT(Text)
|
||||
|
||||
struct NodeInfo final {
|
||||
NodeInfo()
|
||||
: being_analyzed(false),
|
||||
been_analyzed(false),
|
||||
follows_word_interest(false),
|
||||
follows_newline_interest(false),
|
||||
follows_start_interest(false),
|
||||
at_end(false),
|
||||
visited(false),
|
||||
replacement_calculated(false) {}
|
||||
|
||||
// Returns true if the interests and assumptions of this node
|
||||
// matches the given one.
|
||||
bool Matches(NodeInfo* that) {
|
||||
return (at_end == that->at_end) &&
|
||||
(follows_word_interest == that->follows_word_interest) &&
|
||||
(follows_newline_interest == that->follows_newline_interest) &&
|
||||
(follows_start_interest == that->follows_start_interest);
|
||||
}
|
||||
|
||||
// Updates the interests of this node given the interests of the
|
||||
// node preceding it.
|
||||
void AddFromPreceding(NodeInfo* that) {
|
||||
at_end |= that->at_end;
|
||||
follows_word_interest |= that->follows_word_interest;
|
||||
follows_newline_interest |= that->follows_newline_interest;
|
||||
follows_start_interest |= that->follows_start_interest;
|
||||
}
|
||||
|
||||
bool HasLookbehind() {
|
||||
return follows_word_interest || follows_newline_interest ||
|
||||
follows_start_interest;
|
||||
}
|
||||
|
||||
// Sets the interests of this node to include the interests of the
|
||||
// following node.
|
||||
void AddFromFollowing(NodeInfo* that) {
|
||||
follows_word_interest |= that->follows_word_interest;
|
||||
follows_newline_interest |= that->follows_newline_interest;
|
||||
follows_start_interest |= that->follows_start_interest;
|
||||
}
|
||||
|
||||
void ResetCompilationState() {
|
||||
being_analyzed = false;
|
||||
been_analyzed = false;
|
||||
}
|
||||
|
||||
bool being_analyzed : 1;
|
||||
bool been_analyzed : 1;
|
||||
|
||||
// These bits are set of this node has to know what the preceding
|
||||
// character was.
|
||||
bool follows_word_interest : 1;
|
||||
bool follows_newline_interest : 1;
|
||||
bool follows_start_interest : 1;
|
||||
|
||||
bool at_end : 1;
|
||||
bool visited : 1;
|
||||
bool replacement_calculated : 1;
|
||||
};
|
||||
|
||||
struct EatsAtLeastInfo final {
|
||||
EatsAtLeastInfo() : EatsAtLeastInfo(0) {}
|
||||
explicit EatsAtLeastInfo(uint8_t eats)
|
||||
: eats_at_least_from_possibly_start(eats),
|
||||
eats_at_least_from_not_start(eats) {}
|
||||
void SetMin(const EatsAtLeastInfo& other) {
|
||||
if (other.eats_at_least_from_possibly_start <
|
||||
eats_at_least_from_possibly_start) {
|
||||
eats_at_least_from_possibly_start =
|
||||
other.eats_at_least_from_possibly_start;
|
||||
}
|
||||
if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) {
|
||||
eats_at_least_from_not_start = other.eats_at_least_from_not_start;
|
||||
}
|
||||
}
|
||||
|
||||
// Any successful match starting from the current node will consume at least
|
||||
// this many characters. This does not necessarily mean that there is a
|
||||
// possible match with exactly this many characters, but we generally try to
|
||||
// get this number as high as possible to allow for early exit on failure.
|
||||
uint8_t eats_at_least_from_possibly_start;
|
||||
|
||||
// Like eats_at_least_from_possibly_start, but with the additional assumption
|
||||
// that start-of-string assertions (^) can't match. This value is greater than
|
||||
// or equal to eats_at_least_from_possibly_start.
|
||||
uint8_t eats_at_least_from_not_start;
|
||||
};
|
||||
|
||||
class RegExpNode : public ZoneObject {
|
||||
public:
|
||||
explicit RegExpNode(Zone* zone)
|
||||
: replacement_(nullptr),
|
||||
on_work_list_(false),
|
||||
trace_count_(0),
|
||||
zone_(zone) {
|
||||
bm_info_[0] = bm_info_[1] = nullptr;
|
||||
}
|
||||
virtual ~RegExpNode();
|
||||
virtual void Accept(NodeVisitor* visitor) = 0;
|
||||
// Generates a goto to this node or actually generates the code at this point.
|
||||
virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
|
||||
// How many characters must this node consume at a minimum in order to
|
||||
// succeed. The not_at_start argument is used to indicate that we know we are
|
||||
// not at the start of the input. In this case anchored branches will always
|
||||
// fail and can be ignored when determining how many characters are consumed
|
||||
// on success. If this node has not been analyzed yet, EatsAtLeast returns 0.
|
||||
int EatsAtLeast(bool not_at_start);
|
||||
// Returns how many characters this node must consume in order to succeed,
|
||||
// given that this is a LoopChoiceNode whose counter register is in a
|
||||
// newly-initialized state at the current position in the generated code. For
|
||||
// example, consider /a{6,8}/. Absent any extra information, the
|
||||
// LoopChoiceNode for the repetition must report that it consumes at least
|
||||
// zero characters, because it may have already looped several times. However,
|
||||
// with a newly-initialized counter, it can report that it consumes at least
|
||||
// six characters.
|
||||
virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry();
|
||||
// Emits some quick code that checks whether the preloaded characters match.
|
||||
// Falls through on certain failure, jumps to the label on possible success.
|
||||
// If the node cannot make a quick check it does nothing and returns false.
|
||||
bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace,
|
||||
Trace* trace, bool preload_has_checked_bounds,
|
||||
Label* on_possible_success,
|
||||
QuickCheckDetails* details_return,
|
||||
bool fall_through_on_failure, ChoiceNode* predecessor);
|
||||
// For a given number of characters this returns a mask and a value. The
|
||||
// next n characters are anded with the mask and compared with the value.
|
||||
// A comparison failure indicates the node cannot match the next n characters.
|
||||
// A comparison success indicates the node may match.
|
||||
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler,
|
||||
int characters_filled_in,
|
||||
bool not_at_start) = 0;
|
||||
// Fills in quick check details for this node, given that this is a
|
||||
// LoopChoiceNode whose counter register is in a newly-initialized state at
|
||||
// the current position in the generated code. For example, consider /a{6,8}/.
|
||||
// Absent any extra information, the LoopChoiceNode for the repetition cannot
|
||||
// generate any useful quick check because a match might be the (empty)
|
||||
// continuation node. However, with a newly-initialized counter, it can
|
||||
// generate a quick check for several 'a' characters at once.
|
||||
virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler,
|
||||
int characters_filled_in,
|
||||
bool not_at_start);
|
||||
static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
|
||||
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
|
||||
// Only returns the successor for a text node of length 1 that matches any
|
||||
// character and that has no guards on it.
|
||||
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
|
||||
RegExpCompiler* compiler) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Collects information on the possible code units (mod 128) that can match if
|
||||
// we look forward. This is used for a Boyer-Moore-like string searching
|
||||
// implementation. TODO(erikcorry): This should share more code with
|
||||
// EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
|
||||
// the number of nodes we are willing to look at in order to create this data.
|
||||
static const int kRecursionBudget = 200;
|
||||
bool KeepRecursing(RegExpCompiler* compiler);
|
||||
virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) {
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
// If we know that the input is one-byte then there are some nodes that can
|
||||
// never match. This method returns a node that can be substituted for
|
||||
// itself, or nullptr if the node can never match.
|
||||
virtual RegExpNode* FilterOneByte(int depth) { return this; }
|
||||
// Helper for FilterOneByte.
|
||||
RegExpNode* replacement() {
|
||||
DCHECK(info()->replacement_calculated);
|
||||
return replacement_;
|
||||
}
|
||||
RegExpNode* set_replacement(RegExpNode* replacement) {
|
||||
info()->replacement_calculated = true;
|
||||
replacement_ = replacement;
|
||||
return replacement; // For convenience.
|
||||
}
|
||||
|
||||
// We want to avoid recalculating the lookahead info, so we store it on the
|
||||
// node. Only info that is for this node is stored. We can tell that the
|
||||
// info is for this node when offset == 0, so the information is calculated
|
||||
// relative to this node.
|
||||
void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
|
||||
if (offset == 0) set_bm_info(not_at_start, bm);
|
||||
}
|
||||
|
||||
Label* label() { return &label_; }
|
||||
// If non-generic code is generated for a node (i.e. the node is not at the
|
||||
// start of the trace) then it cannot be reused. This variable sets a limit
|
||||
// on how often we allow that to happen before we insist on starting a new
|
||||
// trace and generating generic code for a node that can be reused by flushing
|
||||
// the deferred actions in the current trace and generating a goto.
|
||||
static const int kMaxCopiesCodeGenerated = 10;
|
||||
|
||||
bool on_work_list() { return on_work_list_; }
|
||||
void set_on_work_list(bool value) { on_work_list_ = value; }
|
||||
|
||||
NodeInfo* info() { return &info_; }
|
||||
const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; }
|
||||
void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) {
|
||||
eats_at_least_ = eats_at_least;
|
||||
}
|
||||
|
||||
BoyerMooreLookahead* bm_info(bool not_at_start) {
|
||||
return bm_info_[not_at_start ? 1 : 0];
|
||||
}
|
||||
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
protected:
|
||||
enum LimitResult { DONE, CONTINUE };
|
||||
RegExpNode* replacement_;
|
||||
|
||||
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
|
||||
|
||||
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
|
||||
bm_info_[not_at_start ? 1 : 0] = bm;
|
||||
}
|
||||
|
||||
private:
|
||||
static const int kFirstCharBudget = 10;
|
||||
Label label_;
|
||||
bool on_work_list_;
|
||||
NodeInfo info_;
|
||||
|
||||
// Saved values for EatsAtLeast results, to avoid recomputation. Filled in
|
||||
// during analysis (valid if info_.been_analyzed is true).
|
||||
EatsAtLeastInfo eats_at_least_;
|
||||
|
||||
// This variable keeps track of how many times code has been generated for
|
||||
// this node (in different traces). We don't keep track of where the
|
||||
// generated code is located unless the code is generated at the start of
|
||||
// a trace, in which case it is generic and can be reused by flushing the
|
||||
// deferred operations in the current trace and generating a goto.
|
||||
int trace_count_;
|
||||
BoyerMooreLookahead* bm_info_[2];
|
||||
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
class SeqRegExpNode : public RegExpNode {
|
||||
public:
|
||||
explicit SeqRegExpNode(RegExpNode* on_success)
|
||||
: RegExpNode(on_success->zone()), on_success_(on_success) {}
|
||||
RegExpNode* on_success() { return on_success_; }
|
||||
void set_on_success(RegExpNode* node) { on_success_ = node; }
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override {
|
||||
on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
|
||||
if (offset == 0) set_bm_info(not_at_start, bm);
|
||||
}
|
||||
|
||||
protected:
|
||||
RegExpNode* FilterSuccessor(int depth);
|
||||
|
||||
private:
|
||||
RegExpNode* on_success_;
|
||||
};
|
||||
|
||||
class ActionNode : public SeqRegExpNode {
|
||||
public:
|
||||
enum ActionType {
|
||||
SET_REGISTER_FOR_LOOP,
|
||||
INCREMENT_REGISTER,
|
||||
STORE_POSITION,
|
||||
BEGIN_SUBMATCH,
|
||||
POSITIVE_SUBMATCH_SUCCESS,
|
||||
EMPTY_MATCH_CHECK,
|
||||
CLEAR_CAPTURES
|
||||
};
|
||||
static ActionNode* SetRegisterForLoop(int reg, int val,
|
||||
RegExpNode* on_success);
|
||||
static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
|
||||
static ActionNode* StorePosition(int reg, bool is_capture,
|
||||
RegExpNode* on_success);
|
||||
static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
|
||||
static ActionNode* BeginSubmatch(int stack_pointer_reg, int position_reg,
|
||||
RegExpNode* on_success);
|
||||
static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
|
||||
int restore_reg,
|
||||
int clear_capture_count,
|
||||
int clear_capture_from,
|
||||
RegExpNode* on_success);
|
||||
static ActionNode* EmptyMatchCheck(int start_register,
|
||||
int repetition_register,
|
||||
int repetition_limit,
|
||||
RegExpNode* on_success);
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int filled_in,
|
||||
bool not_at_start) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
ActionType action_type() { return action_type_; }
|
||||
// TODO(erikcorry): We should allow some action nodes in greedy loops.
|
||||
int GreedyLoopTextLength() override {
|
||||
return kNodeIsTooComplexForGreedyLoops;
|
||||
}
|
||||
|
||||
private:
|
||||
union {
|
||||
struct {
|
||||
int reg;
|
||||
int value;
|
||||
} u_store_register;
|
||||
struct {
|
||||
int reg;
|
||||
} u_increment_register;
|
||||
struct {
|
||||
int reg;
|
||||
bool is_capture;
|
||||
} u_position_register;
|
||||
struct {
|
||||
int stack_pointer_register;
|
||||
int current_position_register;
|
||||
int clear_register_count;
|
||||
int clear_register_from;
|
||||
} u_submatch;
|
||||
struct {
|
||||
int start_register;
|
||||
int repetition_register;
|
||||
int repetition_limit;
|
||||
} u_empty_match_check;
|
||||
struct {
|
||||
int range_from;
|
||||
int range_to;
|
||||
} u_clear_captures;
|
||||
} data_;
|
||||
ActionNode(ActionType action_type, RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success), action_type_(action_type) {}
|
||||
ActionType action_type_;
|
||||
friend class DotPrinterImpl;
|
||||
};
|
||||
|
||||
class TextNode : public SeqRegExpNode {
|
||||
public:
|
||||
TextNode(ZoneList<TextElement>* elms, bool read_backward,
|
||||
RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
|
||||
TextNode(RegExpCharacterClass* that, bool read_backward,
|
||||
RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success),
|
||||
elms_(new (zone()) ZoneList<TextElement>(1, zone())),
|
||||
read_backward_(read_backward) {
|
||||
elms_->Add(TextElement::CharClass(that), zone());
|
||||
}
|
||||
// Create TextNode for a single character class for the given ranges.
|
||||
static TextNode* CreateForCharacterRanges(Zone* zone,
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
bool read_backward,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags);
|
||||
// Create TextNode for a surrogate pair with a range given for the
|
||||
// lead and the trail surrogate each.
|
||||
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
|
||||
CharacterRange trail,
|
||||
bool read_backward,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags);
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int characters_filled_in,
|
||||
bool not_at_start) override;
|
||||
ZoneList<TextElement>* elements() { return elms_; }
|
||||
bool read_backward() { return read_backward_; }
|
||||
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
|
||||
int GreedyLoopTextLength() override;
|
||||
RegExpNode* GetSuccessorOfOmnivorousTextNode(
|
||||
RegExpCompiler* compiler) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
void CalculateOffsets();
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
int Length();
|
||||
|
||||
private:
|
||||
enum TextEmitPassType {
|
||||
NON_LATIN1_MATCH, // Check for characters that can't match.
|
||||
SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
|
||||
NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
|
||||
CASE_CHARACTER_MATCH, // Case-independent single character check.
|
||||
CHARACTER_CLASS_MATCH // Character class.
|
||||
};
|
||||
static bool SkipPass(TextEmitPassType pass, bool ignore_case);
|
||||
static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
|
||||
static const int kLastPass = CHARACTER_CLASS_MATCH;
|
||||
void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
|
||||
bool preloaded, Trace* trace, bool first_element_checked,
|
||||
int* checked_up_to);
|
||||
ZoneList<TextElement>* elms_;
|
||||
bool read_backward_;
|
||||
};
|
||||
|
||||
class AssertionNode : public SeqRegExpNode {
|
||||
public:
|
||||
enum AssertionType {
|
||||
AT_END,
|
||||
AT_START,
|
||||
AT_BOUNDARY,
|
||||
AT_NON_BOUNDARY,
|
||||
AFTER_NEWLINE
|
||||
};
|
||||
static AssertionNode* AtEnd(RegExpNode* on_success) {
|
||||
return new (on_success->zone()) AssertionNode(AT_END, on_success);
|
||||
}
|
||||
static AssertionNode* AtStart(RegExpNode* on_success) {
|
||||
return new (on_success->zone()) AssertionNode(AT_START, on_success);
|
||||
}
|
||||
static AssertionNode* AtBoundary(RegExpNode* on_success) {
|
||||
return new (on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
|
||||
}
|
||||
static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
|
||||
return new (on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
|
||||
}
|
||||
static AssertionNode* AfterNewline(RegExpNode* on_success) {
|
||||
return new (on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
|
||||
}
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int filled_in,
|
||||
bool not_at_start) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
AssertionType assertion_type() { return assertion_type_; }
|
||||
|
||||
private:
|
||||
void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
|
||||
enum IfPrevious { kIsNonWord, kIsWord };
|
||||
void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace,
|
||||
IfPrevious backtrack_if_previous);
|
||||
AssertionNode(AssertionType t, RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success), assertion_type_(t) {}
|
||||
AssertionType assertion_type_;
|
||||
};
|
||||
|
||||
class BackReferenceNode : public SeqRegExpNode {
|
||||
public:
|
||||
BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
|
||||
bool read_backward, RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success),
|
||||
start_reg_(start_reg),
|
||||
end_reg_(end_reg),
|
||||
flags_(flags),
|
||||
read_backward_(read_backward) {}
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
int start_register() { return start_reg_; }
|
||||
int end_register() { return end_reg_; }
|
||||
bool read_backward() { return read_backward_; }
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int characters_filled_in,
|
||||
bool not_at_start) override {
|
||||
return;
|
||||
}
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
|
||||
private:
|
||||
int start_reg_;
|
||||
int end_reg_;
|
||||
JSRegExp::Flags flags_;
|
||||
bool read_backward_;
|
||||
};
|
||||
|
||||
class EndNode : public RegExpNode {
|
||||
public:
|
||||
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
|
||||
EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int characters_filled_in,
|
||||
bool not_at_start) override {
|
||||
// Returning 0 from EatsAtLeast should ensure we never get here.
|
||||
UNREACHABLE();
|
||||
}
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override {
|
||||
// Returning 0 from EatsAtLeast should ensure we never get here.
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
private:
|
||||
Action action_;
|
||||
};
|
||||
|
||||
class NegativeSubmatchSuccess : public EndNode {
|
||||
public:
|
||||
NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg,
|
||||
int clear_capture_count, int clear_capture_start,
|
||||
Zone* zone)
|
||||
: EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
|
||||
stack_pointer_register_(stack_pointer_reg),
|
||||
current_position_register_(position_reg),
|
||||
clear_capture_count_(clear_capture_count),
|
||||
clear_capture_start_(clear_capture_start) {}
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
|
||||
private:
|
||||
int stack_pointer_register_;
|
||||
int current_position_register_;
|
||||
int clear_capture_count_;
|
||||
int clear_capture_start_;
|
||||
};
|
||||
|
||||
class Guard : public ZoneObject {
|
||||
public:
|
||||
enum Relation { LT, GEQ };
|
||||
Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {}
|
||||
int reg() { return reg_; }
|
||||
Relation op() { return op_; }
|
||||
int value() { return value_; }
|
||||
|
||||
private:
|
||||
int reg_;
|
||||
Relation op_;
|
||||
int value_;
|
||||
};
|
||||
|
||||
class GuardedAlternative {
|
||||
public:
|
||||
explicit GuardedAlternative(RegExpNode* node)
|
||||
: node_(node), guards_(nullptr) {}
|
||||
void AddGuard(Guard* guard, Zone* zone);
|
||||
RegExpNode* node() { return node_; }
|
||||
void set_node(RegExpNode* node) { node_ = node; }
|
||||
ZoneList<Guard*>* guards() { return guards_; }
|
||||
|
||||
private:
|
||||
RegExpNode* node_;
|
||||
ZoneList<Guard*>* guards_;
|
||||
};
|
||||
|
||||
class AlternativeGeneration;
|
||||
|
||||
class ChoiceNode : public RegExpNode {
|
||||
public:
|
||||
explicit ChoiceNode(int expected_size, Zone* zone)
|
||||
: RegExpNode(zone),
|
||||
alternatives_(new (zone)
|
||||
ZoneList<GuardedAlternative>(expected_size, zone)),
|
||||
not_at_start_(false),
|
||||
being_calculated_(false) {}
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
void AddAlternative(GuardedAlternative node) {
|
||||
alternatives()->Add(node, zone());
|
||||
}
|
||||
ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int characters_filled_in,
|
||||
bool not_at_start) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
|
||||
bool being_calculated() { return being_calculated_; }
|
||||
bool not_at_start() { return not_at_start_; }
|
||||
void set_not_at_start() { not_at_start_ = true; }
|
||||
void set_being_calculated(bool b) { being_calculated_ = b; }
|
||||
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
|
||||
return true;
|
||||
}
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
virtual bool read_backward() { return false; }
|
||||
|
||||
protected:
|
||||
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
|
||||
ZoneList<GuardedAlternative>* alternatives_;
|
||||
|
||||
private:
|
||||
template <typename...>
|
||||
friend class Analysis;
|
||||
|
||||
void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard,
|
||||
Trace* trace);
|
||||
int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
|
||||
void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace,
|
||||
GuardedAlternative alternative,
|
||||
AlternativeGeneration* alt_gen,
|
||||
int preload_characters,
|
||||
bool next_expects_preload);
|
||||
void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
|
||||
PreloadState* preloads);
|
||||
void AssertGuardsMentionRegisters(Trace* trace);
|
||||
int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
|
||||
Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
|
||||
AlternativeGenerationList* alt_gens,
|
||||
PreloadState* preloads,
|
||||
GreedyLoopState* greedy_loop_state, int text_length);
|
||||
void EmitChoices(RegExpCompiler* compiler,
|
||||
AlternativeGenerationList* alt_gens, int first_choice,
|
||||
Trace* trace, PreloadState* preloads);
|
||||
|
||||
// If true, this node is never checked at the start of the input.
|
||||
// Allows a new trace to start with at_start() set to false.
|
||||
bool not_at_start_;
|
||||
bool being_calculated_;
|
||||
};
|
||||
|
||||
class NegativeLookaroundChoiceNode : public ChoiceNode {
|
||||
public:
|
||||
explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
|
||||
GuardedAlternative then_do_this,
|
||||
Zone* zone)
|
||||
: ChoiceNode(2, zone) {
|
||||
AddAlternative(this_must_fail);
|
||||
AddAlternative(then_do_this);
|
||||
}
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int characters_filled_in,
|
||||
bool not_at_start) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override {
|
||||
continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm,
|
||||
not_at_start);
|
||||
if (offset == 0) set_bm_info(not_at_start, bm);
|
||||
}
|
||||
static constexpr int kLookaroundIndex = 0;
|
||||
static constexpr int kContinueIndex = 1;
|
||||
RegExpNode* lookaround_node() {
|
||||
return alternatives()->at(kLookaroundIndex).node();
|
||||
}
|
||||
RegExpNode* continue_node() {
|
||||
return alternatives()->at(kContinueIndex).node();
|
||||
}
|
||||
// For a negative lookahead we don't emit the quick check for the
|
||||
// alternative that is expected to fail. This is because quick check code
|
||||
// starts by loading enough characters for the alternative that takes fewest
|
||||
// characters, but on a negative lookahead the negative branch did not take
|
||||
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
|
||||
bool try_to_emit_quick_check_for_alternative(bool is_first) override {
|
||||
return !is_first;
|
||||
}
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
};
|
||||
|
||||
class LoopChoiceNode : public ChoiceNode {
|
||||
public:
|
||||
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward,
|
||||
int min_loop_iterations, Zone* zone)
|
||||
: ChoiceNode(2, zone),
|
||||
loop_node_(nullptr),
|
||||
continue_node_(nullptr),
|
||||
body_can_be_zero_length_(body_can_be_zero_length),
|
||||
read_backward_(read_backward),
|
||||
traversed_loop_initialization_node_(false),
|
||||
min_loop_iterations_(min_loop_iterations) {}
|
||||
void AddLoopAlternative(GuardedAlternative alt);
|
||||
void AddContinueAlternative(GuardedAlternative alt);
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler, int characters_filled_in,
|
||||
bool not_at_start) override;
|
||||
void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
|
||||
RegExpCompiler* compiler,
|
||||
int characters_filled_in,
|
||||
bool not_at_start) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
EatsAtLeastInfo EatsAtLeastFromLoopEntry() override;
|
||||
RegExpNode* loop_node() { return loop_node_; }
|
||||
RegExpNode* continue_node() { return continue_node_; }
|
||||
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
|
||||
int min_loop_iterations() const { return min_loop_iterations_; }
|
||||
bool read_backward() override { return read_backward_; }
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
|
||||
private:
|
||||
// AddAlternative is made private for loop nodes because alternatives
|
||||
// should not be added freely, we need to keep track of which node
|
||||
// goes back to the node itself.
|
||||
void AddAlternative(GuardedAlternative node) {
|
||||
ChoiceNode::AddAlternative(node);
|
||||
}
|
||||
|
||||
RegExpNode* loop_node_;
|
||||
RegExpNode* continue_node_;
|
||||
bool body_can_be_zero_length_;
|
||||
bool read_backward_;
|
||||
|
||||
// Temporary marker set only while generating quick check details. Represents
|
||||
// whether GetQuickCheckDetails traversed the initialization node for this
|
||||
// loop's counter. If so, we may be able to generate stricter quick checks
|
||||
// because we know the loop node must match at least min_loop_iterations_
|
||||
// times before the continuation node can match.
|
||||
bool traversed_loop_initialization_node_;
|
||||
|
||||
// The minimum number of times the loop_node_ must match before the
|
||||
// continue_node_ might be considered. This value can be temporarily decreased
|
||||
// while generating quick check details, to represent the remaining iterations
|
||||
// after the completed portion of the quick check details.
|
||||
int min_loop_iterations_;
|
||||
|
||||
friend class IterationDecrementer;
|
||||
friend class LoopInitializationMarker;
|
||||
};
|
||||
|
||||
class NodeVisitor {
|
||||
public:
|
||||
virtual ~NodeVisitor() = default;
|
||||
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0;
|
||||
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
|
||||
#undef DECLARE_VISIT
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_NODES_H_
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,361 @@
|
|||
// Copyright 2016 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_PARSER_H_
|
||||
#define V8_REGEXP_REGEXP_PARSER_H_
|
||||
|
||||
#include "regexp/regexp-ast.h"
|
||||
#include "regexp/regexp-error.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
struct RegExpCompileData;
|
||||
|
||||
// A BufferedZoneList is an automatically growing list, just like (and backed
|
||||
// by) a ZoneList, that is optimized for the case of adding and removing
|
||||
// a single element. The last element added is stored outside the backing list,
|
||||
// and if no more than one element is ever added, the ZoneList isn't even
|
||||
// allocated.
|
||||
// Elements must not be nullptr pointers.
|
||||
template <typename T, int initial_size>
|
||||
class BufferedZoneList {
|
||||
public:
|
||||
BufferedZoneList() : list_(nullptr), last_(nullptr) {}
|
||||
|
||||
// Adds element at end of list. This element is buffered and can
|
||||
// be read using last() or removed using RemoveLast until a new Add or until
|
||||
// RemoveLast or GetList has been called.
|
||||
void Add(T* value, Zone* zone) {
|
||||
if (last_ != nullptr) {
|
||||
if (list_ == nullptr) {
|
||||
list_ = new (zone) ZoneList<T*>(initial_size, zone);
|
||||
}
|
||||
list_->Add(last_, zone);
|
||||
}
|
||||
last_ = value;
|
||||
}
|
||||
|
||||
T* last() {
|
||||
DCHECK(last_ != nullptr);
|
||||
return last_;
|
||||
}
|
||||
|
||||
T* RemoveLast() {
|
||||
DCHECK(last_ != nullptr);
|
||||
T* result = last_;
|
||||
if ((list_ != nullptr) && (list_->length() > 0))
|
||||
last_ = list_->RemoveLast();
|
||||
else
|
||||
last_ = nullptr;
|
||||
return result;
|
||||
}
|
||||
|
||||
T* Get(int i) {
|
||||
DCHECK((0 <= i) && (i < length()));
|
||||
if (list_ == nullptr) {
|
||||
DCHECK_EQ(0, i);
|
||||
return last_;
|
||||
} else {
|
||||
if (i == list_->length()) {
|
||||
DCHECK(last_ != nullptr);
|
||||
return last_;
|
||||
} else {
|
||||
return list_->at(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
list_ = nullptr;
|
||||
last_ = nullptr;
|
||||
}
|
||||
|
||||
int length() {
|
||||
int length = (list_ == nullptr) ? 0 : list_->length();
|
||||
return length + ((last_ == nullptr) ? 0 : 1);
|
||||
}
|
||||
|
||||
ZoneList<T*>* GetList(Zone* zone) {
|
||||
if (list_ == nullptr) {
|
||||
list_ = new (zone) ZoneList<T*>(initial_size, zone);
|
||||
}
|
||||
if (last_ != nullptr) {
|
||||
list_->Add(last_, zone);
|
||||
last_ = nullptr;
|
||||
}
|
||||
return list_;
|
||||
}
|
||||
|
||||
private:
|
||||
ZoneList<T*>* list_;
|
||||
T* last_;
|
||||
};
|
||||
|
||||
|
||||
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
|
||||
class RegExpBuilder : public ZoneObject {
|
||||
public:
|
||||
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
|
||||
void AddCharacter(uc16 character);
|
||||
void AddUnicodeCharacter(uc32 character);
|
||||
void AddEscapedUnicodeCharacter(uc32 character);
|
||||
// "Adds" an empty expression. Does nothing except consume a
|
||||
// following quantifier
|
||||
void AddEmpty();
|
||||
void AddCharacterClass(RegExpCharacterClass* cc);
|
||||
void AddCharacterClassForDesugaring(uc32 c);
|
||||
void AddAtom(RegExpTree* tree);
|
||||
void AddTerm(RegExpTree* tree);
|
||||
void AddAssertion(RegExpTree* tree);
|
||||
void NewAlternative(); // '|'
|
||||
bool AddQuantifierToAtom(int min, int max,
|
||||
RegExpQuantifier::QuantifierType type);
|
||||
void FlushText();
|
||||
RegExpTree* ToRegExp();
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
|
||||
|
||||
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
|
||||
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
|
||||
bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
|
||||
|
||||
private:
|
||||
static const uc16 kNoPendingSurrogate = 0;
|
||||
void AddLeadSurrogate(uc16 lead_surrogate);
|
||||
void AddTrailSurrogate(uc16 trail_surrogate);
|
||||
void FlushPendingSurrogate();
|
||||
void FlushCharacters();
|
||||
void FlushTerms();
|
||||
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
|
||||
bool NeedsDesugaringForIgnoreCase(uc32 c);
|
||||
Zone* zone() const { return zone_; }
|
||||
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
|
||||
|
||||
Zone* zone_;
|
||||
bool pending_empty_;
|
||||
JSRegExp::Flags flags_;
|
||||
ZoneList<uc16>* characters_;
|
||||
uc16 pending_surrogate_;
|
||||
BufferedZoneList<RegExpTree, 2> terms_;
|
||||
BufferedZoneList<RegExpTree, 2> text_;
|
||||
BufferedZoneList<RegExpTree, 2> alternatives_;
|
||||
#ifdef DEBUG
|
||||
enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
|
||||
#define LAST(x) last_added_ = x;
|
||||
#else
|
||||
#define LAST(x)
|
||||
#endif
|
||||
};
|
||||
|
||||
class V8_EXPORT_PRIVATE RegExpParser {
|
||||
public:
|
||||
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
|
||||
Zone* zone);
|
||||
|
||||
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
|
||||
JSRegExp::Flags flags, RegExpCompileData* result);
|
||||
|
||||
RegExpTree* ParsePattern();
|
||||
RegExpTree* ParseDisjunction();
|
||||
RegExpTree* ParseGroup();
|
||||
|
||||
// Parses a {...,...} quantifier and stores the range in the given
|
||||
// out parameters.
|
||||
bool ParseIntervalQuantifier(int* min_out, int* max_out);
|
||||
|
||||
// Parses and returns a single escaped character. The character
|
||||
// must not be 'b' or 'B' since they are usually handle specially.
|
||||
uc32 ParseClassCharacterEscape();
|
||||
|
||||
// Checks whether the following is a length-digit hexadecimal number,
|
||||
// and sets the value if it is.
|
||||
bool ParseHexEscape(int length, uc32* value);
|
||||
bool ParseUnicodeEscape(uc32* value);
|
||||
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
|
||||
|
||||
bool ParsePropertyClassName(ZoneVector<char>* name_1,
|
||||
ZoneVector<char>* name_2);
|
||||
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
|
||||
const ZoneVector<char>& name_1,
|
||||
const ZoneVector<char>& name_2);
|
||||
|
||||
RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
|
||||
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
|
||||
|
||||
uc32 ParseOctalLiteral();
|
||||
|
||||
// Tries to parse the input as a back reference. If successful it
|
||||
// stores the result in the output parameter and returns true. If
|
||||
// it fails it will push back the characters read so the same characters
|
||||
// can be reparsed.
|
||||
bool ParseBackReferenceIndex(int* index_out);
|
||||
|
||||
// Parse inside a class. Either add escaped class to the range, or return
|
||||
// false and pass parsed single character through |char_out|.
|
||||
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
|
||||
bool add_unicode_case_equivalents, uc32* char_out,
|
||||
bool* is_class_escape);
|
||||
|
||||
char ParseClassEscape();
|
||||
|
||||
RegExpTree* ReportError(RegExpError error);
|
||||
void Advance();
|
||||
void Advance(int dist);
|
||||
void Reset(int pos);
|
||||
|
||||
// Reports whether the pattern might be used as a literal search string.
|
||||
// Only use if the result of the parse is a single atom node.
|
||||
bool simple();
|
||||
bool contains_anchor() { return contains_anchor_; }
|
||||
void set_contains_anchor() { contains_anchor_ = true; }
|
||||
int captures_started() { return captures_started_; }
|
||||
int position() { return next_pos_ - 1; }
|
||||
bool failed() { return failed_; }
|
||||
// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
|
||||
// just read the initial flag value here.
|
||||
bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
|
||||
|
||||
static bool IsSyntaxCharacterOrSlash(uc32 c);
|
||||
|
||||
static const uc32 kEndMarker = (1 << 21);
|
||||
|
||||
private:
|
||||
enum SubexpressionType {
|
||||
INITIAL,
|
||||
CAPTURE, // All positive values represent captures.
|
||||
POSITIVE_LOOKAROUND,
|
||||
NEGATIVE_LOOKAROUND,
|
||||
GROUPING
|
||||
};
|
||||
|
||||
class RegExpParserState : public ZoneObject {
|
||||
public:
|
||||
// Push a state on the stack.
|
||||
RegExpParserState(RegExpParserState* previous_state,
|
||||
SubexpressionType group_type,
|
||||
RegExpLookaround::Type lookaround_type,
|
||||
int disjunction_capture_index,
|
||||
const ZoneVector<uc16>* capture_name,
|
||||
JSRegExp::Flags flags, Zone* zone)
|
||||
: previous_state_(previous_state),
|
||||
builder_(new (zone) RegExpBuilder(zone, flags)),
|
||||
group_type_(group_type),
|
||||
lookaround_type_(lookaround_type),
|
||||
disjunction_capture_index_(disjunction_capture_index),
|
||||
capture_name_(capture_name) {}
|
||||
// Parser state of containing expression, if any.
|
||||
RegExpParserState* previous_state() const { return previous_state_; }
|
||||
bool IsSubexpression() { return previous_state_ != nullptr; }
|
||||
// RegExpBuilder building this regexp's AST.
|
||||
RegExpBuilder* builder() const { return builder_; }
|
||||
// Type of regexp being parsed (parenthesized group or entire regexp).
|
||||
SubexpressionType group_type() const { return group_type_; }
|
||||
// Lookahead or Lookbehind.
|
||||
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
|
||||
// Index in captures array of first capture in this sub-expression, if any.
|
||||
// Also the capture index of this sub-expression itself, if group_type
|
||||
// is CAPTURE.
|
||||
int capture_index() const { return disjunction_capture_index_; }
|
||||
// The name of the current sub-expression, if group_type is CAPTURE. Only
|
||||
// used for named captures.
|
||||
const ZoneVector<uc16>* capture_name() const { return capture_name_; }
|
||||
|
||||
bool IsNamedCapture() const { return capture_name_ != nullptr; }
|
||||
|
||||
// Check whether the parser is inside a capture group with the given index.
|
||||
bool IsInsideCaptureGroup(int index);
|
||||
// Check whether the parser is inside a capture group with the given name.
|
||||
bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
|
||||
|
||||
private:
|
||||
// Linked list implementation of stack of states.
|
||||
RegExpParserState* const previous_state_;
|
||||
// Builder for the stored disjunction.
|
||||
RegExpBuilder* const builder_;
|
||||
// Stored disjunction type (capture, look-ahead or grouping), if any.
|
||||
const SubexpressionType group_type_;
|
||||
// Stored read direction.
|
||||
const RegExpLookaround::Type lookaround_type_;
|
||||
// Stored disjunction's capture index (if any).
|
||||
const int disjunction_capture_index_;
|
||||
// Stored capture name (if any).
|
||||
const ZoneVector<uc16>* const capture_name_;
|
||||
};
|
||||
|
||||
// Return the 1-indexed RegExpCapture object, allocate if necessary.
|
||||
RegExpCapture* GetCapture(int index);
|
||||
|
||||
// Creates a new named capture at the specified index. Must be called exactly
|
||||
// once for each named capture. Fails if a capture with the same name is
|
||||
// encountered.
|
||||
bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
|
||||
|
||||
// Parses the name of a capture group (?<name>pattern). The name must adhere
|
||||
// to IdentifierName in the ECMAScript standard.
|
||||
const ZoneVector<uc16>* ParseCaptureGroupName();
|
||||
|
||||
bool ParseNamedBackReference(RegExpBuilder* builder,
|
||||
RegExpParserState* state);
|
||||
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
|
||||
|
||||
// After the initial parsing pass, patch corresponding RegExpCapture objects
|
||||
// into all RegExpBackReferences. This is done after initial parsing in order
|
||||
// to avoid complicating cases in which references comes before the capture.
|
||||
void PatchNamedBackReferences();
|
||||
|
||||
Handle<FixedArray> CreateCaptureNameMap();
|
||||
|
||||
// Returns true iff the pattern contains named captures. May call
|
||||
// ScanForCaptures to look ahead at the remaining pattern.
|
||||
bool HasNamedCaptures();
|
||||
|
||||
Isolate* isolate() { return isolate_; }
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
uc32 current() { return current_; }
|
||||
bool has_more() { return has_more_; }
|
||||
bool has_next() { return next_pos_ < in()->length(); }
|
||||
uc32 Next();
|
||||
template <bool update_position>
|
||||
uc32 ReadNext();
|
||||
FlatStringReader* in() { return in_; }
|
||||
void ScanForCaptures();
|
||||
|
||||
struct RegExpCaptureNameLess {
|
||||
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
|
||||
DCHECK_NOT_NULL(lhs);
|
||||
DCHECK_NOT_NULL(rhs);
|
||||
return *lhs->name() < *rhs->name();
|
||||
}
|
||||
};
|
||||
|
||||
Isolate* isolate_;
|
||||
Zone* zone_;
|
||||
RegExpError error_ = RegExpError::kNone;
|
||||
int error_pos_ = 0;
|
||||
ZoneList<RegExpCapture*>* captures_;
|
||||
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
|
||||
ZoneList<RegExpBackReference*>* named_back_references_;
|
||||
FlatStringReader* in_;
|
||||
uc32 current_;
|
||||
// These are the flags specified outside the regexp syntax ie after the
|
||||
// terminating '/' or in the second argument to the constructor. The current
|
||||
// flags are stored on the RegExpBuilder.
|
||||
JSRegExp::Flags top_level_flags_;
|
||||
int next_pos_;
|
||||
int captures_started_;
|
||||
int capture_count_; // Only valid after we have scanned for captures.
|
||||
bool has_more_;
|
||||
bool simple_;
|
||||
bool contains_anchor_;
|
||||
bool is_scanned_for_captures_;
|
||||
bool has_named_captures_; // Only valid after we have scanned for captures.
|
||||
bool failed_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_PARSER_H_
|
|
@ -0,0 +1,212 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
* vim: set ts=8 sts=2 et sw=2 tw=80:
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
#include "regexp/regexp-stack.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
void PrintF(const char* format, ...) {
|
||||
va_list arguments;
|
||||
va_start(arguments, format);
|
||||
vprintf(format, arguments);
|
||||
va_end(arguments);
|
||||
}
|
||||
|
||||
void PrintF(FILE* out, const char* format, ...) {
|
||||
va_list arguments;
|
||||
va_start(arguments, format);
|
||||
vfprintf(out, format, arguments);
|
||||
va_end(arguments);
|
||||
}
|
||||
|
||||
StdoutStream::operator std::ostream&() const { return std::cerr; }
|
||||
|
||||
template <typename T>
|
||||
std::ostream& StdoutStream::operator<<(T t) { return std::cerr << t; }
|
||||
|
||||
template std::ostream& StdoutStream::operator<<(char const* c);
|
||||
|
||||
// Origin:
|
||||
// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/ostreams.cc#L120-L169
|
||||
// (This is a hand-simplified version.)
|
||||
// Writes the given character to the output escaping everything outside
|
||||
// of printable ASCII range.
|
||||
std::ostream& operator<<(std::ostream& os, const AsUC16& c) {
|
||||
uc16 v = c.value;
|
||||
bool isPrint = 0x20 < v && v <= 0x7e;
|
||||
char buf[10];
|
||||
const char* format = isPrint ? "%c" : (v <= 0xFF) ? "\\x%02x" : "\\u%04x";
|
||||
SprintfLiteral(buf, format, v);
|
||||
return os << buf;
|
||||
}
|
||||
std::ostream& operator<<(std::ostream& os, const AsUC32& c) {
|
||||
int32_t v = c.value;
|
||||
if (v <= String::kMaxUtf16CodeUnit) {
|
||||
return os << AsUC16(v);
|
||||
}
|
||||
char buf[13];
|
||||
SprintfLiteral(buf, "\\u{%06x}", v);
|
||||
return os << buf;
|
||||
}
|
||||
|
||||
HandleScope::HandleScope(Isolate* isolate)
|
||||
: isolate_(isolate) {
|
||||
isolate->openHandleScope(*this);
|
||||
}
|
||||
|
||||
HandleScope::~HandleScope() {
|
||||
isolate_->closeHandleScope(level_, non_gc_level_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Handle<T>::Handle(T object, Isolate* isolate)
|
||||
: location_(isolate->getHandleLocation(JS::Value(object))) {}
|
||||
|
||||
template Handle<ByteArray>::Handle(ByteArray b, Isolate* isolate);
|
||||
template Handle<HeapObject>::Handle(JS::Value v, Isolate* isolate);
|
||||
template Handle<JSRegExp>::Handle(JSRegExp re, Isolate* isolate);
|
||||
template Handle<String>::Handle(String s, Isolate* isolate);
|
||||
|
||||
template <typename T>
|
||||
Handle<T>::Handle(JS::Value value, Isolate* isolate)
|
||||
: location_(isolate->getHandleLocation(value)) {
|
||||
T::cast(Object(value)); // Assert that value has the correct type.
|
||||
}
|
||||
|
||||
JS::Value* Isolate::getHandleLocation(JS::Value value) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
if (!handleArena_.Append(value)) {
|
||||
oomUnsafe.crash("Irregexp handle allocation");
|
||||
}
|
||||
return &handleArena_.GetLast();
|
||||
}
|
||||
|
||||
void* Isolate::allocatePseudoHandle(size_t bytes) {
|
||||
PseudoHandle<void> ptr;
|
||||
ptr.reset(js_malloc(bytes));
|
||||
if (!ptr) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!uniquePtrArena_.Append(std::move(ptr))) {
|
||||
return nullptr;
|
||||
}
|
||||
return uniquePtrArena_.GetLast().get();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
PseudoHandle<T> Isolate::takeOwnership(void* ptr) {
|
||||
for (auto iter = uniquePtrArena_.IterFromLast(); !iter.Done(); iter.Prev()) {
|
||||
auto& entry = iter.Get();
|
||||
if (entry.get() == ptr) {
|
||||
PseudoHandle<T> result;
|
||||
result.reset(static_cast<T*>(entry.release()));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
MOZ_CRASH("Tried to take ownership of pseudohandle that is not in the arena");
|
||||
}
|
||||
|
||||
PseudoHandle<ByteArrayData> ByteArray::takeOwnership(Isolate* isolate) {
|
||||
PseudoHandle<ByteArrayData> result =
|
||||
isolate->takeOwnership<ByteArrayData>(value_.toPrivate());
|
||||
value_ = JS::PrivateValue(nullptr);
|
||||
return result;
|
||||
}
|
||||
|
||||
void Isolate::trace(JSTracer* trc) {
|
||||
js::gc::AssertRootMarkingPhase(trc);
|
||||
|
||||
for (auto iter = handleArena_.Iter(); !iter.Done(); iter.Next()) {
|
||||
auto& elem = iter.Get();
|
||||
JS::GCPolicy<JS::Value>::trace(trc, &elem, "Isolate handle arena");
|
||||
}
|
||||
}
|
||||
|
||||
/*static*/ Handle<String> String::Flatten(Isolate* isolate,
|
||||
Handle<String> string) {
|
||||
if (string->IsFlat()) {
|
||||
return string;
|
||||
}
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
JSLinearString* linear = string->str()->ensureLinear(isolate->cx());
|
||||
if (!linear) {
|
||||
oomUnsafe.crash("Irregexp String::Flatten");
|
||||
}
|
||||
return Handle<String>(JS::StringValue(linear), isolate);
|
||||
}
|
||||
|
||||
// This is only used for trace messages printing the source of a
|
||||
// regular expression. To keep things simple, we just return an
|
||||
// empty string and don't print anything.
|
||||
std::unique_ptr<char[]> String::ToCString() {
|
||||
return std::unique_ptr<char[]>();
|
||||
}
|
||||
|
||||
byte* Isolate::top_of_regexp_stack() const {
|
||||
return reinterpret_cast<byte*>(regexpStack_->memory_top_address_address());
|
||||
}
|
||||
|
||||
Handle<ByteArray> Isolate::NewByteArray(int length, AllocationType alloc) {
|
||||
MOZ_RELEASE_ASSERT(length >= 0);
|
||||
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
|
||||
size_t alloc_size = sizeof(uint32_t) + length;
|
||||
ByteArrayData* data =
|
||||
static_cast<ByteArrayData*>(allocatePseudoHandle(alloc_size));
|
||||
if (!data) {
|
||||
oomUnsafe.crash("Irregexp NewByteArray");
|
||||
}
|
||||
data->length = length;
|
||||
|
||||
return Handle<ByteArray>(JS::PrivateValue(data), this);
|
||||
}
|
||||
|
||||
Handle<FixedArray> Isolate::NewFixedArray(int length) {
|
||||
MOZ_RELEASE_ASSERT(length >= 0);
|
||||
MOZ_CRASH("TODO");
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
Handle<String> Isolate::InternalizeString(const Vector<const CharT>& str) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
JSAtom* atom = js::AtomizeChars(cx(), str.begin(), str.length());
|
||||
if (!atom) {
|
||||
oomUnsafe.crash("Irregexp InternalizeString");
|
||||
}
|
||||
return Handle<String>(JS::StringValue(atom), this);
|
||||
}
|
||||
|
||||
template Handle<String>
|
||||
Isolate::InternalizeString(const Vector<const uint8_t>& str);
|
||||
template Handle<String>
|
||||
Isolate::InternalizeString(const Vector<const char16_t>& str);
|
||||
|
||||
// TODO: Map flags to jitoptions
|
||||
bool FLAG_correctness_fuzzer_suppressions = false;
|
||||
bool FLAG_enable_regexp_unaligned_accesses = false;
|
||||
bool FLAG_harmony_regexp_sequence = false;
|
||||
bool FLAG_regexp_interpret_all = false;
|
||||
bool FLAG_regexp_mode_modifiers = false;
|
||||
bool FLAG_regexp_optimization = true;
|
||||
bool FLAG_regexp_peephole_optimization = true;
|
||||
bool FLAG_regexp_possessive_quantifier = false;
|
||||
bool FLAG_regexp_tier_up = false;
|
||||
bool FLAG_trace_regexp_assembler = false;
|
||||
bool FLAG_trace_regexp_bytecodes = false;
|
||||
bool FLAG_trace_regexp_parser = false;
|
||||
bool FLAG_trace_regexp_peephole_optimization = false;
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,97 @@
|
|||
// Copyright 2009 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#include "regexp/regexp-stack.h"
|
||||
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
RegExpStackScope::RegExpStackScope(Isolate* isolate)
|
||||
: regexp_stack_(isolate->regexp_stack()) {
|
||||
// Initialize, if not already initialized.
|
||||
regexp_stack_->EnsureCapacity(0);
|
||||
}
|
||||
|
||||
|
||||
RegExpStackScope::~RegExpStackScope() {
|
||||
// Reset the buffer if it has grown.
|
||||
regexp_stack_->Reset();
|
||||
}
|
||||
|
||||
RegExpStack::RegExpStack() : thread_local_(this), isolate_(nullptr) {}
|
||||
|
||||
RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); }
|
||||
|
||||
char* RegExpStack::ArchiveStack(char* to) {
|
||||
if (!thread_local_.owns_memory_) {
|
||||
// Force dynamic stacks prior to archiving. Any growth will do. A dynamic
|
||||
// stack is needed because stack archival & restoration rely on `memory_`
|
||||
// pointing at a fixed-location backing store, whereas the static stack is
|
||||
// tied to a RegExpStack instance.
|
||||
EnsureCapacity(thread_local_.memory_size_ + 1);
|
||||
DCHECK(thread_local_.owns_memory_);
|
||||
}
|
||||
|
||||
size_t size = sizeof(thread_local_);
|
||||
MemCopy(reinterpret_cast<void*>(to), &thread_local_, size);
|
||||
thread_local_ = ThreadLocal(this);
|
||||
return to + size;
|
||||
}
|
||||
|
||||
|
||||
char* RegExpStack::RestoreStack(char* from) {
|
||||
size_t size = sizeof(thread_local_);
|
||||
MemCopy(&thread_local_, reinterpret_cast<void*>(from), size);
|
||||
return from + size;
|
||||
}
|
||||
|
||||
void RegExpStack::Reset() { thread_local_.ResetToStaticStack(this); }
|
||||
|
||||
void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) {
|
||||
if (owns_memory_) DeleteArray(memory_);
|
||||
|
||||
memory_ = regexp_stack->static_stack_;
|
||||
memory_top_ = regexp_stack->static_stack_ + kStaticStackSize;
|
||||
memory_size_ = kStaticStackSize;
|
||||
limit_ = reinterpret_cast<Address>(regexp_stack->static_stack_) +
|
||||
kStackLimitSlack * kSystemPointerSize;
|
||||
owns_memory_ = false;
|
||||
}
|
||||
|
||||
void RegExpStack::ThreadLocal::FreeAndInvalidate() {
|
||||
if (owns_memory_) DeleteArray(memory_);
|
||||
|
||||
// This stack may not be used after being freed. Just reset to invalid values
|
||||
// to ensure we don't accidentally use old memory areas.
|
||||
memory_ = nullptr;
|
||||
memory_top_ = nullptr;
|
||||
memory_size_ = 0;
|
||||
limit_ = kMemoryTop;
|
||||
}
|
||||
|
||||
Address RegExpStack::EnsureCapacity(size_t size) {
|
||||
if (size > kMaximumStackSize) return kNullAddress;
|
||||
if (size < kMinimumDynamicStackSize) size = kMinimumDynamicStackSize;
|
||||
if (thread_local_.memory_size_ < size) {
|
||||
byte* new_memory = NewArray<byte>(size);
|
||||
if (thread_local_.memory_size_ > 0) {
|
||||
// Copy original memory into top of new memory.
|
||||
MemCopy(new_memory + size - thread_local_.memory_size_,
|
||||
thread_local_.memory_, thread_local_.memory_size_);
|
||||
if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_);
|
||||
}
|
||||
thread_local_.memory_ = new_memory;
|
||||
thread_local_.memory_top_ = new_memory + size;
|
||||
thread_local_.memory_size_ = size;
|
||||
thread_local_.limit_ = reinterpret_cast<Address>(new_memory) +
|
||||
kStackLimitSlack * kSystemPointerSize;
|
||||
thread_local_.owns_memory_ = true;
|
||||
}
|
||||
return reinterpret_cast<Address>(thread_local_.memory_top_);
|
||||
}
|
||||
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
|
@ -0,0 +1,141 @@
|
|||
// Copyright 2009 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_STACK_H_
|
||||
#define V8_REGEXP_REGEXP_STACK_H_
|
||||
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class RegExpStack;
|
||||
|
||||
// Maintains a per-v8thread stack area that can be used by irregexp
|
||||
// implementation for its backtracking stack.
|
||||
// Since there is only one stack area, the Irregexp implementation is not
|
||||
// re-entrant. I.e., no regular expressions may be executed in the same thread
|
||||
// during a preempted Irregexp execution.
|
||||
class RegExpStackScope {
|
||||
public:
|
||||
// Create and delete an instance to control the life-time of a growing stack.
|
||||
|
||||
// Initializes the stack memory area if necessary.
|
||||
explicit RegExpStackScope(Isolate* isolate);
|
||||
~RegExpStackScope(); // Releases the stack if it has grown.
|
||||
|
||||
RegExpStack* stack() const { return regexp_stack_; }
|
||||
|
||||
private:
|
||||
RegExpStack* regexp_stack_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(RegExpStackScope);
|
||||
};
|
||||
|
||||
|
||||
class RegExpStack {
|
||||
public:
|
||||
RegExpStack();
|
||||
~RegExpStack();
|
||||
|
||||
// Number of allocated locations on the stack below the limit.
|
||||
// No sequence of pushes must be longer that this without doing a stack-limit
|
||||
// check.
|
||||
static constexpr int kStackLimitSlack = 32;
|
||||
|
||||
// Gives the top of the memory used as stack.
|
||||
Address stack_base() {
|
||||
DCHECK_NE(0, thread_local_.memory_size_);
|
||||
DCHECK_EQ(thread_local_.memory_top_,
|
||||
thread_local_.memory_ + thread_local_.memory_size_);
|
||||
return reinterpret_cast<Address>(thread_local_.memory_top_);
|
||||
}
|
||||
|
||||
// The total size of the memory allocated for the stack.
|
||||
size_t stack_capacity() { return thread_local_.memory_size_; }
|
||||
|
||||
// If the stack pointer gets below the limit, we should react and
|
||||
// either grow the stack or report an out-of-stack exception.
|
||||
// There is only a limited number of locations below the stack limit,
|
||||
// so users of the stack should check the stack limit during any
|
||||
// sequence of pushes longer that this.
|
||||
Address* limit_address_address() { return &(thread_local_.limit_); }
|
||||
|
||||
// Ensures that there is a memory area with at least the specified size.
|
||||
// If passing zero, the default/minimum size buffer is allocated.
|
||||
Address EnsureCapacity(size_t size);
|
||||
|
||||
// Thread local archiving.
|
||||
static constexpr int ArchiveSpacePerThread() {
|
||||
return static_cast<int>(sizeof(ThreadLocal));
|
||||
}
|
||||
char* ArchiveStack(char* to);
|
||||
char* RestoreStack(char* from);
|
||||
void FreeThreadResources() { thread_local_.ResetToStaticStack(this); }
|
||||
|
||||
// Maximal size of allocated stack area.
|
||||
static constexpr size_t kMaximumStackSize = 64 * MB;
|
||||
|
||||
private:
|
||||
// Artificial limit used when the thread-local state has been destroyed.
|
||||
static const Address kMemoryTop =
|
||||
static_cast<Address>(static_cast<uintptr_t>(-1));
|
||||
|
||||
// Minimal size of dynamically-allocated stack area.
|
||||
static constexpr size_t kMinimumDynamicStackSize = 1 * KB;
|
||||
|
||||
// In addition to dynamically-allocated, variable-sized stacks, we also have
|
||||
// a statically allocated and sized area that is used whenever no dynamic
|
||||
// stack is allocated. This guarantees that a stack is always available and
|
||||
// we can skip availability-checks later on.
|
||||
// It's double the slack size to ensure that we have a bit of breathing room
|
||||
// before NativeRegExpMacroAssembler::GrowStack must be called.
|
||||
static constexpr size_t kStaticStackSize =
|
||||
2 * kStackLimitSlack * kSystemPointerSize;
|
||||
byte static_stack_[kStaticStackSize] = {0};
|
||||
|
||||
STATIC_ASSERT(kStaticStackSize <= kMaximumStackSize);
|
||||
|
||||
// Structure holding the allocated memory, size and limit.
|
||||
struct ThreadLocal {
|
||||
explicit ThreadLocal(RegExpStack* regexp_stack) {
|
||||
ResetToStaticStack(regexp_stack);
|
||||
}
|
||||
|
||||
// If memory_size_ > 0 then memory_ and memory_top_ must be non-nullptr
|
||||
// and memory_top_ = memory_ + memory_size_
|
||||
byte* memory_ = nullptr;
|
||||
byte* memory_top_ = nullptr;
|
||||
size_t memory_size_ = 0;
|
||||
Address limit_ = kNullAddress;
|
||||
bool owns_memory_ = false; // Whether memory_ is owned and must be freed.
|
||||
|
||||
void ResetToStaticStack(RegExpStack* regexp_stack);
|
||||
void FreeAndInvalidate();
|
||||
};
|
||||
|
||||
// Address of top of memory used as stack.
|
||||
Address memory_top_address_address() {
|
||||
return reinterpret_cast<Address>(&thread_local_.memory_top_);
|
||||
}
|
||||
|
||||
// Resets the buffer if it has grown beyond the default/minimum size.
|
||||
// After this, the buffer is either the default size, or it is empty, so
|
||||
// you have to call EnsureCapacity before using it again.
|
||||
void Reset();
|
||||
|
||||
ThreadLocal thread_local_;
|
||||
Isolate* isolate_;
|
||||
|
||||
friend class ExternalReference;
|
||||
friend class Isolate;
|
||||
friend class RegExpStackScope;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(RegExpStack);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_STACK_H_
|
|
@ -0,0 +1,195 @@
|
|||
// Copyright 2012 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_REGEXP_H_
|
||||
#define V8_REGEXP_REGEXP_H_
|
||||
|
||||
#include "regexp/regexp-error.h"
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class RegExpNode;
|
||||
class RegExpTree;
|
||||
|
||||
enum class RegExpCompilationTarget : int { kBytecode, kNative };
|
||||
|
||||
// TODO(jgruber): Do not expose in regexp.h.
|
||||
// TODO(jgruber): Consider splitting between ParseData and CompileData.
|
||||
struct RegExpCompileData {
|
||||
// The parsed AST as produced by the RegExpParser.
|
||||
RegExpTree* tree = nullptr;
|
||||
|
||||
// The compiled Node graph as produced by RegExpTree::ToNode methods.
|
||||
RegExpNode* node = nullptr;
|
||||
|
||||
// Either the generated code as produced by the compiler or a trampoline
|
||||
// to the interpreter.
|
||||
Object code;
|
||||
|
||||
// True, iff the pattern is a 'simple' atom with zero captures. In other
|
||||
// words, the pattern consists of a string with no metacharacters and special
|
||||
// regexp features, and can be implemented as a standard string search.
|
||||
bool simple = true;
|
||||
|
||||
// True, iff the pattern is anchored at the start of the string with '^'.
|
||||
bool contains_anchor = false;
|
||||
|
||||
// Only use if the pattern contains named captures. If so, this contains a
|
||||
// mapping of capture names to capture indices.
|
||||
Handle<FixedArray> capture_name_map;
|
||||
|
||||
// The error message. Only used if an error occurred during parsing or
|
||||
// compilation.
|
||||
RegExpError error = RegExpError::kNone;
|
||||
|
||||
// The position at which the error was detected. Only used if an
|
||||
// error occurred.
|
||||
int error_pos = 0;
|
||||
|
||||
// The number of capture groups, without the global capture \0.
|
||||
int capture_count = 0;
|
||||
|
||||
// The number of registers used by the generated code.
|
||||
int register_count = 0;
|
||||
|
||||
// The compilation target (bytecode or native code).
|
||||
RegExpCompilationTarget compilation_target;
|
||||
};
|
||||
|
||||
class RegExp final : public AllStatic {
|
||||
public:
|
||||
// Whether the irregexp engine generates interpreter bytecode.
|
||||
static bool CanGenerateBytecode() {
|
||||
return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
|
||||
}
|
||||
|
||||
// Parses the RegExp pattern and prepares the JSRegExp object with
|
||||
// generic data and choice of implementation - as well as what
|
||||
// the implementation wants to store in the data field.
|
||||
// Returns false if compilation fails.
|
||||
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
|
||||
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
|
||||
JSRegExp::Flags flags, uint32_t backtrack_limit);
|
||||
|
||||
enum CallOrigin : int {
|
||||
kFromRuntime = 0,
|
||||
kFromJs = 1,
|
||||
};
|
||||
|
||||
// See ECMA-262 section 15.10.6.2.
|
||||
// This function calls the garbage collector if necessary.
|
||||
V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
|
||||
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
|
||||
int index, Handle<RegExpMatchInfo> last_match_info);
|
||||
|
||||
// Integral return values used throughout regexp code layers.
|
||||
static constexpr int kInternalRegExpFailure = 0;
|
||||
static constexpr int kInternalRegExpSuccess = 1;
|
||||
static constexpr int kInternalRegExpException = -1;
|
||||
static constexpr int kInternalRegExpRetry = -2;
|
||||
|
||||
enum IrregexpResult : int32_t {
|
||||
RE_FAILURE = kInternalRegExpFailure,
|
||||
RE_SUCCESS = kInternalRegExpSuccess,
|
||||
RE_EXCEPTION = kInternalRegExpException,
|
||||
};
|
||||
|
||||
// Prepare a RegExp for being executed one or more times (using
|
||||
// IrregexpExecOnce) on the subject.
|
||||
// This ensures that the regexp is compiled for the subject, and that
|
||||
// the subject is flat.
|
||||
// Returns the number of integer spaces required by IrregexpExecOnce
|
||||
// as its "registers" argument. If the regexp cannot be compiled,
|
||||
// an exception is set as pending, and this function returns negative.
|
||||
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
|
||||
Handle<String> subject);
|
||||
|
||||
// Set last match info. If match is nullptr, then setting captures is
|
||||
// omitted.
|
||||
static Handle<RegExpMatchInfo> SetLastMatchInfo(
|
||||
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
|
||||
Handle<String> subject, int capture_count, int32_t* match);
|
||||
|
||||
V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone,
|
||||
RegExpCompileData* input,
|
||||
JSRegExp::Flags flags,
|
||||
Handle<String> pattern,
|
||||
Handle<String> sample_subject,
|
||||
bool is_one_byte);
|
||||
|
||||
V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
|
||||
RegExpNode* node);
|
||||
|
||||
static const int kRegExpTooLargeToOptimize = 20 * KB;
|
||||
};
|
||||
|
||||
// Uses a special global mode of irregexp-generated code to perform a global
|
||||
// search and return multiple results at once. As such, this is essentially an
|
||||
// iterator over multiple results (retrieved batch-wise in advance).
|
||||
class RegExpGlobalCache final {
|
||||
public:
|
||||
RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject,
|
||||
Isolate* isolate);
|
||||
|
||||
~RegExpGlobalCache();
|
||||
|
||||
// Fetch the next entry in the cache for global regexp match results.
|
||||
// This does not set the last match info. Upon failure, nullptr is
|
||||
// returned. The cause can be checked with Result(). The previous result is
|
||||
// still in available in memory when a failure happens.
|
||||
int32_t* FetchNext();
|
||||
|
||||
int32_t* LastSuccessfulMatch();
|
||||
|
||||
bool HasException() { return num_matches_ < 0; }
|
||||
|
||||
private:
|
||||
int AdvanceZeroLength(int last_index);
|
||||
|
||||
int num_matches_;
|
||||
int max_matches_;
|
||||
int current_match_index_;
|
||||
int registers_per_match_;
|
||||
// Pointer to the last set of captures.
|
||||
int32_t* register_array_;
|
||||
int register_array_size_;
|
||||
Handle<JSRegExp> regexp_;
|
||||
Handle<String> subject_;
|
||||
Isolate* isolate_;
|
||||
};
|
||||
|
||||
// Caches results for specific regexp queries on the isolate. At the time of
|
||||
// writing, this is used during global calls to RegExp.prototype.exec and
|
||||
// @@split.
|
||||
class RegExpResultsCache final : public AllStatic {
|
||||
public:
|
||||
enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
|
||||
|
||||
// Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
|
||||
// On success, the returned result is guaranteed to be a COW-array.
|
||||
static Object Lookup(Heap* heap, String key_string, Object key_pattern,
|
||||
FixedArray* last_match_out, ResultsCacheType type);
|
||||
// Attempt to add value_array to the cache specified by type. On success,
|
||||
// value_array is turned into a COW-array.
|
||||
static void Enter(Isolate* isolate, Handle<String> key_string,
|
||||
Handle<Object> key_pattern, Handle<FixedArray> value_array,
|
||||
Handle<FixedArray> last_match_cache, ResultsCacheType type);
|
||||
static void Clear(FixedArray cache);
|
||||
|
||||
static constexpr int kRegExpResultsCacheSize = 0x100;
|
||||
|
||||
private:
|
||||
static constexpr int kStringOffset = 0;
|
||||
static constexpr int kPatternOffset = 1;
|
||||
static constexpr int kArrayOffset = 2;
|
||||
static constexpr int kLastMatchOffset = 3;
|
||||
static constexpr int kArrayEntriesPerCacheEntry = 4;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_H_
|
|
@ -0,0 +1,88 @@
|
|||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that
|
||||
// can be found in the LICENSE file.
|
||||
|
||||
// Automatically generated by regexp/gen-regexp-special-case.cc
|
||||
|
||||
// The following functions are used to build UnicodeSets
|
||||
// for special cases where the case-folding algorithm used by
|
||||
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match
|
||||
// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime
|
||||
// Semantics: Canonicalize) step 3.
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "regexp/special-case.h"
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
icu::UnicodeSet BuildIgnoreSet() {
|
||||
icu::UnicodeSet set;
|
||||
set.add(0xdf);
|
||||
set.add(0x17f);
|
||||
set.add(0x390);
|
||||
set.add(0x3b0);
|
||||
set.add(0x3f4);
|
||||
set.add(0x1e9e);
|
||||
set.add(0x1f80, 0x1faf);
|
||||
set.add(0x1fb3);
|
||||
set.add(0x1fbc);
|
||||
set.add(0x1fc3);
|
||||
set.add(0x1fcc);
|
||||
set.add(0x1fd3);
|
||||
set.add(0x1fe3);
|
||||
set.add(0x1ff3);
|
||||
set.add(0x1ffc);
|
||||
set.add(0x2126);
|
||||
set.add(0x212a, 0x212b);
|
||||
set.add(0xfb05, 0xfb06);
|
||||
set.freeze();
|
||||
return set;
|
||||
}
|
||||
|
||||
struct IgnoreSetData {
|
||||
IgnoreSetData() : set(BuildIgnoreSet()) {}
|
||||
const icu::UnicodeSet set;
|
||||
};
|
||||
|
||||
//static
|
||||
const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() {
|
||||
static base::LazyInstance<IgnoreSetData>::type set =
|
||||
LAZY_INSTANCE_INITIALIZER;
|
||||
return set.Pointer()->set;
|
||||
}
|
||||
|
||||
icu::UnicodeSet BuildSpecialAddSet() {
|
||||
icu::UnicodeSet set;
|
||||
set.add(0x4b);
|
||||
set.add(0x53);
|
||||
set.add(0x6b);
|
||||
set.add(0x73);
|
||||
set.add(0xc5);
|
||||
set.add(0xe5);
|
||||
set.add(0x398);
|
||||
set.add(0x3a9);
|
||||
set.add(0x3b8);
|
||||
set.add(0x3c9);
|
||||
set.add(0x3d1);
|
||||
set.freeze();
|
||||
return set;
|
||||
}
|
||||
|
||||
struct SpecialAddSetData {
|
||||
SpecialAddSetData() : set(BuildSpecialAddSet()) {}
|
||||
const icu::UnicodeSet set;
|
||||
};
|
||||
|
||||
//static
|
||||
const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() {
|
||||
static base::LazyInstance<SpecialAddSetData>::type set =
|
||||
LAZY_INSTANCE_INITIALIZER;
|
||||
return set.Pointer()->set;
|
||||
}
|
||||
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
#endif // V8_INTL_SUPPORT
|
|
@ -0,0 +1,117 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_REGEXP_SPECIAL_CASE_H_
|
||||
#define V8_REGEXP_SPECIAL_CASE_H_
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "regexp/regexp-shim.h"
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// Sets of Unicode characters that need special handling under "i" mode
|
||||
|
||||
// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
|
||||
// defines slightly different case-folding rules than Unicode. An
|
||||
// input character should match a pattern character if the result of
|
||||
// the Canonicalize algorithm is the same for both characters.
|
||||
//
|
||||
// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
|
||||
// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
|
||||
// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
|
||||
// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
|
||||
// the precise definition.
|
||||
//
|
||||
// While compiling such regular expressions, we need to compute the
|
||||
// set of characters that should match a given input character. (See
|
||||
// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
|
||||
// For almost all characters, this can be efficiently computed using
|
||||
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
|
||||
// the remaining special cases.
|
||||
//
|
||||
// For a character c, the rules are as follows:
|
||||
//
|
||||
// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
|
||||
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
|
||||
// containing c will produce the set of characters that should
|
||||
// match /c/i (or /[c]/i), and only those characters.
|
||||
//
|
||||
// 2. If c is in IgnoreSet, then the only character it should match is
|
||||
// itself. However, closeOver will add additional incorrect
|
||||
// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
|
||||
// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
|
||||
// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
|
||||
// itself, and should not match 'ẞ'. In these cases, we can skip
|
||||
// the closeOver entirely, because it will never add an equivalent
|
||||
// character.
|
||||
//
|
||||
// 3. If c is in SpecialAddSet, then it should match at least one
|
||||
// character other than itself. However, closeOver will add at
|
||||
// least one additional incorrect match. For example, consider the
|
||||
// letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
|
||||
// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
|
||||
// SIGN should not match either of the other two characters. As a
|
||||
// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
|
||||
// IgnoreSet). To find the correct matches for characters in
|
||||
// SpecialAddSet, we closeOver the original character, but filter
|
||||
// out the results that do not have the same canonical value.
|
||||
//
|
||||
// The contents of these sets are calculated at build time by
|
||||
// src/regexp/gen-regexp-special-case.cc, which generates
|
||||
// gen/src/regexp/special-case.cc. This is done by iterating over the
|
||||
// result of closeOver for each BMP character, and finding sets for
|
||||
// which at least one character has a different canonical value than
|
||||
// another character. Characters that match no other characters in
|
||||
// their equivalence class are added to IgnoreSet. Characters that
|
||||
// match at least one other character are added to SpecialAddSet.
|
||||
|
||||
class RegExpCaseFolding final : public AllStatic {
|
||||
public:
|
||||
static const icu::UnicodeSet& IgnoreSet();
|
||||
static const icu::UnicodeSet& SpecialAddSet();
|
||||
|
||||
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
|
||||
// Canonicalize) step 3, which is used to determine whether
|
||||
// characters match when ignoreCase is true and unicode is false.
|
||||
static UChar32 Canonicalize(UChar32 ch) {
|
||||
// a. Assert: ch is a UTF-16 code unit.
|
||||
CHECK_LE(ch, 0xffff);
|
||||
|
||||
// b. Let s be the String value consisting of the single code unit ch.
|
||||
icu::UnicodeString s(ch);
|
||||
|
||||
// c. Let u be the same result produced as if by performing the algorithm
|
||||
// for String.prototype.toUpperCase using s as the this value.
|
||||
// d. Assert: Type(u) is String.
|
||||
icu::UnicodeString& u = s.toUpper();
|
||||
|
||||
// e. If u does not consist of a single code unit, return ch.
|
||||
if (u.length() != 1) {
|
||||
return ch;
|
||||
}
|
||||
|
||||
// f. Let cu be u's single code unit element.
|
||||
UChar32 cu = u.char32At(0);
|
||||
|
||||
// g. If the value of ch >= 128 and the value of cu < 128, return ch.
|
||||
if (ch >= 128 && cu < 128) {
|
||||
return ch;
|
||||
}
|
||||
|
||||
// h. Return cu.
|
||||
return cu;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
#endif // V8_REGEXP_SPECIAL_CASE_H_
|
|
@ -0,0 +1,93 @@
|
|||
// Copyright 2014 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_UTIL_FLAGS_H_
|
||||
#define V8_UTIL_FLAGS_H_
|
||||
|
||||
// Origin: https://github.com/v8/v8/blob/1bafcc6b999b23ea1d394f5d267a08183e3c4e19/src/base/flags.h#L15-L90
|
||||
|
||||
namespace v8 {
|
||||
namespace base {
|
||||
|
||||
// The Flags class provides a type-safe way of storing OR-combinations of enum
|
||||
// values. The Flags<T, S> class is a template class, where T is an enum type,
|
||||
// and S is the underlying storage type (usually int).
|
||||
//
|
||||
// The traditional C++ approach for storing OR-combinations of enum values is to
|
||||
// use an int or unsigned int variable. The inconvenience with this approach is
|
||||
// that there's no type checking at all; any enum value can be OR'd with any
|
||||
// other enum value and passed on to a function that takes an int or unsigned
|
||||
// int.
|
||||
template <typename T, typename S = int>
|
||||
class Flags final {
|
||||
public:
|
||||
using flag_type = T;
|
||||
using mask_type = S;
|
||||
|
||||
constexpr Flags() : mask_(0) {}
|
||||
constexpr Flags(flag_type flag)
|
||||
: mask_(static_cast<S>(flag)) {}
|
||||
constexpr explicit Flags(mask_type mask) : mask_(static_cast<S>(mask)) {}
|
||||
|
||||
constexpr bool operator==(flag_type flag) const {
|
||||
return mask_ == static_cast<S>(flag);
|
||||
}
|
||||
constexpr bool operator!=(flag_type flag) const {
|
||||
return mask_ != static_cast<S>(flag);
|
||||
}
|
||||
|
||||
Flags& operator&=(const Flags& flags) {
|
||||
mask_ &= flags.mask_;
|
||||
return *this;
|
||||
}
|
||||
Flags& operator|=(const Flags& flags) {
|
||||
mask_ |= flags.mask_;
|
||||
return *this;
|
||||
}
|
||||
Flags& operator^=(const Flags& flags) {
|
||||
mask_ ^= flags.mask_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr Flags operator&(const Flags& flags) const {
|
||||
return Flags(mask_ & flags.mask_);
|
||||
}
|
||||
constexpr Flags operator|(const Flags& flags) const {
|
||||
return Flags(mask_ | flags.mask_);
|
||||
}
|
||||
constexpr Flags operator^(const Flags& flags) const {
|
||||
return Flags(mask_ ^ flags.mask_);
|
||||
}
|
||||
|
||||
Flags& operator&=(flag_type flag) { return operator&=(Flags(flag)); }
|
||||
Flags& operator|=(flag_type flag) { return operator|=(Flags(flag)); }
|
||||
Flags& operator^=(flag_type flag) { return operator^=(Flags(flag)); }
|
||||
|
||||
constexpr Flags operator&(flag_type flag) const {
|
||||
return operator&(Flags(flag));
|
||||
}
|
||||
constexpr Flags operator|(flag_type flag) const {
|
||||
return operator|(Flags(flag));
|
||||
}
|
||||
constexpr Flags operator^(flag_type flag) const {
|
||||
return operator^(Flags(flag));
|
||||
}
|
||||
|
||||
constexpr Flags operator~() const { return Flags(~mask_); }
|
||||
|
||||
constexpr operator mask_type() const { return mask_; }
|
||||
constexpr bool operator!() const { return !mask_; }
|
||||
|
||||
Flags without(flag_type flag) { return *this & (~Flags(flag)); }
|
||||
|
||||
friend size_t hash_value(const Flags& flags) { return flags.mask_; }
|
||||
|
||||
private:
|
||||
mask_type mask_;
|
||||
};
|
||||
|
||||
} // namespace base
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_UTIL_FLAG_H_
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,204 @@
|
|||
// Copyright 2014 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_UTIL_VECTOR_H_
|
||||
#define V8_UTIL_VECTOR_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
|
||||
#include "js/Utility.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// Adapted from: https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/allocation.h#L36-L58
|
||||
|
||||
template <typename T>
|
||||
T* NewArray(size_t size) {
|
||||
static_assert(std::is_pod<T>::value, "");
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
T* result = static_cast<T*>(js_malloc(size * sizeof(T)));
|
||||
if (!result) {
|
||||
oomUnsafe.crash("Irregexp NewArray");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DeleteArray(T* array) {
|
||||
js_free(array);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// A non-resizable vector containing a pointer and a length.
|
||||
// The Vector may or may not own the pointer, depending on context.
|
||||
// Origin:
|
||||
// https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/vector.h#L20-L134
|
||||
|
||||
template <typename T>
|
||||
class Vector {
|
||||
public:
|
||||
constexpr Vector() : start_(nullptr), length_(0) {}
|
||||
|
||||
constexpr Vector(T* data, size_t length) : start_(data), length_(length) {
|
||||
MOZ_ASSERT_IF(length != 0, data != nullptr);
|
||||
}
|
||||
|
||||
static Vector<T> New(size_t length) {
|
||||
return Vector<T>(NewArray<T>(length), length);
|
||||
}
|
||||
|
||||
// Returns a vector using the same backing storage as this one,
|
||||
// spanning from and including 'from', to but not including 'to'.
|
||||
Vector<T> SubVector(size_t from, size_t to) const {
|
||||
MOZ_ASSERT(from < to);
|
||||
MOZ_ASSERT(to < length_);
|
||||
return Vector<T>(begin() + from, to - from);
|
||||
}
|
||||
|
||||
// Returns the length of the vector. Only use this if you really need an
|
||||
// integer return value. Use {size()} otherwise.
|
||||
int length() const {
|
||||
MOZ_ASSERT(length_ <= std::numeric_limits<int>::max());
|
||||
return static_cast<int>(length_);
|
||||
}
|
||||
|
||||
// Returns the length of the vector as a size_t.
|
||||
constexpr size_t size() const { return length_; }
|
||||
|
||||
// Returns whether or not the vector is empty.
|
||||
constexpr bool empty() const { return length_ == 0; }
|
||||
|
||||
// Access individual vector elements - checks bounds in debug mode.
|
||||
T& operator[](size_t index) const {
|
||||
MOZ_ASSERT(index < length_);
|
||||
return start_[index];
|
||||
}
|
||||
|
||||
const T& at(size_t index) const { return operator[](index); }
|
||||
|
||||
T& first() { return start_[0]; }
|
||||
|
||||
T& last() {
|
||||
MOZ_ASSERT(length_ > 0);
|
||||
return start_[length_ - 1];
|
||||
}
|
||||
|
||||
// Returns a pointer to the start of the data in the vector.
|
||||
constexpr T* begin() const { return start_; }
|
||||
|
||||
// Returns a pointer past the end of the data in the vector.
|
||||
constexpr T* end() const { return start_ + length_; }
|
||||
|
||||
// Returns a clone of this vector with a new backing store.
|
||||
Vector<T> Clone() const {
|
||||
T* result = NewArray<T>(length_);
|
||||
for (size_t i = 0; i < length_; i++) result[i] = start_[i];
|
||||
return Vector<T>(result, length_);
|
||||
}
|
||||
|
||||
void Truncate(size_t length) {
|
||||
MOZ_ASSERT(length <= length_);
|
||||
length_ = length;
|
||||
}
|
||||
|
||||
// Releases the array underlying this vector. Once disposed the
|
||||
// vector is empty.
|
||||
void Dispose() {
|
||||
DeleteArray(start_);
|
||||
start_ = nullptr;
|
||||
length_ = 0;
|
||||
}
|
||||
|
||||
Vector<T> operator+(size_t offset) {
|
||||
MOZ_ASSERT(offset <= length_);
|
||||
return Vector<T>(start_ + offset, length_ - offset);
|
||||
}
|
||||
|
||||
Vector<T> operator+=(size_t offset) {
|
||||
MOZ_ASSERT(offset <= length_);
|
||||
start_ += offset;
|
||||
length_ -= offset;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Implicit conversion from Vector<T> to Vector<const T>.
|
||||
inline operator Vector<const T>() const {
|
||||
return Vector<const T>::cast(*this);
|
||||
}
|
||||
|
||||
template <typename S>
|
||||
static constexpr Vector<T> cast(Vector<S> input) {
|
||||
return Vector<T>(reinterpret_cast<T*>(input.begin()),
|
||||
input.length() * sizeof(S) / sizeof(T));
|
||||
}
|
||||
|
||||
bool operator==(const Vector<const T> other) const {
|
||||
if (length_ != other.length_) return false;
|
||||
if (start_ == other.start_) return true;
|
||||
for (size_t i = 0; i < length_; ++i) {
|
||||
if (start_[i] != other.start_[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
T* start_;
|
||||
size_t length_;
|
||||
};
|
||||
|
||||
// The resulting vector does not contain a null-termination byte. If you want
|
||||
// the null byte, use ArrayVector("foo").
|
||||
inline Vector<const char> CStrVector(const char* data) {
|
||||
return Vector<const char>(data, strlen(data));
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
||||
namespace base {
|
||||
|
||||
// SmallVector uses inline storage first, and reallocates when full.
|
||||
// It is basically equivalent to js::Vector, and is implemented
|
||||
// as a thin wrapper.
|
||||
// V8's implementation: https://github.com/v8/v8/blob/master/src/base/small-vector.h
|
||||
template <typename T, size_t kSize>
|
||||
class SmallVector {
|
||||
public:
|
||||
inline bool empty() const { return inner_.empty(); }
|
||||
inline const T& back() const { return inner_.back(); }
|
||||
inline void pop_back() { inner_.popBack(); };
|
||||
template <typename... Args>
|
||||
inline void emplace_back(Args&&... args) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
if (!inner_.emplaceBack(args...)) {
|
||||
oomUnsafe.crash("Irregexp SmallVector emplace_back");
|
||||
}
|
||||
};
|
||||
inline size_t size() const { return inner_.length(); }
|
||||
inline const T& at(size_t index) const { return inner_[index]; }
|
||||
|
||||
void resize_no_init(size_t new_size) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
if (!inner_.resizeUninitialized(new_size)) {
|
||||
oomUnsafe.crash("Irregexp SmallVector resize");
|
||||
}
|
||||
}
|
||||
private:
|
||||
js::Vector<T, kSize, js::SystemAllocPolicy> inner_;
|
||||
};
|
||||
|
||||
|
||||
} // namespace base
|
||||
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_UTIL_VECTOR_H_
|
|
@ -0,0 +1,375 @@
|
|||
// Copyright 2019 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
#ifndef V8_UTIL_ZONE_H_
|
||||
#define V8_UTIL_ZONE_H_
|
||||
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "ds/LifoAlloc.h"
|
||||
#include "ds/Sort.h"
|
||||
#include "regexp/util/vector.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
// V8::Zone ~= LifoAlloc
|
||||
class Zone {
|
||||
public:
|
||||
Zone(size_t defaultChunkSize) : lifoAlloc_(defaultChunkSize) {
|
||||
lifoAlloc_.setAsInfallibleByDefault();
|
||||
}
|
||||
|
||||
void* New(size_t size) {
|
||||
js::LifoAlloc::AutoFallibleScope fallible(&lifoAlloc_);
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
void* result = lifoAlloc_.alloc(size);
|
||||
if (!result) {
|
||||
oomUnsafe.crash("Irregexp Zone::new");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void DeleteAll() { lifoAlloc_.freeAll(); }
|
||||
|
||||
// Returns true if the total memory allocated exceeds a threshold.
|
||||
static const size_t kExcessLimit = 256 * 1024 * 1024;
|
||||
bool excess_allocation() const {
|
||||
return lifoAlloc_.computedSizeOfExcludingThis() > kExcessLimit;
|
||||
}
|
||||
private:
|
||||
js::LifoAlloc lifoAlloc_;
|
||||
};
|
||||
|
||||
// Superclass for classes allocated in a Zone.
|
||||
// Origin: https://github.com/v8/v8/blob/7b3332844212d78ee87a9426f3a6f7f781a8fbfa/src/zone/zone.h#L138-L155
|
||||
class ZoneObject {
|
||||
public:
|
||||
// Allocate a new ZoneObject of 'size' bytes in the Zone.
|
||||
void* operator new(size_t size, Zone* zone) { return zone->New(size); }
|
||||
|
||||
// Ideally, the delete operator should be private instead of
|
||||
// public, but unfortunately the compiler sometimes synthesizes
|
||||
// (unused) destructors for classes derived from ZoneObject, which
|
||||
// require the operator to be visible. MSVC requires the delete
|
||||
// operator to be public.
|
||||
|
||||
// ZoneObjects should never be deleted individually; use
|
||||
// Zone::DeleteAll() to delete all zone objects in one go.
|
||||
void operator delete(void*, size_t) { MOZ_CRASH("unreachable"); }
|
||||
void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); }
|
||||
};
|
||||
|
||||
// ZoneLists are growable lists with constant-time access to the
|
||||
// elements. The list itself and all its elements are allocated in the
|
||||
// Zone. ZoneLists cannot be deleted individually; you can delete all
|
||||
// objects in the Zone by calling Zone::DeleteAll().
|
||||
// Used throughout irregexp.
|
||||
// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone.h#L173-L318
|
||||
// Inlines: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-list-inl.h#L17-L155
|
||||
template <typename T>
|
||||
class ZoneList final {
|
||||
public:
|
||||
// Construct a new ZoneList with the given capacity; the length is
|
||||
// always zero. The capacity must be non-negative.
|
||||
ZoneList(int capacity, Zone* zone) { Initialize(capacity, zone); }
|
||||
// Construct a new ZoneList from a std::initializer_list
|
||||
ZoneList(std::initializer_list<T> list, Zone* zone) {
|
||||
Initialize(static_cast<int>(list.size()), zone);
|
||||
for (auto& i : list) Add(i, zone);
|
||||
}
|
||||
// Construct a new ZoneList by copying the elements of the given ZoneList.
|
||||
ZoneList(const ZoneList<T>& other, Zone* zone) {
|
||||
Initialize(other.length(), zone);
|
||||
AddAll(other, zone);
|
||||
}
|
||||
|
||||
void* operator new(size_t size, Zone* zone) { return zone->New(size); }
|
||||
|
||||
// Returns a reference to the element at index i. This reference is not safe
|
||||
// to use after operations that can change the list's backing store
|
||||
// (e.g. Add).
|
||||
inline T& operator[](int i) const {
|
||||
MOZ_ASSERT(0 < i);
|
||||
MOZ_ASSERT(static_cast<unsigned>(i) < static_cast<unsigned>(length_));
|
||||
return data_[i];
|
||||
}
|
||||
inline T& at(int i) const { return operator[](i); }
|
||||
inline T& last() const { return at(length_ - 1); }
|
||||
inline T& first() const { return at(0); }
|
||||
|
||||
using iterator = T*;
|
||||
inline iterator begin() const { return &data_[0]; }
|
||||
inline iterator end() const { return &data_[length_]; }
|
||||
|
||||
inline bool is_empty() const { return length_ == 0; }
|
||||
inline int length() const { return length_; }
|
||||
inline int capacity() const { return capacity_; }
|
||||
|
||||
Vector<T> ToVector() const { return Vector<T>(data_, length_); }
|
||||
Vector<T> ToVector(int start, int length) const {
|
||||
return Vector<T>(data_ + start, std::min(length_ - start, length));
|
||||
}
|
||||
|
||||
Vector<const T> ToConstVector() const {
|
||||
return Vector<const T>(data_, length_);
|
||||
}
|
||||
|
||||
inline void Initialize(int capacity, Zone* zone) {
|
||||
MOZ_ASSERT(capacity >= 0);
|
||||
data_ = (capacity > 0) ? NewData(capacity, zone) : nullptr;
|
||||
capacity_ = capacity;
|
||||
length_ = 0;
|
||||
}
|
||||
|
||||
// Adds a copy of the given 'element' to the end of the list,
|
||||
// expanding the list if necessary.
|
||||
void Add(const T& element, Zone* zone) {
|
||||
if (length_ < capacity_) {
|
||||
data_[length_++] = element;
|
||||
} else {
|
||||
ZoneList<T>::ResizeAdd(element, zone);
|
||||
}
|
||||
}
|
||||
// Add all the elements from the argument list to this list.
|
||||
void AddAll(const ZoneList<T>& other, Zone* zone) {
|
||||
AddAll(other.ToVector(), zone);
|
||||
}
|
||||
// Add all the elements from the vector to this list.
|
||||
void AddAll(const Vector<T>& other, Zone* zone) {
|
||||
int result_length = length_ + other.length();
|
||||
if (capacity_ < result_length) {
|
||||
Resize(result_length, zone);
|
||||
}
|
||||
if (std::is_fundamental<T>()) {
|
||||
memcpy(data_ + length_, other.begin(), sizeof(*data_) * other.length());
|
||||
} else {
|
||||
for (int i = 0; i < other.length(); i++) {
|
||||
data_[length_ + i] = other.at(i);
|
||||
}
|
||||
}
|
||||
length_ = result_length;
|
||||
}
|
||||
|
||||
// Overwrites the element at the specific index.
|
||||
void Set(int index, const T& element) {
|
||||
MOZ_ASSERT(index >= 0 && index <= length_);
|
||||
data_[index] = element;
|
||||
}
|
||||
|
||||
// Removes the i'th element without deleting it even if T is a
|
||||
// pointer type; moves all elements above i "down". Returns the
|
||||
// removed element. This function's complexity is linear in the
|
||||
// size of the list.
|
||||
T Remove(int i) {
|
||||
T element = at(i);
|
||||
length_--;
|
||||
while (i < length_) {
|
||||
data_[i] = data_[i + 1];
|
||||
i++;
|
||||
}
|
||||
return element;
|
||||
}
|
||||
|
||||
// Removes the last element without deleting it even if T is a
|
||||
// pointer type. Returns the removed element.
|
||||
inline T RemoveLast() { return Remove(length_ - 1); }
|
||||
|
||||
// Clears the list by freeing the storage memory. If you want to keep the
|
||||
// memory, use Rewind(0) instead. Be aware, that even if T is a
|
||||
// pointer type, clearing the list doesn't delete the entries.
|
||||
inline void Clear() {
|
||||
data_ = nullptr;
|
||||
capacity_ = 0;
|
||||
length_ = 0;
|
||||
}
|
||||
|
||||
// Drops all but the first 'pos' elements from the list.
|
||||
inline void Rewind(int pos) {
|
||||
MOZ_ASSERT(0 <= pos && pos <= length_);
|
||||
length_ = pos;
|
||||
}
|
||||
|
||||
inline bool Contains(const T& elm) const {
|
||||
for (int i = 0; i < length_; i++) {
|
||||
if (data_[i] == elm) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CompareFunction>
|
||||
void StableSort(CompareFunction cmp, size_t start, size_t length) {
|
||||
js::AutoEnterOOMUnsafeRegion oomUnsafe;
|
||||
T* scratch = static_cast<T*>(js_malloc(length * sizeof(T)));
|
||||
if (!scratch) {
|
||||
oomUnsafe.crash("Irregexp stable sort scratch space");
|
||||
}
|
||||
auto comparator = [cmp](const T& a, const T& b, bool* lessOrEqual) {
|
||||
*lessOrEqual = cmp(&a, &b) <= 0;
|
||||
return true;
|
||||
};
|
||||
MOZ_ALWAYS_TRUE(js::MergeSort(begin() + start, length, scratch,
|
||||
comparator));
|
||||
js_free(scratch);
|
||||
}
|
||||
|
||||
void operator delete(void* pointer) { MOZ_CRASH("unreachable"); }
|
||||
void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); }
|
||||
|
||||
private:
|
||||
T* data_;
|
||||
int capacity_;
|
||||
int length_;
|
||||
|
||||
inline T* NewData(int n, Zone* zone) {
|
||||
return static_cast<T*>(zone->New(n * sizeof(T)));
|
||||
}
|
||||
|
||||
// Increase the capacity of a full list, and add an element.
|
||||
// List must be full already.
|
||||
void ResizeAdd(const T& element, Zone* zone) {
|
||||
MOZ_ASSERT(length_ >= capacity_);
|
||||
// Grow the list capacity by 100%, but make sure to let it grow
|
||||
// even when the capacity is zero (possible initial case).
|
||||
int new_capacity = 1 + 2 * capacity_;
|
||||
// Since the element reference could be an element of the list, copy
|
||||
// it out of the old backing storage before resizing.
|
||||
T temp = element;
|
||||
Resize(new_capacity, zone);
|
||||
data_[length_++] = temp;
|
||||
}
|
||||
|
||||
// Resize the list.
|
||||
void Resize(int new_capacity, Zone* zone) {
|
||||
MOZ_ASSERT(length_ <= new_capacity);
|
||||
T* new_data = NewData(new_capacity, zone);
|
||||
if (length_ > 0) {
|
||||
memcpy(new_data, data_, length_ * sizeof(T));
|
||||
}
|
||||
data_ = new_data;
|
||||
capacity_ = new_capacity;
|
||||
}
|
||||
|
||||
ZoneList& operator=(const ZoneList&) = delete;
|
||||
ZoneList() = delete;
|
||||
ZoneList(const ZoneList&) = delete;
|
||||
};
|
||||
|
||||
// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-allocator.h#L14-L77
|
||||
template <typename T>
|
||||
class ZoneAllocator {
|
||||
public:
|
||||
using pointer = T*;
|
||||
using const_pointer = const T*;
|
||||
using reference = T&;
|
||||
using const_reference = const T&;
|
||||
using value_type = T;
|
||||
using size_type = size_t;
|
||||
using difference_type = ptrdiff_t;
|
||||
template <class O>
|
||||
struct rebind {
|
||||
using other = ZoneAllocator<O>;
|
||||
};
|
||||
|
||||
explicit ZoneAllocator(Zone* zone) : zone_(zone) {}
|
||||
template <typename U>
|
||||
ZoneAllocator(const ZoneAllocator<U>& other)
|
||||
: ZoneAllocator<T>(other.zone_) {}
|
||||
template <typename U>
|
||||
friend class ZoneAllocator;
|
||||
|
||||
T* allocate(size_t n) { return static_cast<T*>(zone_->New(n * sizeof(T))); }
|
||||
void deallocate(T* p, size_t) {} // noop for zones
|
||||
|
||||
bool operator==(ZoneAllocator const& other) const {
|
||||
return zone_ == other.zone_;
|
||||
}
|
||||
bool operator!=(ZoneAllocator const& other) const {
|
||||
return zone_ != other.zone_;
|
||||
}
|
||||
|
||||
private:
|
||||
Zone* zone_;
|
||||
};
|
||||
|
||||
// Zone wrappers for std containers:
|
||||
// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-containers.h#L25-L169
|
||||
|
||||
// A wrapper subclass for std::vector to make it easy to construct one
|
||||
// that uses a zone allocator.
|
||||
// Used throughout irregexp
|
||||
template <typename T>
|
||||
class ZoneVector : public std::vector<T, ZoneAllocator<T>> {
|
||||
public:
|
||||
ZoneVector(Zone* zone)
|
||||
: std::vector<T, ZoneAllocator<T>>(ZoneAllocator<T>(zone)) {}
|
||||
|
||||
// Constructs a new vector and fills it with the contents of the range
|
||||
// [first, last).
|
||||
template <class Iter>
|
||||
ZoneVector(Iter first, Iter last, Zone* zone)
|
||||
: std::vector<T, ZoneAllocator<T>>(first, last, ZoneAllocator<T>(zone)) {}
|
||||
};
|
||||
|
||||
// A wrapper subclass for std::list to make it easy to construct one
|
||||
// that uses a zone allocator.
|
||||
// Used in regexp-bytecode-peephole.cc
|
||||
template <typename T>
|
||||
class ZoneLinkedList : public std::list<T, ZoneAllocator<T>> {
|
||||
public:
|
||||
// Constructs an empty list.
|
||||
explicit ZoneLinkedList(Zone* zone)
|
||||
: std::list<T, ZoneAllocator<T>>(ZoneAllocator<T>(zone)) {}
|
||||
};
|
||||
|
||||
// A wrapper subclass for std::set to make it easy to construct one that uses
|
||||
// a zone allocator.
|
||||
// Used in regexp-parser.cc
|
||||
template <typename K, typename Compare = std::less<K>>
|
||||
class ZoneSet : public std::set<K, Compare, ZoneAllocator<K>> {
|
||||
public:
|
||||
// Constructs an empty set.
|
||||
explicit ZoneSet(Zone* zone)
|
||||
: std::set<K, Compare, ZoneAllocator<K>>(Compare(),
|
||||
ZoneAllocator<K>(zone)) {}
|
||||
};
|
||||
|
||||
// A wrapper subclass for std::map to make it easy to construct one that uses
|
||||
// a zone allocator.
|
||||
// Used in regexp-bytecode-peephole.cc
|
||||
template <typename K, typename V, typename Compare = std::less<K>>
|
||||
class ZoneMap
|
||||
: public std::map<K, V, Compare, ZoneAllocator<std::pair<const K, V>>> {
|
||||
public:
|
||||
// Constructs an empty map.
|
||||
explicit ZoneMap(Zone* zone)
|
||||
: std::map<K, V, Compare, ZoneAllocator<std::pair<const K, V>>>(
|
||||
Compare(), ZoneAllocator<std::pair<const K, V>>(zone)) {}
|
||||
};
|
||||
|
||||
// A wrapper subclass for std::unordered_map to make it easy to construct one
|
||||
// that uses a zone allocator.
|
||||
// Used in regexp-bytecode-peephole.cc
|
||||
template <typename K, typename V, typename Hash = std::hash<K>,
|
||||
typename KeyEqual = std::equal_to<K>>
|
||||
class ZoneUnorderedMap
|
||||
: public std::unordered_map<K, V, Hash, KeyEqual,
|
||||
ZoneAllocator<std::pair<const K, V>>> {
|
||||
public:
|
||||
// Constructs an empty map.
|
||||
explicit ZoneUnorderedMap(Zone* zone, size_t bucket_count = 100)
|
||||
: std::unordered_map<K, V, Hash, KeyEqual,
|
||||
ZoneAllocator<std::pair<const K, V>>>(
|
||||
bucket_count, Hash(), KeyEqual(),
|
||||
ZoneAllocator<std::pair<const K, V>>(zone)) {}
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#endif // V8_UTIL_FLAG_H_
|
Loading…
Reference in New Issue