Import V8's regexp parser code and add build config.

master
Fedor 2020-11-26 23:40:46 +02:00
parent 8e4756b26b
commit 03415291a3
48 changed files with 22470 additions and 0 deletions

View File

@ -86,6 +86,7 @@ included_inclnames_to_ignore = set([
'unicode/udat.h', # ICU
'unicode/udatpg.h', # ICU
'unicode/uenum.h', # ICU
'unicode/uniset.h', # ICU
'unicode/unorm.h', # ICU
'unicode/unum.h', # ICU
'unicode/unumsys.h', # ICU

View File

@ -246,3 +246,16 @@ with only_when('--enable-compile-environment'):
set_config('LIBFUZZER', enable_libfuzzer)
set_define('LIBFUZZER', enable_libfuzzer)
# Initial support for new regexp engine
# ==================================================
js_option('--enable-new-regexp', default=False, help='Enable new regexp engine')
@depends('--enable-new-regexp')
def enable_new_regexp(value):
if value:
return True
set_config('JS_NEW_REGEXP', enable_new_regexp)
set_define('JS_NEW_REGEXP', enable_new_regexp)

View File

@ -122,6 +122,9 @@ if CONFIG['JS_HAS_CTYPES']:
if CONFIG['JS_BUNDLED_EDITLINE']:
DIRS += ['editline']
if CONFIG['JS_NEW_REGEXP']:
DIRS += ['regexp']
if not CONFIG['JS_DISABLE_SHELL']:
DIRS += ['shell']

View File

@ -0,0 +1,51 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// This file forward-defines Irregexp classes that need to be visible
// to the rest of Spidermonkey and re-exports them into js::irregexp.
#ifndef regexp_RegExpTypes_h
#define regexp_RegExpTypes_h
namespace js {
class MatchPairs;
}
namespace v8 {
namespace internal {
struct InputOutputData {
const void* inputStart;
const void* inputEnd;
// Index into inputStart (in chars) at which to begin matching.
size_t startIndex;
js::MatchPairs* matches;
template <typename CharT>
InputOutputData(const CharT* inputStart, const CharT* inputEnd,
size_t startIndex, js::MatchPairs* matches)
: inputStart(inputStart),
inputEnd(inputEnd),
startIndex(startIndex),
matches(matches)
{}
};
} // namespace internal
} // namespace v8
namespace js {
namespace irregexp {
using InputOutputData = v8::internal::InputOutputData;
} // namespace irregexp
} // namespace js
#endif // regexp_RegExpTypes_h

2
js/src/regexp/VERSION Normal file
View File

@ -0,0 +1,2 @@
Imported using import-irregexp.py from:
https://github.com/v8/v8/tree/560f2d8bb3f3a72d78e1a7d7654235d53fdcc83c/src/regexp

View File

@ -0,0 +1,165 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include "regexp/special-case.h"
namespace v8 {
namespace internal {
static const uc32 kSurrogateStart = 0xd800;
static const uc32 kSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
// The following code generates "src/regexp/special-case.cc".
void PrintSet(std::ofstream& out, const char* name,
const icu::UnicodeSet& set) {
out << "icu::UnicodeSet Build" << name << "() {\n"
<< " icu::UnicodeSet set;\n";
for (int32_t i = 0; i < set.getRangeCount(); i++) {
if (set.getRangeStart(i) == set.getRangeEnd(i)) {
out << " set.add(0x" << set.getRangeStart(i) << ");\n";
} else {
out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
<< set.getRangeEnd(i) << ");\n";
}
}
out << " set.freeze();\n"
<< " return set;\n"
<< "}\n\n";
out << "struct " << name << "Data {\n"
<< " " << name << "Data() : set(Build" << name << "()) {}\n"
<< " const icu::UnicodeSet set;\n"
<< "};\n\n";
out << "//static\n"
<< "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
<< " static base::LazyInstance<" << name << "Data>::type set =\n"
<< " LAZY_INSTANCE_INITIALIZER;\n"
<< " return set.Pointer()->set;\n"
<< "}\n\n";
}
void PrintSpecial(std::ofstream& out) {
icu::UnicodeSet current;
icu::UnicodeSet special_add;
icu::UnicodeSet ignore;
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeSet upper("[\\p{Lu}]", status);
CHECK(U_SUCCESS(status));
// Iterate through all chars in BMP except surrogates.
for (UChar32 i = 0; i < kNonBmpStart; i++) {
if (i >= kSurrogateStart && i <= kSurrogateEnd) {
continue; // Ignore surrogate range
}
current.set(i, i);
current.closeOver(USET_CASE_INSENSITIVE);
// Check to see if all characters in the case-folding equivalence
// class as defined by UnicodeSet::closeOver all map to the same
// canonical value.
UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
bool class_has_matching_canonical_char = false;
bool class_has_non_matching_canonical_char = false;
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
c++) {
if (c == i) {
continue;
}
UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
if (canonical == other_canonical) {
class_has_matching_canonical_char = true;
} else {
class_has_non_matching_canonical_char = true;
}
}
}
// If any other character in i's equivalence class has a
// different canonical value, then i needs special handling. If
// no other character shares a canonical value with i, we can
// ignore i when adding alternatives for case-independent
// comparison. If at least one other character shares a
// canonical value, then i needs special handling.
if (class_has_non_matching_canonical_char) {
if (class_has_matching_canonical_char) {
special_add.add(i);
} else {
ignore.add(i);
}
}
}
// Verify that no Unicode equivalence class contains two non-trivial
// JS equivalence classes. Every character in SpecialAddSet has the
// same canonical value as every other non-IgnoreSet character in
// its Unicode equivalence class. Therefore, if we call closeOver on
// a set containing no IgnoreSet characters, the only characters
// that must be removed from the result are in IgnoreSet. This fact
// is used in CharacterRange::AddCaseEquivalents.
for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
for (UChar32 c = special_add.getRangeStart(i);
c <= special_add.getRangeEnd(i); c++) {
UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
current.set(c, c);
current.closeOver(USET_CASE_INSENSITIVE);
current.removeAll(ignore);
for (int32_t j = 0; j < current.getRangeCount(); j++) {
for (UChar32 c2 = current.getRangeStart(j);
c2 <= current.getRangeEnd(j); c2++) {
CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
}
}
}
}
PrintSet(out, "IgnoreSet", ignore);
PrintSet(out, "SpecialAddSet", special_add);
}
void WriteHeader(const char* header_filename) {
std::ofstream out(header_filename);
out << std::hex << std::setfill('0') << std::setw(4);
out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
<< "// Use of this source code is governed by a BSD-style license that\n"
<< "// can be found in the LICENSE file.\n\n"
<< "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
<< "// The following functions are used to build UnicodeSets\n"
<< "// for special cases where the case-folding algorithm used by\n"
<< "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
<< "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
<< "// Semantics: Canonicalize) step 3.\n\n"
<< "#ifdef V8_INTL_SUPPORT\n"
<< "#include \"src/base/lazy-instance.h\"\n\n"
<< "#include \"src/regexp/special-case.h\"\n\n"
<< "#include \"unicode/uniset.h\"\n"
<< "namespace v8 {\n"
<< "namespace internal {\n\n";
PrintSpecial(out);
out << "\n"
<< "} // namespace internal\n"
<< "} // namespace v8\n"
<< "#endif // V8_INTL_SUPPORT\n";
}
} // namespace internal
} // namespace v8
int main(int argc, const char** argv) {
if (argc != 2) {
std::cerr << "Usage: " << argv[0] << " <output filename>\n";
std::exit(1);
}
v8::internal::WriteHeader(argv[1]);
return 0;
}

View File

@ -0,0 +1,143 @@
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
# This script handles all the mechanical steps of importing irregexp from v8:
#
# 1. Acquire the source: either from github, or optionally from a local copy of v8.
# 2. Copy the contents of v8/src/regexp into js/src/regexp
# - Exclude files that we have chosen not to import.
# 3. While doing so, update #includes:
# - Change "src/regexp/*" to "regexp/*".
# - Remove other v8-specific headers completely.
# 4. Add '#include "regexp/regexp-shim.h" in the necessary places.
# 5. Update the VERSION file to include the correct git hash.
#
# Usage:
# cd path/to/js/src/regexp
# ./import-irregexp.py --path path/to/v8/src/regexp
#
# Alternatively, without the --path argument, import-irregexp.py will
# clone v8 from github into a temporary directory.
#
# After running this script, changes to the shim code may be necessary
# to account for changes in upstream irregexp.
import os
import re
import subprocess
import sys
from pathlib import Path
def get_hash(path):
# Get the hash for the current git revision
cwd = os.getcwd()
os.chdir(path)
command = ['git', 'rev-parse', 'HEAD']
result = subprocess.check_output(command, encoding='utf-8')
os.chdir(cwd)
return result.rstrip()
def copy_and_update_includes(src_path, dst_path):
# List of header files that need to include the shim header
need_shim = ['property-sequences.h',
'regexp-ast.h',
'regexp-bytecode-peephole.h',
'regexp-bytecodes.h',
'regexp-dotprinter.h',
'regexp.h',
'regexp-macro-assembler.h',
'regexp-stack.h',
'special-case.h']
src = open(str(src_path), 'r')
dst = open(str(dst_path), 'w')
# 1. Rewrite includes of V8 regexp headers:
regexp_include = re.compile('#include "src/regexp')
regexp_include_new = '#include "regexp'
# 2. Remove includes of other V8 headers
other_include = re.compile('#include "src/')
# 3. If needed, add '#include "regexp/regexp-shim.h"'.
# Note: We get a little fancy to ensure that header files are
# in alphabetic order. `need_to_add_shim` is true if we still
# have to add the shim header in this file. `adding_shim_now`
# is true if we have found a '#include "src/*' and we are just
# waiting to find something alphabetically smaller (or an empty
# line) so that we can insert the shim header in the right place.
need_to_add_shim = src_path.name in need_shim
adding_shim_now = False
for line in src:
if adding_shim_now:
if (line == '\n' or line > '#include "src/regexp/regexp-shim.h"'):
dst.write('#include "regexp/regexp-shim.h"\n')
need_to_add_shim = False
adding_shim_now = False
if regexp_include.search(line):
dst.write(re.sub(regexp_include, regexp_include_new, line))
elif other_include.search(line):
if need_to_add_shim:
adding_shim_now = True
else:
dst.write(line)
def import_from(srcdir, dstdir):
excluded = ['OWNERS',
'regexp.cc',
'regexp-utils.cc',
'regexp-utils.h',
'regexp-macro-assembler-arch.h']
for file in srcdir.iterdir():
if file.is_dir():
continue
if str(file.name) in excluded:
continue
copy_and_update_includes(file, dstdir / file.name)
# Update VERSION file
hash = get_hash(srcdir)
version_file = open(str(dstdir / 'VERSION'), 'w')
version_file.write('Imported using import-irregexp.py from:\n')
version_file.write('https://github.com/v8/v8/tree/%s/src/regexp\n' % hash)
if __name__ == '__main__':
import argparse
import tempfile
# This script should be run from js/src/regexp to work correctly.
current_path = Path(os.getcwd())
expected_path = 'js/src/regexp'
if not current_path.match(expected_path):
raise RuntimeError('%s must be run from %s' % (sys.argv[0],
expected_path))
parser = argparse.ArgumentParser(description='Import irregexp from v8')
parser.add_argument('-p', '--path', help='path to v8/src/regexp')
args = parser.parse_args()
if args.path:
src_path = Path(args.path)
if not (src_path / 'regexp.h').exists():
print('Usage:\n import-irregexp.py --path <path/to/v8/src/regexp>')
sys.exit(1)
import_from(src_path, current_path)
sys.exit(0)
with tempfile.TemporaryDirectory() as tempdir:
v8_git = 'https://github.com/v8/v8.git'
clone = 'git clone --depth 1 %s %s' % (v8_git, tempdir)
os.system(clone)
src_path = Path(tempdir) / 'src/regexp'
import_from(src_path, current_path)

37
js/src/regexp/moz.build Normal file
View File

@ -0,0 +1,37 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
include('../js-config.mozbuild')
include('../js-cxxflags.mozbuild')
FINAL_LIBRARY = "js"
# Includes should be relative to parent path
LOCAL_INCLUDES += ["!..", ".."]
SOURCES += [
'regexp-ast.cc',
'regexp-bytecode-generator.cc',
'regexp-bytecode-peephole.cc',
'regexp-bytecodes.cc',
'regexp-compiler-tonode.cc',
'regexp-compiler.cc',
'regexp-dotprinter.cc',
'regexp-interpreter.cc',
'regexp-macro-assembler-tracer.cc',
'regexp-macro-assembler.cc',
'regexp-native-macro-assembler.cc',
'regexp-parser.cc',
'regexp-shim.cc',
'regexp-stack.cc',
'util/unicode.cc'
]
if CONFIG['ENABLE_INTL_API']:
CXXFLAGS += ['-DV8_INTL_SUPPORT']
SOURCES += [
'property-sequences.cc',
'special-case.cc'
]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_PROPERTY_SEQUENCES_H_
#define V8_REGEXP_PROPERTY_SEQUENCES_H_
#ifdef V8_INTL_SUPPORT
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
class UnicodePropertySequences : public AllStatic {
public:
static const uc32 kEmojiFlagSequences[];
static const uc32 kEmojiTagSequences[];
static const uc32 kEmojiZWJSequences[];
};
} // namespace internal
} // namespace v8
#endif // V8_INTL_SUPPORT
#endif // V8_REGEXP_PROPERTY_SEQUENCES_H_

342
js/src/regexp/regexp-ast.cc Normal file
View File

@ -0,0 +1,342 @@
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-ast.h"
namespace v8 {
namespace internal {
#define MAKE_ACCEPT(Name) \
void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
return visitor->Visit##Name(this, data); \
}
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
#undef MAKE_ACCEPT
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExpTree::As##Name() { return nullptr; } \
bool RegExpTree::Is##Name() { return false; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExp##Name::As##Name() { return this; } \
bool RegExp##Name::Is##Name() { return true; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
Interval result = Interval::Empty();
for (int i = 0; i < children->length(); i++)
result = result.Union(children->at(i)->CaptureRegisters());
return result;
}
Interval RegExpAlternative::CaptureRegisters() {
return ListCaptureRegisters(nodes());
}
Interval RegExpDisjunction::CaptureRegisters() {
return ListCaptureRegisters(alternatives());
}
Interval RegExpLookaround::CaptureRegisters() {
return body()->CaptureRegisters();
}
Interval RegExpCapture::CaptureRegisters() {
Interval self(StartRegister(index()), EndRegister(index()));
return self.Union(body()->CaptureRegisters());
}
Interval RegExpQuantifier::CaptureRegisters() {
return body()->CaptureRegisters();
}
bool RegExpAssertion::IsAnchoredAtStart() {
return assertion_type() == RegExpAssertion::START_OF_INPUT;
}
bool RegExpAssertion::IsAnchoredAtEnd() {
return assertion_type() == RegExpAssertion::END_OF_INPUT;
}
bool RegExpAlternative::IsAnchoredAtStart() {
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchoredAtStart()) {
return true;
}
if (node->max_match() > 0) {
return false;
}
}
return false;
}
bool RegExpAlternative::IsAnchoredAtEnd() {
ZoneList<RegExpTree*>* nodes = this->nodes();
for (int i = nodes->length() - 1; i >= 0; i--) {
RegExpTree* node = nodes->at(i);
if (node->IsAnchoredAtEnd()) {
return true;
}
if (node->max_match() > 0) {
return false;
}
}
return false;
}
bool RegExpDisjunction::IsAnchoredAtStart() {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
for (int i = 0; i < alternatives->length(); i++) {
if (!alternatives->at(i)->IsAnchoredAtStart()) return false;
}
return true;
}
bool RegExpDisjunction::IsAnchoredAtEnd() {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
for (int i = 0; i < alternatives->length(); i++) {
if (!alternatives->at(i)->IsAnchoredAtEnd()) return false;
}
return true;
}
bool RegExpLookaround::IsAnchoredAtStart() {
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
// Convert regular expression trees to a simple sexp representation.
// This representation should be different from the input grammar
// in as many cases as possible, to make it more difficult for incorrect
// parses to look as correct ones which is likely if the input and
// output formats are alike.
class RegExpUnparser final : public RegExpVisitor {
public:
RegExpUnparser(std::ostream& os, Zone* zone) : os_(os), zone_(zone) {}
void VisitCharacterRange(CharacterRange that);
#define MAKE_CASE(Name) void* Visit##Name(RegExp##Name*, void* data) override;
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
#undef MAKE_CASE
private:
std::ostream& os_;
Zone* zone_;
};
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
os_ << "(|";
for (int i = 0; i < that->alternatives()->length(); i++) {
os_ << " ";
that->alternatives()->at(i)->Accept(this, data);
}
os_ << ")";
return nullptr;
}
void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
os_ << "(:";
for (int i = 0; i < that->nodes()->length(); i++) {
os_ << " ";
that->nodes()->at(i)->Accept(this, data);
}
os_ << ")";
return nullptr;
}
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
os_ << AsUC32(that.from());
if (!that.IsSingleton()) {
os_ << "-" << AsUC32(that.to());
}
}
void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
void* data) {
if (that->is_negated()) os_ << "^";
os_ << "[";
for (int i = 0; i < that->ranges(zone_)->length(); i++) {
if (i > 0) os_ << " ";
VisitCharacterRange(that->ranges(zone_)->at(i));
}
os_ << "]";
return nullptr;
}
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
switch (that->assertion_type()) {
case RegExpAssertion::START_OF_INPUT:
os_ << "@^i";
break;
case RegExpAssertion::END_OF_INPUT:
os_ << "@$i";
break;
case RegExpAssertion::START_OF_LINE:
os_ << "@^l";
break;
case RegExpAssertion::END_OF_LINE:
os_ << "@$l";
break;
case RegExpAssertion::BOUNDARY:
os_ << "@b";
break;
case RegExpAssertion::NON_BOUNDARY:
os_ << "@B";
break;
}
return nullptr;
}
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
os_ << "'";
Vector<const uc16> chardata = that->data();
for (int i = 0; i < chardata.length(); i++) {
os_ << AsUC16(chardata[i]);
}
os_ << "'";
return nullptr;
}
void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
if (that->elements()->length() == 1) {
that->elements()->at(0).tree()->Accept(this, data);
} else {
os_ << "(!";
for (int i = 0; i < that->elements()->length(); i++) {
os_ << " ";
that->elements()->at(i).tree()->Accept(this, data);
}
os_ << ")";
}
return nullptr;
}
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
os_ << "(# " << that->min() << " ";
if (that->max() == RegExpTree::kInfinity) {
os_ << "- ";
} else {
os_ << that->max() << " ";
}
os_ << (that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
that->body()->Accept(this, data);
os_ << ")";
return nullptr;
}
void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
os_ << "(^ ";
that->body()->Accept(this, data);
os_ << ")";
return nullptr;
}
void* RegExpUnparser::VisitGroup(RegExpGroup* that, void* data) {
os_ << "(?: ";
that->body()->Accept(this, data);
os_ << ")";
return nullptr;
}
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
os_ << "(";
os_ << (that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-");
os_ << (that->is_positive() ? " + " : " - ");
that->body()->Accept(this, data);
os_ << ")";
return nullptr;
}
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that,
void* data) {
os_ << "(<- " << that->index() << ")";
return nullptr;
}
void* RegExpUnparser::VisitEmpty(RegExpEmpty* that, void* data) {
os_ << '%';
return nullptr;
}
std::ostream& RegExpTree::Print(std::ostream& os, Zone* zone) { // NOLINT
RegExpUnparser unparser(os, zone);
Accept(&unparser, nullptr);
return os;
}
RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
: alternatives_(alternatives) {
DCHECK_LT(1, alternatives->length());
RegExpTree* first_alternative = alternatives->at(0);
min_match_ = first_alternative->min_match();
max_match_ = first_alternative->max_match();
for (int i = 1; i < alternatives->length(); i++) {
RegExpTree* alternative = alternatives->at(i);
min_match_ = Min(min_match_, alternative->min_match());
max_match_ = Max(max_match_, alternative->max_match());
}
}
static int IncreaseBy(int previous, int increase) {
if (RegExpTree::kInfinity - previous < increase) {
return RegExpTree::kInfinity;
} else {
return previous + increase;
}
}
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
: nodes_(nodes) {
DCHECK_LT(1, nodes->length());
min_match_ = 0;
max_match_ = 0;
for (int i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->at(i);
int node_min_match = node->min_match();
min_match_ = IncreaseBy(min_match_, node_min_match);
int node_max_match = node->max_match();
max_match_ = IncreaseBy(max_match_, node_max_match);
}
}
} // namespace internal
} // namespace v8

615
js/src/regexp/regexp-ast.h Normal file
View File

@ -0,0 +1,615 @@
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_AST_H_
#define V8_REGEXP_REGEXP_AST_H_
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
VISIT(Disjunction) \
VISIT(Alternative) \
VISIT(Assertion) \
VISIT(CharacterClass) \
VISIT(Atom) \
VISIT(Quantifier) \
VISIT(Capture) \
VISIT(Group) \
VISIT(Lookaround) \
VISIT(BackReference) \
VISIT(Empty) \
VISIT(Text)
#define FORWARD_DECLARE(Name) class RegExp##Name;
FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
#undef FORWARD_DECLARE
class RegExpCompiler;
class RegExpNode;
class RegExpTree;
class RegExpVisitor {
public:
virtual ~RegExpVisitor() = default;
#define MAKE_CASE(Name) \
virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
#undef MAKE_CASE
};
// A simple closed interval.
class Interval {
public:
Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size().
Interval(int from, int to) : from_(from), to_(to) {}
Interval Union(Interval that) {
if (that.from_ == kNone)
return *this;
else if (from_ == kNone)
return that;
else
return Interval(Min(from_, that.from_), Max(to_, that.to_));
}
bool Contains(int value) { return (from_ <= value) && (value <= to_); }
bool is_empty() { return from_ == kNone; }
int from() const { return from_; }
int to() const { return to_; }
int size() const { return to_ - from_ + 1; }
static Interval Empty() { return Interval(); }
static constexpr int kNone = -1;
private:
int from_;
int to_;
};
// Represents code units in the range from from_ to to_, both ends are
// inclusive.
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) {}
// For compatibility with the CHECK_OK macro
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
V8_EXPORT_PRIVATE static void AddClassEscape(char type,
ZoneList<CharacterRange>* ranges,
Zone* zone);
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
V8_EXPORT_PRIVATE static void AddClassEscape(
char type, ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents, Zone* zone);
static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc32 value) {
return CharacterRange(value, value);
}
static inline CharacterRange Range(uc32 from, uc32 to) {
DCHECK(0 <= from && to <= String::kMaxCodePoint);
DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
return CharacterRange(from, to);
}
static inline CharacterRange Everything() {
return CharacterRange(0, String::kMaxCodePoint);
}
static inline ZoneList<CharacterRange>* List(Zone* zone,
CharacterRange range) {
ZoneList<CharacterRange>* list =
new (zone) ZoneList<CharacterRange>(1, zone);
list->Add(range, zone);
return list;
}
bool Contains(uc32 i) { return from_ <= i && i <= to_; }
uc32 from() const { return from_; }
void set_from(uc32 value) { from_ = value; }
uc32 to() const { return to_; }
void set_to(uc32 value) { to_ = value; }
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
V8_EXPORT_PRIVATE static void AddCaseEquivalents(
Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
bool is_one_byte);
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
// Convert range list to canonical form. The characters covered by the ranges
// will still be the same, but no character is in more than one range, and
// adjacent ranges are merged. The resulting list may be shorter than the
// original, but cannot be longer.
static void Canonicalize(ZoneList<CharacterRange>* ranges);
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst, Zone* zone);
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
private:
CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
uc32 from_;
uc32 to_;
};
class CharacterSet final {
public:
explicit CharacterSet(uc16 standard_set_type)
: ranges_(nullptr), standard_set_type_(standard_set_type) {}
explicit CharacterSet(ZoneList<CharacterRange>* ranges)
: ranges_(ranges), standard_set_type_(0) {}
ZoneList<CharacterRange>* ranges(Zone* zone);
uc16 standard_set_type() const { return standard_set_type_; }
void set_standard_set_type(uc16 special_set_type) {
standard_set_type_ = special_set_type;
}
bool is_standard() { return standard_set_type_ != 0; }
V8_EXPORT_PRIVATE void Canonicalize();
private:
ZoneList<CharacterRange>* ranges_;
// If non-zero, the value represents a standard set (e.g., all whitespace
// characters) without having to expand the ranges.
uc16 standard_set_type_;
};
class TextElement final {
public:
enum TextType { ATOM, CHAR_CLASS };
static TextElement Atom(RegExpAtom* atom);
static TextElement CharClass(RegExpCharacterClass* char_class);
int cp_offset() const { return cp_offset_; }
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
int length() const;
TextType text_type() const { return text_type_; }
RegExpTree* tree() const { return tree_; }
RegExpAtom* atom() const {
DCHECK(text_type() == ATOM);
return reinterpret_cast<RegExpAtom*>(tree());
}
RegExpCharacterClass* char_class() const {
DCHECK(text_type() == CHAR_CLASS);
return reinterpret_cast<RegExpCharacterClass*>(tree());
}
private:
TextElement(TextType text_type, RegExpTree* tree)
: cp_offset_(-1), text_type_(text_type), tree_(tree) {}
int cp_offset_;
TextType text_type_;
RegExpTree* tree_;
};
class RegExpTree : public ZoneObject {
public:
static const int kInfinity = kMaxInt;
virtual ~RegExpTree() = default;
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) = 0;
virtual bool IsTextElement() { return false; }
virtual bool IsAnchoredAtStart() { return false; }
virtual bool IsAnchoredAtEnd() { return false; }
virtual int min_match() = 0;
virtual int max_match() = 0;
// Returns the interval of registers used for captures within this
// expression.
virtual Interval CaptureRegisters() { return Interval::Empty(); }
virtual void AppendToText(RegExpText* text, Zone* zone);
V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os,
Zone* zone); // NOLINT
#define MAKE_ASTYPE(Name) \
virtual RegExp##Name* As##Name(); \
virtual bool Is##Name();
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
#undef MAKE_ASTYPE
};
class RegExpDisjunction final : public RegExpTree {
public:
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpDisjunction* AsDisjunction() override;
Interval CaptureRegisters() override;
bool IsDisjunction() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
private:
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
ZoneList<RegExpTree*>* alternatives_;
int min_match_;
int max_match_;
};
class RegExpAlternative final : public RegExpTree {
public:
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAlternative* AsAlternative() override;
Interval CaptureRegisters() override;
bool IsAlternative() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* nodes() { return nodes_; }
private:
ZoneList<RegExpTree*>* nodes_;
int min_match_;
int max_match_;
};
class RegExpAssertion final : public RegExpTree {
public:
enum AssertionType {
START_OF_LINE = 0,
START_OF_INPUT = 1,
END_OF_LINE = 2,
END_OF_INPUT = 3,
BOUNDARY = 4,
NON_BOUNDARY = 5,
LAST_TYPE = NON_BOUNDARY,
};
RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
: assertion_type_(type), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAssertion* AsAssertion() override;
bool IsAssertion() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
AssertionType assertion_type() const { return assertion_type_; }
JSRegExp::Flags flags() const { return flags_; }
private:
const AssertionType assertion_type_;
const JSRegExp::Flags flags_;
};
class RegExpCharacterClass final : public RegExpTree {
public:
// NEGATED: The character class is negated and should match everything but
// the specified ranges.
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
// surrogate and should not be unicode-desugared (crbug.com/641091).
enum Flag {
NEGATED = 1 << 0,
CONTAINS_SPLIT_SURROGATE = 1 << 1,
};
using CharacterClassFlags = base::Flags<Flag>;
RegExpCharacterClass(
Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
CharacterClassFlags character_class_flags = CharacterClassFlags())
: set_(ranges),
flags_(flags),
character_class_flags_(character_class_flags) {
// Convert the empty set of ranges to the negated Everything() range.
if (ranges->is_empty()) {
ranges->Add(CharacterRange::Everything(), zone);
character_class_flags_ ^= NEGATED;
}
}
RegExpCharacterClass(uc16 type, JSRegExp::Flags flags)
: set_(type),
flags_(flags),
character_class_flags_(CharacterClassFlags()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpCharacterClass* AsCharacterClass() override;
bool IsCharacterClass() override;
bool IsTextElement() override { return true; }
int min_match() override { return 1; }
// The character class may match two code units for unicode regexps.
// TODO(yangguo): we should split this class for usage in TextElement, and
// make max_match() dependent on the character class content.
int max_match() override { return 2; }
void AppendToText(RegExpText* text, Zone* zone) override;
CharacterSet character_set() { return set_; }
// TODO(lrn): Remove need for complex version if is_standard that
// recognizes a mangled standard set and just do { return set_.is_special(); }
bool is_standard(Zone* zone);
// Returns a value representing the standard character set if is_standard()
// returns true.
// Currently used values are:
// s : unicode whitespace
// S : unicode non-whitespace
// w : ASCII word character (digit, letter, underscore)
// W : non-ASCII word character
// d : ASCII digit
// D : non-ASCII digit
// . : non-newline
// * : All characters, for advancing unanchored regexp
uc16 standard_type() const { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
JSRegExp::Flags flags() const { return flags_; }
bool contains_split_surrogate() const {
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
}
private:
CharacterSet set_;
const JSRegExp::Flags flags_;
CharacterClassFlags character_class_flags_;
};
class RegExpAtom final : public RegExpTree {
public:
explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
: data_(data), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAtom* AsAtom() override;
bool IsAtom() override;
bool IsTextElement() override { return true; }
int min_match() override { return data_.length(); }
int max_match() override { return data_.length(); }
void AppendToText(RegExpText* text, Zone* zone) override;
Vector<const uc16> data() { return data_; }
int length() { return data_.length(); }
JSRegExp::Flags flags() const { return flags_; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
private:
Vector<const uc16> data_;
const JSRegExp::Flags flags_;
};
class RegExpText final : public RegExpTree {
public:
explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpText* AsText() override;
bool IsText() override;
bool IsTextElement() override { return true; }
int min_match() override { return length_; }
int max_match() override { return length_; }
void AppendToText(RegExpText* text, Zone* zone) override;
void AddElement(TextElement elm, Zone* zone) {
elements_.Add(elm, zone);
length_ += elm.length();
}
ZoneList<TextElement>* elements() { return &elements_; }
private:
ZoneList<TextElement> elements_;
int length_;
};
class RegExpQuantifier final : public RegExpTree {
public:
enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
: body_(body),
min_(min),
max_(max),
quantifier_type_(type) {
if (min > 0 && body->min_match() > kInfinity / min) {
min_match_ = kInfinity;
} else {
min_match_ = min * body->min_match();
}
if (max > 0 && body->max_match() > kInfinity / max) {
max_match_ = kInfinity;
} else {
max_match_ = max * body->max_match();
}
}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
RegExpCompiler* compiler, RegExpNode* on_success,
bool not_at_start = false);
RegExpQuantifier* AsQuantifier() override;
Interval CaptureRegisters() override;
bool IsQuantifier() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
int min() { return min_; }
int max() { return max_; }
bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
bool is_greedy() { return quantifier_type_ == GREEDY; }
RegExpTree* body() { return body_; }
private:
RegExpTree* body_;
int min_;
int max_;
int min_match_;
int max_match_;
QuantifierType quantifier_type_;
};
class RegExpCapture final : public RegExpTree {
public:
explicit RegExpCapture(int index)
: body_(nullptr),
index_(index),
min_match_(0),
max_match_(0),
name_(nullptr) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
static RegExpNode* ToNode(RegExpTree* body, int index,
RegExpCompiler* compiler, RegExpNode* on_success);
RegExpCapture* AsCapture() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
Interval CaptureRegisters() override;
bool IsCapture() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
RegExpTree* body() { return body_; }
void set_body(RegExpTree* body) {
body_ = body;
min_match_ = body->min_match();
max_match_ = body->max_match();
}
int index() const { return index_; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
static int StartRegister(int index) { return index * 2; }
static int EndRegister(int index) { return index * 2 + 1; }
private:
RegExpTree* body_;
int index_;
int min_match_;
int max_match_;
const ZoneVector<uc16>* name_;
};
class RegExpGroup final : public RegExpTree {
public:
explicit RegExpGroup(RegExpTree* body)
: body_(body),
min_match_(body->min_match()),
max_match_(body->max_match()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) override {
return body_->ToNode(compiler, on_success);
}
RegExpGroup* AsGroup() override;
bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
bool IsGroup() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
RegExpTree* body() { return body_; }
private:
RegExpTree* body_;
int min_match_;
int max_match_;
};
class RegExpLookaround final : public RegExpTree {
public:
enum Type { LOOKAHEAD, LOOKBEHIND };
RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
int capture_from, Type type)
: body_(body),
is_positive_(is_positive),
capture_count_(capture_count),
capture_from_(capture_from),
type_(type) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpLookaround* AsLookaround() override;
Interval CaptureRegisters() override;
bool IsLookaround() override;
bool IsAnchoredAtStart() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
RegExpTree* body() { return body_; }
bool is_positive() { return is_positive_; }
int capture_count() { return capture_count_; }
int capture_from() { return capture_from_; }
Type type() { return type_; }
class Builder {
public:
Builder(bool is_positive, RegExpNode* on_success,
int stack_pointer_register, int position_register,
int capture_register_count = 0, int capture_register_start = 0);
RegExpNode* on_match_success() { return on_match_success_; }
RegExpNode* ForMatch(RegExpNode* match);
private:
bool is_positive_;
RegExpNode* on_match_success_;
RegExpNode* on_success_;
int stack_pointer_register_;
int position_register_;
};
private:
RegExpTree* body_;
bool is_positive_;
int capture_count_;
int capture_from_;
Type type_;
};
class RegExpBackReference final : public RegExpTree {
public:
explicit RegExpBackReference(JSRegExp::Flags flags)
: capture_(nullptr), name_(nullptr), flags_(flags) {}
RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags)
: capture_(capture), name_(nullptr), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpBackReference* AsBackReference() override;
bool IsBackReference() override;
int min_match() override { return 0; }
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
// recursion, we give up. Ignorance is bliss.
int max_match() override { return kInfinity; }
int index() { return capture_->index(); }
RegExpCapture* capture() { return capture_; }
void set_capture(RegExpCapture* capture) { capture_ = capture; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
private:
RegExpCapture* capture_;
const ZoneVector<uc16>* name_;
const JSRegExp::Flags flags_;
};
class RegExpEmpty final : public RegExpTree {
public:
RegExpEmpty() = default;
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpEmpty* AsEmpty() override;
bool IsEmpty() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_AST_H_

View File

@ -0,0 +1,55 @@
// Copyright 2008-2009 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_
#include "regexp/regexp-bytecode-generator.h"
#include "regexp/regexp-bytecodes.h"
namespace v8 {
namespace internal {
void RegExpBytecodeGenerator::Emit(uint32_t byte, uint32_t twenty_four_bits) {
uint32_t word = ((twenty_four_bits << BYTECODE_SHIFT) | byte);
DCHECK(pc_ <= buffer_.length());
if (pc_ + 3 >= buffer_.length()) {
Expand();
}
*reinterpret_cast<uint32_t*>(buffer_.begin() + pc_) = word;
pc_ += 4;
}
void RegExpBytecodeGenerator::Emit16(uint32_t word) {
DCHECK(pc_ <= buffer_.length());
if (pc_ + 1 >= buffer_.length()) {
Expand();
}
*reinterpret_cast<uint16_t*>(buffer_.begin() + pc_) = word;
pc_ += 2;
}
void RegExpBytecodeGenerator::Emit8(uint32_t word) {
DCHECK(pc_ <= buffer_.length());
if (pc_ == buffer_.length()) {
Expand();
}
*reinterpret_cast<unsigned char*>(buffer_.begin() + pc_) = word;
pc_ += 1;
}
void RegExpBytecodeGenerator::Emit32(uint32_t word) {
DCHECK(pc_ <= buffer_.length());
if (pc_ + 3 >= buffer_.length()) {
Expand();
}
*reinterpret_cast<uint32_t*>(buffer_.begin() + pc_) = word;
pc_ += 4;
}
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_INL_H_

View File

@ -0,0 +1,395 @@
// Copyright 2008-2009 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-bytecode-generator.h"
#include "regexp/regexp-bytecode-generator-inl.h"
#include "regexp/regexp-bytecode-peephole.h"
#include "regexp/regexp-bytecodes.h"
#include "regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
: RegExpMacroAssembler(isolate, zone),
buffer_(Vector<byte>::New(1024)),
pc_(0),
advance_current_end_(kInvalidPC),
jump_edges_(zone),
isolate_(isolate) {}
RegExpBytecodeGenerator::~RegExpBytecodeGenerator() {
if (backtrack_.is_linked()) backtrack_.Unuse();
buffer_.Dispose();
}
RegExpBytecodeGenerator::IrregexpImplementation
RegExpBytecodeGenerator::Implementation() {
return kBytecodeImplementation;
}
void RegExpBytecodeGenerator::Bind(Label* l) {
advance_current_end_ = kInvalidPC;
DCHECK(!l->is_bound());
if (l->is_linked()) {
int pos = l->pos();
while (pos != 0) {
int fixup = pos;
pos = *reinterpret_cast<int32_t*>(buffer_.begin() + fixup);
*reinterpret_cast<uint32_t*>(buffer_.begin() + fixup) = pc_;
jump_edges_.emplace(fixup, pc_);
}
}
l->bind_to(pc_);
}
void RegExpBytecodeGenerator::EmitOrLink(Label* l) {
if (l == nullptr) l = &backtrack_;
int pos = 0;
if (l->is_bound()) {
pos = l->pos();
jump_edges_.emplace(pc_, pos);
} else {
if (l->is_linked()) {
pos = l->pos();
}
l->link_to(pc_);
}
Emit32(pos);
}
void RegExpBytecodeGenerator::PopRegister(int register_index) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_POP_REGISTER, register_index);
}
void RegExpBytecodeGenerator::PushRegister(int register_index,
StackCheckFlag check_stack_limit) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_PUSH_REGISTER, register_index);
}
void RegExpBytecodeGenerator::WriteCurrentPositionToRegister(int register_index,
int cp_offset) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_SET_REGISTER_TO_CP, register_index);
Emit32(cp_offset); // Current position offset.
}
void RegExpBytecodeGenerator::ClearRegisters(int reg_from, int reg_to) {
DCHECK(reg_from <= reg_to);
for (int reg = reg_from; reg <= reg_to; reg++) {
SetRegister(reg, -1);
}
}
void RegExpBytecodeGenerator::ReadCurrentPositionFromRegister(
int register_index) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_SET_CP_TO_REGISTER, register_index);
}
void RegExpBytecodeGenerator::WriteStackPointerToRegister(int register_index) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_SET_REGISTER_TO_SP, register_index);
}
void RegExpBytecodeGenerator::ReadStackPointerFromRegister(int register_index) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_SET_SP_TO_REGISTER, register_index);
}
void RegExpBytecodeGenerator::SetCurrentPositionFromEnd(int by) {
DCHECK(is_uint24(by));
Emit(BC_SET_CURRENT_POSITION_FROM_END, by);
}
void RegExpBytecodeGenerator::SetRegister(int register_index, int to) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_SET_REGISTER, register_index);
Emit32(to);
}
void RegExpBytecodeGenerator::AdvanceRegister(int register_index, int by) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_ADVANCE_REGISTER, register_index);
Emit32(by);
}
void RegExpBytecodeGenerator::PopCurrentPosition() { Emit(BC_POP_CP, 0); }
void RegExpBytecodeGenerator::PushCurrentPosition() { Emit(BC_PUSH_CP, 0); }
void RegExpBytecodeGenerator::Backtrack() { Emit(BC_POP_BT, 0); }
void RegExpBytecodeGenerator::GoTo(Label* l) {
if (advance_current_end_ == pc_) {
// Combine advance current and goto.
pc_ = advance_current_start_;
Emit(BC_ADVANCE_CP_AND_GOTO, advance_current_offset_);
EmitOrLink(l);
advance_current_end_ = kInvalidPC;
} else {
// Regular goto.
Emit(BC_GOTO, 0);
EmitOrLink(l);
}
}
void RegExpBytecodeGenerator::PushBacktrack(Label* l) {
Emit(BC_PUSH_BT, 0);
EmitOrLink(l);
}
bool RegExpBytecodeGenerator::Succeed() {
Emit(BC_SUCCEED, 0);
return false; // Restart matching for global regexp not supported.
}
void RegExpBytecodeGenerator::Fail() { Emit(BC_FAIL, 0); }
void RegExpBytecodeGenerator::AdvanceCurrentPosition(int by) {
DCHECK_LE(kMinCPOffset, by);
DCHECK_GE(kMaxCPOffset, by);
advance_current_start_ = pc_;
advance_current_offset_ = by;
Emit(BC_ADVANCE_CP, by);
advance_current_end_ = pc_;
}
void RegExpBytecodeGenerator::CheckGreedyLoop(
Label* on_tos_equals_current_position) {
Emit(BC_CHECK_GREEDY, 0);
EmitOrLink(on_tos_equals_current_position);
}
void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset,
Label* on_failure,
bool check_bounds,
int characters,
int eats_at_least) {
DCHECK_GE(eats_at_least, characters);
if (eats_at_least > characters && check_bounds) {
DCHECK(is_uint24(cp_offset + eats_at_least));
Emit(BC_CHECK_CURRENT_POSITION, cp_offset + eats_at_least);
EmitOrLink(on_failure);
check_bounds = false; // Load below doesn't need to check.
}
DCHECK_LE(kMinCPOffset, cp_offset);
DCHECK_GE(kMaxCPOffset, cp_offset);
int bytecode;
if (check_bounds) {
if (characters == 4) {
bytecode = BC_LOAD_4_CURRENT_CHARS;
} else if (characters == 2) {
bytecode = BC_LOAD_2_CURRENT_CHARS;
} else {
DCHECK_EQ(1, characters);
bytecode = BC_LOAD_CURRENT_CHAR;
}
} else {
if (characters == 4) {
bytecode = BC_LOAD_4_CURRENT_CHARS_UNCHECKED;
} else if (characters == 2) {
bytecode = BC_LOAD_2_CURRENT_CHARS_UNCHECKED;
} else {
DCHECK_EQ(1, characters);
bytecode = BC_LOAD_CURRENT_CHAR_UNCHECKED;
}
}
Emit(bytecode, cp_offset);
if (check_bounds) EmitOrLink(on_failure);
}
void RegExpBytecodeGenerator::CheckCharacterLT(uc16 limit, Label* on_less) {
Emit(BC_CHECK_LT, limit);
EmitOrLink(on_less);
}
void RegExpBytecodeGenerator::CheckCharacterGT(uc16 limit, Label* on_greater) {
Emit(BC_CHECK_GT, limit);
EmitOrLink(on_greater);
}
void RegExpBytecodeGenerator::CheckCharacter(uint32_t c, Label* on_equal) {
if (c > MAX_FIRST_ARG) {
Emit(BC_CHECK_4_CHARS, 0);
Emit32(c);
} else {
Emit(BC_CHECK_CHAR, c);
}
EmitOrLink(on_equal);
}
void RegExpBytecodeGenerator::CheckAtStart(int cp_offset, Label* on_at_start) {
Emit(BC_CHECK_AT_START, cp_offset);
EmitOrLink(on_at_start);
}
void RegExpBytecodeGenerator::CheckNotAtStart(int cp_offset,
Label* on_not_at_start) {
Emit(BC_CHECK_NOT_AT_START, cp_offset);
EmitOrLink(on_not_at_start);
}
void RegExpBytecodeGenerator::CheckNotCharacter(uint32_t c,
Label* on_not_equal) {
if (c > MAX_FIRST_ARG) {
Emit(BC_CHECK_NOT_4_CHARS, 0);
Emit32(c);
} else {
Emit(BC_CHECK_NOT_CHAR, c);
}
EmitOrLink(on_not_equal);
}
void RegExpBytecodeGenerator::CheckCharacterAfterAnd(uint32_t c, uint32_t mask,
Label* on_equal) {
if (c > MAX_FIRST_ARG) {
Emit(BC_AND_CHECK_4_CHARS, 0);
Emit32(c);
} else {
Emit(BC_AND_CHECK_CHAR, c);
}
Emit32(mask);
EmitOrLink(on_equal);
}
void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c,
uint32_t mask,
Label* on_not_equal) {
if (c > MAX_FIRST_ARG) {
Emit(BC_AND_CHECK_NOT_4_CHARS, 0);
Emit32(c);
} else {
Emit(BC_AND_CHECK_NOT_CHAR, c);
}
Emit32(mask);
EmitOrLink(on_not_equal);
}
void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd(
uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) {
Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c);
Emit16(minus);
Emit16(mask);
EmitOrLink(on_not_equal);
}
void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to,
Label* on_in_range) {
Emit(BC_CHECK_CHAR_IN_RANGE, 0);
Emit16(from);
Emit16(to);
EmitOrLink(on_in_range);
}
void RegExpBytecodeGenerator::CheckCharacterNotInRange(uc16 from, uc16 to,
Label* on_not_in_range) {
Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0);
Emit16(from);
Emit16(to);
EmitOrLink(on_not_in_range);
}
void RegExpBytecodeGenerator::CheckBitInTable(Handle<ByteArray> table,
Label* on_bit_set) {
Emit(BC_CHECK_BIT_IN_TABLE, 0);
EmitOrLink(on_bit_set);
for (int i = 0; i < kTableSize; i += kBitsPerByte) {
int byte = 0;
for (int j = 0; j < kBitsPerByte; j++) {
if (table->get(i + j) != 0) byte |= 1 << j;
}
Emit8(byte);
}
}
void RegExpBytecodeGenerator::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_not_equal) {
DCHECK_LE(0, start_reg);
DCHECK_GE(kMaxRegister, start_reg);
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF,
start_reg);
EmitOrLink(on_not_equal);
}
void RegExpBytecodeGenerator::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_not_equal) {
DCHECK_LE(0, start_reg);
DCHECK_GE(kMaxRegister, start_reg);
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
: BC_CHECK_NOT_BACK_REF_NO_CASE,
start_reg);
EmitOrLink(on_not_equal);
}
void RegExpBytecodeGenerator::IfRegisterLT(int register_index, int comparand,
Label* on_less_than) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_CHECK_REGISTER_LT, register_index);
Emit32(comparand);
EmitOrLink(on_less_than);
}
void RegExpBytecodeGenerator::IfRegisterGE(int register_index, int comparand,
Label* on_greater_or_equal) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_CHECK_REGISTER_GE, register_index);
Emit32(comparand);
EmitOrLink(on_greater_or_equal);
}
void RegExpBytecodeGenerator::IfRegisterEqPos(int register_index,
Label* on_eq) {
DCHECK_LE(0, register_index);
DCHECK_GE(kMaxRegister, register_index);
Emit(BC_CHECK_REGISTER_EQ_POS, register_index);
EmitOrLink(on_eq);
}
Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
Bind(&backtrack_);
Emit(BC_POP_BT, 0);
Handle<ByteArray> array;
if (FLAG_regexp_peephole_optimization) {
array = RegExpBytecodePeepholeOptimization::OptimizeBytecode(
isolate_, zone(), source, buffer_.begin(), length(), jump_edges_);
} else {
array = isolate_->factory()->NewByteArray(length());
Copy(array->GetDataStartAddress());
}
return array;
}
int RegExpBytecodeGenerator::length() { return pc_; }
void RegExpBytecodeGenerator::Copy(byte* a) {
MemCopy(a, buffer_.begin(), length());
}
void RegExpBytecodeGenerator::Expand() {
Vector<byte> old_buffer = buffer_;
buffer_ = Vector<byte>::New(old_buffer.length() * 2);
MemCopy(buffer_.begin(), old_buffer.begin(), old_buffer.length());
old_buffer.Dispose();
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,119 @@
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
#define V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_
#include "regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
// An assembler/generator for the Irregexp byte code.
class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
public:
// Create an assembler. Instructions and relocation information are emitted
// into a buffer, with the instructions starting from the beginning and the
// relocation information starting from the end of the buffer. See CodeDesc
// for a detailed comment on the layout (globals.h).
//
// The assembler allocates and grows its own buffer, and buffer_size
// determines the initial buffer size. The buffer is owned by the assembler
// and deallocated upon destruction of the assembler.
RegExpBytecodeGenerator(Isolate* isolate, Zone* zone);
virtual ~RegExpBytecodeGenerator();
// The byte-code interpreter checks on each push anyway.
virtual int stack_limit_slack() { return 1; }
virtual bool CanReadUnaligned() { return false; }
virtual void Bind(Label* label);
virtual void AdvanceCurrentPosition(int by); // Signed cp change.
virtual void PopCurrentPosition();
virtual void PushCurrentPosition();
virtual void Backtrack();
virtual void GoTo(Label* label);
virtual void PushBacktrack(Label* label);
virtual bool Succeed();
virtual void Fail();
virtual void PopRegister(int register_index);
virtual void PushRegister(int register_index,
StackCheckFlag check_stack_limit);
virtual void AdvanceRegister(int reg, int by); // r[reg] += by.
virtual void SetCurrentPositionFromEnd(int by);
virtual void SetRegister(int register_index, int to);
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
virtual void ClearRegisters(int reg_from, int reg_to);
virtual void ReadCurrentPositionFromRegister(int reg);
virtual void WriteStackPointerToRegister(int reg);
virtual void ReadStackPointerFromRegister(int reg);
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least);
virtual void CheckCharacter(unsigned c, Label* on_equal);
virtual void CheckCharacterAfterAnd(unsigned c, unsigned mask,
Label* on_equal);
virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
virtual void CheckCharacterLT(uc16 limit, Label* on_less);
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position);
virtual void CheckAtStart(int cp_offset, Label* on_at_start);
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start);
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
Label* on_not_equal);
virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask,
Label* on_not_equal);
virtual void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range);
virtual void CheckCharacterNotInRange(uc16 from, uc16 to,
Label* on_not_in_range);
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set);
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
Label* on_no_match);
virtual void IfRegisterLT(int register_index, int comparand, Label* if_lt);
virtual void IfRegisterGE(int register_index, int comparand, Label* if_ge);
virtual void IfRegisterEqPos(int register_index, Label* if_eq);
virtual IrregexpImplementation Implementation();
virtual Handle<HeapObject> GetCode(Handle<String> source);
private:
void Expand();
// Code and bitmap emission.
inline void EmitOrLink(Label* label);
inline void Emit32(uint32_t x);
inline void Emit16(uint32_t x);
inline void Emit8(uint32_t x);
inline void Emit(uint32_t bc, uint32_t arg);
// Bytecode buffer.
int length();
void Copy(byte* a);
// The buffer into which code and relocation info are generated.
Vector<byte> buffer_;
// The program counter.
int pc_;
Label backtrack_;
int advance_current_start_;
int advance_current_offset_;
int advance_current_end_;
// Stores jump edges emitted for the bytecode (used by
// RegExpBytecodePeepholeOptimization).
// Key: jump source (offset in buffer_ where jump destination is stored).
// Value: jump destination (offset in buffer_ to jump to).
ZoneUnorderedMap<int, int> jump_edges_;
Isolate* isolate_;
static const int kInvalidPC = -1;
DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpBytecodeGenerator);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_BYTECODE_GENERATOR_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
#define V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
class ByteArray;
// Peephole optimization for regexp interpreter bytecode.
// Pre-defined bytecode sequences occuring in the bytecode generated by the
// RegExpBytecodeGenerator can be optimized into a single bytecode.
class RegExpBytecodePeepholeOptimization : public AllStatic {
public:
// Performs peephole optimization on the given bytecode and returns the
// optimized bytecode.
static Handle<ByteArray> OptimizeBytecode(
Isolate* isolate, Zone* zone, Handle<String> source, const byte* bytecode,
int length, const ZoneUnorderedMap<int, int>& jump_edges);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_BYTECODE_PEEPHOLE_H_

View File

@ -0,0 +1,45 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-bytecodes.h"
#include <cctype>
namespace v8 {
namespace internal {
void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc) {
PrintF("%s", RegExpBytecodeName(*pc));
// Args and the bytecode as hex.
for (int i = 0; i < RegExpBytecodeLength(*pc); i++) {
PrintF(", %02x", pc[i]);
}
PrintF(" ");
// Args as ascii.
for (int i = 1; i < RegExpBytecodeLength(*pc); i++) {
unsigned char b = pc[i];
PrintF("%c", std::isprint(b) ? b : '.');
}
PrintF("\n");
}
void RegExpBytecodeDisassemble(const byte* code_base, int length,
const char* pattern) {
PrintF("[generated bytecode for regexp pattern: '%s']\n", pattern);
ptrdiff_t offset = 0;
while (offset < length) {
const byte* const pc = code_base + offset;
PrintF("%p %4" V8PRIxPTRDIFF " ", pc, offset);
RegExpBytecodeDisassembleSingle(code_base, pc);
offset += RegExpBytecodeLength(*pc);
}
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,251 @@
// Copyright 2011 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_BYTECODES_H_
#define V8_REGEXP_REGEXP_BYTECODES_H_
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
// Maximum number of bytecodes that will be used (next power of 2 of actually
// defined bytecodes).
// All slots between the last actually defined bytecode and maximum id will be
// filled with BREAKs, indicating an invalid operation. This way using
// BYTECODE_MASK guarantees no OOB access to the dispatch table.
constexpr int kRegExpPaddedBytecodeCount = 1 << 6;
constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
// The first argument is packed in with the byte code in one word, but so it
// has 24 bits, but it can be positive and negative so only use 23 bits for
// positive values.
const unsigned int MAX_FIRST_ARG = 0x7fffffu;
const int BYTECODE_SHIFT = 8;
STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
// TODO(pthier): Argument offsets of bytecodes should be easily accessible by
// name or at least by position.
#define BYTECODE_ITERATOR(V) \
V(BREAK, 0, 4) /* bc8 */ \
V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
V(POP_CP, 10, 4) /* bc8 pad24 */ \
V(POP_BT, 11, 4) /* bc8 pad24 */ \
V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
V(FAIL, 13, 4) /* bc8 pad24 */ \
V(SUCCEED, 14, 4) /* bc8 pad24 */ \
V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
/* Jump to another bytecode given its offset. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x10 (fixed) Bytecode */ \
/* 0x08 - 0x1F: 0x00 (unused) Padding */ \
/* 0x20 - 0x3F: Address of bytecode to jump to */ \
V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
/* Check if offset is in range and load character at given offset. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x11 (fixed) Bytecode */ \
/* 0x08 - 0x1F: Offset from current position */ \
/* 0x20 - 0x3F: Address of bytecode when load is out of range */ \
V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
/* Load character at given offset without range checks. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x12 (fixed) Bytecode */ \
/* 0x08 - 0x1F: Offset from current position */ \
V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
/* Check if current character is equal to a given character */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x19 (fixed) Bytecode */ \
/* 0x08 - 0x0F: 0x00 (unused) Padding */ \
/* 0x10 - 0x1F: Character to check */ \
/* 0x20 - 0x3F: Address of bytecode when matched */ \
V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
/* Checks if the current character combined with mask (bitwise and) */ \
/* matches a character (e.g. used when two characters in a disjunction */ \
/* differ by only a single bit */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x1c (fixed) Bytecode */ \
/* 0x08 - 0x0F: 0x00 (unused) Padding */ \
/* 0x10 - 0x1F: Character to match against (after mask aplied) */ \
/* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \
/* 0x40 - 0x5F: Address of bytecode when matched */ \
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
/* Checks if the current character matches any of the characters encoded */ \
/* in a bit table. Similar to/inspired by boyer moore string search */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x22 (fixed) Bytecode */ \
/* 0x08 - 0x1F: 0x00 (unused) Padding */ \
/* 0x20 - 0x3F: Address of bytecode when bit is set */ \
/* 0x40 - 0xBF: Bit table */ \
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* UNUSED */ \
V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /* UNUSED */ \
V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
/* Checks if the current position matches top of backtrack stack */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x31 (fixed) Bytecode */ \
/* 0x08 - 0x1F: 0x00 (unused) Padding */ \
/* 0x20 - 0x3F: Address of bytecode when current matches tos */ \
V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
/* Advance character pointer by given offset and jump to another bytecode.*/ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x32 (fixed) Bytecode */ \
/* 0x08 - 0x1F: Number of characters to advance */ \
/* 0x20 - 0x3F: Address of bytecode to jump to */ \
V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ \
/* Checks if current position + given offset is in range. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07: 0x34 (fixed) Bytecode */ \
/* 0x08 - 0x1F: Offset from current position */ \
/* 0x20 - 0x3F: Address of bytecode when position is out of range */ \
V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32 */ \
/* Combination of: */ \
/* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO */ \
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07 0x35 (fixed) Bytecode */ \
/* 0x08 - 0x1F Load character offset from current position */ \
/* 0x20 - 0x3F Number of characters to advance */ \
/* 0x40 - 0xBF Bit Table */ \
/* 0xC0 - 0xDF Address of bytecode when character is matched */ \
/* 0xE0 - 0xFF Address of bytecode when no match */ \
V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32) \
/* Combination of: */ \
/* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR */ \
/* and ADVANCE_CP_AND_GOTO */ \
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07 0x36 (fixed) Bytecode */ \
/* 0x08 - 0x1F Load character offset from current position */ \
/* 0x20 - 0x2F Number of characters to advance */ \
/* 0x30 - 0x3F Character to match against (after mask applied) */ \
/* 0x40 - 0x5F: Bitmask bitwise and combined with current character */ \
/* 0x60 - 0x7F Minimum number of characters this pattern consumes */ \
/* 0x80 - 0x9F Address of bytecode when character is matched */ \
/* 0xA0 - 0xBF Address of bytecode when no match */ \
V(SKIP_UNTIL_CHAR_AND, 54, 24) \
/* Combination of: */ \
/* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07 0x37 (fixed) Bytecode */ \
/* 0x08 - 0x1F Load character offset from current position */ \
/* 0x20 - 0x2F Number of characters to advance */ \
/* 0x30 - 0x3F Character to match */ \
/* 0x40 - 0x5F Address of bytecode when character is matched */ \
/* 0x60 - 0x7F Address of bytecode when no match */ \
V(SKIP_UNTIL_CHAR, 55, 16) \
/* Combination of: */ \
/* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR */ \
/* and ADVANCE_CP_AND_GOTO */ \
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07 0x38 (fixed) Bytecode */ \
/* 0x08 - 0x1F Load character offset from current position */ \
/* 0x20 - 0x2F Number of characters to advance */ \
/* 0x30 - 0x3F Character to match */ \
/* 0x40 - 0x5F Minimum number of characters this pattern consumes */ \
/* 0x60 - 0x7F Address of bytecode when character is matched */ \
/* 0x80 - 0x9F Address of bytecode when no match */ \
V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20) \
/* Combination of: */ \
/* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07 0x39 (fixed) Bytecode */ \
/* 0x08 - 0x1F Load character offset from current position */ \
/* 0x20 - 0x3F Number of characters to advance */ \
/* 0x40 - 0x4F Character to match */ \
/* 0x50 - 0x5F Other Character to match */ \
/* 0x60 - 0x7F Address of bytecode when either character is matched */ \
/* 0x80 - 0x9F Address of bytecode when no match */ \
V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20) \
/* Combination of: */ \
/* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and */ \
/* and ADVANCE_CP_AND_GOTO */ \
/* Emitted by RegExpBytecodePeepholeOptimization. */ \
/* Bit Layout: */ \
/* 0x00 - 0x07 0x3A (fixed) Bytecode */ \
/* 0x08 - 0x1F Load character offset from current position */ \
/* 0x20 - 0x2F Number of characters to advance */ \
/* 0x30 - 0x3F Character to check if it is less than current char */ \
/* 0x40 - 0xBF Bit Table */ \
/* 0xC0 - 0xDF Address of bytecode when character is matched */ \
/* 0xE0 - 0xFF Address of bytecode when no match */ \
V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32)
#define COUNT(...) +1
static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
#undef COUNT
// Just making sure we assigned values above properly. They should be
// contiguous, strictly increasing, and start at 0.
// TODO(jgruber): Do not explicitly assign values, instead generate them
// implicitly from the list order.
STATIC_ASSERT(kRegExpBytecodeCount == 59);
#define DECLARE_BYTECODES(name, code, length) \
static constexpr int BC_##name = code;
BYTECODE_ITERATOR(DECLARE_BYTECODES)
#undef DECLARE_BYTECODES
static constexpr int kRegExpBytecodeLengths[] = {
#define DECLARE_BYTECODE_LENGTH(name, code, length) length,
BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH)
#undef DECLARE_BYTECODE_LENGTH
};
inline constexpr int RegExpBytecodeLength(int bytecode) {
return kRegExpBytecodeLengths[bytecode];
}
static const char* const kRegExpBytecodeNames[] = {
#define DECLARE_BYTECODE_NAME(name, ...) #name,
BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME)
#undef DECLARE_BYTECODE_NAME
};
inline const char* RegExpBytecodeName(int bytecode) {
return kRegExpBytecodeNames[bytecode];
}
void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc);
void RegExpBytecodeDisassemble(const byte* code_base, int length,
const char* pattern);
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_BYTECODES_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,621 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_COMPILER_H_
#define V8_REGEXP_REGEXP_COMPILER_H_
#include <bitset>
#include "regexp/regexp-nodes.h"
namespace v8 {
namespace internal {
class DynamicBitSet;
class Isolate;
namespace regexp_compiler_constants {
// The '2' variant is has inclusive from and exclusive to.
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
constexpr uc32 kRangeEndMarker = 0x110000;
constexpr int kSpaceRanges[] = {
'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
'_' + 1, 'a', 'z' + 1, kRangeEndMarker};
constexpr int kWordRangeCount = arraysize(kWordRanges);
constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
constexpr int kDigitRangeCount = arraysize(kDigitRanges);
constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
kLeadSurrogateStart + 1, kRangeEndMarker};
constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
0x2028, 0x202A, kRangeEndMarker};
constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
// More makes code generation slower, less makes V8 benchmark score lower.
constexpr int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
constexpr int kPatternTooShortForBoyerMoore = 2;
} // namespace regexp_compiler_constants
inline bool IgnoreCase(JSRegExp::Flags flags) {
return (flags & JSRegExp::kIgnoreCase) != 0;
}
inline bool IsUnicode(JSRegExp::Flags flags) {
return (flags & JSRegExp::kUnicode) != 0;
}
inline bool IsSticky(JSRegExp::Flags flags) {
return (flags & JSRegExp::kSticky) != 0;
}
inline bool IsGlobal(JSRegExp::Flags flags) {
return (flags & JSRegExp::kGlobal) != 0;
}
inline bool DotAll(JSRegExp::Flags flags) {
return (flags & JSRegExp::kDotAll) != 0;
}
inline bool Multiline(JSRegExp::Flags flags) {
return (flags & JSRegExp::kMultiline) != 0;
}
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
// Both unicode and ignore_case flags are set. We need to use ICU to find
// the closure over case equivalents.
return IsUnicode(flags) && IgnoreCase(flags);
}
// Details of a quick mask-compare check that can look ahead in the
// input stream.
class QuickCheckDetails {
public:
QuickCheckDetails()
: characters_(0), mask_(0), value_(0), cannot_match_(false) {}
explicit QuickCheckDetails(int characters)
: characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
bool Rationalize(bool one_byte);
// Merge in the information from another branch of an alternation.
void Merge(QuickCheckDetails* other, int from_index);
// Advance the current position by some amount.
void Advance(int by, bool one_byte);
void Clear();
bool cannot_match() { return cannot_match_; }
void set_cannot_match() { cannot_match_ = true; }
struct Position {
Position() : mask(0), value(0), determines_perfectly(false) {}
uc16 mask;
uc16 value;
bool determines_perfectly;
};
int characters() { return characters_; }
void set_characters(int characters) { characters_ = characters; }
Position* positions(int index) {
DCHECK_LE(0, index);
DCHECK_GT(characters_, index);
return positions_ + index;
}
uint32_t mask() { return mask_; }
uint32_t value() { return value_; }
private:
// How many characters do we have quick check information from. This is
// the same for all branches of a choice node.
int characters_;
Position positions_[4];
// These values are the condensate of the above array after Rationalize().
uint32_t mask_;
uint32_t value_;
// If set to true, there is no way this quick check can match at all.
// E.g., if it requires to be at the start of the input, and isn't.
bool cannot_match_;
};
// Improve the speed that we scan for an initial point where a non-anchored
// regexp can match by using a Boyer-Moore-like table. This is done by
// identifying non-greedy non-capturing loops in the nodes that eat any
// character one at a time. For example in the middle of the regexp
// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
// inserted at the start of any non-anchored regexp.
//
// When we have found such a loop we look ahead in the nodes to find the set of
// characters that can come at given distances. For example for the regexp
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
// the lookahead info where the set of characters is reasonably constrained. In
// our example this is from index 1 to 2 (0 is not constrained). We can now
// look 3 characters ahead and if we don't find one of [f, o] (the union of
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
//
// For Unicode input strings we do the same, but modulo 128.
//
// We also look at the first string fed to the regexp and use that to get a hint
// of the character frequencies in the inputs. This affects the assessment of
// whether the set of characters is 'reasonably constrained'.
//
// We also have another lookahead mechanism (called quick check in the code),
// which uses a wide load of multiple characters followed by a mask and compare
// to determine whether a match is possible at this point.
enum ContainedInLattice {
kNotYet = 0,
kLatticeIn = 1,
kLatticeOut = 2,
kLatticeUnknown = 3 // Can also mean both in and out.
};
inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
return static_cast<ContainedInLattice>(a | b);
}
class BoyerMoorePositionInfo : public ZoneObject {
public:
bool at(int i) const { return map_[i]; }
static constexpr int kMapSize = 128;
static constexpr int kMask = kMapSize - 1;
int map_count() const { return map_count_; }
void Set(int character);
void SetInterval(const Interval& interval);
void SetAll();
bool is_non_word() { return w_ == kLatticeOut; }
bool is_word() { return w_ == kLatticeIn; }
using Bitset = std::bitset<kMapSize>;
Bitset raw_bitset() const { return map_; }
private:
Bitset map_;
int map_count_ = 0; // Number of set bits in the map.
ContainedInLattice w_ = kNotYet; // The \w character class.
};
class BoyerMooreLookahead : public ZoneObject {
public:
BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
int length() { return length_; }
int max_char() { return max_char_; }
RegExpCompiler* compiler() { return compiler_; }
int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
void Set(int map_number, int character) {
if (character > max_char_) return;
BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
info->Set(character);
}
void SetInterval(int map_number, const Interval& interval) {
if (interval.from() > max_char_) return;
BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
if (interval.to() > max_char_) {
info->SetInterval(Interval(interval.from(), max_char_));
} else {
info->SetInterval(interval);
}
}
void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
void SetRest(int from_map) {
for (int i = from_map; i < length_; i++) SetAll(i);
}
void EmitSkipInstructions(RegExpMacroAssembler* masm);
private:
// This is the value obtained by EatsAtLeast. If we do not have at least this
// many characters left in the sample string then the match is bound to fail.
// Therefore it is OK to read a character this far ahead of the current match
// point.
int length_;
RegExpCompiler* compiler_;
// 0xff for Latin1, 0xffff for UTF-16.
int max_char_;
ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
int GetSkipTable(int min_lookahead, int max_lookahead,
Handle<ByteArray> boolean_skip_table);
bool FindWorthwhileInterval(int* from, int* to);
int FindBestInterval(int max_number_of_chars, int old_biggest_points,
int* from, int* to);
};
// There are many ways to generate code for a node. This class encapsulates
// the current way we should be generating. In other words it encapsulates
// the current state of the code generator. The effect of this is that we
// generate code for paths that the matcher can take through the regular
// expression. A given node in the regexp can be code-generated several times
// as it can be part of several traces. For example for the regexp:
// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
// to match foo is generated only once (the traces have a common prefix). The
// code to store the capture is deferred and generated (twice) after the places
// where baz has been matched.
class Trace {
public:
// A value for a property that is either known to be true, know to be false,
// or not known.
enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
class DeferredAction {
public:
DeferredAction(ActionNode::ActionType action_type, int reg)
: action_type_(action_type), reg_(reg), next_(nullptr) {}
DeferredAction* next() { return next_; }
bool Mentions(int reg);
int reg() { return reg_; }
ActionNode::ActionType action_type() { return action_type_; }
private:
ActionNode::ActionType action_type_;
int reg_;
DeferredAction* next_;
friend class Trace;
};
class DeferredCapture : public DeferredAction {
public:
DeferredCapture(int reg, bool is_capture, Trace* trace)
: DeferredAction(ActionNode::STORE_POSITION, reg),
cp_offset_(trace->cp_offset()),
is_capture_(is_capture) {}
int cp_offset() { return cp_offset_; }
bool is_capture() { return is_capture_; }
private:
int cp_offset_;
bool is_capture_;
void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
};
class DeferredSetRegisterForLoop : public DeferredAction {
public:
DeferredSetRegisterForLoop(int reg, int value)
: DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
value_(value) {}
int value() { return value_; }
private:
int value_;
};
class DeferredClearCaptures : public DeferredAction {
public:
explicit DeferredClearCaptures(Interval range)
: DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
Interval range() { return range_; }
private:
Interval range_;
};
class DeferredIncrementRegister : public DeferredAction {
public:
explicit DeferredIncrementRegister(int reg)
: DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
};
Trace()
: cp_offset_(0),
actions_(nullptr),
backtrack_(nullptr),
stop_node_(nullptr),
loop_label_(nullptr),
characters_preloaded_(0),
bound_checked_up_to_(0),
flush_budget_(100),
at_start_(UNKNOWN) {}
// End the trace. This involves flushing the deferred actions in the trace
// and pushing a backtrack location onto the backtrack stack. Once this is
// done we can start a new trace or go to one that has already been
// generated.
void Flush(RegExpCompiler* compiler, RegExpNode* successor);
int cp_offset() { return cp_offset_; }
DeferredAction* actions() { return actions_; }
// A trivial trace is one that has no deferred actions or other state that
// affects the assumptions used when generating code. There is no recorded
// backtrack location in a trivial trace, so with a trivial trace we will
// generate code that, on a failure to match, gets the backtrack location
// from the backtrack stack rather than using a direct jump instruction. We
// always start code generation with a trivial trace and non-trivial traces
// are created as we emit code for nodes or add to the list of deferred
// actions in the trace. The location of the code generated for a node using
// a trivial trace is recorded in a label in the node so that gotos can be
// generated to that code.
bool is_trivial() {
return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
}
TriBool at_start() { return at_start_; }
void set_at_start(TriBool at_start) { at_start_ = at_start; }
Label* backtrack() { return backtrack_; }
Label* loop_label() { return loop_label_; }
RegExpNode* stop_node() { return stop_node_; }
int characters_preloaded() { return characters_preloaded_; }
int bound_checked_up_to() { return bound_checked_up_to_; }
int flush_budget() { return flush_budget_; }
QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
bool mentions_reg(int reg);
// Returns true if a deferred position store exists to the specified
// register and stores the offset in the out-parameter. Otherwise
// returns false.
bool GetStoredPosition(int reg, int* cp_offset);
// These set methods and AdvanceCurrentPositionInTrace should be used only on
// new traces - the intention is that traces are immutable after creation.
void add_action(DeferredAction* new_action) {
DCHECK(new_action->next_ == nullptr);
new_action->next_ = actions_;
actions_ = new_action;
}
void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
void set_stop_node(RegExpNode* node) { stop_node_ = node; }
void set_loop_label(Label* label) { loop_label_ = label; }
void set_characters_preloaded(int count) { characters_preloaded_ = count; }
void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
void set_flush_budget(int to) { flush_budget_ = to; }
void set_quick_check_performed(QuickCheckDetails* d) {
quick_check_performed_ = *d;
}
void InvalidateCurrentCharacter();
void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
private:
int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
const DynamicBitSet& affected_registers,
DynamicBitSet* registers_to_pop,
DynamicBitSet* registers_to_clear, Zone* zone);
void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
const DynamicBitSet& registers_to_pop,
const DynamicBitSet& registers_to_clear);
int cp_offset_;
DeferredAction* actions_;
Label* backtrack_;
RegExpNode* stop_node_;
Label* loop_label_;
int characters_preloaded_;
int bound_checked_up_to_;
QuickCheckDetails quick_check_performed_;
int flush_budget_;
TriBool at_start_;
};
class GreedyLoopState {
public:
explicit GreedyLoopState(bool not_at_start);
Label* label() { return &label_; }
Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
private:
Label label_;
Trace counter_backtrack_trace_;
};
struct PreloadState {
static const int kEatsAtLeastNotYetInitialized = -1;
bool preload_is_current_;
bool preload_has_checked_bounds_;
int preload_characters_;
int eats_at_least_;
void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
};
// Analysis performs assertion propagation and computes eats_at_least_ values.
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
// details.
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
class FrequencyCollator {
public:
FrequencyCollator() : total_samples_(0) {
for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
frequencies_[i] = CharacterFrequency(i);
}
}
void CountCharacter(int character) {
int index = (character & RegExpMacroAssembler::kTableMask);
frequencies_[index].Increment();
total_samples_++;
}
// Does not measure in percent, but rather per-128 (the table size from the
// regexp macro assembler).
int Frequency(int in_character) {
DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
if (total_samples_ < 1) return 1; // Division by zero.
int freq_in_per128 =
(frequencies_[in_character].counter() * 128) / total_samples_;
return freq_in_per128;
}
private:
class CharacterFrequency {
public:
CharacterFrequency() : counter_(0), character_(-1) {}
explicit CharacterFrequency(int character)
: counter_(0), character_(character) {}
void Increment() { counter_++; }
int counter() { return counter_; }
int character() { return character_; }
private:
int counter_;
int character_;
};
private:
CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
int total_samples_;
};
class RegExpCompiler {
public:
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
bool is_one_byte);
int AllocateRegister() {
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
reg_exp_too_big_ = true;
return next_register_;
}
return next_register_++;
}
// Lookarounds to match lone surrogates for unicode character class matches
// are never nested. We can therefore reuse registers.
int UnicodeLookaroundStackRegister() {
if (unicode_lookaround_stack_register_ == kNoRegister) {
unicode_lookaround_stack_register_ = AllocateRegister();
}
return unicode_lookaround_stack_register_;
}
int UnicodeLookaroundPositionRegister() {
if (unicode_lookaround_position_register_ == kNoRegister) {
unicode_lookaround_position_register_ = AllocateRegister();
}
return unicode_lookaround_position_register_;
}
struct CompilationResult final {
explicit CompilationResult(RegExpError err) : error(err) {}
CompilationResult(Object code, int registers)
: code(code), num_registers(registers) {}
static CompilationResult RegExpTooBig() {
return CompilationResult(RegExpError::kTooLarge);
}
bool Succeeded() const { return error == RegExpError::kNone; }
const RegExpError error = RegExpError::kNone;
Object code;
int num_registers = 0;
};
CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
RegExpNode* start, int capture_count,
Handle<String> pattern);
// If the regexp matching starts within a surrogate pair, step back to the
// lead surrogate and start matching from there.
static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
RegExpNode* on_success,
JSRegExp::Flags flags);
inline void AddWork(RegExpNode* node) {
if (!node->on_work_list() && !node->label()->is_bound()) {
node->set_on_work_list(true);
work_list_->push_back(node);
}
}
static const int kImplementationOffset = 0;
static const int kNumberOfRegistersOffset = 0;
static const int kCodeOffset = 1;
RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
EndNode* accept() { return accept_; }
static const int kMaxRecursion = 100;
inline int recursion_depth() { return recursion_depth_; }
inline void IncrementRecursionDepth() { recursion_depth_++; }
inline void DecrementRecursionDepth() { recursion_depth_--; }
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
inline bool one_byte() { return one_byte_; }
inline bool optimize() { return optimize_; }
inline void set_optimize(bool value) { optimize_ = value; }
inline bool limiting_recursion() { return limiting_recursion_; }
inline void set_limiting_recursion(bool value) {
limiting_recursion_ = value;
}
bool read_backward() { return read_backward_; }
void set_read_backward(bool value) { read_backward_ = value; }
FrequencyCollator* frequency_collator() { return &frequency_collator_; }
int current_expansion_factor() { return current_expansion_factor_; }
void set_current_expansion_factor(int value) {
current_expansion_factor_ = value;
}
Isolate* isolate() const { return isolate_; }
Zone* zone() const { return zone_; }
static const int kNoRegister = -1;
private:
EndNode* accept_;
int next_register_;
int unicode_lookaround_stack_register_;
int unicode_lookaround_position_register_;
ZoneVector<RegExpNode*>* work_list_;
int recursion_depth_;
RegExpMacroAssembler* macro_assembler_;
bool one_byte_;
bool reg_exp_too_big_;
bool limiting_recursion_;
bool optimize_;
bool read_backward_;
int current_expansion_factor_;
FrequencyCollator frequency_collator_;
Isolate* isolate_;
Zone* zone_;
};
// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
class UnicodeRangeSplitter {
public:
V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
static constexpr int kInitialSize = 8;
using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
const CharacterRangeVector* bmp() const { return &bmp_; }
const CharacterRangeVector* lead_surrogates() const {
return &lead_surrogates_;
}
const CharacterRangeVector* trail_surrogates() const {
return &trail_surrogates_;
}
const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
private:
void AddRange(CharacterRange range);
CharacterRangeVector bmp_;
CharacterRangeVector lead_surrogates_;
CharacterRangeVector trail_surrogates_;
CharacterRangeVector non_bmp_;
};
// We need to check for the following characters: 0x39C 0x3BC 0x178.
// TODO(jgruber): Move to CharacterRange.
bool RangeContainsLatin1Equivalents(CharacterRange range);
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_COMPILER_H_

View File

@ -0,0 +1,252 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-dotprinter.h"
#include "regexp/regexp-compiler.h"
namespace v8 {
namespace internal {
// -------------------------------------------------------------------
// Dot/dotty output
#ifdef DEBUG
class DotPrinterImpl : public NodeVisitor {
public:
explicit DotPrinterImpl(std::ostream& os) : os_(os) {}
void PrintNode(const char* label, RegExpNode* node);
void Visit(RegExpNode* node);
void PrintAttributes(RegExpNode* from);
void PrintOnFailure(RegExpNode* from, RegExpNode* to);
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
private:
std::ostream& os_;
};
void DotPrinterImpl::PrintNode(const char* label, RegExpNode* node) {
os_ << "digraph G {\n graph [label=\"";
for (int i = 0; label[i]; i++) {
switch (label[i]) {
case '\\':
os_ << "\\\\";
break;
case '"':
os_ << "\"";
break;
default:
os_ << label[i];
break;
}
}
os_ << "\"];\n";
Visit(node);
os_ << "}" << std::endl;
}
void DotPrinterImpl::Visit(RegExpNode* node) {
if (node->info()->visited) return;
node->info()->visited = true;
node->Accept(this);
}
void DotPrinterImpl::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
Visit(on_failure);
}
class AttributePrinter {
public:
explicit AttributePrinter(std::ostream& os) // NOLINT
: os_(os), first_(true) {}
void PrintSeparator() {
if (first_) {
first_ = false;
} else {
os_ << "|";
}
}
void PrintBit(const char* name, bool value) {
if (!value) return;
PrintSeparator();
os_ << "{" << name << "}";
}
void PrintPositive(const char* name, int value) {
if (value < 0) return;
PrintSeparator();
os_ << "{" << name << "|" << value << "}";
}
private:
std::ostream& os_;
bool first_;
};
void DotPrinterImpl::PrintAttributes(RegExpNode* that) {
os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
<< "margin=0.1, fontsize=10, label=\"{";
AttributePrinter printer(os_);
NodeInfo* info = that->info();
printer.PrintBit("NI", info->follows_newline_interest);
printer.PrintBit("WI", info->follows_word_interest);
printer.PrintBit("SI", info->follows_start_interest);
Label* label = that->label();
if (label->is_bound()) printer.PrintPositive("@", label->pos());
os_ << "}\"];\n"
<< " a" << that << " -> n" << that
<< " [style=dashed, color=grey, arrowhead=none];\n";
}
void DotPrinterImpl::VisitChoice(ChoiceNode* that) {
os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
for (int i = 0; i < that->alternatives()->length(); i++) {
GuardedAlternative alt = that->alternatives()->at(i);
os_ << " n" << that << " -> n" << alt.node();
}
for (int i = 0; i < that->alternatives()->length(); i++) {
GuardedAlternative alt = that->alternatives()->at(i);
alt.node()->Accept(this);
}
}
void DotPrinterImpl::VisitLoopChoice(LoopChoiceNode* that) {
VisitChoice(that);
}
void DotPrinterImpl::VisitNegativeLookaroundChoice(
NegativeLookaroundChoiceNode* that) {
VisitChoice(that);
}
void DotPrinterImpl::VisitText(TextNode* that) {
Zone* zone = that->zone();
os_ << " n" << that << " [label=\"";
for (int i = 0; i < that->elements()->length(); i++) {
if (i > 0) os_ << " ";
TextElement elm = that->elements()->at(i);
switch (elm.text_type()) {
case TextElement::ATOM: {
Vector<const uc16> data = elm.atom()->data();
for (int i = 0; i < data.length(); i++) {
os_ << static_cast<char>(data[i]);
}
break;
}
case TextElement::CHAR_CLASS: {
RegExpCharacterClass* node = elm.char_class();
os_ << "[";
if (node->is_negated()) os_ << "^";
for (int j = 0; j < node->ranges(zone)->length(); j++) {
CharacterRange range = node->ranges(zone)->at(j);
os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
}
os_ << "]";
break;
}
default:
UNREACHABLE();
}
}
os_ << "\", shape=box, peripheries=2];\n";
PrintAttributes(that);
os_ << " n" << that << " -> n" << that->on_success() << ";\n";
Visit(that->on_success());
}
void DotPrinterImpl::VisitBackReference(BackReferenceNode* that) {
os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
<< that->end_register() << "\", shape=doubleoctagon];\n";
PrintAttributes(that);
os_ << " n" << that << " -> n" << that->on_success() << ";\n";
Visit(that->on_success());
}
void DotPrinterImpl::VisitEnd(EndNode* that) {
os_ << " n" << that << " [style=bold, shape=point];\n";
PrintAttributes(that);
}
void DotPrinterImpl::VisitAssertion(AssertionNode* that) {
os_ << " n" << that << " [";
switch (that->assertion_type()) {
case AssertionNode::AT_END:
os_ << "label=\"$\", shape=septagon";
break;
case AssertionNode::AT_START:
os_ << "label=\"^\", shape=septagon";
break;
case AssertionNode::AT_BOUNDARY:
os_ << "label=\"\\b\", shape=septagon";
break;
case AssertionNode::AT_NON_BOUNDARY:
os_ << "label=\"\\B\", shape=septagon";
break;
case AssertionNode::AFTER_NEWLINE:
os_ << "label=\"(?<=\\n)\", shape=septagon";
break;
}
os_ << "];\n";
PrintAttributes(that);
RegExpNode* successor = that->on_success();
os_ << " n" << that << " -> n" << successor << ";\n";
Visit(successor);
}
void DotPrinterImpl::VisitAction(ActionNode* that) {
os_ << " n" << that << " [";
switch (that->action_type_) {
case ActionNode::SET_REGISTER_FOR_LOOP:
os_ << "label=\"$" << that->data_.u_store_register.reg
<< ":=" << that->data_.u_store_register.value << "\", shape=octagon";
break;
case ActionNode::INCREMENT_REGISTER:
os_ << "label=\"$" << that->data_.u_increment_register.reg
<< "++\", shape=octagon";
break;
case ActionNode::STORE_POSITION:
os_ << "label=\"$" << that->data_.u_position_register.reg
<< ":=$pos\", shape=octagon";
break;
case ActionNode::BEGIN_SUBMATCH:
os_ << "label=\"$" << that->data_.u_submatch.current_position_register
<< ":=$pos,begin\", shape=septagon";
break;
case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
os_ << "label=\"escape\", shape=septagon";
break;
case ActionNode::EMPTY_MATCH_CHECK:
os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
<< "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
<< "<" << that->data_.u_empty_match_check.repetition_limit
<< "?\", shape=septagon";
break;
case ActionNode::CLEAR_CAPTURES: {
os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
<< " to $" << that->data_.u_clear_captures.range_to
<< "\", shape=septagon";
break;
}
}
os_ << "];\n";
PrintAttributes(that);
RegExpNode* successor = that->on_success();
os_ << " n" << that << " -> n" << successor << ";\n";
Visit(successor);
}
#endif // DEBUG
void DotPrinter::DotPrint(const char* label, RegExpNode* node) {
#ifdef DEBUG
StdoutStream os;
DotPrinterImpl printer(os);
printer.PrintNode(label, node);
#endif // DEBUG
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,23 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_DOTPRINTER_H_
#define V8_REGEXP_REGEXP_DOTPRINTER_H_
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
class RegExpNode;
class DotPrinter final : public AllStatic {
public:
static void DotPrint(const char* label, RegExpNode* node);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_DOTPRINTER_H_

View File

@ -0,0 +1,22 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-error.h"
namespace v8 {
namespace internal {
const char* kRegExpErrorStrings[] = {
#define TEMPLATE(NAME, STRING) STRING,
REGEXP_ERROR_MESSAGES(TEMPLATE)
#undef TEMPLATE
};
const char* RegExpErrorString(RegExpError error) {
DCHECK_LT(error, RegExpError::NumErrors);
return kRegExpErrorStrings[static_cast<int>(error)];
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,56 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_ERROR_H_
#define V8_REGEXP_REGEXP_ERROR_H_
namespace v8 {
namespace internal {
#define REGEXP_ERROR_MESSAGES(T) \
T(None, "") \
T(StackOverflow, "Maximum call stack size exceeded") \
T(AnalysisStackOverflow, "Stack overflow") \
T(TooLarge, "Regular expression too large") \
T(UnterminatedGroup, "Unterminated group") \
T(UnmatchedParen, "Unmatched ')'") \
T(EscapeAtEndOfPattern, "\\ at end of pattern") \
T(InvalidPropertyName, "Invalid property name") \
T(InvalidEscape, "Invalid escape") \
T(InvalidDecimalEscape, "Invalid decimal escape") \
T(InvalidUnicodeEscape, "Invalid Unicode escape") \
T(NothingToRepeat, "Nothing to repeat") \
T(LoneQuantifierBrackets, "Lone quantifier brackets") \
T(RangeOutOfOrder, "numbers out of order in {} quantifier") \
T(IncompleteQuantifier, "Incomplete quantifier") \
T(InvalidQuantifier, "Invalid quantifier") \
T(InvalidGroup, "Invalid group") \
T(MultipleFlagDashes, "Multiple dashes in flag group") \
T(RepeatedFlag, "Repeated flag in flag group") \
T(InvalidFlagGroup, "Invalid flag group") \
T(TooManyCaptures, "Too many captures") \
T(InvalidCaptureGroupName, "Invalid capture group name") \
T(DuplicateCaptureGroupName, "Duplicate capture group name") \
T(InvalidNamedReference, "Invalid named reference") \
T(InvalidNamedCaptureReference, "Invalid named capture referenced") \
T(InvalidClassEscape, "Invalid class escape") \
T(InvalidClassPropertyName, "Invalid property name in character class") \
T(InvalidCharacterClass, "Invalid character class") \
T(UnterminatedCharacterClass, "Unterminated character class") \
T(OutOfOrderCharacterClass, "Range out of order in character class")
enum class RegExpError : uint32_t {
#define TEMPLATE(NAME, STRING) k##NAME,
REGEXP_ERROR_MESSAGES(TEMPLATE)
#undef TEMPLATE
NumErrors
};
V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_ERROR_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,61 @@
// Copyright 2011 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// A simple interpreter for the Irregexp byte code.
#ifndef V8_REGEXP_REGEXP_INTERPRETER_H_
#define V8_REGEXP_REGEXP_INTERPRETER_H_
#include "regexp/regexp.h"
namespace v8 {
namespace internal {
class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
public:
enum Result {
FAILURE = RegExp::kInternalRegExpFailure,
SUCCESS = RegExp::kInternalRegExpSuccess,
EXCEPTION = RegExp::kInternalRegExpException,
RETRY = RegExp::kInternalRegExpRetry,
};
// In case a StackOverflow occurs, a StackOverflowException is created and
// EXCEPTION is returned.
static Result MatchForCallFromRuntime(Isolate* isolate,
Handle<JSRegExp> regexp,
Handle<String> subject_string,
int* registers, int registers_length,
int start_position);
// In case a StackOverflow occurs, EXCEPTION is returned. The caller is
// responsible for creating the exception.
// RETRY is returned if a retry through the runtime is needed (e.g. when
// interrupts have been scheduled or the regexp is marked for tier-up).
// Arguments input_start, input_end and backtrack_stack are
// unused. They are only passed to match the signature of the native irregex
// code.
static Result MatchForCallFromJs(Address subject, int32_t start_position,
Address input_start, Address input_end,
int* registers, int32_t registers_length,
Address backtrack_stack,
RegExp::CallOrigin call_origin,
Isolate* isolate, Address regexp);
static Result MatchInternal(Isolate* isolate, ByteArray code_array,
String subject_string, int* registers,
int registers_length, int start_position,
RegExp::CallOrigin call_origin,
uint32_t backtrack_limit);
private:
static Result Match(Isolate* isolate, JSRegExp regexp, String subject_string,
int* registers, int registers_length, int start_position,
RegExp::CallOrigin call_origin);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_INTERPRETER_H_

View File

@ -0,0 +1,291 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file implements the NativeRegExpMacroAssembler interface for
// SpiderMonkey. It provides the same interface as each of V8's
// architecture-specific implementations.
#ifndef RegexpMacroAssemblerArch_h
#define RegexpMacroAssemblerArch_h
#include "jit/MacroAssembler.h"
#include "regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
struct FrameData {
// Character position at the start of the input, stored as a
// negative offset from the end of the string (input_end_pointer_).
size_t inputStart;
// The backtrack_stack_pointer_ register points to the top of the stack.
// This points to the bottom of the backtrack stack.
void* backtrackStackBase;
// Copy of the input MatchPairs.
int32_t* matches; // pointer to capture array
int32_t numMatches; // size of capture array
};
class SMRegExpMacroAssembler final : public NativeRegExpMacroAssembler {
public:
SMRegExpMacroAssembler(JSContext* cx, Isolate* isolate,
js::jit::StackMacroAssembler& masm, Zone* zone,
Mode mode, uint32_t num_capture_registers);
virtual ~SMRegExpMacroAssembler() {} // Nothing to do here
virtual int stack_limit_slack();
virtual IrregexpImplementation Implementation();
virtual bool Succeed();
virtual void Fail();
virtual void AdvanceCurrentPosition(int by);
virtual void PopCurrentPosition();
virtual void PushCurrentPosition();
virtual void SetCurrentPositionFromEnd(int by);
virtual void Backtrack();
virtual void Bind(Label* label);
virtual void GoTo(Label* label);
virtual void PushBacktrack(Label* label);
virtual void CheckCharacter(uint32_t c, Label* on_equal);
virtual void CheckNotCharacter(uint32_t c, Label* on_not_equal);
virtual void CheckCharacterGT(uc16 limit, Label* on_greater);
virtual void CheckCharacterLT(uc16 limit, Label* on_less);
virtual void CheckCharacterAfterAnd(uint32_t c, uint32_t mask,
Label* on_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c, uint32_t mask,
Label* on_not_equal);
virtual void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask,
Label* on_not_equal);
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position);
virtual void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range);
virtual void CheckCharacterNotInRange(uc16 from, uc16 to,
Label* on_not_in_range);
virtual void CheckAtStart(int cp_offset, Label* on_at_start);
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start);
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set);
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
Label* on_no_match);
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least);
virtual void AdvanceRegister(int reg, int by);
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge);
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt);
virtual void IfRegisterEqPos(int reg, Label* if_eq);
virtual void PopRegister(int register_index);
virtual void PushRegister(int register_index,
StackCheckFlag check_stack_limit);
virtual void ReadCurrentPositionFromRegister(int reg);
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset);
virtual void ReadStackPointerFromRegister(int reg);
virtual void WriteStackPointerToRegister(int reg);
virtual void SetRegister(int register_index, int to);
virtual void ClearRegisters(int reg_from, int reg_to);
virtual Handle<HeapObject> GetCode(Handle<String> source);
private:
size_t frameSize_ = 0;
void createStackFrame();
void initFrameAndRegs();
void successHandler();
void exitHandler();
void backtrackHandler();
void stackOverflowHandler();
// Push a register on the backtrack stack.
void Push(js::jit::Register value);
// Pop a value from the backtrack stack.
void Pop(js::jit::Register target);
void CheckAtStartImpl(int cp_offset, Label* on_cond,
js::jit::Assembler::Condition cond);
void CheckCharacterImpl(js::jit::Imm32 c, Label* on_cond,
js::jit::Assembler::Condition cond);
void CheckCharacterAfterAndImpl(uint32_t c, uint32_t and_with, Label* on_cond,
bool negate);
void CheckCharacterInRangeImpl(uc16 from, uc16 to, Label* on_cond,
js::jit::Assembler::Condition cond);
void CheckNotBackReferenceImpl(int start_reg, bool read_backward,
Label* on_no_match, bool ignore_case);
void LoadCurrentCharacterUnchecked(int cp_offset, int characters);
void JumpOrBacktrack(Label* to);
// MacroAssembler methods that take a Label can be called with a
// null label, which means that we should backtrack if we would jump
// to that label. This is a helper to avoid writing out the same
// logic a dozen times.
inline js::jit::Label* LabelOrBacktrack(Label* to) {
return to ? to->inner() : &backtrack_label_;
}
void CheckBacktrackStackLimit();
static bool GrowBacktrackStack(RegExpStack* regexp_stack);
static uint32_t CaseInsensitiveCompareStrings(const char16_t* substring1,
const char16_t* substring2,
size_t byteLength);
static uint32_t CaseInsensitiveCompareUCStrings(const char16_t* substring1,
const char16_t* substring2,
size_t byteLength);
inline int char_size() { return static_cast<int>(mode_); }
inline js::jit::Scale factor() {
return mode_ == UC16 ? js::jit::TimesTwo : js::jit::TimesOne;
}
js::jit::Address inputStart() {
return js::jit::Address(masm_.getStackPointer(),
offsetof(FrameData, inputStart));
}
js::jit::Address backtrackStackBase() {
return js::jit::Address(masm_.getStackPointer(),
offsetof(FrameData, backtrackStackBase));
}
js::jit::Address matches() {
return js::jit::Address(masm_.getStackPointer(),
offsetof(FrameData, matches));
}
js::jit::Address numMatches() {
return js::jit::Address(masm_.getStackPointer(),
offsetof(FrameData, numMatches));
}
// The stack-pointer-relative location of a regexp register.
js::jit::Address register_location(int register_index) {
return js::jit::Address(masm_.getStackPointer(),
register_offset(register_index));
}
int32_t register_offset(int register_index) {
MOZ_ASSERT(register_index >= 0 && register_index <= kMaxRegister);
if (num_registers_ <= register_index) {
num_registers_ = register_index + 1;
}
static_assert(alignof(uintptr_t) <= alignof(FrameData));
return sizeof(FrameData) + register_index * sizeof(uintptr_t*);
}
JSContext* cx_;
js::jit::StackMacroAssembler& masm_;
/*
* This assembler uses the following registers:
*
* - current_character_:
* Contains the character (or characters) currently being examined.
* Must be loaded using LoadCurrentCharacter before using any of the
* dispatch methods. After a matching pass for a global regexp,
* temporarily stores the index of capture start.
* - current_position_:
* Current position in input *as negative byte offset from end of string*.
* - input_end_pointer_:
* Points to byte after last character in the input. current_position_ is
* relative to this.
* - backtrack_stack_pointer_:
* Points to tip of the (heap-allocated) backtrack stack. The stack grows
* downward (like the native stack).
* - temp0_, temp1_, temp2_:
* Scratch registers.
*
* The native stack pointer is used to access arguments (InputOutputData),
* local variables (FrameData), and irregexp's internal virtual registers
* (see register_location).
*/
js::jit::Register current_character_;
js::jit::Register current_position_;
js::jit::Register input_end_pointer_;
js::jit::Register backtrack_stack_pointer_;
js::jit::Register temp0_, temp1_, temp2_;
js::jit::Label entry_label_;
js::jit::Label start_label_;
js::jit::Label backtrack_label_;
js::jit::Label success_label_;
js::jit::Label exit_label_;
js::jit::Label stack_overflow_label_;
js::jit::Label exit_with_exception_label_;
// When we generate the code to push a backtrack label's address
// onto the backtrack stack, we don't know its final address. We
// have to patch it after linking. This is slightly delicate, as the
// Label itself (which is allocated on the stack) may not exist by
// the time we link. The approach is as follows:
//
// 1. When we push a label on the backtrack stack (PushBacktrack),
// we bind the label's patchOffset_ field to the offset within
// the code that should be overwritten. This works because each
// label is only pushed by a single instruction.
//
// 2. When we bind a label (Bind), we check to see if it has a
// bound patchOffset_. If it does, we create a LabelPatch mapping
// its patch offset to the offset of the label itself.
//
// 3. While linking the code, we walk the list of label patches
// and patch the code accordingly.
class LabelPatch {
public:
LabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset)
: patchOffset_(patchOffset), labelOffset_(labelOffset) {}
js::jit::CodeOffset patchOffset_;
size_t labelOffset_ = 0;
};
js::Vector<LabelPatch, 4, js::SystemAllocPolicy> labelPatches_;
void AddLabelPatch(js::jit::CodeOffset patchOffset, size_t labelOffset) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
if (!labelPatches_.emplaceBack(patchOffset, labelOffset)) {
oomUnsafe.crash("Irregexp label patch");
}
}
Mode mode_;
int num_registers_;
int num_capture_registers_;
js::jit::LiveGeneralRegisterSet savedRegisters_;
public:
using TableVector =
js::Vector<PseudoHandle<ByteArrayData>, 4, js::SystemAllocPolicy>;
TableVector& tables() { return tables_; }
private:
TableVector tables_;
void AddTable(PseudoHandle<ByteArrayData> table) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
if (!tables_.append(std::move(table))) {
oomUnsafe.crash("Irregexp table append");
}
}
};
} // namespace internal
} // namespace v8
#endif // RegexpMacroAssemblerArch_h

View File

@ -0,0 +1,418 @@
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-macro-assembler-tracer.h"
namespace v8 {
namespace internal {
RegExpMacroAssemblerTracer::RegExpMacroAssemblerTracer(
Isolate* isolate, RegExpMacroAssembler* assembler)
: RegExpMacroAssembler(isolate, assembler->zone()), assembler_(assembler) {
IrregexpImplementation type = assembler->Implementation();
DCHECK_LT(type, 9);
const char* impl_names[] = {"IA32", "ARM", "ARM64", "MIPS", "S390",
"PPC", "X64", "X87", "Bytecode"};
PrintF("RegExpMacroAssembler%s();\n", impl_names[type]);
}
RegExpMacroAssemblerTracer::~RegExpMacroAssemblerTracer() = default;
void RegExpMacroAssemblerTracer::AbortedCodeGeneration() {
PrintF(" AbortedCodeGeneration\n");
assembler_->AbortedCodeGeneration();
}
// This is used for printing out debugging information. It makes an integer
// that is closely related to the address of an object.
static int LabelToInt(Label* label) {
return static_cast<int>(reinterpret_cast<intptr_t>(label));
}
void RegExpMacroAssemblerTracer::Bind(Label* label) {
PrintF("label[%08x]: (Bind)\n", LabelToInt(label));
assembler_->Bind(label);
}
void RegExpMacroAssemblerTracer::AdvanceCurrentPosition(int by) {
PrintF(" AdvanceCurrentPosition(by=%d);\n", by);
assembler_->AdvanceCurrentPosition(by);
}
void RegExpMacroAssemblerTracer::CheckGreedyLoop(Label* label) {
PrintF(" CheckGreedyLoop(label[%08x]);\n\n", LabelToInt(label));
assembler_->CheckGreedyLoop(label);
}
void RegExpMacroAssemblerTracer::PopCurrentPosition() {
PrintF(" PopCurrentPosition();\n");
assembler_->PopCurrentPosition();
}
void RegExpMacroAssemblerTracer::PushCurrentPosition() {
PrintF(" PushCurrentPosition();\n");
assembler_->PushCurrentPosition();
}
void RegExpMacroAssemblerTracer::Backtrack() {
PrintF(" Backtrack();\n");
assembler_->Backtrack();
}
void RegExpMacroAssemblerTracer::GoTo(Label* label) {
PrintF(" GoTo(label[%08x]);\n\n", LabelToInt(label));
assembler_->GoTo(label);
}
void RegExpMacroAssemblerTracer::PushBacktrack(Label* label) {
PrintF(" PushBacktrack(label[%08x]);\n", LabelToInt(label));
assembler_->PushBacktrack(label);
}
bool RegExpMacroAssemblerTracer::Succeed() {
bool restart = assembler_->Succeed();
PrintF(" Succeed();%s\n", restart ? " [restart for global match]" : "");
return restart;
}
void RegExpMacroAssemblerTracer::Fail() {
PrintF(" Fail();");
assembler_->Fail();
}
void RegExpMacroAssemblerTracer::PopRegister(int register_index) {
PrintF(" PopRegister(register=%d);\n", register_index);
assembler_->PopRegister(register_index);
}
void RegExpMacroAssemblerTracer::PushRegister(
int register_index,
StackCheckFlag check_stack_limit) {
PrintF(" PushRegister(register=%d, %s);\n",
register_index,
check_stack_limit ? "check stack limit" : "");
assembler_->PushRegister(register_index, check_stack_limit);
}
void RegExpMacroAssemblerTracer::AdvanceRegister(int reg, int by) {
PrintF(" AdvanceRegister(register=%d, by=%d);\n", reg, by);
assembler_->AdvanceRegister(reg, by);
}
void RegExpMacroAssemblerTracer::SetCurrentPositionFromEnd(int by) {
PrintF(" SetCurrentPositionFromEnd(by=%d);\n", by);
assembler_->SetCurrentPositionFromEnd(by);
}
void RegExpMacroAssemblerTracer::SetRegister(int register_index, int to) {
PrintF(" SetRegister(register=%d, to=%d);\n", register_index, to);
assembler_->SetRegister(register_index, to);
}
void RegExpMacroAssemblerTracer::WriteCurrentPositionToRegister(int reg,
int cp_offset) {
PrintF(" WriteCurrentPositionToRegister(register=%d,cp_offset=%d);\n",
reg,
cp_offset);
assembler_->WriteCurrentPositionToRegister(reg, cp_offset);
}
void RegExpMacroAssemblerTracer::ClearRegisters(int reg_from, int reg_to) {
PrintF(" ClearRegister(from=%d, to=%d);\n", reg_from, reg_to);
assembler_->ClearRegisters(reg_from, reg_to);
}
void RegExpMacroAssemblerTracer::ReadCurrentPositionFromRegister(int reg) {
PrintF(" ReadCurrentPositionFromRegister(register=%d);\n", reg);
assembler_->ReadCurrentPositionFromRegister(reg);
}
void RegExpMacroAssemblerTracer::WriteStackPointerToRegister(int reg) {
PrintF(" WriteStackPointerToRegister(register=%d);\n", reg);
assembler_->WriteStackPointerToRegister(reg);
}
void RegExpMacroAssemblerTracer::ReadStackPointerFromRegister(int reg) {
PrintF(" ReadStackPointerFromRegister(register=%d);\n", reg);
assembler_->ReadStackPointerFromRegister(reg);
}
void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl(
int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
int eats_at_least) {
const char* check_msg = check_bounds ? "" : " (unchecked)";
PrintF(
" LoadCurrentCharacter(cp_offset=%d, label[%08x]%s (%d chars) (eats at "
"least %d));\n",
cp_offset, LabelToInt(on_end_of_input), check_msg, characters,
eats_at_least);
assembler_->LoadCurrentCharacter(cp_offset, on_end_of_input, check_bounds,
characters, eats_at_least);
}
class PrintablePrinter {
public:
explicit PrintablePrinter(uc16 character) : character_(character) { }
const char* operator*() {
if (character_ >= ' ' && character_ <= '~') {
buffer_[0] = '(';
buffer_[1] = static_cast<char>(character_);
buffer_[2] = ')';
buffer_[3] = '\0';
} else {
buffer_[0] = '\0';
}
return &buffer_[0];
}
private:
uc16 character_;
char buffer_[4];
};
void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) {
PrintablePrinter printable(limit);
PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n",
limit,
*printable,
LabelToInt(on_less));
assembler_->CheckCharacterLT(limit, on_less);
}
void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit,
Label* on_greater) {
PrintablePrinter printable(limit);
PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n",
limit,
*printable,
LabelToInt(on_greater));
assembler_->CheckCharacterGT(limit, on_greater);
}
void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) {
PrintablePrinter printable(c);
PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n",
c,
*printable,
LabelToInt(on_equal));
assembler_->CheckCharacter(c, on_equal);
}
void RegExpMacroAssemblerTracer::CheckAtStart(int cp_offset,
Label* on_at_start) {
PrintF(" CheckAtStart(cp_offset=%d, label[%08x]);\n", cp_offset,
LabelToInt(on_at_start));
assembler_->CheckAtStart(cp_offset, on_at_start);
}
void RegExpMacroAssemblerTracer::CheckNotAtStart(int cp_offset,
Label* on_not_at_start) {
PrintF(" CheckNotAtStart(cp_offset=%d, label[%08x]);\n", cp_offset,
LabelToInt(on_not_at_start));
assembler_->CheckNotAtStart(cp_offset, on_not_at_start);
}
void RegExpMacroAssemblerTracer::CheckNotCharacter(unsigned c,
Label* on_not_equal) {
PrintablePrinter printable(c);
PrintF(" CheckNotCharacter(c=0x%04x%s, label[%08x]);\n",
c,
*printable,
LabelToInt(on_not_equal));
assembler_->CheckNotCharacter(c, on_not_equal);
}
void RegExpMacroAssemblerTracer::CheckCharacterAfterAnd(
unsigned c,
unsigned mask,
Label* on_equal) {
PrintablePrinter printable(c);
PrintF(" CheckCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n",
c,
*printable,
mask,
LabelToInt(on_equal));
assembler_->CheckCharacterAfterAnd(c, mask, on_equal);
}
void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd(
unsigned c,
unsigned mask,
Label* on_not_equal) {
PrintablePrinter printable(c);
PrintF(" CheckNotCharacterAfterAnd(c=0x%04x%s, mask=0x%04x, label[%08x]);\n",
c,
*printable,
mask,
LabelToInt(on_not_equal));
assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal);
}
void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
uc16 c,
uc16 minus,
uc16 mask,
Label* on_not_equal) {
PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, "
"label[%08x]);\n",
c,
minus,
mask,
LabelToInt(on_not_equal));
assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal);
}
void RegExpMacroAssemblerTracer::CheckCharacterInRange(
uc16 from,
uc16 to,
Label* on_not_in_range) {
PrintablePrinter printable_from(from);
PrintablePrinter printable_to(to);
PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n",
from,
*printable_from,
to,
*printable_to,
LabelToInt(on_not_in_range));
assembler_->CheckCharacterInRange(from, to, on_not_in_range);
}
void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(
uc16 from,
uc16 to,
Label* on_in_range) {
PrintablePrinter printable_from(from);
PrintablePrinter printable_to(to);
PrintF(
" CheckCharacterNotInRange(from=0x%04x%s," " to=%04x%s, label[%08x]);\n",
from,
*printable_from,
to,
*printable_to,
LabelToInt(on_in_range));
assembler_->CheckCharacterNotInRange(from, to, on_in_range);
}
void RegExpMacroAssemblerTracer::CheckBitInTable(
Handle<ByteArray> table, Label* on_bit_set) {
PrintF(" CheckBitInTable(label[%08x] ", LabelToInt(on_bit_set));
for (int i = 0; i < kTableSize; i++) {
PrintF("%c", table->get(i) != 0 ? 'X' : '.');
if (i % 32 == 31 && i != kTableMask) {
PrintF("\n ");
}
}
PrintF(");\n");
assembler_->CheckBitInTable(table, on_bit_set);
}
void RegExpMacroAssemblerTracer::CheckNotBackReference(int start_reg,
bool read_backward,
Label* on_no_match) {
PrintF(" CheckNotBackReference(register=%d, %s, label[%08x]);\n", start_reg,
read_backward ? "backward" : "forward", LabelToInt(on_no_match));
assembler_->CheckNotBackReference(start_reg, read_backward, on_no_match);
}
void RegExpMacroAssemblerTracer::CheckNotBackReferenceIgnoreCase(
int start_reg, bool read_backward, Label* on_no_match) {
PrintF(" CheckNotBackReferenceIgnoreCase(register=%d, %s, label[%08x]);\n",
start_reg, read_backward ? "backward" : "forward",
LabelToInt(on_no_match));
assembler_->CheckNotBackReferenceIgnoreCase(start_reg, read_backward,
on_no_match);
}
void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
Label* on_outside_input) {
PrintF(" CheckPosition(cp_offset=%d, label[%08x]);\n", cp_offset,
LabelToInt(on_outside_input));
assembler_->CheckPosition(cp_offset, on_outside_input);
}
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
uc16 type,
Label* on_no_match) {
bool supported = assembler_->CheckSpecialCharacterClass(type,
on_no_match);
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
type,
LabelToInt(on_no_match),
supported ? "true" : "false");
return supported;
}
void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index,
int comparand, Label* if_lt) {
PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n",
register_index, comparand, LabelToInt(if_lt));
assembler_->IfRegisterLT(register_index, comparand, if_lt);
}
void RegExpMacroAssemblerTracer::IfRegisterEqPos(int register_index,
Label* if_eq) {
PrintF(" IfRegisterEqPos(register=%d, label[%08x]);\n",
register_index, LabelToInt(if_eq));
assembler_->IfRegisterEqPos(register_index, if_eq);
}
void RegExpMacroAssemblerTracer::IfRegisterGE(int register_index,
int comparand, Label* if_ge) {
PrintF(" IfRegisterGE(register=%d, number=%d, label[%08x]);\n",
register_index, comparand, LabelToInt(if_ge));
assembler_->IfRegisterGE(register_index, comparand, if_ge);
}
RegExpMacroAssembler::IrregexpImplementation
RegExpMacroAssemblerTracer::Implementation() {
return assembler_->Implementation();
}
Handle<HeapObject> RegExpMacroAssemblerTracer::GetCode(Handle<String> source) {
PrintF(" GetCode(%s);\n", source->ToCString().get());
return assembler_->GetCode(source);
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,80 @@
// Copyright 2008 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_
#include "regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
// Decorator on a RegExpMacroAssembler that write all calls.
class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
public:
RegExpMacroAssemblerTracer(Isolate* isolate, RegExpMacroAssembler* assembler);
~RegExpMacroAssemblerTracer() override;
void AbortedCodeGeneration() override;
int stack_limit_slack() override { return assembler_->stack_limit_slack(); }
bool CanReadUnaligned() override { return assembler_->CanReadUnaligned(); }
void AdvanceCurrentPosition(int by) override; // Signed cp change.
void AdvanceRegister(int reg, int by) override; // r[reg] += by.
void Backtrack() override;
void Bind(Label* label) override;
void CheckCharacter(unsigned c, Label* on_equal) override;
void CheckCharacterAfterAnd(unsigned c, unsigned and_with,
Label* on_equal) override;
void CheckCharacterGT(uc16 limit, Label* on_greater) override;
void CheckCharacterLT(uc16 limit, Label* on_less) override;
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
void CheckAtStart(int cp_offset, Label* on_at_start) override;
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) override;
void CheckNotBackReferenceIgnoreCase(int start_reg, bool read_backward,
Label* on_no_match) override;
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
Label* on_not_equal) override;
void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with,
Label* on_not_equal) override;
void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override;
void CheckCharacterNotInRange(uc16 from, uc16 to,
Label* on_not_in_range) override;
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;
void IfRegisterGE(int reg, int comparand, Label* if_ge) override;
void IfRegisterLT(int reg, int comparand, Label* if_lt) override;
void IfRegisterEqPos(int reg, Label* if_eq) override;
IrregexpImplementation Implementation() override;
void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least) override;
void PopCurrentPosition() override;
void PopRegister(int register_index) override;
void PushBacktrack(Label* label) override;
void PushCurrentPosition() override;
void PushRegister(int register_index,
StackCheckFlag check_stack_limit) override;
void ReadCurrentPositionFromRegister(int reg) override;
void ReadStackPointerFromRegister(int reg) override;
void SetCurrentPositionFromEnd(int by) override;
void SetRegister(int register_index, int to) override;
bool Succeed() override;
void WriteCurrentPositionToRegister(int reg, int cp_offset) override;
void ClearRegisters(int reg_from, int reg_to) override;
void WriteStackPointerToRegister(int reg) override;
private:
RegExpMacroAssembler* assembler_;
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_TRACER_H_

View File

@ -0,0 +1,344 @@
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-macro-assembler.h"
#include "regexp/regexp-stack.h"
#ifdef V8_INTL_SUPPORT
#include "unicode/uchar.h"
#include "unicode/unistr.h"
#endif // V8_INTL_SUPPORT
namespace v8 {
namespace internal {
RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
: slow_safe_compiler_(false),
global_mode_(NOT_GLOBAL),
isolate_(isolate),
zone_(zone) {}
RegExpMacroAssembler::~RegExpMacroAssembler() = default;
int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
Isolate* isolate) {
// This function is not allowed to cause a garbage collection.
// A GC might move the calling generated code and invalidate the
// return address on the stack.
DCHECK_EQ(0, byte_length % 2);
#ifdef V8_INTL_SUPPORT
int32_t length = (int32_t)(byte_length >> 1);
icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
length);
return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
length, U_FOLD_CASE_DEFAULT) == 0;
#else
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
size_t length = byte_length >> 1;
DCHECK_NOT_NULL(isolate);
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
isolate->regexp_macro_assembler_canonicalize();
for (size_t i = 0; i < length; i++) {
unibrow::uchar c1 = substring1[i];
unibrow::uchar c2 = substring2[i];
if (c1 != c2) {
unibrow::uchar s1[1] = {c1};
canonicalize->get(c1, '\0', s1);
if (s1[0] != c2) {
unibrow::uchar s2[1] = {c2};
canonicalize->get(c2, '\0', s2);
if (s1[0] != s2[0]) {
return 0;
}
}
}
}
return 1;
#endif // V8_INTL_SUPPORT
}
void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
Label* on_failure) {
Label ok;
// Check that current character is not a trail surrogate.
LoadCurrentCharacter(cp_offset, &ok);
CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
// Check that previous character is not a lead surrogate.
LoadCurrentCharacter(cp_offset - 1, &ok);
CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
Bind(&ok);
}
void RegExpMacroAssembler::CheckPosition(int cp_offset,
Label* on_outside_input) {
LoadCurrentCharacter(cp_offset, on_outside_input, true);
}
void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
Label* on_end_of_input,
bool check_bounds,
int characters,
int eats_at_least) {
// By default, eats_at_least = characters.
if (eats_at_least == kUseCharactersValue) {
eats_at_least = characters;
}
LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
eats_at_least);
}
bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
Label* on_no_match) {
return false;
}
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
Zone* zone)
: RegExpMacroAssembler(isolate, zone) {}
NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
bool NativeRegExpMacroAssembler::CanReadUnaligned() {
return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
}
#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
// This method may only be called after an interrupt.
int NativeRegExpMacroAssembler::CheckStackGuardState(
Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
Address* return_address, Code re_code, Address* subject,
const byte** input_start, const byte** input_end) {
DisallowHeapAllocation no_gc;
Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
DCHECK_LE(re_code.raw_instruction_start(), old_pc);
DCHECK_LE(old_pc, re_code.raw_instruction_end());
StackLimitCheck check(isolate);
bool js_has_overflowed = check.JsHasOverflowed();
if (call_origin == RegExp::CallOrigin::kFromJs) {
// Direct calls from JavaScript can be interrupted in two ways:
// 1. A real stack overflow, in which case we let the caller throw the
// exception.
// 2. The stack guard was used to interrupt execution for another purpose,
// forcing the call through the runtime system.
// Bug(v8:9540) Investigate why this method is called from JS although no
// stackoverflow or interrupt is pending on ARM64. We return 0 in this case
// to continue execution normally.
if (js_has_overflowed) {
return EXCEPTION;
} else if (check.InterruptRequested()) {
return RETRY;
} else {
return 0;
}
}
DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
// Prepare for possible GC.
HandleScope handles(isolate);
Handle<Code> code_handle(re_code, isolate);
Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
int return_value = 0;
if (js_has_overflowed) {
AllowHeapAllocation yes_gc;
isolate->StackOverflow();
return_value = EXCEPTION;
} else if (check.InterruptRequested()) {
AllowHeapAllocation yes_gc;
Object result = isolate->stack_guard()->HandleInterrupts();
if (result.IsException(isolate)) return_value = EXCEPTION;
}
if (*code_handle != re_code) { // Return address no longer valid
// Overwrite the return address on the stack.
intptr_t delta = code_handle->address() - re_code.address();
Address new_pc = old_pc + delta;
// TODO(v8:10026): avoid replacing a signed pointer.
PointerAuthentication::ReplacePC(return_address, new_pc, 0);
}
// If we continue, we need to update the subject string addresses.
if (return_value == 0) {
// String encoding might have changed.
if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
is_one_byte) {
// If we changed between an LATIN1 and an UC16 string, the specialized
// code cannot be used, and we need to restart regexp matching from
// scratch (including, potentially, compiling a new version of the code).
return_value = RETRY;
} else {
*subject = subject_handle->ptr();
intptr_t byte_length = *input_end - *input_start;
*input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
*input_end = *input_start + byte_length;
}
}
return return_value;
}
// Returns a {Result} sentinel, or the number of successful matches.
int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
Handle<String> subject,
int* offsets_vector,
int offsets_vector_length,
int previous_index, Isolate* isolate) {
DCHECK(subject->IsFlat());
DCHECK_LE(0, previous_index);
DCHECK_LE(previous_index, subject->length());
// No allocations before calling the regexp, but we can't use
// DisallowHeapAllocation, since regexps might be preempted, and another
// thread might do allocation anyway.
String subject_ptr = *subject;
// Character offsets into string.
int start_offset = previous_index;
int char_length = subject_ptr.length() - start_offset;
int slice_offset = 0;
// The string has been flattened, so if it is a cons string it contains the
// full string in the first part.
if (StringShape(subject_ptr).IsCons()) {
DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
subject_ptr = ConsString::cast(subject_ptr).first();
} else if (StringShape(subject_ptr).IsSliced()) {
SlicedString slice = SlicedString::cast(subject_ptr);
subject_ptr = slice.parent();
slice_offset = slice.offset();
}
if (StringShape(subject_ptr).IsThin()) {
subject_ptr = ThinString::cast(subject_ptr).actual();
}
// Ensure that an underlying string has the same representation.
bool is_one_byte = subject_ptr.IsOneByteRepresentation();
DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
// String is now either Sequential or External
int char_size_shift = is_one_byte ? 0 : 1;
DisallowHeapAllocation no_gc;
const byte* input_start =
subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
int byte_length = char_length << char_size_shift;
const byte* input_end = input_start + byte_length;
return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
offsets_vector_length, isolate, *regexp);
}
// Returns a {Result} sentinel, or the number of successful matches.
// TODO(pthier): The JSRegExp object is passed to native irregexp code to match
// the signature of the interpreter. We should get rid of JS objects passed to
// internal methods.
int NativeRegExpMacroAssembler::Execute(
String input, // This needs to be the unpacked (sliced, cons) string.
int start_offset, const byte* input_start, const byte* input_end,
int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
// Ensure that the minimum stack has been allocated.
RegExpStackScope stack_scope(isolate);
Address stack_base = stack_scope.stack()->stack_base();
bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
Code code = Code::cast(regexp.Code(is_one_byte));
RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
using RegexpMatcherSig = int(
Address input_string, int start_offset, // NOLINT(readability/casting)
const byte* input_start, const byte* input_end, int* output,
int output_size, Address stack_base, int call_origin, Isolate* isolate,
Address regexp);
auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
int result =
fn.Call(input.ptr(), start_offset, input_start, input_end, output,
output_size, stack_base, call_origin, isolate, regexp.ptr());
DCHECK(result >= RETRY);
if (result == EXCEPTION && !isolate->has_pending_exception()) {
// We detected a stack overflow (on the backtrack stack) in RegExp code,
// but haven't created the exception yet. Additionally, we allow heap
// allocation because even though it invalidates {input_start} and
// {input_end}, we are about to return anyway.
AllowHeapAllocation allow_allocation;
isolate->StackOverflow();
}
return result;
}
#endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
// clang-format off
const byte NativeRegExpMacroAssembler::word_character_map[] = {
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
// Latin-1 range
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
};
// clang-format on
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
Address* stack_base,
Isolate* isolate) {
RegExpStack* regexp_stack = isolate->regexp_stack();
size_t size = regexp_stack->stack_capacity();
Address old_stack_base = regexp_stack->stack_base();
DCHECK(old_stack_base == *stack_base);
DCHECK(stack_pointer <= old_stack_base);
DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
if (new_stack_base == kNullAddress) {
return kNullAddress;
}
*stack_base = new_stack_base;
intptr_t stack_content_size = old_stack_base - stack_pointer;
return new_stack_base - stack_content_size;
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,280 @@
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
#define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
#include "regexp/regexp-ast.h"
#include "regexp/regexp-shim.h"
#include "regexp/regexp.h"
namespace v8 {
namespace internal {
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
struct DisjunctDecisionRow {
RegExpCharacterClass cc;
Label* on_match;
};
class RegExpMacroAssembler {
public:
// The implementation must be able to handle at least:
static const int kMaxRegister = (1 << 16) - 1;
static const int kMaxCPOffset = (1 << 15) - 1;
static const int kMinCPOffset = -(1 << 15);
static const int kTableSizeBits = 7;
static const int kTableSize = 1 << kTableSizeBits;
static const int kTableMask = kTableSize - 1;
static constexpr int kUseCharactersValue = -1;
enum IrregexpImplementation {
kIA32Implementation,
kARMImplementation,
kARM64Implementation,
kMIPSImplementation,
kS390Implementation,
kPPCImplementation,
kX64Implementation,
kX87Implementation,
kBytecodeImplementation
};
enum StackCheckFlag {
kNoStackLimitCheck = false,
kCheckStackLimit = true
};
RegExpMacroAssembler(Isolate* isolate, Zone* zone);
virtual ~RegExpMacroAssembler();
// This function is called when code generation is aborted, so that
// the assembler could clean up internal data structures.
virtual void AbortedCodeGeneration() {}
// The maximal number of pushes between stack checks. Users must supply
// kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
// at least once for every stack_limit() pushes that are executed.
virtual int stack_limit_slack() = 0;
virtual bool CanReadUnaligned() = 0;
virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
// Continues execution from the position pushed on the top of the backtrack
// stack by an earlier PushBacktrack(Label*).
virtual void Backtrack() = 0;
virtual void Bind(Label* label) = 0;
// Dispatch after looking the current character up in a 2-bits-per-entry
// map. The destinations vector has up to 4 labels.
virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
// Bitwise and the current character with the given constant and then
// check for a match with c.
virtual void CheckCharacterAfterAnd(unsigned c,
unsigned and_with,
Label* on_equal) = 0;
virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0;
virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0;
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
// matches. If the label is nullptr then we should pop a backtrack address
// off the stack and go to that.
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
virtual void CheckNotCharacterAfterAnd(unsigned c,
unsigned and_with,
Label* on_not_equal) = 0;
// Subtract a constant from the current character, then and with the given
// constant and then check for a match with c.
virtual void CheckNotCharacterAfterMinusAnd(uc16 c,
uc16 minus,
uc16 and_with,
Label* on_not_equal) = 0;
virtual void CheckCharacterInRange(uc16 from,
uc16 to, // Both inclusive.
Label* on_in_range) = 0;
virtual void CheckCharacterNotInRange(uc16 from,
uc16 to, // Both inclusive.
Label* on_not_in_range) = 0;
// The current character (modulus the kTableSize) is looked up in the byte
// array, and if the found byte is non-zero, we jump to the on_bit_set label.
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
// Checks whether the given offset from the current position is before
// the end of the string. May overwrite the current character.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
// Check whether a standard/default character class matches the current
// character. Returns false if the type of special character class does
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
// Control-flow integrity:
// Define a jump target and bind a label.
virtual void BindJumpTarget(Label* label) { Bind(label); }
virtual void Fail() = 0;
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
virtual void GoTo(Label* label) = 0;
// Check whether a register is >= a given constant and go to a label if it
// is. Backtracks instead if the label is nullptr.
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
// Check whether a register is < a given constant and go to a label if it is.
// Backtracks instead if the label is nullptr.
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
// Check whether a register is == to the current position and go to a
// label if it is.
virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
virtual IrregexpImplementation Implementation() = 0;
V8_EXPORT_PRIVATE void LoadCurrentCharacter(
int cp_offset, Label* on_end_of_input, bool check_bounds = true,
int characters = 1, int eats_at_least = kUseCharactersValue);
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least) = 0;
virtual void PopCurrentPosition() = 0;
virtual void PopRegister(int register_index) = 0;
// Pushes the label on the backtrack stack, so that a following Backtrack
// will go to this label. Always checks the backtrack stack limit.
virtual void PushBacktrack(Label* label) = 0;
virtual void PushCurrentPosition() = 0;
virtual void PushRegister(int register_index,
StackCheckFlag check_stack_limit) = 0;
virtual void ReadCurrentPositionFromRegister(int reg) = 0;
virtual void ReadStackPointerFromRegister(int reg) = 0;
virtual void SetCurrentPositionFromEnd(int by) = 0;
virtual void SetRegister(int register_index, int to) = 0;
// Return whether the matching (with a global regexp) will be restarted.
virtual bool Succeed() = 0;
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
virtual void WriteStackPointerToRegister(int reg) = 0;
// Compares two-byte strings case insensitively.
// Called from generated RegExp code.
static int CaseInsensitiveCompareUC16(Address byte_offset1,
Address byte_offset2,
size_t byte_length, Isolate* isolate);
// Check that we are not in the middle of a surrogate pair.
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
// Controls the generation of large inlined constants in the code.
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; }
void set_backtrack_limit(uint32_t backtrack_limit) {
backtrack_limit_ = backtrack_limit;
}
enum GlobalMode {
NOT_GLOBAL,
GLOBAL_NO_ZERO_LENGTH_CHECK,
GLOBAL,
GLOBAL_UNICODE
};
// Set whether the regular expression has the global flag. Exiting due to
// a failure in a global regexp may still mean success overall.
inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
inline bool global() { return global_mode_ != NOT_GLOBAL; }
inline bool global_with_zero_length_check() {
return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
}
inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
Isolate* isolate() const { return isolate_; }
Zone* zone() const { return zone_; }
protected:
bool has_backtrack_limit() const {
return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
}
uint32_t backtrack_limit() const { return backtrack_limit_; }
private:
bool slow_safe_compiler_;
uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit;
GlobalMode global_mode_;
Isolate* isolate_;
Zone* zone_;
};
class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
public:
// Type of input string to generate code for.
enum Mode { LATIN1 = 1, UC16 = 2 };
// Result of calling generated native RegExp code.
// RETRY: Something significant changed during execution, and the matching
// should be retried from scratch.
// EXCEPTION: Something failed during execution. If no exception has been
// thrown, it's an internal out-of-memory, and the caller should
// throw the exception.
// FAILURE: Matching failed.
// SUCCESS: Matching succeeded, and the output array has been filled with
// capture positions.
enum Result {
FAILURE = RegExp::kInternalRegExpFailure,
SUCCESS = RegExp::kInternalRegExpSuccess,
EXCEPTION = RegExp::kInternalRegExpException,
RETRY = RegExp::kInternalRegExpRetry,
};
NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone);
~NativeRegExpMacroAssembler() override;
bool CanReadUnaligned() override;
// Returns a {Result} sentinel, or the number of successful matches.
static int Match(Handle<JSRegExp> regexp, Handle<String> subject,
int* offsets_vector, int offsets_vector_length,
int previous_index, Isolate* isolate);
// Called from RegExp if the backtrack stack limit is hit.
// Tries to expand the stack. Returns the new stack-pointer if
// successful, and updates the stack_top address, or returns 0 if unable
// to grow the stack.
// This function must not trigger a garbage collection.
static Address GrowStack(Address stack_pointer, Address* stack_top,
Isolate* isolate);
static int CheckStackGuardState(Isolate* isolate, int start_index,
RegExp::CallOrigin call_origin,
Address* return_address, Code re_code,
Address* subject, const byte** input_start,
const byte** input_end);
// Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code.
static const byte word_character_map[256];
static Address word_character_map_address() {
return reinterpret_cast<Address>(&word_character_map[0]);
}
// Returns a {Result} sentinel, or the number of successful matches.
V8_EXPORT_PRIVATE static int Execute(String input, int start_offset,
const byte* input_start,
const byte* input_end, int* output,
int output_size, Isolate* isolate,
JSRegExp regexp);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,750 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_NODES_H_
#define V8_REGEXP_REGEXP_NODES_H_
#include "regexp/regexp-macro-assembler.h"
namespace v8 {
namespace internal {
class AlternativeGenerationList;
class BoyerMooreLookahead;
class GreedyLoopState;
class Label;
class NodeVisitor;
class QuickCheckDetails;
class RegExpCompiler;
class Trace;
struct PreloadState;
class ChoiceNode;
#define FOR_EACH_NODE_TYPE(VISIT) \
VISIT(End) \
VISIT(Action) \
VISIT(Choice) \
VISIT(LoopChoice) \
VISIT(NegativeLookaroundChoice) \
VISIT(BackReference) \
VISIT(Assertion) \
VISIT(Text)
struct NodeInfo final {
NodeInfo()
: being_analyzed(false),
been_analyzed(false),
follows_word_interest(false),
follows_newline_interest(false),
follows_start_interest(false),
at_end(false),
visited(false),
replacement_calculated(false) {}
// Returns true if the interests and assumptions of this node
// matches the given one.
bool Matches(NodeInfo* that) {
return (at_end == that->at_end) &&
(follows_word_interest == that->follows_word_interest) &&
(follows_newline_interest == that->follows_newline_interest) &&
(follows_start_interest == that->follows_start_interest);
}
// Updates the interests of this node given the interests of the
// node preceding it.
void AddFromPreceding(NodeInfo* that) {
at_end |= that->at_end;
follows_word_interest |= that->follows_word_interest;
follows_newline_interest |= that->follows_newline_interest;
follows_start_interest |= that->follows_start_interest;
}
bool HasLookbehind() {
return follows_word_interest || follows_newline_interest ||
follows_start_interest;
}
// Sets the interests of this node to include the interests of the
// following node.
void AddFromFollowing(NodeInfo* that) {
follows_word_interest |= that->follows_word_interest;
follows_newline_interest |= that->follows_newline_interest;
follows_start_interest |= that->follows_start_interest;
}
void ResetCompilationState() {
being_analyzed = false;
been_analyzed = false;
}
bool being_analyzed : 1;
bool been_analyzed : 1;
// These bits are set of this node has to know what the preceding
// character was.
bool follows_word_interest : 1;
bool follows_newline_interest : 1;
bool follows_start_interest : 1;
bool at_end : 1;
bool visited : 1;
bool replacement_calculated : 1;
};
struct EatsAtLeastInfo final {
EatsAtLeastInfo() : EatsAtLeastInfo(0) {}
explicit EatsAtLeastInfo(uint8_t eats)
: eats_at_least_from_possibly_start(eats),
eats_at_least_from_not_start(eats) {}
void SetMin(const EatsAtLeastInfo& other) {
if (other.eats_at_least_from_possibly_start <
eats_at_least_from_possibly_start) {
eats_at_least_from_possibly_start =
other.eats_at_least_from_possibly_start;
}
if (other.eats_at_least_from_not_start < eats_at_least_from_not_start) {
eats_at_least_from_not_start = other.eats_at_least_from_not_start;
}
}
// Any successful match starting from the current node will consume at least
// this many characters. This does not necessarily mean that there is a
// possible match with exactly this many characters, but we generally try to
// get this number as high as possible to allow for early exit on failure.
uint8_t eats_at_least_from_possibly_start;
// Like eats_at_least_from_possibly_start, but with the additional assumption
// that start-of-string assertions (^) can't match. This value is greater than
// or equal to eats_at_least_from_possibly_start.
uint8_t eats_at_least_from_not_start;
};
class RegExpNode : public ZoneObject {
public:
explicit RegExpNode(Zone* zone)
: replacement_(nullptr),
on_work_list_(false),
trace_count_(0),
zone_(zone) {
bm_info_[0] = bm_info_[1] = nullptr;
}
virtual ~RegExpNode();
virtual void Accept(NodeVisitor* visitor) = 0;
// Generates a goto to this node or actually generates the code at this point.
virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
// How many characters must this node consume at a minimum in order to
// succeed. The not_at_start argument is used to indicate that we know we are
// not at the start of the input. In this case anchored branches will always
// fail and can be ignored when determining how many characters are consumed
// on success. If this node has not been analyzed yet, EatsAtLeast returns 0.
int EatsAtLeast(bool not_at_start);
// Returns how many characters this node must consume in order to succeed,
// given that this is a LoopChoiceNode whose counter register is in a
// newly-initialized state at the current position in the generated code. For
// example, consider /a{6,8}/. Absent any extra information, the
// LoopChoiceNode for the repetition must report that it consumes at least
// zero characters, because it may have already looped several times. However,
// with a newly-initialized counter, it can report that it consumes at least
// six characters.
virtual EatsAtLeastInfo EatsAtLeastFromLoopEntry();
// Emits some quick code that checks whether the preloaded characters match.
// Falls through on certain failure, jumps to the label on possible success.
// If the node cannot make a quick check it does nothing and returns false.
bool EmitQuickCheck(RegExpCompiler* compiler, Trace* bounds_check_trace,
Trace* trace, bool preload_has_checked_bounds,
Label* on_possible_success,
QuickCheckDetails* details_return,
bool fall_through_on_failure, ChoiceNode* predecessor);
// For a given number of characters this returns a mask and a value. The
// next n characters are anded with the mask and compared with the value.
// A comparison failure indicates the node cannot match the next n characters.
// A comparison success indicates the node may match.
virtual void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) = 0;
// Fills in quick check details for this node, given that this is a
// LoopChoiceNode whose counter register is in a newly-initialized state at
// the current position in the generated code. For example, consider /a{6,8}/.
// Absent any extra information, the LoopChoiceNode for the repetition cannot
// generate any useful quick check because a match might be the (empty)
// continuation node. However, with a newly-initialized counter, it can
// generate a quick check for several 'a' characters at once.
virtual void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start);
static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
// Only returns the successor for a text node of length 1 that matches any
// character and that has no guards on it.
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) {
return nullptr;
}
// Collects information on the possible code units (mod 128) that can match if
// we look forward. This is used for a Boyer-Moore-like string searching
// implementation. TODO(erikcorry): This should share more code with
// EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
// the number of nodes we are willing to look at in order to create this data.
static const int kRecursionBudget = 200;
bool KeepRecursing(RegExpCompiler* compiler);
virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) {
UNREACHABLE();
}
// If we know that the input is one-byte then there are some nodes that can
// never match. This method returns a node that can be substituted for
// itself, or nullptr if the node can never match.
virtual RegExpNode* FilterOneByte(int depth) { return this; }
// Helper for FilterOneByte.
RegExpNode* replacement() {
DCHECK(info()->replacement_calculated);
return replacement_;
}
RegExpNode* set_replacement(RegExpNode* replacement) {
info()->replacement_calculated = true;
replacement_ = replacement;
return replacement; // For convenience.
}
// We want to avoid recalculating the lookahead info, so we store it on the
// node. Only info that is for this node is stored. We can tell that the
// info is for this node when offset == 0, so the information is calculated
// relative to this node.
void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
if (offset == 0) set_bm_info(not_at_start, bm);
}
Label* label() { return &label_; }
// If non-generic code is generated for a node (i.e. the node is not at the
// start of the trace) then it cannot be reused. This variable sets a limit
// on how often we allow that to happen before we insist on starting a new
// trace and generating generic code for a node that can be reused by flushing
// the deferred actions in the current trace and generating a goto.
static const int kMaxCopiesCodeGenerated = 10;
bool on_work_list() { return on_work_list_; }
void set_on_work_list(bool value) { on_work_list_ = value; }
NodeInfo* info() { return &info_; }
const EatsAtLeastInfo* eats_at_least_info() const { return &eats_at_least_; }
void set_eats_at_least_info(const EatsAtLeastInfo& eats_at_least) {
eats_at_least_ = eats_at_least;
}
BoyerMooreLookahead* bm_info(bool not_at_start) {
return bm_info_[not_at_start ? 1 : 0];
}
Zone* zone() const { return zone_; }
protected:
enum LimitResult { DONE, CONTINUE };
RegExpNode* replacement_;
LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
bm_info_[not_at_start ? 1 : 0] = bm;
}
private:
static const int kFirstCharBudget = 10;
Label label_;
bool on_work_list_;
NodeInfo info_;
// Saved values for EatsAtLeast results, to avoid recomputation. Filled in
// during analysis (valid if info_.been_analyzed is true).
EatsAtLeastInfo eats_at_least_;
// This variable keeps track of how many times code has been generated for
// this node (in different traces). We don't keep track of where the
// generated code is located unless the code is generated at the start of
// a trace, in which case it is generic and can be reused by flushing the
// deferred operations in the current trace and generating a goto.
int trace_count_;
BoyerMooreLookahead* bm_info_[2];
Zone* zone_;
};
class SeqRegExpNode : public RegExpNode {
public:
explicit SeqRegExpNode(RegExpNode* on_success)
: RegExpNode(on_success->zone()), on_success_(on_success) {}
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
RegExpNode* FilterOneByte(int depth) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
if (offset == 0) set_bm_info(not_at_start, bm);
}
protected:
RegExpNode* FilterSuccessor(int depth);
private:
RegExpNode* on_success_;
};
class ActionNode : public SeqRegExpNode {
public:
enum ActionType {
SET_REGISTER_FOR_LOOP,
INCREMENT_REGISTER,
STORE_POSITION,
BEGIN_SUBMATCH,
POSITIVE_SUBMATCH_SUCCESS,
EMPTY_MATCH_CHECK,
CLEAR_CAPTURES
};
static ActionNode* SetRegisterForLoop(int reg, int val,
RegExpNode* on_success);
static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
static ActionNode* StorePosition(int reg, bool is_capture,
RegExpNode* on_success);
static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
static ActionNode* BeginSubmatch(int stack_pointer_reg, int position_reg,
RegExpNode* on_success);
static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
int restore_reg,
int clear_capture_count,
int clear_capture_from,
RegExpNode* on_success);
static ActionNode* EmptyMatchCheck(int start_register,
int repetition_register,
int repetition_limit,
RegExpNode* on_success);
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
ActionType action_type() { return action_type_; }
// TODO(erikcorry): We should allow some action nodes in greedy loops.
int GreedyLoopTextLength() override {
return kNodeIsTooComplexForGreedyLoops;
}
private:
union {
struct {
int reg;
int value;
} u_store_register;
struct {
int reg;
} u_increment_register;
struct {
int reg;
bool is_capture;
} u_position_register;
struct {
int stack_pointer_register;
int current_position_register;
int clear_register_count;
int clear_register_from;
} u_submatch;
struct {
int start_register;
int repetition_register;
int repetition_limit;
} u_empty_match_check;
struct {
int range_from;
int range_to;
} u_clear_captures;
} data_;
ActionNode(ActionType action_type, RegExpNode* on_success)
: SeqRegExpNode(on_success), action_type_(action_type) {}
ActionType action_type_;
friend class DotPrinterImpl;
};
class TextNode : public SeqRegExpNode {
public:
TextNode(ZoneList<TextElement>* elms, bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass* that, bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success),
elms_(new (zone()) ZoneList<TextElement>(1, zone())),
read_backward_(read_backward) {
elms_->Add(TextElement::CharClass(that), zone());
}
// Create TextNode for a single character class for the given ranges.
static TextNode* CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags);
// Create TextNode for a surrogate pair with a range given for the
// lead and the trail surrogate each.
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags);
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
ZoneList<TextElement>* elements() { return elms_; }
bool read_backward() { return read_backward_; }
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
int GreedyLoopTextLength() override;
RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
void CalculateOffsets();
RegExpNode* FilterOneByte(int depth) override;
int Length();
private:
enum TextEmitPassType {
NON_LATIN1_MATCH, // Check for characters that can't match.
SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
CASE_CHARACTER_MATCH, // Case-independent single character check.
CHARACTER_CLASS_MATCH // Character class.
};
static bool SkipPass(TextEmitPassType pass, bool ignore_case);
static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
static const int kLastPass = CHARACTER_CLASS_MATCH;
void TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
bool preloaded, Trace* trace, bool first_element_checked,
int* checked_up_to);
ZoneList<TextElement>* elms_;
bool read_backward_;
};
class AssertionNode : public SeqRegExpNode {
public:
enum AssertionType {
AT_END,
AT_START,
AT_BOUNDARY,
AT_NON_BOUNDARY,
AFTER_NEWLINE
};
static AssertionNode* AtEnd(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_END, on_success);
}
static AssertionNode* AtStart(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_START, on_success);
}
static AssertionNode* AtBoundary(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
}
static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
}
static AssertionNode* AfterNewline(RegExpNode* on_success) {
return new (on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
}
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
AssertionType assertion_type() { return assertion_type_; }
private:
void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
enum IfPrevious { kIsNonWord, kIsWord };
void BacktrackIfPrevious(RegExpCompiler* compiler, Trace* trace,
IfPrevious backtrack_if_previous);
AssertionNode(AssertionType t, RegExpNode* on_success)
: SeqRegExpNode(on_success), assertion_type_(t) {}
AssertionType assertion_type_;
};
class BackReferenceNode : public SeqRegExpNode {
public:
BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
bool read_backward, RegExpNode* on_success)
: SeqRegExpNode(on_success),
start_reg_(start_reg),
end_reg_(end_reg),
flags_(flags),
read_backward_(read_backward) {}
void Accept(NodeVisitor* visitor) override;
int start_register() { return start_reg_; }
int end_register() { return end_reg_; }
bool read_backward() { return read_backward_; }
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override {
return;
}
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
private:
int start_reg_;
int end_reg_;
JSRegExp::Flags flags_;
bool read_backward_;
};
class EndNode : public RegExpNode {
public:
enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
// Returning 0 from EatsAtLeast should ensure we never get here.
UNREACHABLE();
}
private:
Action action_;
};
class NegativeSubmatchSuccess : public EndNode {
public:
NegativeSubmatchSuccess(int stack_pointer_reg, int position_reg,
int clear_capture_count, int clear_capture_start,
Zone* zone)
: EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
stack_pointer_register_(stack_pointer_reg),
current_position_register_(position_reg),
clear_capture_count_(clear_capture_count),
clear_capture_start_(clear_capture_start) {}
void Emit(RegExpCompiler* compiler, Trace* trace) override;
private:
int stack_pointer_register_;
int current_position_register_;
int clear_capture_count_;
int clear_capture_start_;
};
class Guard : public ZoneObject {
public:
enum Relation { LT, GEQ };
Guard(int reg, Relation op, int value) : reg_(reg), op_(op), value_(value) {}
int reg() { return reg_; }
Relation op() { return op_; }
int value() { return value_; }
private:
int reg_;
Relation op_;
int value_;
};
class GuardedAlternative {
public:
explicit GuardedAlternative(RegExpNode* node)
: node_(node), guards_(nullptr) {}
void AddGuard(Guard* guard, Zone* zone);
RegExpNode* node() { return node_; }
void set_node(RegExpNode* node) { node_ = node; }
ZoneList<Guard*>* guards() { return guards_; }
private:
RegExpNode* node_;
ZoneList<Guard*>* guards_;
};
class AlternativeGeneration;
class ChoiceNode : public RegExpNode {
public:
explicit ChoiceNode(int expected_size, Zone* zone)
: RegExpNode(zone),
alternatives_(new (zone)
ZoneList<GuardedAlternative>(expected_size, zone)),
not_at_start_(false),
being_calculated_(false) {}
void Accept(NodeVisitor* visitor) override;
void AddAlternative(GuardedAlternative node) {
alternatives()->Add(node, zone());
}
ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
bool being_calculated() { return being_calculated_; }
bool not_at_start() { return not_at_start_; }
void set_not_at_start() { not_at_start_ = true; }
void set_being_calculated(bool b) { being_calculated_ = b; }
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
return true;
}
RegExpNode* FilterOneByte(int depth) override;
virtual bool read_backward() { return false; }
protected:
int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
ZoneList<GuardedAlternative>* alternatives_;
private:
template <typename...>
friend class Analysis;
void GenerateGuard(RegExpMacroAssembler* macro_assembler, Guard* guard,
Trace* trace);
int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
void EmitOutOfLineContinuation(RegExpCompiler* compiler, Trace* trace,
GuardedAlternative alternative,
AlternativeGeneration* alt_gen,
int preload_characters,
bool next_expects_preload);
void SetUpPreLoad(RegExpCompiler* compiler, Trace* current_trace,
PreloadState* preloads);
void AssertGuardsMentionRegisters(Trace* trace);
int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
Trace* EmitGreedyLoop(RegExpCompiler* compiler, Trace* trace,
AlternativeGenerationList* alt_gens,
PreloadState* preloads,
GreedyLoopState* greedy_loop_state, int text_length);
void EmitChoices(RegExpCompiler* compiler,
AlternativeGenerationList* alt_gens, int first_choice,
Trace* trace, PreloadState* preloads);
// If true, this node is never checked at the start of the input.
// Allows a new trace to start with at_start() set to false.
bool not_at_start_;
bool being_calculated_;
};
class NegativeLookaroundChoiceNode : public ChoiceNode {
public:
explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
GuardedAlternative then_do_this,
Zone* zone)
: ChoiceNode(2, zone) {
AddAlternative(this_must_fail);
AddAlternative(then_do_this);
}
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
continue_node()->FillInBMInfo(isolate, offset, budget - 1, bm,
not_at_start);
if (offset == 0) set_bm_info(not_at_start, bm);
}
static constexpr int kLookaroundIndex = 0;
static constexpr int kContinueIndex = 1;
RegExpNode* lookaround_node() {
return alternatives()->at(kLookaroundIndex).node();
}
RegExpNode* continue_node() {
return alternatives()->at(kContinueIndex).node();
}
// For a negative lookahead we don't emit the quick check for the
// alternative that is expected to fail. This is because quick check code
// starts by loading enough characters for the alternative that takes fewest
// characters, but on a negative lookahead the negative branch did not take
// part in that calculation (EatsAtLeast) so the assumptions don't hold.
bool try_to_emit_quick_check_for_alternative(bool is_first) override {
return !is_first;
}
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
};
class LoopChoiceNode : public ChoiceNode {
public:
LoopChoiceNode(bool body_can_be_zero_length, bool read_backward,
int min_loop_iterations, Zone* zone)
: ChoiceNode(2, zone),
loop_node_(nullptr),
continue_node_(nullptr),
body_can_be_zero_length_(body_can_be_zero_length),
read_backward_(read_backward),
traversed_loop_initialization_node_(false),
min_loop_iterations_(min_loop_iterations) {}
void AddLoopAlternative(GuardedAlternative alt);
void AddContinueAlternative(GuardedAlternative alt);
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler, int characters_filled_in,
bool not_at_start) override;
void GetQuickCheckDetailsFromLoopEntry(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
bool not_at_start) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
EatsAtLeastInfo EatsAtLeastFromLoopEntry() override;
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
int min_loop_iterations() const { return min_loop_iterations_; }
bool read_backward() override { return read_backward_; }
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
private:
// AddAlternative is made private for loop nodes because alternatives
// should not be added freely, we need to keep track of which node
// goes back to the node itself.
void AddAlternative(GuardedAlternative node) {
ChoiceNode::AddAlternative(node);
}
RegExpNode* loop_node_;
RegExpNode* continue_node_;
bool body_can_be_zero_length_;
bool read_backward_;
// Temporary marker set only while generating quick check details. Represents
// whether GetQuickCheckDetails traversed the initialization node for this
// loop's counter. If so, we may be able to generate stricter quick checks
// because we know the loop node must match at least min_loop_iterations_
// times before the continuation node can match.
bool traversed_loop_initialization_node_;
// The minimum number of times the loop_node_ must match before the
// continue_node_ might be considered. This value can be temporarily decreased
// while generating quick check details, to represent the remaining iterations
// after the completed portion of the quick check details.
int min_loop_iterations_;
friend class IterationDecrementer;
friend class LoopInitializationMarker;
};
class NodeVisitor {
public:
virtual ~NodeVisitor() = default;
#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)
#undef DECLARE_VISIT
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_NODES_H_

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,361 @@
// Copyright 2016 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_PARSER_H_
#define V8_REGEXP_REGEXP_PARSER_H_
#include "regexp/regexp-ast.h"
#include "regexp/regexp-error.h"
namespace v8 {
namespace internal {
struct RegExpCompileData;
// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be nullptr pointers.
template <typename T, int initial_size>
class BufferedZoneList {
public:
BufferedZoneList() : list_(nullptr), last_(nullptr) {}
// Adds element at end of list. This element is buffered and can
// be read using last() or removed using RemoveLast until a new Add or until
// RemoveLast or GetList has been called.
void Add(T* value, Zone* zone) {
if (last_ != nullptr) {
if (list_ == nullptr) {
list_ = new (zone) ZoneList<T*>(initial_size, zone);
}
list_->Add(last_, zone);
}
last_ = value;
}
T* last() {
DCHECK(last_ != nullptr);
return last_;
}
T* RemoveLast() {
DCHECK(last_ != nullptr);
T* result = last_;
if ((list_ != nullptr) && (list_->length() > 0))
last_ = list_->RemoveLast();
else
last_ = nullptr;
return result;
}
T* Get(int i) {
DCHECK((0 <= i) && (i < length()));
if (list_ == nullptr) {
DCHECK_EQ(0, i);
return last_;
} else {
if (i == list_->length()) {
DCHECK(last_ != nullptr);
return last_;
} else {
return list_->at(i);
}
}
}
void Clear() {
list_ = nullptr;
last_ = nullptr;
}
int length() {
int length = (list_ == nullptr) ? 0 : list_->length();
return length + ((last_ == nullptr) ? 0 : 1);
}
ZoneList<T*>* GetList(Zone* zone) {
if (list_ == nullptr) {
list_ = new (zone) ZoneList<T*>(initial_size, zone);
}
if (last_ != nullptr) {
list_->Add(last_, zone);
last_ = nullptr;
}
return list_;
}
private:
ZoneList<T*>* list_;
T* last_;
};
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
public:
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
void AddEscapedUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc);
void AddCharacterClassForDesugaring(uc32 c);
void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
void FlushText();
RegExpTree* ToRegExp();
JSRegExp::Flags flags() const { return flags_; }
void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
private:
static const uc16 kNoPendingSurrogate = 0;
void AddLeadSurrogate(uc16 lead_surrogate);
void AddTrailSurrogate(uc16 trail_surrogate);
void FlushPendingSurrogate();
void FlushCharacters();
void FlushTerms();
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
bool NeedsDesugaringForIgnoreCase(uc32 c);
Zone* zone() const { return zone_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
Zone* zone_;
bool pending_empty_;
JSRegExp::Flags flags_;
ZoneList<uc16>* characters_;
uc16 pending_surrogate_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};
class V8_EXPORT_PRIVATE RegExpParser {
public:
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
JSRegExp::Flags flags, RegExpCompileData* result);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
uc32 ParseClassCharacterEscape();
// Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is.
bool ParseHexEscape(int length, uc32* value);
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
bool ParsePropertyClassName(ZoneVector<char>* name_1,
ZoneVector<char>* name_2);
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
const ZoneVector<char>& name_1,
const ZoneVector<char>& name_2);
RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);
// Parse inside a class. Either add escaped class to the range, or return
// false and pass parsed single character through |char_out|.
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents, uc32* char_out,
bool* is_class_escape);
char ParseClassEscape();
RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
// just read the initial flag value here.
bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
static bool IsSyntaxCharacterOrSlash(uc32 c);
static const uc32 kEndMarker = (1 << 21);
private:
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAROUND,
NEGATIVE_LOOKAROUND,
GROUPING
};
class RegExpParserState : public ZoneObject {
public:
// Push a state on the stack.
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index,
const ZoneVector<uc16>* capture_name,
JSRegExp::Flags flags, Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder(zone, flags)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index),
capture_name_(capture_name) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() const { return previous_state_; }
bool IsSubexpression() { return previous_state_ != nullptr; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() const { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() const { return group_type_; }
// Lookahead or Lookbehind.
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() const { return disjunction_capture_index_; }
// The name of the current sub-expression, if group_type is CAPTURE. Only
// used for named captures.
const ZoneVector<uc16>* capture_name() const { return capture_name_; }
bool IsNamedCapture() const { return capture_name_ != nullptr; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index);
// Check whether the parser is inside a capture group with the given name.
bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
private:
// Linked list implementation of stack of states.
RegExpParserState* const previous_state_;
// Builder for the stored disjunction.
RegExpBuilder* const builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
const SubexpressionType group_type_;
// Stored read direction.
const RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
const int disjunction_capture_index_;
// Stored capture name (if any).
const ZoneVector<uc16>* const capture_name_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
// Creates a new named capture at the specified index. Must be called exactly
// once for each named capture. Fails if a capture with the same name is
// encountered.
bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
// Parses the name of a capture group (?<name>pattern). The name must adhere
// to IdentifierName in the ECMAScript standard.
const ZoneVector<uc16>* ParseCaptureGroupName();
bool ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state);
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
// After the initial parsing pass, patch corresponding RegExpCapture objects
// into all RegExpBackReferences. This is done after initial parsing in order
// to avoid complicating cases in which references comes before the capture.
void PatchNamedBackReferences();
Handle<FixedArray> CreateCaptureNameMap();
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
bool HasNamedCaptures();
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
uc32 current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
template <bool update_position>
uc32 ReadNext();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
struct RegExpCaptureNameLess {
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
DCHECK_NOT_NULL(lhs);
DCHECK_NOT_NULL(rhs);
return *lhs->name() < *rhs->name();
}
};
Isolate* isolate_;
Zone* zone_;
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
FlatStringReader* in_;
uc32 current_;
// These are the flags specified outside the regexp syntax ie after the
// terminating '/' or in the second argument to the constructor. The current
// flags are stored on the RegExpBuilder.
JSRegExp::Flags top_level_flags_;
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.
bool has_more_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool has_named_captures_; // Only valid after we have scanned for captures.
bool failed_;
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_PARSER_H_

View File

@ -0,0 +1,212 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
* vim: set ts=8 sts=2 et sw=2 tw=80:
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <iostream>
#include "regexp/regexp-shim.h"
#include "regexp/regexp-stack.h"
namespace v8 {
namespace internal {
void PrintF(const char* format, ...) {
va_list arguments;
va_start(arguments, format);
vprintf(format, arguments);
va_end(arguments);
}
void PrintF(FILE* out, const char* format, ...) {
va_list arguments;
va_start(arguments, format);
vfprintf(out, format, arguments);
va_end(arguments);
}
StdoutStream::operator std::ostream&() const { return std::cerr; }
template <typename T>
std::ostream& StdoutStream::operator<<(T t) { return std::cerr << t; }
template std::ostream& StdoutStream::operator<<(char const* c);
// Origin:
// https://github.com/v8/v8/blob/855591a54d160303349a5f0a32fab15825c708d1/src/utils/ostreams.cc#L120-L169
// (This is a hand-simplified version.)
// Writes the given character to the output escaping everything outside
// of printable ASCII range.
std::ostream& operator<<(std::ostream& os, const AsUC16& c) {
uc16 v = c.value;
bool isPrint = 0x20 < v && v <= 0x7e;
char buf[10];
const char* format = isPrint ? "%c" : (v <= 0xFF) ? "\\x%02x" : "\\u%04x";
SprintfLiteral(buf, format, v);
return os << buf;
}
std::ostream& operator<<(std::ostream& os, const AsUC32& c) {
int32_t v = c.value;
if (v <= String::kMaxUtf16CodeUnit) {
return os << AsUC16(v);
}
char buf[13];
SprintfLiteral(buf, "\\u{%06x}", v);
return os << buf;
}
HandleScope::HandleScope(Isolate* isolate)
: isolate_(isolate) {
isolate->openHandleScope(*this);
}
HandleScope::~HandleScope() {
isolate_->closeHandleScope(level_, non_gc_level_);
}
template <typename T>
Handle<T>::Handle(T object, Isolate* isolate)
: location_(isolate->getHandleLocation(JS::Value(object))) {}
template Handle<ByteArray>::Handle(ByteArray b, Isolate* isolate);
template Handle<HeapObject>::Handle(JS::Value v, Isolate* isolate);
template Handle<JSRegExp>::Handle(JSRegExp re, Isolate* isolate);
template Handle<String>::Handle(String s, Isolate* isolate);
template <typename T>
Handle<T>::Handle(JS::Value value, Isolate* isolate)
: location_(isolate->getHandleLocation(value)) {
T::cast(Object(value)); // Assert that value has the correct type.
}
JS::Value* Isolate::getHandleLocation(JS::Value value) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
if (!handleArena_.Append(value)) {
oomUnsafe.crash("Irregexp handle allocation");
}
return &handleArena_.GetLast();
}
void* Isolate::allocatePseudoHandle(size_t bytes) {
PseudoHandle<void> ptr;
ptr.reset(js_malloc(bytes));
if (!ptr) {
return nullptr;
}
if (!uniquePtrArena_.Append(std::move(ptr))) {
return nullptr;
}
return uniquePtrArena_.GetLast().get();
}
template <typename T>
PseudoHandle<T> Isolate::takeOwnership(void* ptr) {
for (auto iter = uniquePtrArena_.IterFromLast(); !iter.Done(); iter.Prev()) {
auto& entry = iter.Get();
if (entry.get() == ptr) {
PseudoHandle<T> result;
result.reset(static_cast<T*>(entry.release()));
return result;
}
}
MOZ_CRASH("Tried to take ownership of pseudohandle that is not in the arena");
}
PseudoHandle<ByteArrayData> ByteArray::takeOwnership(Isolate* isolate) {
PseudoHandle<ByteArrayData> result =
isolate->takeOwnership<ByteArrayData>(value_.toPrivate());
value_ = JS::PrivateValue(nullptr);
return result;
}
void Isolate::trace(JSTracer* trc) {
js::gc::AssertRootMarkingPhase(trc);
for (auto iter = handleArena_.Iter(); !iter.Done(); iter.Next()) {
auto& elem = iter.Get();
JS::GCPolicy<JS::Value>::trace(trc, &elem, "Isolate handle arena");
}
}
/*static*/ Handle<String> String::Flatten(Isolate* isolate,
Handle<String> string) {
if (string->IsFlat()) {
return string;
}
js::AutoEnterOOMUnsafeRegion oomUnsafe;
JSLinearString* linear = string->str()->ensureLinear(isolate->cx());
if (!linear) {
oomUnsafe.crash("Irregexp String::Flatten");
}
return Handle<String>(JS::StringValue(linear), isolate);
}
// This is only used for trace messages printing the source of a
// regular expression. To keep things simple, we just return an
// empty string and don't print anything.
std::unique_ptr<char[]> String::ToCString() {
return std::unique_ptr<char[]>();
}
byte* Isolate::top_of_regexp_stack() const {
return reinterpret_cast<byte*>(regexpStack_->memory_top_address_address());
}
Handle<ByteArray> Isolate::NewByteArray(int length, AllocationType alloc) {
MOZ_RELEASE_ASSERT(length >= 0);
js::AutoEnterOOMUnsafeRegion oomUnsafe;
size_t alloc_size = sizeof(uint32_t) + length;
ByteArrayData* data =
static_cast<ByteArrayData*>(allocatePseudoHandle(alloc_size));
if (!data) {
oomUnsafe.crash("Irregexp NewByteArray");
}
data->length = length;
return Handle<ByteArray>(JS::PrivateValue(data), this);
}
Handle<FixedArray> Isolate::NewFixedArray(int length) {
MOZ_RELEASE_ASSERT(length >= 0);
MOZ_CRASH("TODO");
}
template <typename CharT>
Handle<String> Isolate::InternalizeString(const Vector<const CharT>& str) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
JSAtom* atom = js::AtomizeChars(cx(), str.begin(), str.length());
if (!atom) {
oomUnsafe.crash("Irregexp InternalizeString");
}
return Handle<String>(JS::StringValue(atom), this);
}
template Handle<String>
Isolate::InternalizeString(const Vector<const uint8_t>& str);
template Handle<String>
Isolate::InternalizeString(const Vector<const char16_t>& str);
// TODO: Map flags to jitoptions
bool FLAG_correctness_fuzzer_suppressions = false;
bool FLAG_enable_regexp_unaligned_accesses = false;
bool FLAG_harmony_regexp_sequence = false;
bool FLAG_regexp_interpret_all = false;
bool FLAG_regexp_mode_modifiers = false;
bool FLAG_regexp_optimization = true;
bool FLAG_regexp_peephole_optimization = true;
bool FLAG_regexp_possessive_quantifier = false;
bool FLAG_regexp_tier_up = false;
bool FLAG_trace_regexp_assembler = false;
bool FLAG_trace_regexp_bytecodes = false;
bool FLAG_trace_regexp_parser = false;
bool FLAG_trace_regexp_peephole_optimization = false;
} // namespace internal
} // namespace v8

1174
js/src/regexp/regexp-shim.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,97 @@
// Copyright 2009 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "regexp/regexp-stack.h"
namespace v8 {
namespace internal {
RegExpStackScope::RegExpStackScope(Isolate* isolate)
: regexp_stack_(isolate->regexp_stack()) {
// Initialize, if not already initialized.
regexp_stack_->EnsureCapacity(0);
}
RegExpStackScope::~RegExpStackScope() {
// Reset the buffer if it has grown.
regexp_stack_->Reset();
}
RegExpStack::RegExpStack() : thread_local_(this), isolate_(nullptr) {}
RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); }
char* RegExpStack::ArchiveStack(char* to) {
if (!thread_local_.owns_memory_) {
// Force dynamic stacks prior to archiving. Any growth will do. A dynamic
// stack is needed because stack archival & restoration rely on `memory_`
// pointing at a fixed-location backing store, whereas the static stack is
// tied to a RegExpStack instance.
EnsureCapacity(thread_local_.memory_size_ + 1);
DCHECK(thread_local_.owns_memory_);
}
size_t size = sizeof(thread_local_);
MemCopy(reinterpret_cast<void*>(to), &thread_local_, size);
thread_local_ = ThreadLocal(this);
return to + size;
}
char* RegExpStack::RestoreStack(char* from) {
size_t size = sizeof(thread_local_);
MemCopy(&thread_local_, reinterpret_cast<void*>(from), size);
return from + size;
}
void RegExpStack::Reset() { thread_local_.ResetToStaticStack(this); }
void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) {
if (owns_memory_) DeleteArray(memory_);
memory_ = regexp_stack->static_stack_;
memory_top_ = regexp_stack->static_stack_ + kStaticStackSize;
memory_size_ = kStaticStackSize;
limit_ = reinterpret_cast<Address>(regexp_stack->static_stack_) +
kStackLimitSlack * kSystemPointerSize;
owns_memory_ = false;
}
void RegExpStack::ThreadLocal::FreeAndInvalidate() {
if (owns_memory_) DeleteArray(memory_);
// This stack may not be used after being freed. Just reset to invalid values
// to ensure we don't accidentally use old memory areas.
memory_ = nullptr;
memory_top_ = nullptr;
memory_size_ = 0;
limit_ = kMemoryTop;
}
Address RegExpStack::EnsureCapacity(size_t size) {
if (size > kMaximumStackSize) return kNullAddress;
if (size < kMinimumDynamicStackSize) size = kMinimumDynamicStackSize;
if (thread_local_.memory_size_ < size) {
byte* new_memory = NewArray<byte>(size);
if (thread_local_.memory_size_ > 0) {
// Copy original memory into top of new memory.
MemCopy(new_memory + size - thread_local_.memory_size_,
thread_local_.memory_, thread_local_.memory_size_);
if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_);
}
thread_local_.memory_ = new_memory;
thread_local_.memory_top_ = new_memory + size;
thread_local_.memory_size_ = size;
thread_local_.limit_ = reinterpret_cast<Address>(new_memory) +
kStackLimitSlack * kSystemPointerSize;
thread_local_.owns_memory_ = true;
}
return reinterpret_cast<Address>(thread_local_.memory_top_);
}
} // namespace internal
} // namespace v8

View File

@ -0,0 +1,141 @@
// Copyright 2009 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_STACK_H_
#define V8_REGEXP_REGEXP_STACK_H_
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
class RegExpStack;
// Maintains a per-v8thread stack area that can be used by irregexp
// implementation for its backtracking stack.
// Since there is only one stack area, the Irregexp implementation is not
// re-entrant. I.e., no regular expressions may be executed in the same thread
// during a preempted Irregexp execution.
class RegExpStackScope {
public:
// Create and delete an instance to control the life-time of a growing stack.
// Initializes the stack memory area if necessary.
explicit RegExpStackScope(Isolate* isolate);
~RegExpStackScope(); // Releases the stack if it has grown.
RegExpStack* stack() const { return regexp_stack_; }
private:
RegExpStack* regexp_stack_;
DISALLOW_COPY_AND_ASSIGN(RegExpStackScope);
};
class RegExpStack {
public:
RegExpStack();
~RegExpStack();
// Number of allocated locations on the stack below the limit.
// No sequence of pushes must be longer that this without doing a stack-limit
// check.
static constexpr int kStackLimitSlack = 32;
// Gives the top of the memory used as stack.
Address stack_base() {
DCHECK_NE(0, thread_local_.memory_size_);
DCHECK_EQ(thread_local_.memory_top_,
thread_local_.memory_ + thread_local_.memory_size_);
return reinterpret_cast<Address>(thread_local_.memory_top_);
}
// The total size of the memory allocated for the stack.
size_t stack_capacity() { return thread_local_.memory_size_; }
// If the stack pointer gets below the limit, we should react and
// either grow the stack or report an out-of-stack exception.
// There is only a limited number of locations below the stack limit,
// so users of the stack should check the stack limit during any
// sequence of pushes longer that this.
Address* limit_address_address() { return &(thread_local_.limit_); }
// Ensures that there is a memory area with at least the specified size.
// If passing zero, the default/minimum size buffer is allocated.
Address EnsureCapacity(size_t size);
// Thread local archiving.
static constexpr int ArchiveSpacePerThread() {
return static_cast<int>(sizeof(ThreadLocal));
}
char* ArchiveStack(char* to);
char* RestoreStack(char* from);
void FreeThreadResources() { thread_local_.ResetToStaticStack(this); }
// Maximal size of allocated stack area.
static constexpr size_t kMaximumStackSize = 64 * MB;
private:
// Artificial limit used when the thread-local state has been destroyed.
static const Address kMemoryTop =
static_cast<Address>(static_cast<uintptr_t>(-1));
// Minimal size of dynamically-allocated stack area.
static constexpr size_t kMinimumDynamicStackSize = 1 * KB;
// In addition to dynamically-allocated, variable-sized stacks, we also have
// a statically allocated and sized area that is used whenever no dynamic
// stack is allocated. This guarantees that a stack is always available and
// we can skip availability-checks later on.
// It's double the slack size to ensure that we have a bit of breathing room
// before NativeRegExpMacroAssembler::GrowStack must be called.
static constexpr size_t kStaticStackSize =
2 * kStackLimitSlack * kSystemPointerSize;
byte static_stack_[kStaticStackSize] = {0};
STATIC_ASSERT(kStaticStackSize <= kMaximumStackSize);
// Structure holding the allocated memory, size and limit.
struct ThreadLocal {
explicit ThreadLocal(RegExpStack* regexp_stack) {
ResetToStaticStack(regexp_stack);
}
// If memory_size_ > 0 then memory_ and memory_top_ must be non-nullptr
// and memory_top_ = memory_ + memory_size_
byte* memory_ = nullptr;
byte* memory_top_ = nullptr;
size_t memory_size_ = 0;
Address limit_ = kNullAddress;
bool owns_memory_ = false; // Whether memory_ is owned and must be freed.
void ResetToStaticStack(RegExpStack* regexp_stack);
void FreeAndInvalidate();
};
// Address of top of memory used as stack.
Address memory_top_address_address() {
return reinterpret_cast<Address>(&thread_local_.memory_top_);
}
// Resets the buffer if it has grown beyond the default/minimum size.
// After this, the buffer is either the default size, or it is empty, so
// you have to call EnsureCapacity before using it again.
void Reset();
ThreadLocal thread_local_;
Isolate* isolate_;
friend class ExternalReference;
friend class Isolate;
friend class RegExpStackScope;
DISALLOW_COPY_AND_ASSIGN(RegExpStack);
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_STACK_H_

195
js/src/regexp/regexp.h Normal file
View File

@ -0,0 +1,195 @@
// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_REGEXP_H_
#define V8_REGEXP_REGEXP_H_
#include "regexp/regexp-error.h"
#include "regexp/regexp-shim.h"
namespace v8 {
namespace internal {
class RegExpNode;
class RegExpTree;
enum class RegExpCompilationTarget : int { kBytecode, kNative };
// TODO(jgruber): Do not expose in regexp.h.
// TODO(jgruber): Consider splitting between ParseData and CompileData.
struct RegExpCompileData {
// The parsed AST as produced by the RegExpParser.
RegExpTree* tree = nullptr;
// The compiled Node graph as produced by RegExpTree::ToNode methods.
RegExpNode* node = nullptr;
// Either the generated code as produced by the compiler or a trampoline
// to the interpreter.
Object code;
// True, iff the pattern is a 'simple' atom with zero captures. In other
// words, the pattern consists of a string with no metacharacters and special
// regexp features, and can be implemented as a standard string search.
bool simple = true;
// True, iff the pattern is anchored at the start of the string with '^'.
bool contains_anchor = false;
// Only use if the pattern contains named captures. If so, this contains a
// mapping of capture names to capture indices.
Handle<FixedArray> capture_name_map;
// The error message. Only used if an error occurred during parsing or
// compilation.
RegExpError error = RegExpError::kNone;
// The position at which the error was detected. Only used if an
// error occurred.
int error_pos = 0;
// The number of capture groups, without the global capture \0.
int capture_count = 0;
// The number of registers used by the generated code.
int register_count = 0;
// The compilation target (bytecode or native code).
RegExpCompilationTarget compilation_target;
};
class RegExp final : public AllStatic {
public:
// Whether the irregexp engine generates interpreter bytecode.
static bool CanGenerateBytecode() {
return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
}
// Parses the RegExp pattern and prepares the JSRegExp object with
// generic data and choice of implementation - as well as what
// the implementation wants to store in the data field.
// Returns false if compilation fails.
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
JSRegExp::Flags flags, uint32_t backtrack_limit);
enum CallOrigin : int {
kFromRuntime = 0,
kFromJs = 1,
};
// See ECMA-262 section 15.10.6.2.
// This function calls the garbage collector if necessary.
V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
int index, Handle<RegExpMatchInfo> last_match_info);
// Integral return values used throughout regexp code layers.
static constexpr int kInternalRegExpFailure = 0;
static constexpr int kInternalRegExpSuccess = 1;
static constexpr int kInternalRegExpException = -1;
static constexpr int kInternalRegExpRetry = -2;
enum IrregexpResult : int32_t {
RE_FAILURE = kInternalRegExpFailure,
RE_SUCCESS = kInternalRegExpSuccess,
RE_EXCEPTION = kInternalRegExpException,
};
// Prepare a RegExp for being executed one or more times (using
// IrregexpExecOnce) on the subject.
// This ensures that the regexp is compiled for the subject, and that
// the subject is flat.
// Returns the number of integer spaces required by IrregexpExecOnce
// as its "registers" argument. If the regexp cannot be compiled,
// an exception is set as pending, and this function returns negative.
static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
Handle<String> subject);
// Set last match info. If match is nullptr, then setting captures is
// omitted.
static Handle<RegExpMatchInfo> SetLastMatchInfo(
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
Handle<String> subject, int capture_count, int32_t* match);
V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone,
RegExpCompileData* input,
JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte);
V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
RegExpNode* node);
static const int kRegExpTooLargeToOptimize = 20 * KB;
};
// Uses a special global mode of irregexp-generated code to perform a global
// search and return multiple results at once. As such, this is essentially an
// iterator over multiple results (retrieved batch-wise in advance).
class RegExpGlobalCache final {
public:
RegExpGlobalCache(Handle<JSRegExp> regexp, Handle<String> subject,
Isolate* isolate);
~RegExpGlobalCache();
// Fetch the next entry in the cache for global regexp match results.
// This does not set the last match info. Upon failure, nullptr is
// returned. The cause can be checked with Result(). The previous result is
// still in available in memory when a failure happens.
int32_t* FetchNext();
int32_t* LastSuccessfulMatch();
bool HasException() { return num_matches_ < 0; }
private:
int AdvanceZeroLength(int last_index);
int num_matches_;
int max_matches_;
int current_match_index_;
int registers_per_match_;
// Pointer to the last set of captures.
int32_t* register_array_;
int register_array_size_;
Handle<JSRegExp> regexp_;
Handle<String> subject_;
Isolate* isolate_;
};
// Caches results for specific regexp queries on the isolate. At the time of
// writing, this is used during global calls to RegExp.prototype.exec and
// @@split.
class RegExpResultsCache final : public AllStatic {
public:
enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
// Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
// On success, the returned result is guaranteed to be a COW-array.
static Object Lookup(Heap* heap, String key_string, Object key_pattern,
FixedArray* last_match_out, ResultsCacheType type);
// Attempt to add value_array to the cache specified by type. On success,
// value_array is turned into a COW-array.
static void Enter(Isolate* isolate, Handle<String> key_string,
Handle<Object> key_pattern, Handle<FixedArray> value_array,
Handle<FixedArray> last_match_cache, ResultsCacheType type);
static void Clear(FixedArray cache);
static constexpr int kRegExpResultsCacheSize = 0x100;
private:
static constexpr int kStringOffset = 0;
static constexpr int kPatternOffset = 1;
static constexpr int kArrayOffset = 2;
static constexpr int kLastMatchOffset = 3;
static constexpr int kArrayEntriesPerCacheEntry = 4;
};
} // namespace internal
} // namespace v8
#endif // V8_REGEXP_REGEXP_H_

View File

@ -0,0 +1,88 @@
// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that
// can be found in the LICENSE file.
// Automatically generated by regexp/gen-regexp-special-case.cc
// The following functions are used to build UnicodeSets
// for special cases where the case-folding algorithm used by
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match
// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime
// Semantics: Canonicalize) step 3.
#ifdef V8_INTL_SUPPORT
#include "regexp/special-case.h"
#include "unicode/uniset.h"
namespace v8 {
namespace internal {
icu::UnicodeSet BuildIgnoreSet() {
icu::UnicodeSet set;
set.add(0xdf);
set.add(0x17f);
set.add(0x390);
set.add(0x3b0);
set.add(0x3f4);
set.add(0x1e9e);
set.add(0x1f80, 0x1faf);
set.add(0x1fb3);
set.add(0x1fbc);
set.add(0x1fc3);
set.add(0x1fcc);
set.add(0x1fd3);
set.add(0x1fe3);
set.add(0x1ff3);
set.add(0x1ffc);
set.add(0x2126);
set.add(0x212a, 0x212b);
set.add(0xfb05, 0xfb06);
set.freeze();
return set;
}
struct IgnoreSetData {
IgnoreSetData() : set(BuildIgnoreSet()) {}
const icu::UnicodeSet set;
};
//static
const icu::UnicodeSet& RegExpCaseFolding::IgnoreSet() {
static base::LazyInstance<IgnoreSetData>::type set =
LAZY_INSTANCE_INITIALIZER;
return set.Pointer()->set;
}
icu::UnicodeSet BuildSpecialAddSet() {
icu::UnicodeSet set;
set.add(0x4b);
set.add(0x53);
set.add(0x6b);
set.add(0x73);
set.add(0xc5);
set.add(0xe5);
set.add(0x398);
set.add(0x3a9);
set.add(0x3b8);
set.add(0x3c9);
set.add(0x3d1);
set.freeze();
return set;
}
struct SpecialAddSetData {
SpecialAddSetData() : set(BuildSpecialAddSet()) {}
const icu::UnicodeSet set;
};
//static
const icu::UnicodeSet& RegExpCaseFolding::SpecialAddSet() {
static base::LazyInstance<SpecialAddSetData>::type set =
LAZY_INSTANCE_INITIALIZER;
return set.Pointer()->set;
}
} // namespace internal
} // namespace v8
#endif // V8_INTL_SUPPORT

View File

@ -0,0 +1,117 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_REGEXP_SPECIAL_CASE_H_
#define V8_REGEXP_SPECIAL_CASE_H_
#ifdef V8_INTL_SUPPORT
#include "regexp/regexp-shim.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
namespace v8 {
namespace internal {
// Sets of Unicode characters that need special handling under "i" mode
// For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
// defines slightly different case-folding rules than Unicode. An
// input character should match a pattern character if the result of
// the Canonicalize algorithm is the same for both characters.
//
// Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
// c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
// string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
// the precise definition.
//
// While compiling such regular expressions, we need to compute the
// set of characters that should match a given input character. (See
// GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
// For almost all characters, this can be efficiently computed using
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
// the remaining special cases.
//
// For a character c, the rules are as follows:
//
// 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
// containing c will produce the set of characters that should
// match /c/i (or /[c]/i), and only those characters.
//
// 2. If c is in IgnoreSet, then the only character it should match is
// itself. However, closeOver will add additional incorrect
// matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
// (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
// "SS". Step 3.e therefore requires that 'ß' canonicalizes to
// itself, and should not match 'ẞ'. In these cases, we can skip
// the closeOver entirely, because it will never add an equivalent
// character.
//
// 3. If c is in SpecialAddSet, then it should match at least one
// character other than itself. However, closeOver will add at
// least one additional incorrect match. For example, consider the
// letter 'k'. Closing over 'k' gives "kK" (lowercase k, uppercase
// K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
// SIGN should not match either of the other two characters. As a
// result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
// IgnoreSet). To find the correct matches for characters in
// SpecialAddSet, we closeOver the original character, but filter
// out the results that do not have the same canonical value.
//
// The contents of these sets are calculated at build time by
// src/regexp/gen-regexp-special-case.cc, which generates
// gen/src/regexp/special-case.cc. This is done by iterating over the
// result of closeOver for each BMP character, and finding sets for
// which at least one character has a different canonical value than
// another character. Characters that match no other characters in
// their equivalence class are added to IgnoreSet. Characters that
// match at least one other character are added to SpecialAddSet.
class RegExpCaseFolding final : public AllStatic {
public:
static const icu::UnicodeSet& IgnoreSet();
static const icu::UnicodeSet& SpecialAddSet();
// This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
// Canonicalize) step 3, which is used to determine whether
// characters match when ignoreCase is true and unicode is false.
static UChar32 Canonicalize(UChar32 ch) {
// a. Assert: ch is a UTF-16 code unit.
CHECK_LE(ch, 0xffff);
// b. Let s be the String value consisting of the single code unit ch.
icu::UnicodeString s(ch);
// c. Let u be the same result produced as if by performing the algorithm
// for String.prototype.toUpperCase using s as the this value.
// d. Assert: Type(u) is String.
icu::UnicodeString& u = s.toUpper();
// e. If u does not consist of a single code unit, return ch.
if (u.length() != 1) {
return ch;
}
// f. Let cu be u's single code unit element.
UChar32 cu = u.char32At(0);
// g. If the value of ch >= 128 and the value of cu < 128, return ch.
if (ch >= 128 && cu < 128) {
return ch;
}
// h. Return cu.
return cu;
}
};
} // namespace internal
} // namespace v8
#endif // V8_INTL_SUPPORT
#endif // V8_REGEXP_SPECIAL_CASE_H_

View File

@ -0,0 +1,93 @@
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_UTIL_FLAGS_H_
#define V8_UTIL_FLAGS_H_
// Origin: https://github.com/v8/v8/blob/1bafcc6b999b23ea1d394f5d267a08183e3c4e19/src/base/flags.h#L15-L90
namespace v8 {
namespace base {
// The Flags class provides a type-safe way of storing OR-combinations of enum
// values. The Flags<T, S> class is a template class, where T is an enum type,
// and S is the underlying storage type (usually int).
//
// The traditional C++ approach for storing OR-combinations of enum values is to
// use an int or unsigned int variable. The inconvenience with this approach is
// that there's no type checking at all; any enum value can be OR'd with any
// other enum value and passed on to a function that takes an int or unsigned
// int.
template <typename T, typename S = int>
class Flags final {
public:
using flag_type = T;
using mask_type = S;
constexpr Flags() : mask_(0) {}
constexpr Flags(flag_type flag)
: mask_(static_cast<S>(flag)) {}
constexpr explicit Flags(mask_type mask) : mask_(static_cast<S>(mask)) {}
constexpr bool operator==(flag_type flag) const {
return mask_ == static_cast<S>(flag);
}
constexpr bool operator!=(flag_type flag) const {
return mask_ != static_cast<S>(flag);
}
Flags& operator&=(const Flags& flags) {
mask_ &= flags.mask_;
return *this;
}
Flags& operator|=(const Flags& flags) {
mask_ |= flags.mask_;
return *this;
}
Flags& operator^=(const Flags& flags) {
mask_ ^= flags.mask_;
return *this;
}
constexpr Flags operator&(const Flags& flags) const {
return Flags(mask_ & flags.mask_);
}
constexpr Flags operator|(const Flags& flags) const {
return Flags(mask_ | flags.mask_);
}
constexpr Flags operator^(const Flags& flags) const {
return Flags(mask_ ^ flags.mask_);
}
Flags& operator&=(flag_type flag) { return operator&=(Flags(flag)); }
Flags& operator|=(flag_type flag) { return operator|=(Flags(flag)); }
Flags& operator^=(flag_type flag) { return operator^=(Flags(flag)); }
constexpr Flags operator&(flag_type flag) const {
return operator&(Flags(flag));
}
constexpr Flags operator|(flag_type flag) const {
return operator|(Flags(flag));
}
constexpr Flags operator^(flag_type flag) const {
return operator^(Flags(flag));
}
constexpr Flags operator~() const { return Flags(~mask_); }
constexpr operator mask_type() const { return mask_; }
constexpr bool operator!() const { return !mask_; }
Flags without(flag_type flag) { return *this & (~Flags(flag)); }
friend size_t hash_value(const Flags& flags) { return flags.mask_; }
private:
mask_type mask_;
};
} // namespace base
} // namespace v8
#endif // V8_UTIL_FLAG_H_

File diff suppressed because it is too large Load Diff

204
js/src/regexp/util/vector.h Normal file
View File

@ -0,0 +1,204 @@
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_UTIL_VECTOR_H_
#define V8_UTIL_VECTOR_H_
#include <algorithm>
#include <cstring>
#include <iterator>
#include <memory>
#include "js/Utility.h"
namespace v8 {
namespace internal {
//////////////////////////////////////////////////
// Adapted from: https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/allocation.h#L36-L58
template <typename T>
T* NewArray(size_t size) {
static_assert(std::is_pod<T>::value, "");
js::AutoEnterOOMUnsafeRegion oomUnsafe;
T* result = static_cast<T*>(js_malloc(size * sizeof(T)));
if (!result) {
oomUnsafe.crash("Irregexp NewArray");
}
return result;
}
template <typename T>
void DeleteArray(T* array) {
js_free(array);
}
//////////////////////////////////////////////////
// A non-resizable vector containing a pointer and a length.
// The Vector may or may not own the pointer, depending on context.
// Origin:
// https://github.com/v8/v8/blob/5f69bbc233c2d1baf149faf869a7901603929914/src/utils/vector.h#L20-L134
template <typename T>
class Vector {
public:
constexpr Vector() : start_(nullptr), length_(0) {}
constexpr Vector(T* data, size_t length) : start_(data), length_(length) {
MOZ_ASSERT_IF(length != 0, data != nullptr);
}
static Vector<T> New(size_t length) {
return Vector<T>(NewArray<T>(length), length);
}
// Returns a vector using the same backing storage as this one,
// spanning from and including 'from', to but not including 'to'.
Vector<T> SubVector(size_t from, size_t to) const {
MOZ_ASSERT(from < to);
MOZ_ASSERT(to < length_);
return Vector<T>(begin() + from, to - from);
}
// Returns the length of the vector. Only use this if you really need an
// integer return value. Use {size()} otherwise.
int length() const {
MOZ_ASSERT(length_ <= std::numeric_limits<int>::max());
return static_cast<int>(length_);
}
// Returns the length of the vector as a size_t.
constexpr size_t size() const { return length_; }
// Returns whether or not the vector is empty.
constexpr bool empty() const { return length_ == 0; }
// Access individual vector elements - checks bounds in debug mode.
T& operator[](size_t index) const {
MOZ_ASSERT(index < length_);
return start_[index];
}
const T& at(size_t index) const { return operator[](index); }
T& first() { return start_[0]; }
T& last() {
MOZ_ASSERT(length_ > 0);
return start_[length_ - 1];
}
// Returns a pointer to the start of the data in the vector.
constexpr T* begin() const { return start_; }
// Returns a pointer past the end of the data in the vector.
constexpr T* end() const { return start_ + length_; }
// Returns a clone of this vector with a new backing store.
Vector<T> Clone() const {
T* result = NewArray<T>(length_);
for (size_t i = 0; i < length_; i++) result[i] = start_[i];
return Vector<T>(result, length_);
}
void Truncate(size_t length) {
MOZ_ASSERT(length <= length_);
length_ = length;
}
// Releases the array underlying this vector. Once disposed the
// vector is empty.
void Dispose() {
DeleteArray(start_);
start_ = nullptr;
length_ = 0;
}
Vector<T> operator+(size_t offset) {
MOZ_ASSERT(offset <= length_);
return Vector<T>(start_ + offset, length_ - offset);
}
Vector<T> operator+=(size_t offset) {
MOZ_ASSERT(offset <= length_);
start_ += offset;
length_ -= offset;
return *this;
}
// Implicit conversion from Vector<T> to Vector<const T>.
inline operator Vector<const T>() const {
return Vector<const T>::cast(*this);
}
template <typename S>
static constexpr Vector<T> cast(Vector<S> input) {
return Vector<T>(reinterpret_cast<T*>(input.begin()),
input.length() * sizeof(S) / sizeof(T));
}
bool operator==(const Vector<const T> other) const {
if (length_ != other.length_) return false;
if (start_ == other.start_) return true;
for (size_t i = 0; i < length_; ++i) {
if (start_[i] != other.start_[i]) {
return false;
}
}
return true;
}
private:
T* start_;
size_t length_;
};
// The resulting vector does not contain a null-termination byte. If you want
// the null byte, use ArrayVector("foo").
inline Vector<const char> CStrVector(const char* data) {
return Vector<const char>(data, strlen(data));
}
} // namespace internal
namespace base {
// SmallVector uses inline storage first, and reallocates when full.
// It is basically equivalent to js::Vector, and is implemented
// as a thin wrapper.
// V8's implementation: https://github.com/v8/v8/blob/master/src/base/small-vector.h
template <typename T, size_t kSize>
class SmallVector {
public:
inline bool empty() const { return inner_.empty(); }
inline const T& back() const { return inner_.back(); }
inline void pop_back() { inner_.popBack(); };
template <typename... Args>
inline void emplace_back(Args&&... args) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
if (!inner_.emplaceBack(args...)) {
oomUnsafe.crash("Irregexp SmallVector emplace_back");
}
};
inline size_t size() const { return inner_.length(); }
inline const T& at(size_t index) const { return inner_[index]; }
void resize_no_init(size_t new_size) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
if (!inner_.resizeUninitialized(new_size)) {
oomUnsafe.crash("Irregexp SmallVector resize");
}
}
private:
js::Vector<T, kSize, js::SystemAllocPolicy> inner_;
};
} // namespace base
} // namespace v8
#endif // V8_UTIL_VECTOR_H_

375
js/src/regexp/util/zone.h Normal file
View File

@ -0,0 +1,375 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef V8_UTIL_ZONE_H_
#define V8_UTIL_ZONE_H_
#include <list>
#include <map>
#include <set>
#include <unordered_map>
#include <vector>
#include "ds/LifoAlloc.h"
#include "ds/Sort.h"
#include "regexp/util/vector.h"
namespace v8 {
namespace internal {
// V8::Zone ~= LifoAlloc
class Zone {
public:
Zone(size_t defaultChunkSize) : lifoAlloc_(defaultChunkSize) {
lifoAlloc_.setAsInfallibleByDefault();
}
void* New(size_t size) {
js::LifoAlloc::AutoFallibleScope fallible(&lifoAlloc_);
js::AutoEnterOOMUnsafeRegion oomUnsafe;
void* result = lifoAlloc_.alloc(size);
if (!result) {
oomUnsafe.crash("Irregexp Zone::new");
}
return result;
}
void DeleteAll() { lifoAlloc_.freeAll(); }
// Returns true if the total memory allocated exceeds a threshold.
static const size_t kExcessLimit = 256 * 1024 * 1024;
bool excess_allocation() const {
return lifoAlloc_.computedSizeOfExcludingThis() > kExcessLimit;
}
private:
js::LifoAlloc lifoAlloc_;
};
// Superclass for classes allocated in a Zone.
// Origin: https://github.com/v8/v8/blob/7b3332844212d78ee87a9426f3a6f7f781a8fbfa/src/zone/zone.h#L138-L155
class ZoneObject {
public:
// Allocate a new ZoneObject of 'size' bytes in the Zone.
void* operator new(size_t size, Zone* zone) { return zone->New(size); }
// Ideally, the delete operator should be private instead of
// public, but unfortunately the compiler sometimes synthesizes
// (unused) destructors for classes derived from ZoneObject, which
// require the operator to be visible. MSVC requires the delete
// operator to be public.
// ZoneObjects should never be deleted individually; use
// Zone::DeleteAll() to delete all zone objects in one go.
void operator delete(void*, size_t) { MOZ_CRASH("unreachable"); }
void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); }
};
// ZoneLists are growable lists with constant-time access to the
// elements. The list itself and all its elements are allocated in the
// Zone. ZoneLists cannot be deleted individually; you can delete all
// objects in the Zone by calling Zone::DeleteAll().
// Used throughout irregexp.
// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone.h#L173-L318
// Inlines: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-list-inl.h#L17-L155
template <typename T>
class ZoneList final {
public:
// Construct a new ZoneList with the given capacity; the length is
// always zero. The capacity must be non-negative.
ZoneList(int capacity, Zone* zone) { Initialize(capacity, zone); }
// Construct a new ZoneList from a std::initializer_list
ZoneList(std::initializer_list<T> list, Zone* zone) {
Initialize(static_cast<int>(list.size()), zone);
for (auto& i : list) Add(i, zone);
}
// Construct a new ZoneList by copying the elements of the given ZoneList.
ZoneList(const ZoneList<T>& other, Zone* zone) {
Initialize(other.length(), zone);
AddAll(other, zone);
}
void* operator new(size_t size, Zone* zone) { return zone->New(size); }
// Returns a reference to the element at index i. This reference is not safe
// to use after operations that can change the list's backing store
// (e.g. Add).
inline T& operator[](int i) const {
MOZ_ASSERT(0 < i);
MOZ_ASSERT(static_cast<unsigned>(i) < static_cast<unsigned>(length_));
return data_[i];
}
inline T& at(int i) const { return operator[](i); }
inline T& last() const { return at(length_ - 1); }
inline T& first() const { return at(0); }
using iterator = T*;
inline iterator begin() const { return &data_[0]; }
inline iterator end() const { return &data_[length_]; }
inline bool is_empty() const { return length_ == 0; }
inline int length() const { return length_; }
inline int capacity() const { return capacity_; }
Vector<T> ToVector() const { return Vector<T>(data_, length_); }
Vector<T> ToVector(int start, int length) const {
return Vector<T>(data_ + start, std::min(length_ - start, length));
}
Vector<const T> ToConstVector() const {
return Vector<const T>(data_, length_);
}
inline void Initialize(int capacity, Zone* zone) {
MOZ_ASSERT(capacity >= 0);
data_ = (capacity > 0) ? NewData(capacity, zone) : nullptr;
capacity_ = capacity;
length_ = 0;
}
// Adds a copy of the given 'element' to the end of the list,
// expanding the list if necessary.
void Add(const T& element, Zone* zone) {
if (length_ < capacity_) {
data_[length_++] = element;
} else {
ZoneList<T>::ResizeAdd(element, zone);
}
}
// Add all the elements from the argument list to this list.
void AddAll(const ZoneList<T>& other, Zone* zone) {
AddAll(other.ToVector(), zone);
}
// Add all the elements from the vector to this list.
void AddAll(const Vector<T>& other, Zone* zone) {
int result_length = length_ + other.length();
if (capacity_ < result_length) {
Resize(result_length, zone);
}
if (std::is_fundamental<T>()) {
memcpy(data_ + length_, other.begin(), sizeof(*data_) * other.length());
} else {
for (int i = 0; i < other.length(); i++) {
data_[length_ + i] = other.at(i);
}
}
length_ = result_length;
}
// Overwrites the element at the specific index.
void Set(int index, const T& element) {
MOZ_ASSERT(index >= 0 && index <= length_);
data_[index] = element;
}
// Removes the i'th element without deleting it even if T is a
// pointer type; moves all elements above i "down". Returns the
// removed element. This function's complexity is linear in the
// size of the list.
T Remove(int i) {
T element = at(i);
length_--;
while (i < length_) {
data_[i] = data_[i + 1];
i++;
}
return element;
}
// Removes the last element without deleting it even if T is a
// pointer type. Returns the removed element.
inline T RemoveLast() { return Remove(length_ - 1); }
// Clears the list by freeing the storage memory. If you want to keep the
// memory, use Rewind(0) instead. Be aware, that even if T is a
// pointer type, clearing the list doesn't delete the entries.
inline void Clear() {
data_ = nullptr;
capacity_ = 0;
length_ = 0;
}
// Drops all but the first 'pos' elements from the list.
inline void Rewind(int pos) {
MOZ_ASSERT(0 <= pos && pos <= length_);
length_ = pos;
}
inline bool Contains(const T& elm) const {
for (int i = 0; i < length_; i++) {
if (data_[i] == elm) return true;
}
return false;
}
template <typename CompareFunction>
void StableSort(CompareFunction cmp, size_t start, size_t length) {
js::AutoEnterOOMUnsafeRegion oomUnsafe;
T* scratch = static_cast<T*>(js_malloc(length * sizeof(T)));
if (!scratch) {
oomUnsafe.crash("Irregexp stable sort scratch space");
}
auto comparator = [cmp](const T& a, const T& b, bool* lessOrEqual) {
*lessOrEqual = cmp(&a, &b) <= 0;
return true;
};
MOZ_ALWAYS_TRUE(js::MergeSort(begin() + start, length, scratch,
comparator));
js_free(scratch);
}
void operator delete(void* pointer) { MOZ_CRASH("unreachable"); }
void operator delete(void* pointer, Zone* zone) { MOZ_CRASH("unreachable"); }
private:
T* data_;
int capacity_;
int length_;
inline T* NewData(int n, Zone* zone) {
return static_cast<T*>(zone->New(n * sizeof(T)));
}
// Increase the capacity of a full list, and add an element.
// List must be full already.
void ResizeAdd(const T& element, Zone* zone) {
MOZ_ASSERT(length_ >= capacity_);
// Grow the list capacity by 100%, but make sure to let it grow
// even when the capacity is zero (possible initial case).
int new_capacity = 1 + 2 * capacity_;
// Since the element reference could be an element of the list, copy
// it out of the old backing storage before resizing.
T temp = element;
Resize(new_capacity, zone);
data_[length_++] = temp;
}
// Resize the list.
void Resize(int new_capacity, Zone* zone) {
MOZ_ASSERT(length_ <= new_capacity);
T* new_data = NewData(new_capacity, zone);
if (length_ > 0) {
memcpy(new_data, data_, length_ * sizeof(T));
}
data_ = new_data;
capacity_ = new_capacity;
}
ZoneList& operator=(const ZoneList&) = delete;
ZoneList() = delete;
ZoneList(const ZoneList&) = delete;
};
// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-allocator.h#L14-L77
template <typename T>
class ZoneAllocator {
public:
using pointer = T*;
using const_pointer = const T*;
using reference = T&;
using const_reference = const T&;
using value_type = T;
using size_type = size_t;
using difference_type = ptrdiff_t;
template <class O>
struct rebind {
using other = ZoneAllocator<O>;
};
explicit ZoneAllocator(Zone* zone) : zone_(zone) {}
template <typename U>
ZoneAllocator(const ZoneAllocator<U>& other)
: ZoneAllocator<T>(other.zone_) {}
template <typename U>
friend class ZoneAllocator;
T* allocate(size_t n) { return static_cast<T*>(zone_->New(n * sizeof(T))); }
void deallocate(T* p, size_t) {} // noop for zones
bool operator==(ZoneAllocator const& other) const {
return zone_ == other.zone_;
}
bool operator!=(ZoneAllocator const& other) const {
return zone_ != other.zone_;
}
private:
Zone* zone_;
};
// Zone wrappers for std containers:
// Origin: https://github.com/v8/v8/blob/5e514a969376dc63517d575b062758efd36cd757/src/zone/zone-containers.h#L25-L169
// A wrapper subclass for std::vector to make it easy to construct one
// that uses a zone allocator.
// Used throughout irregexp
template <typename T>
class ZoneVector : public std::vector<T, ZoneAllocator<T>> {
public:
ZoneVector(Zone* zone)
: std::vector<T, ZoneAllocator<T>>(ZoneAllocator<T>(zone)) {}
// Constructs a new vector and fills it with the contents of the range
// [first, last).
template <class Iter>
ZoneVector(Iter first, Iter last, Zone* zone)
: std::vector<T, ZoneAllocator<T>>(first, last, ZoneAllocator<T>(zone)) {}
};
// A wrapper subclass for std::list to make it easy to construct one
// that uses a zone allocator.
// Used in regexp-bytecode-peephole.cc
template <typename T>
class ZoneLinkedList : public std::list<T, ZoneAllocator<T>> {
public:
// Constructs an empty list.
explicit ZoneLinkedList(Zone* zone)
: std::list<T, ZoneAllocator<T>>(ZoneAllocator<T>(zone)) {}
};
// A wrapper subclass for std::set to make it easy to construct one that uses
// a zone allocator.
// Used in regexp-parser.cc
template <typename K, typename Compare = std::less<K>>
class ZoneSet : public std::set<K, Compare, ZoneAllocator<K>> {
public:
// Constructs an empty set.
explicit ZoneSet(Zone* zone)
: std::set<K, Compare, ZoneAllocator<K>>(Compare(),
ZoneAllocator<K>(zone)) {}
};
// A wrapper subclass for std::map to make it easy to construct one that uses
// a zone allocator.
// Used in regexp-bytecode-peephole.cc
template <typename K, typename V, typename Compare = std::less<K>>
class ZoneMap
: public std::map<K, V, Compare, ZoneAllocator<std::pair<const K, V>>> {
public:
// Constructs an empty map.
explicit ZoneMap(Zone* zone)
: std::map<K, V, Compare, ZoneAllocator<std::pair<const K, V>>>(
Compare(), ZoneAllocator<std::pair<const K, V>>(zone)) {}
};
// A wrapper subclass for std::unordered_map to make it easy to construct one
// that uses a zone allocator.
// Used in regexp-bytecode-peephole.cc
template <typename K, typename V, typename Hash = std::hash<K>,
typename KeyEqual = std::equal_to<K>>
class ZoneUnorderedMap
: public std::unordered_map<K, V, Hash, KeyEqual,
ZoneAllocator<std::pair<const K, V>>> {
public:
// Constructs an empty map.
explicit ZoneUnorderedMap(Zone* zone, size_t bucket_count = 100)
: std::unordered_map<K, V, Hash, KeyEqual,
ZoneAllocator<std::pair<const K, V>>>(
bucket_count, Hash(), KeyEqual(),
ZoneAllocator<std::pair<const K, V>>(zone)) {}
};
} // namespace internal
} // namespace v8
#endif // V8_UTIL_FLAG_H_