diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h index a9106d3d30..643e1fc983 100644 --- a/src/regexp/regexp-ast.h +++ b/src/regexp/regexp-ast.h @@ -76,9 +76,8 @@ class Interval { int to_; }; - -// Represents code units in the range from from_ to to_, both ends are -// inclusive. +// Represents code points (with values up to 0x10FFFF) in the range from from_ +// to to_, both ends are inclusive. class CharacterRange { public: CharacterRange() : from_(0), to_(0) {} diff --git a/src/regexp/regexp-compiler.cc b/src/regexp/regexp-compiler.cc index 6e0f3bf4ea..8dc79667d8 100644 --- a/src/regexp/regexp-compiler.cc +++ b/src/regexp/regexp-compiler.cc @@ -174,6 +174,24 @@ using namespace regexp_compiler_constants; // NOLINT(build/namespaces) // trace is not recorded in the node and so it cannot currently be reused in // the event that code generation is requested for an identical trace. +namespace { + +constexpr uc32 MaxCodeUnit(const bool one_byte) { + STATIC_ASSERT(String::kMaxOneByteCharCodeU <= + std::numeric_limits::max()); + STATIC_ASSERT(String::kMaxUtf16CodeUnitU <= + std::numeric_limits::max()); + return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU; +} + +constexpr uint32_t CharMask(const bool one_byte) { + STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); + STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); + return MaxCodeUnit(one_byte); +} + +} // namespace + void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); } void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) { @@ -386,9 +404,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, int pushes = 0; for (int reg = 0; reg <= max_register; reg++) { - if (!affected_registers.Get(reg)) { - continue; - } + if (!affected_registers.Get(reg)) continue; // The chronologically first deferred action in the trace // is used to infer the action needed to restore a register @@ -710,6 +726,20 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, } } +namespace { + +#ifdef DEBUG +bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { + STATIC_ASSERT(sizeof(unibrow::uchar) == 4); + for (int i = 0; i < length; i++) { + if (chars[i] > String::kMaxUtf16CodeUnit) return false; + } + return true; +} +#endif // DEBUG + +} // namespace + // Returns the number of characters in the equivalence class, omitting those // that cannot occur in the source string because it is Latin1. static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, @@ -719,6 +749,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, #ifdef V8_INTL_SUPPORT if (RegExpCaseFolding::IgnoreSet().contains(character)) { letters[0] = character; + DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1)); return 1; } bool in_special_add_set = @@ -747,6 +778,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, letters[items++] = (unibrow::uchar)(cu); } } + DCHECK(ContainsOnlyUtf16CodeUnits(letters, items)); return items; #else int length = @@ -768,6 +800,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, length = new_length; } + DCHECK(ContainsOnlyUtf16CodeUnits(letters, length)); return length; #endif // V8_INTL_SUPPORT } @@ -820,12 +853,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, bool one_byte, uc16 c1, uc16 c2, Label* on_failure) { - uc16 char_mask; - if (one_byte) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(one_byte); uc16 exor = c1 ^ c2; // Check whether exor has only one bit set. if (((exor - 1) & exor) == 0) { @@ -1185,21 +1213,13 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, ZoneList* ranges = cc->ranges(zone); CharacterRange::Canonicalize(ranges); - int max_char; - if (one_byte) { - max_char = String::kMaxOneByteCharCode; - } else { - max_char = String::kMaxUtf16CodeUnit; - } - + const uc32 max_char = MaxCodeUnit(one_byte); int range_count = ranges->length(); int last_valid_range = range_count - 1; while (last_valid_range >= 0) { CharacterRange& range = ranges->at(last_valid_range); - if (static_cast(range.from()) <= max_char) { - break; - } + if (range.from() <= max_char) break; last_valid_range--; } @@ -1257,7 +1277,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, range_boundaries->Add(range.to() + 1, zone); } int end_index = range_boundaries->length() - 1; - if (range_boundaries->at(end_index) > max_char) { + if (static_cast(range_boundaries->at(end_index)) > max_char) { end_index--; } @@ -1371,12 +1391,7 @@ static inline uint32_t SmearBitsRight(uint32_t v) { bool QuickCheckDetails::Rationalize(bool asc) { bool found_useful_op = false; - uint32_t char_mask; - if (asc) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(asc); mask_ = 0; value_ = 0; int char_shift = 0; @@ -1496,12 +1511,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, if (details->characters() == 1) { // If number of characters preloaded is 1 then we used a byte or 16 bit // load so the value is already masked down. - uint32_t char_mask; - if (compiler->one_byte()) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(compiler->one_byte()); if ((mask & char_mask) == char_mask) need_mask = false; mask &= char_mask; } else { @@ -1552,12 +1562,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, Isolate* isolate = compiler->macro_assembler()->isolate(); DCHECK(characters_filled_in < details->characters()); int characters = details->characters(); - int char_mask; - if (compiler->one_byte()) { - char_mask = String::kMaxOneByteCharCode; - } else { - char_mask = String::kMaxUtf16CodeUnit; - } + const uint32_t char_mask = CharMask(compiler->one_byte()); for (int k = 0; k < elements()->length(); k++) { TextElement elm = elements()->at(k); if (elm.text_type() == TextElement::ATOM) { @@ -1637,7 +1642,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, pos->value = 0; } else { int first_range = 0; - while (static_cast(ranges->at(first_range).from()) > char_mask) { + while (ranges->at(first_range).from() > char_mask) { first_range++; if (first_range == ranges->length()) { details->set_cannot_match(); @@ -1646,26 +1651,22 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, } } CharacterRange range = ranges->at(first_range); - uc16 from = range.from(); - uc16 to = range.to(); - if (to > char_mask) { - to = char_mask; - } - uint32_t differing_bits = (from ^ to); + const uc32 first_from = range.from(); + const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to(); + const uint32_t differing_bits = (first_from ^ first_to); // A mask and compare is only perfect if the differing bits form a // number like 00011111 with one single block of trailing 1s. if ((differing_bits & (differing_bits + 1)) == 0 && - from + differing_bits == to) { + first_from + differing_bits == first_to) { pos->determines_perfectly = true; } uint32_t common_bits = ~SmearBitsRight(differing_bits); - uint32_t bits = (from & common_bits); + uint32_t bits = (first_from & common_bits); for (int i = first_range + 1; i < ranges->length(); i++) { CharacterRange range = ranges->at(i); - uc16 from = range.from(); - uc16 to = range.to(); + const uc32 from = range.from(); if (from > char_mask) continue; - if (to > char_mask) to = char_mask; + const uc32 to = (range.to() > char_mask) ? char_mask : range.to(); // Here we are combining more ranges into the mask and compare // value. With each new range the mask becomes more sparse and // so the chances of a false positive rise. A character class @@ -1685,9 +1686,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, } characters_filled_in++; DCHECK(characters_filled_in <= details->characters()); - if (characters_filled_in == details->characters()) { - return; - } + if (characters_filled_in == details->characters()) return; } } DCHECK(characters_filled_in != details->characters()); @@ -1749,7 +1748,7 @@ void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) { pos->mask &= other_pos->mask; pos->value &= pos->mask; other_pos->value &= pos->mask; - uc16 differing_bits = (pos->value ^ other_pos->value); + uint32_t differing_bits = (pos->value ^ other_pos->value); pos->mask &= ~differing_bits; pos->value &= pos->mask; } @@ -1859,16 +1858,20 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (range_count != 0 && ranges->at(0).from() == 0 && ranges->at(0).to() >= String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) + if (IgnoreCase(cc->flags()) && + RangesContainLatin1Equivalents(ranges)) { continue; + } return set_replacement(nullptr); } } else { if (range_count == 0 || ranges->at(0).from() > String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges)) + if (IgnoreCase(cc->flags()) && + RangesContainLatin1Equivalents(ranges)) { continue; + } return set_replacement(nullptr); } } @@ -2505,12 +2508,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( return ranges->length() == 0 ? on_success() : nullptr; } if (ranges->length() != 1) return nullptr; - uint32_t max_char; - if (compiler->one_byte()) { - max_char = String::kMaxOneByteCharCode; - } else { - max_char = String::kMaxUtf16CodeUnit; - } + const uc32 max_char = MaxCodeUnit(compiler->one_byte()); return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; } @@ -2720,12 +2718,9 @@ void BoyerMoorePositionInfo::SetAll() { BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone) - : length_(length), compiler_(compiler) { - if (compiler->one_byte()) { - max_char_ = String::kMaxOneByteCharCode; - } else { - max_char_ = String::kMaxUtf16CodeUnit; - } + : length_(length), + compiler_(compiler), + max_char_(MaxCodeUnit(compiler->one_byte())) { bitmaps_ = new (zone) ZoneList(length, zone); for (int i = 0; i < length; i++) { bitmaps_->Add(new (zone) BoyerMoorePositionInfo(), zone); diff --git a/src/regexp/regexp-compiler.h b/src/regexp/regexp-compiler.h index a35ffcd01a..4e7652883c 100644 --- a/src/regexp/regexp-compiler.h +++ b/src/regexp/regexp-compiler.h @@ -96,8 +96,8 @@ class QuickCheckDetails { void set_cannot_match() { cannot_match_ = true; } struct Position { Position() : mask(0), value(0), determines_perfectly(false) {} - uc16 mask; - uc16 value; + uc32 mask; + uc32 value; bool determines_perfectly; }; int characters() { return characters_; } diff --git a/src/regexp/regexp-dotprinter.cc b/src/regexp/regexp-dotprinter.cc index b6640626f2..7cf1e82c4d 100644 --- a/src/regexp/regexp-dotprinter.cc +++ b/src/regexp/regexp-dotprinter.cc @@ -143,7 +143,7 @@ void DotPrinterImpl::VisitText(TextNode* that) { if (node->is_negated()) os_ << "^"; for (int j = 0; j < node->ranges(zone)->length(); j++) { CharacterRange range = node->ranges(zone)->at(j); - os_ << AsUC16(range.from()) << "-" << AsUC16(range.to()); + os_ << AsUC32(range.from()) << "-" << AsUC32(range.to()); } os_ << "]"; break; diff --git a/test/mjsunit/regress/regress-v8-10568.js b/test/mjsunit/regress/regress-v8-10568.js new file mode 100644 index 0000000000..4f31e7f8c2 --- /dev/null +++ b/test/mjsunit/regress/regress-v8-10568.js @@ -0,0 +1,6 @@ +// Copyright 2020 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +assertEquals(/[--𐀾]/u.exec("Hr3QoS3KCWXQ2yjBoDIK")[0], "H"); +assertEquals(/[0-\u{10000}]/u.exec("A0")[0], "A");