[regexp] Fix integer overflows in TextNode::GetQuickCheckDetails
Several uc32 (= int32_t) fields were incorrectly treated as uc16 (= uint16_t): CharacterRange::from() CharacterRange::to() QuickCheckDetails::Position::mask QuickCheckDetails::Position::value Bug: v8:10568 Change-Id: I9ea7d76e4a0cbc6ee681de2136c398cdc622bca2 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2230527 Commit-Queue: Jakob Gruber <jgruber@chromium.org> Reviewed-by: Leszek Swirski <leszeks@chromium.org> Cr-Commit-Position: refs/heads/master@{#68290}
This commit is contained in:
parent
abe6ce3dbf
commit
a305d2de5c
@ -76,9 +76,8 @@ class Interval {
|
||||
int to_;
|
||||
};
|
||||
|
||||
|
||||
// Represents code units in the range from from_ to to_, both ends are
|
||||
// inclusive.
|
||||
// Represents code points (with values up to 0x10FFFF) in the range from from_
|
||||
// to to_, both ends are inclusive.
|
||||
class CharacterRange {
|
||||
public:
|
||||
CharacterRange() : from_(0), to_(0) {}
|
||||
|
@ -174,6 +174,24 @@ using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
|
||||
// trace is not recorded in the node and so it cannot currently be reused in
|
||||
// the event that code generation is requested for an identical trace.
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr uc32 MaxCodeUnit(const bool one_byte) {
|
||||
STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
|
||||
}
|
||||
|
||||
constexpr uint32_t CharMask(const bool one_byte) {
|
||||
STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
|
||||
STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
|
||||
return MaxCodeUnit(one_byte);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void RegExpTree::AppendToText(RegExpText* text, Zone* zone) { UNREACHABLE(); }
|
||||
|
||||
void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
|
||||
@ -386,9 +404,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
|
||||
int pushes = 0;
|
||||
|
||||
for (int reg = 0; reg <= max_register; reg++) {
|
||||
if (!affected_registers.Get(reg)) {
|
||||
continue;
|
||||
}
|
||||
if (!affected_registers.Get(reg)) continue;
|
||||
|
||||
// The chronologically first deferred action in the trace
|
||||
// is used to infer the action needed to restore a register
|
||||
@ -710,6 +726,20 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
#ifdef DEBUG
|
||||
bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
|
||||
STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (chars[i] > String::kMaxUtf16CodeUnit) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif // DEBUG
|
||||
|
||||
} // namespace
|
||||
|
||||
// Returns the number of characters in the equivalence class, omitting those
|
||||
// that cannot occur in the source string because it is Latin1.
|
||||
static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
||||
@ -719,6 +749,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
if (RegExpCaseFolding::IgnoreSet().contains(character)) {
|
||||
letters[0] = character;
|
||||
DCHECK(ContainsOnlyUtf16CodeUnits(letters, 1));
|
||||
return 1;
|
||||
}
|
||||
bool in_special_add_set =
|
||||
@ -747,6 +778,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
||||
letters[items++] = (unibrow::uchar)(cu);
|
||||
}
|
||||
}
|
||||
DCHECK(ContainsOnlyUtf16CodeUnits(letters, items));
|
||||
return items;
|
||||
#else
|
||||
int length =
|
||||
@ -768,6 +800,7 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
||||
length = new_length;
|
||||
}
|
||||
|
||||
DCHECK(ContainsOnlyUtf16CodeUnits(letters, length));
|
||||
return length;
|
||||
#endif // V8_INTL_SUPPORT
|
||||
}
|
||||
@ -820,12 +853,7 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
|
||||
static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
|
||||
bool one_byte, uc16 c1, uc16 c2,
|
||||
Label* on_failure) {
|
||||
uc16 char_mask;
|
||||
if (one_byte) {
|
||||
char_mask = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
char_mask = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
const uint32_t char_mask = CharMask(one_byte);
|
||||
uc16 exor = c1 ^ c2;
|
||||
// Check whether exor has only one bit set.
|
||||
if (((exor - 1) & exor) == 0) {
|
||||
@ -1185,21 +1213,13 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges(zone);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
|
||||
int max_char;
|
||||
if (one_byte) {
|
||||
max_char = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
max_char = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
|
||||
const uc32 max_char = MaxCodeUnit(one_byte);
|
||||
int range_count = ranges->length();
|
||||
|
||||
int last_valid_range = range_count - 1;
|
||||
while (last_valid_range >= 0) {
|
||||
CharacterRange& range = ranges->at(last_valid_range);
|
||||
if (static_cast<int>(range.from()) <= max_char) {
|
||||
break;
|
||||
}
|
||||
if (range.from() <= max_char) break;
|
||||
last_valid_range--;
|
||||
}
|
||||
|
||||
@ -1257,7 +1277,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
||||
range_boundaries->Add(range.to() + 1, zone);
|
||||
}
|
||||
int end_index = range_boundaries->length() - 1;
|
||||
if (range_boundaries->at(end_index) > max_char) {
|
||||
if (static_cast<uc32>(range_boundaries->at(end_index)) > max_char) {
|
||||
end_index--;
|
||||
}
|
||||
|
||||
@ -1371,12 +1391,7 @@ static inline uint32_t SmearBitsRight(uint32_t v) {
|
||||
|
||||
bool QuickCheckDetails::Rationalize(bool asc) {
|
||||
bool found_useful_op = false;
|
||||
uint32_t char_mask;
|
||||
if (asc) {
|
||||
char_mask = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
char_mask = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
const uint32_t char_mask = CharMask(asc);
|
||||
mask_ = 0;
|
||||
value_ = 0;
|
||||
int char_shift = 0;
|
||||
@ -1496,12 +1511,7 @@ bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
|
||||
if (details->characters() == 1) {
|
||||
// If number of characters preloaded is 1 then we used a byte or 16 bit
|
||||
// load so the value is already masked down.
|
||||
uint32_t char_mask;
|
||||
if (compiler->one_byte()) {
|
||||
char_mask = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
char_mask = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
const uint32_t char_mask = CharMask(compiler->one_byte());
|
||||
if ((mask & char_mask) == char_mask) need_mask = false;
|
||||
mask &= char_mask;
|
||||
} else {
|
||||
@ -1552,12 +1562,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
Isolate* isolate = compiler->macro_assembler()->isolate();
|
||||
DCHECK(characters_filled_in < details->characters());
|
||||
int characters = details->characters();
|
||||
int char_mask;
|
||||
if (compiler->one_byte()) {
|
||||
char_mask = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
char_mask = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
const uint32_t char_mask = CharMask(compiler->one_byte());
|
||||
for (int k = 0; k < elements()->length(); k++) {
|
||||
TextElement elm = elements()->at(k);
|
||||
if (elm.text_type() == TextElement::ATOM) {
|
||||
@ -1637,7 +1642,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
pos->value = 0;
|
||||
} else {
|
||||
int first_range = 0;
|
||||
while (static_cast<int>(ranges->at(first_range).from()) > char_mask) {
|
||||
while (ranges->at(first_range).from() > char_mask) {
|
||||
first_range++;
|
||||
if (first_range == ranges->length()) {
|
||||
details->set_cannot_match();
|
||||
@ -1646,26 +1651,22 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
}
|
||||
}
|
||||
CharacterRange range = ranges->at(first_range);
|
||||
uc16 from = range.from();
|
||||
uc16 to = range.to();
|
||||
if (to > char_mask) {
|
||||
to = char_mask;
|
||||
}
|
||||
uint32_t differing_bits = (from ^ to);
|
||||
const uc32 first_from = range.from();
|
||||
const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to();
|
||||
const uint32_t differing_bits = (first_from ^ first_to);
|
||||
// A mask and compare is only perfect if the differing bits form a
|
||||
// number like 00011111 with one single block of trailing 1s.
|
||||
if ((differing_bits & (differing_bits + 1)) == 0 &&
|
||||
from + differing_bits == to) {
|
||||
first_from + differing_bits == first_to) {
|
||||
pos->determines_perfectly = true;
|
||||
}
|
||||
uint32_t common_bits = ~SmearBitsRight(differing_bits);
|
||||
uint32_t bits = (from & common_bits);
|
||||
uint32_t bits = (first_from & common_bits);
|
||||
for (int i = first_range + 1; i < ranges->length(); i++) {
|
||||
CharacterRange range = ranges->at(i);
|
||||
uc16 from = range.from();
|
||||
uc16 to = range.to();
|
||||
const uc32 from = range.from();
|
||||
if (from > char_mask) continue;
|
||||
if (to > char_mask) to = char_mask;
|
||||
const uc32 to = (range.to() > char_mask) ? char_mask : range.to();
|
||||
// Here we are combining more ranges into the mask and compare
|
||||
// value. With each new range the mask becomes more sparse and
|
||||
// so the chances of a false positive rise. A character class
|
||||
@ -1685,9 +1686,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
}
|
||||
characters_filled_in++;
|
||||
DCHECK(characters_filled_in <= details->characters());
|
||||
if (characters_filled_in == details->characters()) {
|
||||
return;
|
||||
}
|
||||
if (characters_filled_in == details->characters()) return;
|
||||
}
|
||||
}
|
||||
DCHECK(characters_filled_in != details->characters());
|
||||
@ -1749,7 +1748,7 @@ void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
|
||||
pos->mask &= other_pos->mask;
|
||||
pos->value &= pos->mask;
|
||||
other_pos->value &= pos->mask;
|
||||
uc16 differing_bits = (pos->value ^ other_pos->value);
|
||||
uint32_t differing_bits = (pos->value ^ other_pos->value);
|
||||
pos->mask &= ~differing_bits;
|
||||
pos->value &= pos->mask;
|
||||
}
|
||||
@ -1859,16 +1858,20 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
|
||||
if (range_count != 0 && ranges->at(0).from() == 0 &&
|
||||
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
|
||||
// This will be handled in a later filter.
|
||||
if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
|
||||
if (IgnoreCase(cc->flags()) &&
|
||||
RangesContainLatin1Equivalents(ranges)) {
|
||||
continue;
|
||||
}
|
||||
return set_replacement(nullptr);
|
||||
}
|
||||
} else {
|
||||
if (range_count == 0 ||
|
||||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
|
||||
// This will be handled in a later filter.
|
||||
if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
|
||||
if (IgnoreCase(cc->flags()) &&
|
||||
RangesContainLatin1Equivalents(ranges)) {
|
||||
continue;
|
||||
}
|
||||
return set_replacement(nullptr);
|
||||
}
|
||||
}
|
||||
@ -2505,12 +2508,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
|
||||
return ranges->length() == 0 ? on_success() : nullptr;
|
||||
}
|
||||
if (ranges->length() != 1) return nullptr;
|
||||
uint32_t max_char;
|
||||
if (compiler->one_byte()) {
|
||||
max_char = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
max_char = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
const uc32 max_char = MaxCodeUnit(compiler->one_byte());
|
||||
return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
|
||||
}
|
||||
|
||||
@ -2720,12 +2718,9 @@ void BoyerMoorePositionInfo::SetAll() {
|
||||
|
||||
BoyerMooreLookahead::BoyerMooreLookahead(int length, RegExpCompiler* compiler,
|
||||
Zone* zone)
|
||||
: length_(length), compiler_(compiler) {
|
||||
if (compiler->one_byte()) {
|
||||
max_char_ = String::kMaxOneByteCharCode;
|
||||
} else {
|
||||
max_char_ = String::kMaxUtf16CodeUnit;
|
||||
}
|
||||
: length_(length),
|
||||
compiler_(compiler),
|
||||
max_char_(MaxCodeUnit(compiler->one_byte())) {
|
||||
bitmaps_ = new (zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
|
||||
for (int i = 0; i < length; i++) {
|
||||
bitmaps_->Add(new (zone) BoyerMoorePositionInfo(), zone);
|
||||
|
@ -96,8 +96,8 @@ class QuickCheckDetails {
|
||||
void set_cannot_match() { cannot_match_ = true; }
|
||||
struct Position {
|
||||
Position() : mask(0), value(0), determines_perfectly(false) {}
|
||||
uc16 mask;
|
||||
uc16 value;
|
||||
uc32 mask;
|
||||
uc32 value;
|
||||
bool determines_perfectly;
|
||||
};
|
||||
int characters() { return characters_; }
|
||||
|
@ -143,7 +143,7 @@ void DotPrinterImpl::VisitText(TextNode* that) {
|
||||
if (node->is_negated()) os_ << "^";
|
||||
for (int j = 0; j < node->ranges(zone)->length(); j++) {
|
||||
CharacterRange range = node->ranges(zone)->at(j);
|
||||
os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
|
||||
os_ << AsUC32(range.from()) << "-" << AsUC32(range.to());
|
||||
}
|
||||
os_ << "]";
|
||||
break;
|
||||
|
6
test/mjsunit/regress/regress-v8-10568.js
Normal file
6
test/mjsunit/regress/regress-v8-10568.js
Normal file
@ -0,0 +1,6 @@
|
||||
// Copyright 2020 the V8 project authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
assertEquals(/[--]/u.exec("Hr3QoS3KCWXQ2yjBoDIK")[0], "H");
|
||||
assertEquals(/[0-\u{10000}]/u.exec("A0")[0], "A");
|
Loading…
Reference in New Issue
Block a user