[regexp] Handle marker value 0x10ffff in MakeRangeArray

Unfortunately, CharacterRanges may use 0x10ffff as a marker value
signifying 'highest possible code unit' irrespective of whether the
regexp instance has the unicode flag or not. This value makes it
through RegExpCharacterClass::ToNode unmodified (since no surrogate
desugaring takes place without /u). Correctly mask out the 0xffff
value for purposes of building our uint16_t range array.

Note: It'd be better to never introduce 0x10ffff in the first place,
but given the irregexp pipeline's lack of hackability I hesitate to
change this - we are sure to rely on it implicitly in other spots.

Drive-by: Refactors.

Fixed: chromium:1264508
Bug: v8:11069
Change-Id: Ib3c5780e91f682f1a6d15f26eb4cf03636d93c25
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3256549
Commit-Queue: Jakob Gruber <jgruber@chromium.org>
Reviewed-by: Mathias Bynens <mathias@chromium.org>
Cr-Commit-Position: refs/heads/main@{#77673}
This commit is contained in:
Jakob Gruber 2021-11-03 09:43:16 +01:00 committed by V8 LUCI CQ
parent d0df1ebce8
commit bfa681ffb9
5 changed files with 32 additions and 26 deletions

View File

@ -112,9 +112,6 @@ class CharacterRange {
return list;
}
V8_EXPORT_PRIVATE static void AddClassEscape(
StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges, Zone* zone);
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
V8_EXPORT_PRIVATE static void AddClassEscape(
StandardCharacterSet standard_character_set,

View File

@ -50,17 +50,17 @@ namespace {
bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
const int* special_class, int length) {
length--; // Remove final marker.
DCHECK_EQ(kRangeEndMarker, special_class[length]);
DCHECK_NE(0, ranges->length());
DCHECK_NE(0, length);
DCHECK_NE(0, special_class[0]);
if (ranges->length() != (length >> 1) + 1) {
return false;
}
if (ranges->length() != (length >> 1) + 1) return false;
CharacterRange range = ranges->at(0);
if (range.from() != 0) {
return false;
}
if (range.from() != 0) return false;
for (int i = 0; i < length; i += 2) {
if (static_cast<base::uc32>(special_class[i]) != (range.to() + 1)) {
return false;
@ -70,19 +70,17 @@ bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
return false;
}
}
if (range.to() != kMaxCodePoint) {
return false;
}
return true;
return range.to() == kMaxCodePoint;
}
bool CompareRanges(ZoneList<CharacterRange>* ranges, const int* special_class,
int length) {
length--; // Remove final marker.
DCHECK_EQ(kRangeEndMarker, special_class[length]);
if (ranges->length() * 2 != length) {
return false;
}
if (ranges->length() * 2 != length) return false;
for (int i = 0; i < length; i += 2) {
CharacterRange range = ranges->at(i >> 1);
if (range.from() != static_cast<base::uc32>(special_class[i]) ||
@ -1157,12 +1155,7 @@ void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
ranges->AddAll(*new_ranges, zone);
return;
}
AddClassEscape(standard_character_set, ranges, zone);
}
void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges,
Zone* zone) {
switch (standard_character_set) {
case StandardCharacterSet::kWhitespace:
AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);

View File

@ -115,9 +115,16 @@ uint32_t Hash(const ZoneList<CharacterRange>* ranges) {
return static_cast<uint32_t>(seed);
}
constexpr base::uc32 MaskEndOfRangeMarker(base::uc32 c) {
// CharacterRanges may use 0x10ffff as the end-of-range marker irrespective
// of whether the regexp IsUnicode or not; translate the marker value here.
DCHECK_IMPLIES(c > kMaxUInt16, c == String::kMaxCodePoint);
return c & 0xffff;
}
int RangeArrayLengthFor(const ZoneList<CharacterRange>* ranges) {
const int ranges_length = ranges->length();
return ranges->at(ranges_length - 1).to() == kMaxUInt16
return MaskEndOfRangeMarker(ranges->at(ranges_length - 1).to()) == kMaxUInt16
? ranges_length * 2 - 1
: ranges_length * 2;
}
@ -146,11 +153,13 @@ Handle<ByteArray> MakeRangeArray(Isolate* isolate,
const CharacterRange& r = ranges->at(i);
DCHECK_LE(r.from(), kMaxUInt16);
range_array->set_uint16(i * 2 + 0, r.from());
if (i == ranges_length - 1 && r.to() == kMaxUInt16) {
const base::uc32 to = MaskEndOfRangeMarker(r.to());
if (i == ranges_length - 1 && to == kMaxUInt16) {
DCHECK_EQ(byte_array_length, ranges_length * 2 - 1);
break; // Avoid overflow by leaving the last range open-ended.
}
DCHECK_LT(r.to(), kMaxUInt16);
range_array->set_uint16(i * 2 + 1, r.to() + 1); // Exclusive.
DCHECK_LT(to, kMaxUInt16);
range_array->set_uint16(i * 2 + 1, to + 1); // Exclusive.
}
return range_array;
}

View File

@ -510,7 +510,7 @@ static void TestCharacterClassEscapes(StandardCharacterSet c,
Zone zone(CcTest::i_isolate()->allocator(), ZONE_NAME);
ZoneList<CharacterRange>* ranges =
zone.New<ZoneList<CharacterRange>>(2, &zone);
CharacterRange::AddClassEscape(c, ranges, &zone);
CharacterRange::AddClassEscape(c, ranges, false, &zone);
for (base::uc32 i = 0; i < (1 << 16); i++) {
bool in_class = false;
for (int j = 0; !in_class && j < ranges->length(); j++) {

View File

@ -0,0 +1,7 @@
// Copyright 2021 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// Flags: --no-regexp-tier-up
assertNotNull(/[nyreekp\W]/isy.exec("\u2603"));