diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc index a26a1d77ce..30e39f7c5c 100644 --- a/src/regexp/jsregexp.cc +++ b/src/regexp/jsregexp.cc @@ -2768,16 +2768,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) { Vector quarks = elm.atom()->data(); for (int j = 0; j < quarks.length(); j++) { uint16_t c = quarks[j]; - if (c <= String::kMaxOneByteCharCode) continue; - if (!IgnoreCase(elm.atom()->flags())) return set_replacement(nullptr); - // Here, we need to check for characters whose upper and lower cases - // are outside the Latin-1 range. - uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); - // Character is outside Latin-1 completely - if (converted == 0) return set_replacement(nullptr); - // Convert quark to Latin-1 in place. - uint16_t* copy = const_cast(quarks.start()); - copy[j] = converted; + if (elm.atom()->ignore_case()) { + c = unibrow::Latin1::TryConvertToLatin1(c); + } + if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); + // Replace quark in case we converted to Latin-1. + uint16_t* writable_quarks = const_cast(quarks.start()); + writable_quarks[j] = c; } } else { DCHECK(elm.text_type() == TextElement::CHAR_CLASS); @@ -3209,10 +3206,17 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; EmitCharacterFunction* emit_function = nullptr; + uc16 quark = quarks[j]; + if (elm.atom()->ignore_case()) { + // Everywhere else we assume that a non-Latin-1 character cannot match + // a Latin-1 character. Avoid the cases where this is assumption is + // invalid by using the Latin1 equivalent instead. + quark = unibrow::Latin1::TryConvertToLatin1(quark); + } switch (pass) { case NON_LATIN1_MATCH: DCHECK(one_byte); - if (quarks[j] > String::kMaxOneByteCharCode) { + if (quark > String::kMaxOneByteCharCode) { assembler->GoTo(backtrack); return; } @@ -3232,8 +3236,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, if (emit_function != nullptr) { bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); bool bound_checked = - emit_function(isolate, compiler, quarks[j], backtrack, - cp_offset + j, bounds_check, preloaded); + emit_function(isolate, compiler, quark, backtrack, cp_offset + j, + bounds_check, preloaded); if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); } } diff --git a/src/unicode-decoder.h b/src/unicode-decoder.h index 38a1837af3..97f551f4c6 100644 --- a/src/unicode-decoder.h +++ b/src/unicode-decoder.h @@ -93,15 +93,11 @@ size_t Utf8Decoder::WriteUtf16(uint16_t* data, class Latin1 { public: static const unsigned kMaxChar = 0xff; - // Returns 0 if character does not convert to single latin-1 character - // or if the character doesn't not convert back to latin-1 via inverse - // operation (upper to lower, etc). - static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t); + // Convert the character to Latin-1 case equivalent if possible. + static inline uint16_t TryConvertToLatin1(uint16_t); }; - -uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { - DCHECK_GT(c, Latin1::kMaxChar); +uint16_t Latin1::TryConvertToLatin1(uint16_t c) { switch (c) { // This are equivalent characters in unicode. case 0x39c: @@ -112,7 +108,7 @@ uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) { case 0x178: return 0xff; } - return 0; + return c; } diff --git a/test/cctest/test-strings.cc b/test/cctest/test-strings.cc index ba6186828d..1cdcf77fa7 100644 --- a/test/cctest/test-strings.cc +++ b/test/cctest/test-strings.cc @@ -1505,7 +1505,7 @@ static uint16_t ConvertLatin1(uint16_t c) { #ifndef V8_INTL_SUPPORT static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) { uint16_t expect = ConvertLatin1(c); - if (expect > unibrow::Latin1::kMaxChar) expect = 0; + if (expect > unibrow::Latin1::kMaxChar || expect == 0) expect = c; CHECK_EQ(expect, test); } @@ -1514,7 +1514,7 @@ TEST(Latin1IgnoreCase) { for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) { uint16_t lower = ConvertLatin1(c); uint16_t upper = ConvertLatin1(c); - uint16_t test = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); + uint16_t test = unibrow::Latin1::TryConvertToLatin1(c); // Filter out all character whose upper is not their lower or vice versa. if (lower == 0 && upper == 0) { CheckCanonicalEquivalence(c, test); diff --git a/test/mjsunit/regress/regress-6703.js b/test/mjsunit/regress/regress-6703.js new file mode 100644 index 0000000000..82bf21d55e --- /dev/null +++ b/test/mjsunit/regress/regress-6703.js @@ -0,0 +1,23 @@ +// Copyright 2018 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +assertTrue(/(\u039C)/i.test("\xB5")); +assertTrue(/(\u039C)+/i.test("\xB5")); +assertTrue(/(\u039C)/ui.test("\xB5")); +assertTrue(/(\u039C)+/ui.test("\xB5")); + +assertTrue(/(\u03BC)/i.test("\xB5")); +assertTrue(/(\u03BC)+/i.test("\xB5")); +assertTrue(/(\u03BC)/ui.test("\xB5")); +assertTrue(/(\u03BC)+/ui.test("\xB5")); + +assertTrue(/(\u03BC)/i.test("\u039C")); +assertTrue(/(\u03BC)+/i.test("\u039C")); +assertTrue(/(\u03BC)/ui.test("\u039C")); +assertTrue(/(\u03BC)+/ui.test("\u039C")); + +assertTrue(/(\u0178)/i.test("\xFF")); +assertTrue(/(\u0178)+/i.test("\xFF")); +assertTrue(/(\u0178)/ui.test("\xFF")); +assertTrue(/(\u0178)+/ui.test("\xFF"));