Reland "[regexp] fix Latin1 ignore-case bug."

Bug: v8:6703 Change-Id: I225cd78bedf2c0c123aedd3deeb1cd6d442f7697 Reviewed-on: https://chromium-review.googlesource.com/901522 Reviewed-by: Jakob Gruber <jgruber@chromium.org> Commit-Queue: Yang Guo <yangguo@chromium.org> Cr-Commit-Position: refs/heads/master@{#51114}
2018-02-05 15:29:59 +01:00 · 2018-02-05 15:29:59 +01:00 · d17b4bfb27
commit d17b4bfb27
parent 8f96f66f66
4 changed files with 46 additions and 23 deletions
--- a/src/regexp/jsregexp.cc
+++ b/src/regexp/jsregexp.cc
@ -2768,16 +2768,13 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
      Vector<const uc16> quarks = elm.atom()->data();
      for (int j = 0; j < quarks.length(); j++) {
        uint16_t c = quarks[j];
-        if (c <= String::kMaxOneByteCharCode) continue;
-        if (!IgnoreCase(elm.atom()->flags())) return set_replacement(nullptr);
-        // Here, we need to check for characters whose upper and lower cases
-        // are outside the Latin-1 range.
-        uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
-        // Character is outside Latin-1 completely
-        if (converted == 0) return set_replacement(nullptr);
-        // Convert quark to Latin-1 in place.
-        uint16_t* copy = const_cast<uint16_t*>(quarks.start());
-        copy[j] = converted;
+        if (elm.atom()->ignore_case()) {
+          c = unibrow::Latin1::TryConvertToLatin1(c);
+        }
+        if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
+        // Replace quark in case we converted to Latin-1.
+        uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
+        writable_quarks[j] = c;
      }
    } else {
      DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
@ -3209,10 +3206,17 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
        if (first_element_checked && i == 0 && j == 0) continue;
        if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
        EmitCharacterFunction* emit_function = nullptr;
+        uc16 quark = quarks[j];
+        if (elm.atom()->ignore_case()) {
+          // Everywhere else we assume that a non-Latin-1 character cannot match
+          // a Latin-1 character. Avoid the cases where this is assumption is
+          // invalid by using the Latin1 equivalent instead.
+          quark = unibrow::Latin1::TryConvertToLatin1(quark);
+        }
        switch (pass) {
          case NON_LATIN1_MATCH:
            DCHECK(one_byte);
-            if (quarks[j] > String::kMaxOneByteCharCode) {
+            if (quark > String::kMaxOneByteCharCode) {
              assembler->GoTo(backtrack);
              return;
            }
@ -3232,8 +3236,8 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
        if (emit_function != nullptr) {
          bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
          bool bound_checked =
-              emit_function(isolate, compiler, quarks[j], backtrack,
-                            cp_offset + j, bounds_check, preloaded);
+              emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
+                            bounds_check, preloaded);
          if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
        }
      }
--- a/src/unicode-decoder.h
+++ b/src/unicode-decoder.h
@ -93,15 +93,11 @@ size_t Utf8Decoder<kBufferSize>::WriteUtf16(uint16_t* data,
 class Latin1 {
 public:
  static const unsigned kMaxChar = 0xff;
-  // Returns 0 if character does not convert to single latin-1 character
-  // or if the character doesn't not convert back to latin-1 via inverse
-  // operation (upper to lower, etc).
-  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
+  // Convert the character to Latin-1 case equivalent if possible.
+  static inline uint16_t TryConvertToLatin1(uint16_t);
 };

-
-uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
-  DCHECK_GT(c, Latin1::kMaxChar);
+uint16_t Latin1::TryConvertToLatin1(uint16_t c) {
  switch (c) {
    // This are equivalent characters in unicode.
    case 0x39c:
@ -112,7 +108,7 @@ uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
    case 0x178:
      return 0xff;
  }
-  return 0;
+  return c;
 }


--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@ -1505,7 +1505,7 @@ static uint16_t ConvertLatin1(uint16_t c) {
 #ifndef V8_INTL_SUPPORT
 static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
  uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
-  if (expect > unibrow::Latin1::kMaxChar) expect = 0;
+  if (expect > unibrow::Latin1::kMaxChar || expect == 0) expect = c;
  CHECK_EQ(expect, test);
 }

@ -1514,7 +1514,7 @@ TEST(Latin1IgnoreCase) {
  for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
    uint16_t lower = ConvertLatin1<unibrow::ToLowercase, false>(c);
    uint16_t upper = ConvertLatin1<unibrow::ToUppercase, false>(c);
-    uint16_t test = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
+    uint16_t test = unibrow::Latin1::TryConvertToLatin1(c);
    // Filter out all character whose upper is not their lower or vice versa.
    if (lower == 0 && upper == 0) {
      CheckCanonicalEquivalence(c, test);
--- a/test/mjsunit/regress/regress-6703.js
+++ b/test/mjsunit/regress/regress-6703.js
@ -0,0 +1,23 @@
+// Copyright 2018 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+assertTrue(/(\u039C)/i.test("\xB5"));
+assertTrue(/(\u039C)+/i.test("\xB5"));
+assertTrue(/(\u039C)/ui.test("\xB5"));
+assertTrue(/(\u039C)+/ui.test("\xB5"));
+
+assertTrue(/(\u03BC)/i.test("\xB5"));
+assertTrue(/(\u03BC)+/i.test("\xB5"));
+assertTrue(/(\u03BC)/ui.test("\xB5"));
+assertTrue(/(\u03BC)+/ui.test("\xB5"));
+
+assertTrue(/(\u03BC)/i.test("\u039C"));
+assertTrue(/(\u03BC)+/i.test("\u039C"));
+assertTrue(/(\u03BC)/ui.test("\u039C"));
+assertTrue(/(\u03BC)+/ui.test("\u039C"));
+
+assertTrue(/(\u0178)/i.test("\xFF"));
+assertTrue(/(\u0178)+/i.test("\xFF"));
+assertTrue(/(\u0178)/ui.test("\xFF"));
+assertTrue(/(\u0178)+/ui.test("\xFF"));