Fix some latin-1 webkit units tests

R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11962035 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
2013-01-21 16:11:31 +00:00 · 2013-01-21 16:11:31 +00:00 · 0c822b21cb
commit 0c822b21cb
parent 7f331f6280
8 changed files with 146 additions and 65 deletions
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
 }
 // We need to check for the following characters: 0x39c 0x3bc 0x178.
 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
 #ifdef ENABLE_LATIN_1
  // TODO(dcarney): this could be a lot more efficient.
  return range.Contains(0x39c) ||
      range.Contains(0x3bc) || range.Contains(0x178);
 #else
  return false;
 #endif
 }
 #ifdef ENABLE_LATIN_1
 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
  for (int i = 0; i < ranges->length(); i++) {
    // TODO(dcarney): this could be a lot more efficient.
    if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
  }
  return false;
 }
 #endif
 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
  if (info()->replacement_calculated) return replacement();
  if (depth < 0) return this;
@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
          return set_replacement(NULL);
        }
 #else
-        if (quarks[j] <= String::kMaxOneByteCharCode) continue;
+        uint16_t c = quarks[j];
        if (c <= String::kMaxOneByteCharCode) continue;
        if (!ignore_case) return set_replacement(NULL);
        // Here, we need to check for characters whose upper and lower cases
        // are outside the Latin-1 range.
-        if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
+        uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
-          return set_replacement(NULL);
+        // Character is outside Latin-1 completely
-        }
+        if (converted == 0) return set_replacement(NULL);
        // Convert quark to Latin-1 in place.
        uint16_t* copy = const_cast<uint16_t*>(quarks.start());
        copy[j] = converted;
 #endif
      }
    } else {
      ASSERT(elm.type == TextElement::CHAR_CLASS);
 #ifdef ENABLE_LATIN_1
      // TODO(dcarney): Can this be improved?
      if (ignore_case) continue;
 #endif
      RegExpCharacterClass* cc = elm.data.u_char_class;
      ZoneList<CharacterRange>* ranges = cc->ranges(zone());
      if (!CharacterRange::IsCanonical(ranges)) {
@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
        if (range_count != 0 &&
            ranges->at(0).from() == 0 &&
            ranges->at(0).to() >= String::kMaxOneByteCharCode) {
 #ifdef ENABLE_LATIN_1
          // This will be handled in a later filter.
          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
 #endif
          return set_replacement(NULL);
        }
      } else {
        if (range_count == 0 ||
            ranges->at(0).from() > String::kMaxOneByteCharCode) {
 #ifdef ENABLE_LATIN_1
          // This will be handled in a later filter.
          if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
 #endif
          return set_replacement(NULL);
        }
      }
@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
  Isolate* isolate = Isolate::Current();
  uc16 bottom = from();
  uc16 top = to();
-  if (is_ascii) {
+  if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
    if (bottom > String::kMaxOneByteCharCode) return;
    if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
  }
--- a/src/regexp-macro-assembler.cc
+++ b/src/regexp-macro-assembler.cc
@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
    0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
    0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
    // Latin-1 range
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
    0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
 };
--- a/src/regexp-macro-assembler.h
+++ b/src/regexp-macro-assembler.h
@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
  static const byte* StringCharacterPosition(String* subject, int start_index);
-  // Byte map of ASCII characters with a 0xff if the character is a word
+  // Byte map of one byte characters with a 0xff if the character is a word
  // character (digit, letter or underscore) and 0x00 otherwise.
  // Used by generated RegExp code.
-  static const byte word_character_map[128];
+  static const byte word_character_map[256];
  static Address word_character_map_address() {
    return const_cast<Address>(&word_character_map[0]);
--- a/src/runtime.cc
+++ b/src/runtime.cc
@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
      // Fast check for a junk value. A valid string may start from a
      // whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
      // the 'I' character ('Infinity'). All of that have codes not greater than
-      // '9' except 'I'.
+      // '9' except 'I' and &nbsp;.
-      if (data[start_pos] != 'I') {
+      if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
        return isolate->heap()->nan_value();
      }
    } else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
--- a/src/unicode-inl.h
+++ b/src/unicode-inl.h
@ -79,33 +79,19 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
 }
-bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
+uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
  ASSERT(c > Latin1::kMaxChar);
  switch (c) {
-    case 0x130:
+    // This are equivalent characters in unicode.
-    case 0x131:
+    case 0x39c:
-    case 0x149:
+    case 0x3bc:
      return 0xb5;
    // This is an uppercase of a Latin-1 character
    // outside of Latin-1.
    case 0x178:
-    case 0x17f:
+      return 0xff;
    case 0x1f0:
    case 0x1e96:
    case 0x1e97:
    case 0x1e98:
    case 0x1e99:
    case 0x1e9a:
    case 0x1e9e:
    case 0x212a:
    case 0x212b:
    case 0xfb00:
    case 0xfb01:
    case 0xfb02:
    case 0xfb03:
    case 0xfb04:
    case 0xfb05:
    case 0xfb06:
      return true;
  }
-  return false;
+  return 0;
 }
--- a/src/unicode.h
+++ b/src/unicode.h
@ -140,7 +140,10 @@ class Latin1 {
 #else
  static const unsigned kMaxChar = 0xff;
 #endif
-  static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
+  // Returns 0 if character does not convert to single latin-1 character
  // or if the character doesn't not convert back to latin-1 via inverse
  // operation (upper to lower, etc).
  static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
 };
 class Utf8 {
--- a/test/cctest/test-strings.cc
+++ b/test/cctest/test-strings.cc
@ -1277,38 +1277,60 @@ TEST(IsAscii) {
 }
-static bool CanBeConvertedToLatin1(uint16_t c) {
+
-  CHECK(c > unibrow::Latin1::kMaxChar);
+#ifdef ENABLE_LATIN_1
-  uint32_t result[4];
+template<typename Op, bool return_first>
 static uint16_t ConvertLatin1(uint16_t c) {
  uint32_t result[Op::kMaxWidth];
  int chars;
-  chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
+  chars = Op::Convert(c, 0, result, NULL);
-  if (chars > 0) {
+  if (chars == 0) return 0;
-    CHECK_LE(chars, static_cast<int>(sizeof(result)));
+  CHECK_LE(chars, static_cast<int>(sizeof(result)));
-    for (int i = 0; i < chars; i++) {
+  if (!return_first && chars > 1) {
-      if (result[i] <= unibrow::Latin1::kMaxChar) {
+    return 0;
        return true;
      }
    }
  }
-  chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
+  return result[0];
  if (chars > 0) {
    CHECK_LE(chars, static_cast<int>(sizeof(result)));
    for (int i = 0; i < chars; i++) {
      if (result[i] <= unibrow::Latin1::kMaxChar) {
        return true;
      }
    }
  }
  return false;
 }
-TEST(Latin1) {
+static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
-#ifndef ENABLE_LATIN_1
+  uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
-    if (true) return;
+  if (expect > unibrow::Latin1::kMaxChar) expect = 0;
-#endif
+  CHECK_EQ(expect, test);
-  for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
+}
-    CHECK_EQ(CanBeConvertedToLatin1(c),
+
-             unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
+
 TEST(Latin1IgnoreCase) {
  if (true) return;
  using namespace unibrow;
  for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
    uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
    uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
    uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
    // Filter out all character whose upper is not their lower or vice versa.
    if (lower == 0 && upper == 0) {
      CheckCanonicalEquivalence(c, test);
      continue;
    }
    if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
      CheckCanonicalEquivalence(c, test);
      continue;
    }
    if (lower == 0 && upper != 0) {
      lower = ConvertLatin1<ToLowercase, false>(upper);
    }
    if (upper == 0 && lower != c) {
      upper = ConvertLatin1<ToUppercase, false>(lower);
    }
    if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
      CheckCanonicalEquivalence(c, test);
      continue;
    }
    if (upper != c && lower != c) {
      CheckCanonicalEquivalence(c, test);
      continue;
    }
    CHECK_EQ(Min(upper, lower), test);
  }
 }
 #endif  // ENABLE_LATIN_1
--- a/test/mjsunit/regress/regress-latin-1.js
+++ b/test/mjsunit/regress/regress-latin-1.js
@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) {
 // Should have hit the branch for the following char codes:
 // [A-Z], [192-222] but not 215
 assertEquals((90-65+1)+(222-192-1+1), total_lo);
 // Latin-1 whitespace character
 assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
 // Latin-1 \W characters
 assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
 // Latin-1 character that uppercases out of Latin-1.
 assertTrue(/\u0178/i.test('\u00ff'));
 // Unicode equivalence
 assertTrue(/\u039c/i.test('\u00b5'));
 assertTrue(/\u039c/i.test('\u03bc'));
 assertTrue(/\u00b5/i.test('\u03bc'));
 // Unicode equivalence ranges
 assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
 assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
 assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
 assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));