Fix some latin-1 webkit units tests

R=yangguo@chromium.org
BUG=

Review URL: https://chromiumcodereview.appspot.com/11962035
Patch from Dan Carney <dcarney@google.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
yangguo@chromium.org 2013-01-21 16:11:31 +00:00
parent 7f331f6280
commit 0c822b21cb
8 changed files with 146 additions and 65 deletions

View File

@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
} }
// We need to check for the following characters: 0x39c 0x3bc 0x178.
static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
#ifdef ENABLE_LATIN_1
// TODO(dcarney): this could be a lot more efficient.
return range.Contains(0x39c) ||
range.Contains(0x3bc) || range.Contains(0x178);
#else
return false;
#endif
}
#ifdef ENABLE_LATIN_1
static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
for (int i = 0; i < ranges->length(); i++) {
// TODO(dcarney): this could be a lot more efficient.
if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
}
return false;
}
#endif
RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (info()->replacement_calculated) return replacement(); if (info()->replacement_calculated) return replacement();
if (depth < 0) return this; if (depth < 0) return this;
@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
return set_replacement(NULL); return set_replacement(NULL);
} }
#else #else
if (quarks[j] <= String::kMaxOneByteCharCode) continue; uint16_t c = quarks[j];
if (c <= String::kMaxOneByteCharCode) continue;
if (!ignore_case) return set_replacement(NULL); if (!ignore_case) return set_replacement(NULL);
// Here, we need to check for characters whose upper and lower cases // Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range. // are outside the Latin-1 range.
if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) { uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
return set_replacement(NULL); // Character is outside Latin-1 completely
} if (converted == 0) return set_replacement(NULL);
// Convert quark to Latin-1 in place.
uint16_t* copy = const_cast<uint16_t*>(quarks.start());
copy[j] = converted;
#endif #endif
} }
} else { } else {
ASSERT(elm.type == TextElement::CHAR_CLASS); ASSERT(elm.type == TextElement::CHAR_CLASS);
#ifdef ENABLE_LATIN_1
// TODO(dcarney): Can this be improved?
if (ignore_case) continue;
#endif
RegExpCharacterClass* cc = elm.data.u_char_class; RegExpCharacterClass* cc = elm.data.u_char_class;
ZoneList<CharacterRange>* ranges = cc->ranges(zone()); ZoneList<CharacterRange>* ranges = cc->ranges(zone());
if (!CharacterRange::IsCanonical(ranges)) { if (!CharacterRange::IsCanonical(ranges)) {
@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (range_count != 0 && if (range_count != 0 &&
ranges->at(0).from() == 0 && ranges->at(0).from() == 0 &&
ranges->at(0).to() >= String::kMaxOneByteCharCode) { ranges->at(0).to() >= String::kMaxOneByteCharCode) {
#ifdef ENABLE_LATIN_1
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
#endif
return set_replacement(NULL); return set_replacement(NULL);
} }
} else { } else {
if (range_count == 0 || if (range_count == 0 ||
ranges->at(0).from() > String::kMaxOneByteCharCode) { ranges->at(0).from() > String::kMaxOneByteCharCode) {
#ifdef ENABLE_LATIN_1
// This will be handled in a later filter.
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
#endif
return set_replacement(NULL); return set_replacement(NULL);
} }
} }
@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
Isolate* isolate = Isolate::Current(); Isolate* isolate = Isolate::Current();
uc16 bottom = from(); uc16 bottom = from();
uc16 top = to(); uc16 top = to();
if (is_ascii) { if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
if (bottom > String::kMaxOneByteCharCode) return; if (bottom > String::kMaxOneByteCharCode) return;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
} }

View File

@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o' 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w' 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z' 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
// Latin-1 range
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
}; };

View File

@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
static const byte* StringCharacterPosition(String* subject, int start_index); static const byte* StringCharacterPosition(String* subject, int start_index);
// Byte map of ASCII characters with a 0xff if the character is a word // Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise. // character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code. // Used by generated RegExp code.
static const byte word_character_map[128]; static const byte word_character_map[256];
static Address word_character_map_address() { static Address word_character_map_address() {
return const_cast<Address>(&word_character_map[0]); return const_cast<Address>(&word_character_map[0]);

View File

@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
// Fast check for a junk value. A valid string may start from a // Fast check for a junk value. A valid string may start from a
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or // whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
// the 'I' character ('Infinity'). All of that have codes not greater than // the 'I' character ('Infinity'). All of that have codes not greater than
// '9' except 'I'. // '9' except 'I' and &nbsp;.
if (data[start_pos] != 'I') { if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
return isolate->heap()->nan_value(); return isolate->heap()->nan_value();
} }
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) { } else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {

View File

@ -79,33 +79,19 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
} }
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) { uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
ASSERT(c > Latin1::kMaxChar); ASSERT(c > Latin1::kMaxChar);
switch (c) { switch (c) {
case 0x130: // This are equivalent characters in unicode.
case 0x131: case 0x39c:
case 0x149: case 0x3bc:
return 0xb5;
// This is an uppercase of a Latin-1 character
// outside of Latin-1.
case 0x178: case 0x178:
case 0x17f: return 0xff;
case 0x1f0:
case 0x1e96:
case 0x1e97:
case 0x1e98:
case 0x1e99:
case 0x1e9a:
case 0x1e9e:
case 0x212a:
case 0x212b:
case 0xfb00:
case 0xfb01:
case 0xfb02:
case 0xfb03:
case 0xfb04:
case 0xfb05:
case 0xfb06:
return true;
} }
return false; return 0;
} }

View File

@ -140,7 +140,10 @@ class Latin1 {
#else #else
static const unsigned kMaxChar = 0xff; static const unsigned kMaxChar = 0xff;
#endif #endif
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t); // Returns 0 if character does not convert to single latin-1 character
// or if the character doesn't not convert back to latin-1 via inverse
// operation (upper to lower, etc).
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
}; };
class Utf8 { class Utf8 {

View File

@ -1277,38 +1277,60 @@ TEST(IsAscii) {
} }
static bool CanBeConvertedToLatin1(uint16_t c) {
CHECK(c > unibrow::Latin1::kMaxChar); #ifdef ENABLE_LATIN_1
uint32_t result[4]; template<typename Op, bool return_first>
static uint16_t ConvertLatin1(uint16_t c) {
uint32_t result[Op::kMaxWidth];
int chars; int chars;
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL); chars = Op::Convert(c, 0, result, NULL);
if (chars > 0) { if (chars == 0) return 0;
CHECK_LE(chars, static_cast<int>(sizeof(result))); CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) { if (!return_first && chars > 1) {
if (result[i] <= unibrow::Latin1::kMaxChar) { return 0;
return true;
}
}
} }
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL); return result[0];
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
}
return false;
} }
TEST(Latin1) { static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
#ifndef ENABLE_LATIN_1 uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
if (true) return; if (expect > unibrow::Latin1::kMaxChar) expect = 0;
#endif CHECK_EQ(expect, test);
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) { }
CHECK_EQ(CanBeConvertedToLatin1(c),
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
TEST(Latin1IgnoreCase) {
if (true) return;
using namespace unibrow;
for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
// Filter out all character whose upper is not their lower or vice versa.
if (lower == 0 && upper == 0) {
CheckCanonicalEquivalence(c, test);
continue;
}
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
CheckCanonicalEquivalence(c, test);
continue;
}
if (lower == 0 && upper != 0) {
lower = ConvertLatin1<ToLowercase, false>(upper);
}
if (upper == 0 && lower != c) {
upper = ConvertLatin1<ToUppercase, false>(lower);
}
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
CheckCanonicalEquivalence(c, test);
continue;
}
if (upper != c && lower != c) {
CheckCanonicalEquivalence(c, test);
continue;
}
CHECK_EQ(Min(upper, lower), test);
} }
} }
#endif // ENABLE_LATIN_1

View File

@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) {
// Should have hit the branch for the following char codes: // Should have hit the branch for the following char codes:
// [A-Z], [192-222] but not 215 // [A-Z], [192-222] but not 215
assertEquals((90-65+1)+(222-192-1+1), total_lo); assertEquals((90-65+1)+(222-192-1+1), total_lo);
// Latin-1 whitespace character
assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
// Latin-1 \W characters
assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
// Latin-1 character that uppercases out of Latin-1.
assertTrue(/\u0178/i.test('\u00ff'));
// Unicode equivalence
assertTrue(/\u039c/i.test('\u00b5'));
assertTrue(/\u039c/i.test('\u03bc'));
assertTrue(/\u00b5/i.test('\u03bc'));
// Unicode equivalence ranges
assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));