Fix some latin-1 webkit units tests
R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11962035 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
7f331f6280
commit
0c822b21cb
@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// We need to check for the following characters: 0x39c 0x3bc 0x178.
|
||||||
|
static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
|
||||||
|
#ifdef ENABLE_LATIN_1
|
||||||
|
// TODO(dcarney): this could be a lot more efficient.
|
||||||
|
return range.Contains(0x39c) ||
|
||||||
|
range.Contains(0x3bc) || range.Contains(0x178);
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef ENABLE_LATIN_1
|
||||||
|
static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
|
||||||
|
for (int i = 0; i < ranges->length(); i++) {
|
||||||
|
// TODO(dcarney): this could be a lot more efficient.
|
||||||
|
if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
||||||
if (info()->replacement_calculated) return replacement();
|
if (info()->replacement_calculated) return replacement();
|
||||||
if (depth < 0) return this;
|
if (depth < 0) return this;
|
||||||
@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
|||||||
return set_replacement(NULL);
|
return set_replacement(NULL);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (quarks[j] <= String::kMaxOneByteCharCode) continue;
|
uint16_t c = quarks[j];
|
||||||
|
if (c <= String::kMaxOneByteCharCode) continue;
|
||||||
if (!ignore_case) return set_replacement(NULL);
|
if (!ignore_case) return set_replacement(NULL);
|
||||||
// Here, we need to check for characters whose upper and lower cases
|
// Here, we need to check for characters whose upper and lower cases
|
||||||
// are outside the Latin-1 range.
|
// are outside the Latin-1 range.
|
||||||
if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
|
uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
|
||||||
return set_replacement(NULL);
|
// Character is outside Latin-1 completely
|
||||||
}
|
if (converted == 0) return set_replacement(NULL);
|
||||||
|
// Convert quark to Latin-1 in place.
|
||||||
|
uint16_t* copy = const_cast<uint16_t*>(quarks.start());
|
||||||
|
copy[j] = converted;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ASSERT(elm.type == TextElement::CHAR_CLASS);
|
ASSERT(elm.type == TextElement::CHAR_CLASS);
|
||||||
#ifdef ENABLE_LATIN_1
|
|
||||||
// TODO(dcarney): Can this be improved?
|
|
||||||
if (ignore_case) continue;
|
|
||||||
#endif
|
|
||||||
RegExpCharacterClass* cc = elm.data.u_char_class;
|
RegExpCharacterClass* cc = elm.data.u_char_class;
|
||||||
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
|
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
|
||||||
if (!CharacterRange::IsCanonical(ranges)) {
|
if (!CharacterRange::IsCanonical(ranges)) {
|
||||||
@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
|||||||
if (range_count != 0 &&
|
if (range_count != 0 &&
|
||||||
ranges->at(0).from() == 0 &&
|
ranges->at(0).from() == 0 &&
|
||||||
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
|
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
|
||||||
|
#ifdef ENABLE_LATIN_1
|
||||||
|
// This will be handled in a later filter.
|
||||||
|
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
|
||||||
|
#endif
|
||||||
return set_replacement(NULL);
|
return set_replacement(NULL);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (range_count == 0 ||
|
if (range_count == 0 ||
|
||||||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
|
ranges->at(0).from() > String::kMaxOneByteCharCode) {
|
||||||
|
#ifdef ENABLE_LATIN_1
|
||||||
|
// This will be handled in a later filter.
|
||||||
|
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
|
||||||
|
#endif
|
||||||
return set_replacement(NULL);
|
return set_replacement(NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
|||||||
Isolate* isolate = Isolate::Current();
|
Isolate* isolate = Isolate::Current();
|
||||||
uc16 bottom = from();
|
uc16 bottom = from();
|
||||||
uc16 top = to();
|
uc16 top = to();
|
||||||
if (is_ascii) {
|
if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
|
||||||
if (bottom > String::kMaxOneByteCharCode) return;
|
if (bottom > String::kMaxOneByteCharCode) return;
|
||||||
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
||||||
}
|
}
|
||||||
|
@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
|
|||||||
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
|
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
|
||||||
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
|
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
|
||||||
0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
|
0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
|
||||||
|
// Latin-1 range
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
|
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
|
|||||||
|
|
||||||
static const byte* StringCharacterPosition(String* subject, int start_index);
|
static const byte* StringCharacterPosition(String* subject, int start_index);
|
||||||
|
|
||||||
// Byte map of ASCII characters with a 0xff if the character is a word
|
// Byte map of one byte characters with a 0xff if the character is a word
|
||||||
// character (digit, letter or underscore) and 0x00 otherwise.
|
// character (digit, letter or underscore) and 0x00 otherwise.
|
||||||
// Used by generated RegExp code.
|
// Used by generated RegExp code.
|
||||||
static const byte word_character_map[128];
|
static const byte word_character_map[256];
|
||||||
|
|
||||||
static Address word_character_map_address() {
|
static Address word_character_map_address() {
|
||||||
return const_cast<Address>(&word_character_map[0]);
|
return const_cast<Address>(&word_character_map[0]);
|
||||||
|
@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
|
|||||||
// Fast check for a junk value. A valid string may start from a
|
// Fast check for a junk value. A valid string may start from a
|
||||||
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
|
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
|
||||||
// the 'I' character ('Infinity'). All of that have codes not greater than
|
// the 'I' character ('Infinity'). All of that have codes not greater than
|
||||||
// '9' except 'I'.
|
// '9' except 'I' and .
|
||||||
if (data[start_pos] != 'I') {
|
if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
|
||||||
return isolate->heap()->nan_value();
|
return isolate->heap()->nan_value();
|
||||||
}
|
}
|
||||||
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
|
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
|
||||||
|
@ -79,33 +79,19 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
|
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
|
||||||
ASSERT(c > Latin1::kMaxChar);
|
ASSERT(c > Latin1::kMaxChar);
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 0x130:
|
// This are equivalent characters in unicode.
|
||||||
case 0x131:
|
case 0x39c:
|
||||||
case 0x149:
|
case 0x3bc:
|
||||||
|
return 0xb5;
|
||||||
|
// This is an uppercase of a Latin-1 character
|
||||||
|
// outside of Latin-1.
|
||||||
case 0x178:
|
case 0x178:
|
||||||
case 0x17f:
|
return 0xff;
|
||||||
case 0x1f0:
|
|
||||||
case 0x1e96:
|
|
||||||
case 0x1e97:
|
|
||||||
case 0x1e98:
|
|
||||||
case 0x1e99:
|
|
||||||
case 0x1e9a:
|
|
||||||
case 0x1e9e:
|
|
||||||
case 0x212a:
|
|
||||||
case 0x212b:
|
|
||||||
case 0xfb00:
|
|
||||||
case 0xfb01:
|
|
||||||
case 0xfb02:
|
|
||||||
case 0xfb03:
|
|
||||||
case 0xfb04:
|
|
||||||
case 0xfb05:
|
|
||||||
case 0xfb06:
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
return false;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -140,7 +140,10 @@ class Latin1 {
|
|||||||
#else
|
#else
|
||||||
static const unsigned kMaxChar = 0xff;
|
static const unsigned kMaxChar = 0xff;
|
||||||
#endif
|
#endif
|
||||||
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
|
// Returns 0 if character does not convert to single latin-1 character
|
||||||
|
// or if the character doesn't not convert back to latin-1 via inverse
|
||||||
|
// operation (upper to lower, etc).
|
||||||
|
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
|
||||||
};
|
};
|
||||||
|
|
||||||
class Utf8 {
|
class Utf8 {
|
||||||
|
@ -1277,38 +1277,60 @@ TEST(IsAscii) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static bool CanBeConvertedToLatin1(uint16_t c) {
|
|
||||||
CHECK(c > unibrow::Latin1::kMaxChar);
|
#ifdef ENABLE_LATIN_1
|
||||||
uint32_t result[4];
|
template<typename Op, bool return_first>
|
||||||
|
static uint16_t ConvertLatin1(uint16_t c) {
|
||||||
|
uint32_t result[Op::kMaxWidth];
|
||||||
int chars;
|
int chars;
|
||||||
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
|
chars = Op::Convert(c, 0, result, NULL);
|
||||||
if (chars > 0) {
|
if (chars == 0) return 0;
|
||||||
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
||||||
for (int i = 0; i < chars; i++) {
|
if (!return_first && chars > 1) {
|
||||||
if (result[i] <= unibrow::Latin1::kMaxChar) {
|
return 0;
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
|
return result[0];
|
||||||
if (chars > 0) {
|
|
||||||
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
|
||||||
for (int i = 0; i < chars; i++) {
|
|
||||||
if (result[i] <= unibrow::Latin1::kMaxChar) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
TEST(Latin1) {
|
static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
|
||||||
#ifndef ENABLE_LATIN_1
|
uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
|
||||||
if (true) return;
|
if (expect > unibrow::Latin1::kMaxChar) expect = 0;
|
||||||
#endif
|
CHECK_EQ(expect, test);
|
||||||
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
|
}
|
||||||
CHECK_EQ(CanBeConvertedToLatin1(c),
|
|
||||||
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
|
|
||||||
|
TEST(Latin1IgnoreCase) {
|
||||||
|
if (true) return;
|
||||||
|
using namespace unibrow;
|
||||||
|
for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
|
||||||
|
uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
|
||||||
|
uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
|
||||||
|
uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
|
||||||
|
// Filter out all character whose upper is not their lower or vice versa.
|
||||||
|
if (lower == 0 && upper == 0) {
|
||||||
|
CheckCanonicalEquivalence(c, test);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
|
||||||
|
CheckCanonicalEquivalence(c, test);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (lower == 0 && upper != 0) {
|
||||||
|
lower = ConvertLatin1<ToLowercase, false>(upper);
|
||||||
|
}
|
||||||
|
if (upper == 0 && lower != c) {
|
||||||
|
upper = ConvertLatin1<ToUppercase, false>(lower);
|
||||||
|
}
|
||||||
|
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
|
||||||
|
CheckCanonicalEquivalence(c, test);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (upper != c && lower != c) {
|
||||||
|
CheckCanonicalEquivalence(c, test);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
CHECK_EQ(Min(upper, lower), test);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif // ENABLE_LATIN_1
|
||||||
|
@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) {
|
|||||||
// Should have hit the branch for the following char codes:
|
// Should have hit the branch for the following char codes:
|
||||||
// [A-Z], [192-222] but not 215
|
// [A-Z], [192-222] but not 215
|
||||||
assertEquals((90-65+1)+(222-192-1+1), total_lo);
|
assertEquals((90-65+1)+(222-192-1+1), total_lo);
|
||||||
|
|
||||||
|
// Latin-1 whitespace character
|
||||||
|
assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
|
||||||
|
|
||||||
|
// Latin-1 \W characters
|
||||||
|
assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
|
||||||
|
|
||||||
|
// Latin-1 character that uppercases out of Latin-1.
|
||||||
|
assertTrue(/\u0178/i.test('\u00ff'));
|
||||||
|
|
||||||
|
// Unicode equivalence
|
||||||
|
assertTrue(/\u039c/i.test('\u00b5'));
|
||||||
|
assertTrue(/\u039c/i.test('\u03bc'));
|
||||||
|
assertTrue(/\u00b5/i.test('\u03bc'));
|
||||||
|
// Unicode equivalence ranges
|
||||||
|
assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
|
||||||
|
assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
|
||||||
|
assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
|
||||||
|
assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));
|
||||||
|
Loading…
Reference in New Issue
Block a user