Fix some latin-1 webkit units tests
R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11962035 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13455 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
7f331f6280
commit
0c822b21cb
@ -2855,6 +2855,29 @@ RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
|
||||
}
|
||||
|
||||
|
||||
// We need to check for the following characters: 0x39c 0x3bc 0x178.
|
||||
static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
|
||||
#ifdef ENABLE_LATIN_1
|
||||
// TODO(dcarney): this could be a lot more efficient.
|
||||
return range.Contains(0x39c) ||
|
||||
range.Contains(0x3bc) || range.Contains(0x178);
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#ifdef ENABLE_LATIN_1
|
||||
static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
|
||||
for (int i = 0; i < ranges->length(); i++) {
|
||||
// TODO(dcarney): this could be a lot more efficient.
|
||||
if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
@ -2871,21 +2894,21 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
#else
|
||||
if (quarks[j] <= String::kMaxOneByteCharCode) continue;
|
||||
uint16_t c = quarks[j];
|
||||
if (c <= String::kMaxOneByteCharCode) continue;
|
||||
if (!ignore_case) return set_replacement(NULL);
|
||||
// Here, we need to check for characters whose upper and lower cases
|
||||
// are outside the Latin-1 range.
|
||||
if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
|
||||
// Character is outside Latin-1 completely
|
||||
if (converted == 0) return set_replacement(NULL);
|
||||
// Convert quark to Latin-1 in place.
|
||||
uint16_t* copy = const_cast<uint16_t*>(quarks.start());
|
||||
copy[j] = converted;
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
ASSERT(elm.type == TextElement::CHAR_CLASS);
|
||||
#ifdef ENABLE_LATIN_1
|
||||
// TODO(dcarney): Can this be improved?
|
||||
if (ignore_case) continue;
|
||||
#endif
|
||||
RegExpCharacterClass* cc = elm.data.u_char_class;
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
|
||||
if (!CharacterRange::IsCanonical(ranges)) {
|
||||
@ -2897,11 +2920,19 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
||||
if (range_count != 0 &&
|
||||
ranges->at(0).from() == 0 &&
|
||||
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
|
||||
#ifdef ENABLE_LATIN_1
|
||||
// This will be handled in a later filter.
|
||||
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
|
||||
#endif
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
} else {
|
||||
if (range_count == 0 ||
|
||||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
|
||||
#ifdef ENABLE_LATIN_1
|
||||
// This will be handled in a later filter.
|
||||
if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
|
||||
#endif
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
}
|
||||
@ -5354,7 +5385,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
|
||||
Isolate* isolate = Isolate::Current();
|
||||
uc16 bottom = from();
|
||||
uc16 top = to();
|
||||
if (is_ascii) {
|
||||
if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
|
||||
if (bottom > String::kMaxOneByteCharCode) return;
|
||||
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
|
||||
}
|
||||
|
@ -210,6 +210,26 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
|
||||
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
|
||||
0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
|
||||
0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
|
||||
// Latin-1 range
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
|
||||
};
|
||||
|
||||
|
||||
|
@ -244,10 +244,10 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
|
||||
|
||||
static const byte* StringCharacterPosition(String* subject, int start_index);
|
||||
|
||||
// Byte map of ASCII characters with a 0xff if the character is a word
|
||||
// Byte map of one byte characters with a 0xff if the character is a word
|
||||
// character (digit, letter or underscore) and 0x00 otherwise.
|
||||
// Used by generated RegExp code.
|
||||
static const byte word_character_map[128];
|
||||
static const byte word_character_map[256];
|
||||
|
||||
static Address word_character_map_address() {
|
||||
return const_cast<Address>(&word_character_map[0]);
|
||||
|
@ -5051,8 +5051,8 @@ RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToNumber) {
|
||||
// Fast check for a junk value. A valid string may start from a
|
||||
// whitespace, a sign ('+' or '-'), the decimal point, a decimal digit or
|
||||
// the 'I' character ('Infinity'). All of that have codes not greater than
|
||||
// '9' except 'I'.
|
||||
if (data[start_pos] != 'I') {
|
||||
// '9' except 'I' and .
|
||||
if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
|
||||
return isolate->heap()->nan_value();
|
||||
}
|
||||
} else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
|
||||
|
@ -79,33 +79,19 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
|
||||
}
|
||||
|
||||
|
||||
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
|
||||
uint16_t Latin1::ConvertNonLatin1ToLatin1(uint16_t c) {
|
||||
ASSERT(c > Latin1::kMaxChar);
|
||||
switch (c) {
|
||||
case 0x130:
|
||||
case 0x131:
|
||||
case 0x149:
|
||||
// This are equivalent characters in unicode.
|
||||
case 0x39c:
|
||||
case 0x3bc:
|
||||
return 0xb5;
|
||||
// This is an uppercase of a Latin-1 character
|
||||
// outside of Latin-1.
|
||||
case 0x178:
|
||||
case 0x17f:
|
||||
case 0x1f0:
|
||||
case 0x1e96:
|
||||
case 0x1e97:
|
||||
case 0x1e98:
|
||||
case 0x1e99:
|
||||
case 0x1e9a:
|
||||
case 0x1e9e:
|
||||
case 0x212a:
|
||||
case 0x212b:
|
||||
case 0xfb00:
|
||||
case 0xfb01:
|
||||
case 0xfb02:
|
||||
case 0xfb03:
|
||||
case 0xfb04:
|
||||
case 0xfb05:
|
||||
case 0xfb06:
|
||||
return true;
|
||||
return 0xff;
|
||||
}
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -140,7 +140,10 @@ class Latin1 {
|
||||
#else
|
||||
static const unsigned kMaxChar = 0xff;
|
||||
#endif
|
||||
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
|
||||
// Returns 0 if character does not convert to single latin-1 character
|
||||
// or if the character doesn't not convert back to latin-1 via inverse
|
||||
// operation (upper to lower, etc).
|
||||
static inline uint16_t ConvertNonLatin1ToLatin1(uint16_t);
|
||||
};
|
||||
|
||||
class Utf8 {
|
||||
|
@ -1277,38 +1277,60 @@ TEST(IsAscii) {
|
||||
}
|
||||
|
||||
|
||||
static bool CanBeConvertedToLatin1(uint16_t c) {
|
||||
CHECK(c > unibrow::Latin1::kMaxChar);
|
||||
uint32_t result[4];
|
||||
|
||||
#ifdef ENABLE_LATIN_1
|
||||
template<typename Op, bool return_first>
|
||||
static uint16_t ConvertLatin1(uint16_t c) {
|
||||
uint32_t result[Op::kMaxWidth];
|
||||
int chars;
|
||||
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
|
||||
if (chars > 0) {
|
||||
chars = Op::Convert(c, 0, result, NULL);
|
||||
if (chars == 0) return 0;
|
||||
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
||||
for (int i = 0; i < chars; i++) {
|
||||
if (result[i] <= unibrow::Latin1::kMaxChar) {
|
||||
return true;
|
||||
if (!return_first && chars > 1) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
|
||||
if (chars > 0) {
|
||||
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
||||
for (int i = 0; i < chars; i++) {
|
||||
if (result[i] <= unibrow::Latin1::kMaxChar) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return result[0];
|
||||
}
|
||||
|
||||
|
||||
TEST(Latin1) {
|
||||
#ifndef ENABLE_LATIN_1
|
||||
static void CheckCanonicalEquivalence(uint16_t c, uint16_t test) {
|
||||
uint16_t expect = ConvertLatin1<unibrow::Ecma262UnCanonicalize, true>(c);
|
||||
if (expect > unibrow::Latin1::kMaxChar) expect = 0;
|
||||
CHECK_EQ(expect, test);
|
||||
}
|
||||
|
||||
|
||||
TEST(Latin1IgnoreCase) {
|
||||
if (true) return;
|
||||
#endif
|
||||
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
|
||||
CHECK_EQ(CanBeConvertedToLatin1(c),
|
||||
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
|
||||
using namespace unibrow;
|
||||
for (uint16_t c = Latin1::kMaxChar + 1; c != 0; c++) {
|
||||
uint16_t lower = ConvertLatin1<ToLowercase, false>(c);
|
||||
uint16_t upper = ConvertLatin1<ToUppercase, false>(c);
|
||||
uint16_t test = Latin1::ConvertNonLatin1ToLatin1(c);
|
||||
// Filter out all character whose upper is not their lower or vice versa.
|
||||
if (lower == 0 && upper == 0) {
|
||||
CheckCanonicalEquivalence(c, test);
|
||||
continue;
|
||||
}
|
||||
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
|
||||
CheckCanonicalEquivalence(c, test);
|
||||
continue;
|
||||
}
|
||||
if (lower == 0 && upper != 0) {
|
||||
lower = ConvertLatin1<ToLowercase, false>(upper);
|
||||
}
|
||||
if (upper == 0 && lower != c) {
|
||||
upper = ConvertLatin1<ToUppercase, false>(lower);
|
||||
}
|
||||
if (lower > Latin1::kMaxChar && upper > Latin1::kMaxChar) {
|
||||
CheckCanonicalEquivalence(c, test);
|
||||
continue;
|
||||
}
|
||||
if (upper != c && lower != c) {
|
||||
CheckCanonicalEquivalence(c, test);
|
||||
continue;
|
||||
}
|
||||
CHECK_EQ(Min(upper, lower), test);
|
||||
}
|
||||
}
|
||||
#endif // ENABLE_LATIN_1
|
||||
|
@ -57,3 +57,22 @@ for (var i = 0; i < 0xff; i++) {
|
||||
// Should have hit the branch for the following char codes:
|
||||
// [A-Z], [192-222] but not 215
|
||||
assertEquals((90-65+1)+(222-192-1+1), total_lo);
|
||||
|
||||
// Latin-1 whitespace character
|
||||
assertEquals( 1, +(String.fromCharCode(0xA0) + '1') );
|
||||
|
||||
// Latin-1 \W characters
|
||||
assertEquals(["+\u00a3", "=="], "+\u00a3==".match(/\W\W/g));
|
||||
|
||||
// Latin-1 character that uppercases out of Latin-1.
|
||||
assertTrue(/\u0178/i.test('\u00ff'));
|
||||
|
||||
// Unicode equivalence
|
||||
assertTrue(/\u039c/i.test('\u00b5'));
|
||||
assertTrue(/\u039c/i.test('\u03bc'));
|
||||
assertTrue(/\u00b5/i.test('\u03bc'));
|
||||
// Unicode equivalence ranges
|
||||
assertTrue(/[\u039b-\u039d]/i.test('\u00b5'));
|
||||
assertFalse(/[^\u039b-\u039d]/i.test('\u00b5'));
|
||||
assertFalse(/[\u039b-\u039d]/.test('\u00b5'));
|
||||
assertTrue(/[^\u039b-\u039d]/.test('\u00b5'));
|
||||
|
Loading…
Reference in New Issue
Block a user