Cleanup latin-1 conversion check in regexp engine
R=yangguo@chromium.org BUG= Review URL: https://chromiumcodereview.appspot.com/11880045 Patch from Dan Carney <dcarney@google.com>. git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13400 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
451ed2f2cf
commit
a8d59243b9
@ -2875,23 +2875,9 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
|
||||
if (!ignore_case) return set_replacement(NULL);
|
||||
// Here, we need to check for characters whose upper and lower cases
|
||||
// are outside the Latin-1 range.
|
||||
// TODO(dcarney): Replace this code with a simple
|
||||
// table lookup in unibrow::Latin-1.
|
||||
// TODO(dcarney): Test cases!.
|
||||
unibrow::uchar result;
|
||||
int chars;
|
||||
chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
|
||||
if (chars > 1 ||
|
||||
(chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
|
||||
continue;
|
||||
if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
|
||||
return set_replacement(NULL);
|
||||
}
|
||||
chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
|
||||
if (chars > 1 ||
|
||||
(chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
|
||||
continue;
|
||||
}
|
||||
// This character is definitely not in the Latin-1 range.
|
||||
return set_replacement(NULL);
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
|
@ -79,6 +79,36 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
|
||||
}
|
||||
|
||||
|
||||
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
|
||||
ASSERT(c > Latin1::kMaxChar);
|
||||
switch (c) {
|
||||
case 0x130:
|
||||
case 0x131:
|
||||
case 0x149:
|
||||
case 0x178:
|
||||
case 0x17f:
|
||||
case 0x1f0:
|
||||
case 0x1e96:
|
||||
case 0x1e97:
|
||||
case 0x1e98:
|
||||
case 0x1e99:
|
||||
case 0x1e9a:
|
||||
case 0x1e9e:
|
||||
case 0x212a:
|
||||
case 0x212b:
|
||||
case 0xfb00:
|
||||
case 0xfb01:
|
||||
case 0xfb02:
|
||||
case 0xfb03:
|
||||
case 0xfb04:
|
||||
case 0xfb05:
|
||||
case 0xfb06:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
unsigned Utf8::Encode(char* str, uchar c, int previous) {
|
||||
static const int kMask = ~(1 << 6);
|
||||
if (c <= kMaxOneByteChar) {
|
||||
|
@ -140,6 +140,7 @@ class Latin1 {
|
||||
#else
|
||||
static const unsigned kMaxChar = 0xff;
|
||||
#endif
|
||||
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
|
||||
};
|
||||
|
||||
class Utf8 {
|
||||
|
@ -1275,3 +1275,40 @@ TEST(IsAscii) {
|
||||
CHECK(String::IsAscii(static_cast<char*>(NULL), 0));
|
||||
CHECK(String::IsOneByte(static_cast<uc16*>(NULL), 0));
|
||||
}
|
||||
|
||||
|
||||
static bool CanBeConvertedToLatin1(uint16_t c) {
|
||||
CHECK(c > unibrow::Latin1::kMaxChar);
|
||||
uint32_t result[4];
|
||||
int chars;
|
||||
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
|
||||
if (chars > 0) {
|
||||
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
||||
for (int i = 0; i < chars; i++) {
|
||||
if (result[i] <= unibrow::Latin1::kMaxChar) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
|
||||
if (chars > 0) {
|
||||
CHECK_LE(chars, static_cast<int>(sizeof(result)));
|
||||
for (int i = 0; i < chars; i++) {
|
||||
if (result[i] <= unibrow::Latin1::kMaxChar) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
TEST(Latin1) {
|
||||
#ifndef ENABLE_LATIN_1
|
||||
if (true) return;
|
||||
#endif
|
||||
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
|
||||
CHECK_EQ(CanBeConvertedToLatin1(c),
|
||||
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user