Cleanup latin-1 conversion check in regexp engine

R=yangguo@chromium.org
BUG=

Review URL: https://chromiumcodereview.appspot.com/11880045
Patch from Dan Carney <dcarney@google.com>.

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@13400 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
yangguo@chromium.org 2013-01-16 13:04:07 +00:00
parent 451ed2f2cf
commit a8d59243b9
4 changed files with 70 additions and 16 deletions

View File

@ -2875,23 +2875,9 @@ RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
if (!ignore_case) return set_replacement(NULL);
// Here, we need to check for characters whose upper and lower cases
// are outside the Latin-1 range.
// TODO(dcarney): Replace this code with a simple
// table lookup in unibrow::Latin-1.
// TODO(dcarney): Test cases!.
unibrow::uchar result;
int chars;
chars = unibrow::ToLowercase::Convert(quarks[j], 0, &result, NULL);
if (chars > 1 ||
(chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
continue;
if (!unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(quarks[j])) {
return set_replacement(NULL);
}
chars = unibrow::ToUppercase::Convert(quarks[j], 0, &result, NULL);
if (chars > 1 ||
(chars == 1 && result <= String::kMaxOneByteCharCodeU)) {
continue;
}
// This character is definitely not in the Latin-1 range.
return set_replacement(NULL);
#endif
}
} else {

View File

@ -79,6 +79,36 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
}
bool Latin1::NonLatin1CanBeConvertedToLatin1(uint16_t c) {
ASSERT(c > Latin1::kMaxChar);
switch (c) {
case 0x130:
case 0x131:
case 0x149:
case 0x178:
case 0x17f:
case 0x1f0:
case 0x1e96:
case 0x1e97:
case 0x1e98:
case 0x1e99:
case 0x1e9a:
case 0x1e9e:
case 0x212a:
case 0x212b:
case 0xfb00:
case 0xfb01:
case 0xfb02:
case 0xfb03:
case 0xfb04:
case 0xfb05:
case 0xfb06:
return true;
}
return false;
}
unsigned Utf8::Encode(char* str, uchar c, int previous) {
static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) {

View File

@ -140,6 +140,7 @@ class Latin1 {
#else
static const unsigned kMaxChar = 0xff;
#endif
static inline bool NonLatin1CanBeConvertedToLatin1(uint16_t);
};
class Utf8 {

View File

@ -1275,3 +1275,40 @@ TEST(IsAscii) {
CHECK(String::IsAscii(static_cast<char*>(NULL), 0));
CHECK(String::IsOneByte(static_cast<uc16*>(NULL), 0));
}
static bool CanBeConvertedToLatin1(uint16_t c) {
CHECK(c > unibrow::Latin1::kMaxChar);
uint32_t result[4];
int chars;
chars = unibrow::ToLowercase::Convert(c, 0, result, NULL);
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
}
chars = unibrow::ToUppercase::Convert(c, 0, result, NULL);
if (chars > 0) {
CHECK_LE(chars, static_cast<int>(sizeof(result)));
for (int i = 0; i < chars; i++) {
if (result[i] <= unibrow::Latin1::kMaxChar) {
return true;
}
}
}
return false;
}
TEST(Latin1) {
#ifndef ENABLE_LATIN_1
if (true) return;
#endif
for (uint16_t c = unibrow::Latin1::kMaxChar + 1; c != 0; c++) {
CHECK_EQ(CanBeConvertedToLatin1(c),
unibrow::Latin1::NonLatin1CanBeConvertedToLatin1(c));
}
}