Fix bug 486, Cyrillic character ranges in case independent regexps.
http://code.google.com/p/v8/issues/detail?id=486 Review URL: http://codereview.chromium.org/361033 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@3236 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
This commit is contained in:
parent
8f53c139d3
commit
57c919e414
@ -2440,8 +2440,8 @@ void TextNode::MakeCaseIndependent() {
|
||||
RegExpCharacterClass* cc = elm.data.u_char_class;
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges();
|
||||
int range_count = ranges->length();
|
||||
for (int i = 0; i < range_count; i++) {
|
||||
ranges->at(i).AddCaseEquivalents(ranges);
|
||||
for (int j = 0; j < range_count; j++) {
|
||||
ranges->at(j).AddCaseEquivalents(ranges);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3961,7 +3961,7 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
|
||||
} else {
|
||||
start = pos;
|
||||
}
|
||||
// Then we add the ranges on at a time, incrementing the current
|
||||
// Then we add the ranges one at a time, incrementing the current
|
||||
// position to be after the last block each time. The position
|
||||
// always points to the start of a block.
|
||||
while (pos < to()) {
|
||||
@ -3987,8 +3987,45 @@ void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
|
||||
}
|
||||
start = pos = block_end + 1;
|
||||
}
|
||||
} else {
|
||||
// TODO(plesner) when we've fixed the 2^11 bug in unibrow.
|
||||
} else if (from() > 0 || to() < String::kMaxUC16CharCode) {
|
||||
// Unibrow ranges don't work for high characters due to the "2^11 bug".
|
||||
// Therefore we do something dumber for these ranges. We don't bother
|
||||
// if the range is 0-max (as encountered at the start of an unanchored
|
||||
// regexp).
|
||||
ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
|
||||
int bottom = from();
|
||||
int top = to();
|
||||
for (int i = bottom; i <= top; i++) {
|
||||
int length = uncanonicalize.get(i, '\0', chars);
|
||||
for (int j = 0; j < length; j++) {
|
||||
uc32 chr = chars[j];
|
||||
if (chr != i && chr < bottom || chr > top) {
|
||||
characters->Add(chr);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (characters->length() > 0) {
|
||||
int new_from = characters->at(0);
|
||||
int new_to = new_from;
|
||||
for (int i = 1; i < characters->length(); i++) {
|
||||
int chr = characters->at(i);
|
||||
if (chr == new_to + 1) {
|
||||
new_to++;
|
||||
} else {
|
||||
if (new_to == new_from) {
|
||||
ranges->Add(CharacterRange::Singleton(new_from));
|
||||
} else {
|
||||
ranges->Add(CharacterRange(new_from, new_to));
|
||||
}
|
||||
new_from = new_to = chr;
|
||||
}
|
||||
}
|
||||
if (new_to == new_from) {
|
||||
ranges->Add(CharacterRange::Singleton(new_from));
|
||||
} else {
|
||||
ranges->Add(CharacterRange(new_from, new_to));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
169
test/mjsunit/cyrillic.js
Normal file
169
test/mjsunit/cyrillic.js
Normal file
@ -0,0 +1,169 @@
|
||||
// Copyright 2009 the V8 project authors. All rights reserved.
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following
|
||||
// disclaimer in the documentation and/or other materials provided
|
||||
// with the distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived
|
||||
// from this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
// Test Unicode character ranges in regexps.
|
||||
|
||||
|
||||
// Cyrillic.
|
||||
var cyrillic = {
|
||||
FIRST: "\u0410", // A
|
||||
first: "\u0430", // a
|
||||
LAST: "\u042f", // YA
|
||||
last: "\u044f", // ya
|
||||
MIDDLE: "\u0427", // CHE
|
||||
middle: "\u0447", // che
|
||||
// Actually no characters are between the cases in Cyrillic.
|
||||
BetweenCases: false};
|
||||
|
||||
var SIGMA = "\u03a3";
|
||||
var sigma = "\u03c3";
|
||||
var alternative_sigma = "\u03c2";
|
||||
|
||||
// Greek.
|
||||
var greek = {
|
||||
FIRST: "\u0391", // ALPHA
|
||||
first: "\u03b1", // alpha
|
||||
LAST: "\u03a9", // OMEGA
|
||||
last: "\u03c9", // omega
|
||||
MIDDLE: SIGMA, // SIGMA
|
||||
middle: sigma, // sigma
|
||||
// Epsilon acute is between ALPHA-OMEGA and alpha-omega, ie it
|
||||
// is between OMEGA and alpha.
|
||||
BetweenCases: "\u03ad"};
|
||||
|
||||
|
||||
function Range(from, to, flags) {
|
||||
return new RegExp("[" + from + "-" + to + "]", flags);
|
||||
}
|
||||
|
||||
for (var lang = 0; lang < 2; lang++) {
|
||||
var chars = (lang == 0) ? cyrillic : greek;
|
||||
|
||||
for (var i = 0; i < 2; i++) {
|
||||
var lc = (i == 0); // Lower case.
|
||||
var first = lc ? chars.first : chars.FIRST;
|
||||
var middle = lc ? chars.middle : chars.MIDDLE;
|
||||
var last = lc ? chars.last : chars.LAST;
|
||||
var first_other_case = lc ? chars.FIRST : chars.first;
|
||||
var middle_other_case = lc ? chars.MIDDLE : chars.middle;
|
||||
var last_other_case = lc ? chars.LAST : chars.last;
|
||||
|
||||
assertTrue(Range(first, last).test(first), 1);
|
||||
assertTrue(Range(first, last).test(middle), 2);
|
||||
assertTrue(Range(first, last).test(last), 3);
|
||||
|
||||
assertFalse(Range(first, last).test(first_other_case), 4);
|
||||
assertFalse(Range(first, last).test(middle_other_case), 5);
|
||||
assertFalse(Range(first, last).test(last_other_case), 6);
|
||||
|
||||
assertTrue(Range(first, last, "i").test(first), 7);
|
||||
assertTrue(Range(first, last, "i").test(middle), 8);
|
||||
assertTrue(Range(first, last, "i").test(last), 9);
|
||||
|
||||
assertTrue(Range(first, last, "i").test(first_other_case), 10);
|
||||
assertTrue(Range(first, last, "i").test(middle_other_case), 11);
|
||||
assertTrue(Range(first, last, "i").test(last_other_case), 12);
|
||||
|
||||
if (chars.BetweenCases) {
|
||||
assertFalse(Range(first, last).test(chars.BetweenCases), 13);
|
||||
assertFalse(Range(first, last, "i").test(chars.BetweenCases), 14);
|
||||
}
|
||||
}
|
||||
if (chars.BetweenCases) {
|
||||
assertTrue(Range(chars.FIRST, chars.last).test(chars.BetweenCases), 15);
|
||||
assertTrue(Range(chars.FIRST, chars.last, "i").test(chars.BetweenCases), 16);
|
||||
}
|
||||
}
|
||||
|
||||
for (key in greek) {
|
||||
assertTrue(Range(greek.FIRST, cyrillic.last).test(greek[key]), 17 + key);
|
||||
if (cyrillic[key]) {
|
||||
assertTrue(Range(greek.FIRST, cyrillic.last).test(cyrillic[key]), 18 + key);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (var i = 0; i < 2; i++) {
|
||||
var ignore_case = (i == 0);
|
||||
var flag = ignore_case ? "i" : "";
|
||||
assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.first), 19);
|
||||
assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.middle), 20);
|
||||
assertTrue(Range(greek.first, cyrillic.LAST, flag).test(greek.last), 21);
|
||||
|
||||
assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.FIRST), 22);
|
||||
assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.MIDDLE), 23);
|
||||
assertTrue(Range(greek.first, cyrillic.LAST, flag).test(cyrillic.LAST), 24);
|
||||
|
||||
assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.FIRST), 25);
|
||||
assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.MIDDLE), 26);
|
||||
assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(greek.LAST), 27);
|
||||
|
||||
assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.first), 28);
|
||||
assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.middle), 29);
|
||||
assertEquals(ignore_case, Range(greek.first, cyrillic.LAST, flag).test(cyrillic.last), 30);
|
||||
}
|
||||
|
||||
|
||||
for (var i = 0; i < 2; i++) {
|
||||
var simple = (i != 0);
|
||||
var name = simple ? "" : "[]";
|
||||
var regex = simple ? SIGMA : "[" + SIGMA + "]";
|
||||
|
||||
assertFalse(new RegExp(regex).test(sigma), 31 + name);
|
||||
assertFalse(new RegExp(regex).test(alternative_sigma), 32 + name);
|
||||
assertTrue(new RegExp(regex).test(SIGMA), 33 + name);
|
||||
|
||||
assertTrue(new RegExp(regex, "i").test(sigma), 34 + name);
|
||||
// JSC and Tracemonkey fail this one.
|
||||
assertTrue(new RegExp(regex, "i").test(alternative_sigma), 35 + name);
|
||||
assertTrue(new RegExp(regex, "i").test(SIGMA), 36 + name);
|
||||
|
||||
regex = simple ? sigma : "[" + sigma + "]";
|
||||
|
||||
assertTrue(new RegExp(regex).test(sigma), 41 + name);
|
||||
assertFalse(new RegExp(regex).test(alternative_sigma), 42 + name);
|
||||
assertFalse(new RegExp(regex).test(SIGMA), 43 + name);
|
||||
|
||||
assertTrue(new RegExp(regex, "i").test(sigma), 44 + name);
|
||||
// JSC and Tracemonkey fail this one.
|
||||
assertTrue(new RegExp(regex, "i").test(alternative_sigma), 45 + name);
|
||||
assertTrue(new RegExp(regex, "i").test(SIGMA), 46 + name);
|
||||
|
||||
regex = simple ? alternative_sigma : "[" + alternative_sigma + "]";
|
||||
|
||||
assertFalse(new RegExp(regex).test(sigma), 51 + name);
|
||||
assertTrue(new RegExp(regex).test(alternative_sigma), 52 + name);
|
||||
assertFalse(new RegExp(regex).test(SIGMA), 53 + name);
|
||||
|
||||
// JSC and Tracemonkey fail this one.
|
||||
assertTrue(new RegExp(regex, "i").test(sigma), 54 + name);
|
||||
assertTrue(new RegExp(regex, "i").test(alternative_sigma), 55 + name);
|
||||
// JSC and Tracemonkey fail this one.
|
||||
assertTrue(new RegExp(regex, "i").test(SIGMA), 56 + name);
|
||||
}
|
||||
|
||||
print("ok");
|
30
test/mjsunit/regress/regress-486.js
Normal file
30
test/mjsunit/regress/regress-486.js
Normal file
@ -0,0 +1,30 @@
|
||||
// Copyright 2009 the V8 project authors. All rights reserved.
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following
|
||||
// disclaimer in the documentation and/or other materials provided
|
||||
// with the distribution.
|
||||
// * Neither the name of Google Inc. nor the names of its
|
||||
// contributors may be used to endorse or promote products derived
|
||||
// from this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
var st = "\u0422\u0435\u0441\u0442"; // Test in Cyrillic characters.
|
||||
var cyrillicMatch = /^[\u0430-\u044fa-z]+$/i.test(st); // a-ja a-z.
|
||||
assertTrue(cyrillicMatch);
|
Loading…
Reference in New Issue
Block a user