From be20c55b3e00c189e532c960091b29cf4a50c303 Mon Sep 17 00:00:00 2001 From: "lrn@chromium.org" Date: Thu, 2 Dec 2010 08:02:37 +0000 Subject: [PATCH] Change RegExp syntax to fail on invalid ranges like [\d-x], [x-\d] and [\d-\d]. The previous behavior was to treat the "-" as verbatim if the range was invalid. This change matches the JSC changeset http://trac.webkit.org/changeset/72813/ Review URL: http://codereview.chromium.org/5464001 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@5911 ce2b1a6d-e550-0410-aec6-3dcde31c8c00 --- src/parser.cc | 22 +++++-- test/cctest/test-regexp.cc | 3 - test/mjsunit/regexp.js | 82 +++++++++++++------------ test/mjsunit/third_party/regexp-pcre.js | 24 ++++---- test/mozilla/mozilla.status | 4 ++ 5 files changed, 79 insertions(+), 56 deletions(-) diff --git a/src/parser.cc b/src/parser.cc index d147dff939..db36297a5d 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -4404,6 +4404,7 @@ CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { RegExpTree* RegExpParser::ParseCharacterClass() { static const char* kUnterminated = "Unterminated character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; + static const char* kInvalidRange = "Invalid character range"; ASSERT_EQ(current(), '['); Advance(); @@ -4412,12 +4413,28 @@ RegExpTree* RegExpParser::ParseCharacterClass() { is_negated = true; Advance(); } + // A CharacterClass is a sequence of single characters, character class + // escapes or ranges. Ranges are on the form "x-y" where x and y are + // single characters (and not character class escapes like \s). + // A "-" may occur at the start or end of the character class (just after + // "[" or "[^", or just before "]") without being considered part of a + // range. A "-" may also appear as the beginning or end of a range. + // I.e., [--+] is valid, so is [!--]. + ZoneList* ranges = new ZoneList(2); while (has_more() && current() != ']') { uc16 char_class = 0; CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); if (char_class) { CharacterRange::AddClassEscape(char_class, ranges); + if (current() == '-') { + Advance(); + ranges->Add(CharacterRange::Singleton('-')); + if (current() != ']') { + ReportError(CStrVector(kInvalidRange) CHECK_FAILED); + } + break; + } continue; } if (current() == '-') { @@ -4433,10 +4450,7 @@ RegExpTree* RegExpParser::ParseCharacterClass() { } CharacterRange next = ParseClassAtom(&char_class CHECK_FAILED); if (char_class) { - ranges->Add(first); - ranges->Add(CharacterRange::Singleton('-')); - CharacterRange::AddClassEscape(char_class, ranges); - continue; + ReportError(CStrVector(kInvalidRange) CHECK_FAILED); } if (first.from() > next.to()) { return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 3e6709aef4..3bf24a8ab1 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -173,9 +173,6 @@ TEST(Parser) { CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]"); CHECK_PARSE_EQ("[\\d]", "[0-9]"); CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]"); - CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]"); - CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]"); - CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]"); CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK", "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'"); CHECK_PARSE_EQ("\\c!", "'c!'"); diff --git a/test/mjsunit/regexp.js b/test/mjsunit/regexp.js index b57b86d2d8..59c3ba8d28 100644 --- a/test/mjsunit/regexp.js +++ b/test/mjsunit/regexp.js @@ -110,44 +110,6 @@ assertFalse(re.test("\\]")); assertFalse(re.test("\x03]")); // I.e., read as \cc -// Test that we handle \s and \S correctly inside some bizarre -// character classes. -re = /[\s-:]/; -assertTrue(re.test('-')); -assertTrue(re.test(':')); -assertTrue(re.test(' ')); -assertTrue(re.test('\t')); -assertTrue(re.test('\n')); -assertFalse(re.test('a')); -assertFalse(re.test('Z')); - -re = /[\S-:]/; -assertTrue(re.test('-')); -assertTrue(re.test(':')); -assertFalse(re.test(' ')); -assertFalse(re.test('\t')); -assertFalse(re.test('\n')); -assertTrue(re.test('a')); -assertTrue(re.test('Z')); - -re = /[^\s-:]/; -assertFalse(re.test('-')); -assertFalse(re.test(':')); -assertFalse(re.test(' ')); -assertFalse(re.test('\t')); -assertFalse(re.test('\n')); -assertTrue(re.test('a')); -assertTrue(re.test('Z')); - -re = /[^\S-:]/; -assertFalse(re.test('-')); -assertFalse(re.test(':')); -assertTrue(re.test(' ')); -assertTrue(re.test('\t')); -assertTrue(re.test('\n')); -assertFalse(re.test('a')); -assertFalse(re.test('Z')); - re = /[\s]/; assertFalse(re.test('-')); assertFalse(re.test(':')); @@ -647,3 +609,47 @@ assertEquals(4, re.exec("zimzamzumba").index); assertEquals(["bc"], re.exec("zimzomzumbc")); assertFalse(re.test("c")); assertFalse(re.test("")); + + +function testInvalidRange(str) { + try { + RegExp(str).test("x"); + } catch (e) { + return; + } + assetUnreachable("Allowed invalid range in " + str); +} + +function testValidRange(str) { + try { + RegExp(str).test("x"); + } catch (e) { + assertUnreachable("Shouldn't fail parsing: " + str + ", was: " + e); + } +} + +testInvalidRange("[\\d-z]"); +testInvalidRange("[z-\\d]"); +testInvalidRange("[\\d-\\d]"); +testInvalidRange("[z-x]"); // Larger value first. +testInvalidRange("[x-\\d-\\d]"); + +testValidRange("[x-z]"); +testValidRange("[!--\d]"); // Second "-" is end of range. +testValidRange("[\d-]"); +testValidRange("[-\d]"); +testValidRange("[-\d-]"); +testValidRange("[^-\d-]"); +testValidRange("[^-\d-]"); +testValidRange("[0-9-\w]"); + +// Escaped dashes do not count as range operators. +testValidRange("[\\d\\-z]"); +testValidRange("[z\\-\\d]"); +testValidRange("[\\d\\-\\d]"); +testValidRange("[z\\-x]"); +testValidRange("[x\\-\\d\\-\\d]"); + + + + diff --git a/test/mjsunit/third_party/regexp-pcre.js b/test/mjsunit/third_party/regexp-pcre.js index dcb1b320fd..d9fa976855 100644 --- a/test/mjsunit/third_party/regexp-pcre.js +++ b/test/mjsunit/third_party/regexp-pcre.js @@ -962,7 +962,7 @@ res[882] = /[az-]+/; res[883] = /[a\-z]+/; res[884] = /[a-z]+/; res[885] = /[\d-]+/; -res[886] = /[\d-z]+/; +// res[886] - Disabled after making [\d-z] invalid to be compatible with JSC. res[887] = /\x5c/; res[888] = /\x20Z/; res[889] = /ab{3cd/; @@ -1346,7 +1346,7 @@ res[1266] = /((Z)+|A)*/; res[1267] = /(Z()|A)*/; res[1268] = /(Z(())|A)*/; res[1269] = /a*/g; -res[1270] = /^[\d-a]/; +// res[1270] disabled after making /^[\d-a]/ invalid to be compatible with JSC. res[1271] = /[[:space:]]+/; res[1272] = /[[:blank:]]+/; res[1273] = /[\s]+/; @@ -2530,7 +2530,7 @@ assertEquals(null, res[431].exec("a\x0db ", 882)); assertEquals(null, res[431].exec("a\x85b", 883)); assertThrows("var re = /(?-+a)/;", 884); assertEquals(null, res[443].exec("aaaa", 885)); -assertEquals(null, res[443].exec("bacxxx", 886)); +// assertEquals(null, res[443].exec("bacxxx", 886)); assertEquals(null, res[443].exec("bbaccxxx ", 887)); assertEquals(null, res[443].exec("bbbacccxx", 888)); assertEquals(null, res[443].exec("aaaa", 889)); @@ -4391,9 +4391,10 @@ assertEquals("abcdxyz", res[884].exec("abcdxyz"), 2743); assertEquals("12-34", res[885].exec("12-34"), 2744); assertEquals(null, res[885].exec("*** Failers", 2745)); assertEquals(null, res[885].exec("aaa", 2746)); -assertEquals("12-34z", res[886].exec("12-34z"), 2747); -assertEquals(null, res[886].exec("*** Failers", 2748)); -assertEquals(null, res[886].exec("aaa", 2749)); +// Disabled. To be compatible with JSC, the regexp is no longer valid. +// assertEquals("12-34z", res[886].exec("12-34z"), 2747); +// assertEquals(null, res[886].exec("*** Failers", 2748)); +// assertEquals(null, res[886].exec("aaa", 2749)); assertEquals("\\", res[887].exec("\\\\"), 2750); assertEquals(" Z", res[888].exec("the Zoo"), 2751); assertEquals(null, res[888].exec("*** Failers", 2752)); @@ -5355,11 +5356,12 @@ assertEquals("", res[1269].exec("-things"), 3707); assertEquals("", res[1269].exec("0digit"), 3708); assertEquals("", res[1269].exec("*** Failers"), 3709); assertEquals("", res[1269].exec("bcdef "), 3710); -assertEquals("a", res[1270].exec("abcde"), 3711); -assertEquals("-", res[1270].exec("-things"), 3712); -assertEquals("0", res[1270].exec("0digit"), 3713); -assertEquals(null, res[1270].exec("*** Failers", 3714)); -assertEquals(null, res[1270].exec("bcdef ", 3715)); +// Disabled. To be compatible with JSC, the RegExp is no longer valid. +// assertEquals("a", res[1270].exec("abcde"), 3711); +// assertEquals("-", res[1270].exec("-things"), 3712); +// assertEquals("0", res[1270].exec("0digit"), 3713); +// assertEquals(null, res[1270].exec("*** Failers", 3714)); +// assertEquals(null, res[1270].exec("bcdef ", 3715)); assertEquals(null, res[1271].exec("> \x09\n\x0c\x0d\x0b<", 3716)); assertEquals(null, res[1271].exec(" ", 3717)); assertEquals(null, res[1272].exec("> \x09\n\x0c\x0d\x0b<", 3718)); diff --git a/test/mozilla/mozilla.status b/test/mozilla/mozilla.status index 1768c3975d..38d83493d5 100644 --- a/test/mozilla/mozilla.status +++ b/test/mozilla/mozilla.status @@ -288,6 +288,10 @@ js1_2/regexp/RegExp_multiline_as_array: FAIL_OK js1_2/regexp/beginLine: FAIL_OK js1_2/regexp/endLine: FAIL_OK +# To be compatible with JSC, we no longer accept [\d-x], [x-\d] or +# [\d-\d] as valid ranges. +ecma_3/RegExp/regress-375715-02: FAIL +js1_5/extensions/regress-351463-01: FAIL # To be compatible with safari typeof a regexp yields 'function'; # in firefox it yields 'object'.