From 92bfd13457c80f02be01551f4ea9a5badfe0e4c4 Mon Sep 17 00:00:00 2001 From: yangguo Date: Tue, 14 Jun 2016 06:53:19 -0700 Subject: [PATCH] [regexp] implement \p{Any}, \p{Ascii}, and \p{Assigned}. R=littledan@chromium.org, mathias@qiwi.be BUG=v8:4743 Review-Url: https://codereview.chromium.org/2059113002 Cr-Commit-Position: refs/heads/master@{#36969} --- src/regexp/regexp-parser.cc | 65 +++++++++++++------ src/regexp/regexp-parser.h | 2 +- .../regexp-property-general-category.js | 16 ++--- test/mjsunit/harmony/regexp-property-lu-ui.js | 13 ++++ .../harmony/regexp-property-special.js | 51 +++++++++++++++ 5 files changed, 119 insertions(+), 28 deletions(-) create mode 100644 test/mjsunit/harmony/regexp-property-lu-ui.js create mode 100644 test/mjsunit/harmony/regexp-property-special.js diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc index bdfa13f719..323c378cb6 100644 --- a/src/regexp/regexp-parser.cc +++ b/src/regexp/regexp-parser.cc @@ -362,11 +362,11 @@ RegExpTree* RegExpParser::ParseDisjunction() { if (FLAG_harmony_regexp_property) { ZoneList* ranges = new (zone()) ZoneList(2, zone()); - if (!ParsePropertyClass(ranges)) { + if (!ParsePropertyClass(ranges, p == 'P')) { return ReportError(CStrVector("Invalid property name")); } RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(ranges, p == 'P'); + new (zone()) RegExpCharacterClass(ranges, false); builder->AddCharacterClass(cc); } else { // With /u, no identity escapes except for syntax characters @@ -845,6 +845,9 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { } #ifdef V8_I18N_SUPPORT + +namespace { + bool IsExactPropertyAlias(const char* property_name, UProperty property) { const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); if (short_name != NULL && strcmp(property_name, short_name) == 0) return true; @@ -875,7 +878,7 @@ bool IsExactPropertyValueAlias(const char* property_value_name, } bool LookupPropertyValueName(UProperty property, - const char* property_value_name, + const char* property_value_name, bool negate, ZoneList* result, Zone* zone) { int32_t property_value = u_getPropertyValueEnum(property, property_value_name); @@ -895,6 +898,7 @@ bool LookupPropertyValueName(UProperty property, if (success) { uset_removeAllStrings(set); + if (negate) uset_complement(set); int item_count = uset_getItemCount(set); int item_result = 0; for (int i = 0; i < item_count; i++) { @@ -910,7 +914,33 @@ bool LookupPropertyValueName(UProperty property, return success; } -bool RegExpParser::ParsePropertyClass(ZoneList* result) { +template +inline bool NameEquals(const char* name, const char (&literal)[N]) { + return strncmp(name, literal, N + 1) == 0; +} + +bool LookupSpecialPropertyValueName(const char* name, + ZoneList* result, + bool negate, Zone* zone) { + if (NameEquals(name, "Any")) { + if (!negate) result->Add(CharacterRange::Everything(), zone); + } else if (NameEquals(name, "ASCII")) { + result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) + : CharacterRange::Range(0x0, 0x7f), + zone); + } else if (NameEquals(name, "Assigned")) { + return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", + !negate, result, zone); + } else { + return false; + } + return true; +} + +} // anonymous namespace + +bool RegExpParser::ParsePropertyClass(ZoneList* result, + bool negate) { // Parse the property class as follows: // - In \p{name}, 'name' is interpreted // - either as a general category property value name. @@ -943,8 +973,12 @@ bool RegExpParser::ParsePropertyClass(ZoneList* result) { if (second_part.is_empty()) { // First attempt to interpret as general category property value name. const char* name = first_part.ToConstVector().start(); - if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, result, - zone())) { + if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, + result, zone())) { + return true; + } + // Interpret "Any", "ASCII", and "Assigned". + if (LookupSpecialPropertyValueName(name, result, negate, zone())) { return true; } // Then attempt to interpret as binary property name with value name 'Y'. @@ -952,7 +986,8 @@ bool RegExpParser::ParsePropertyClass(ZoneList* result) { if (property < UCHAR_BINARY_START) return false; if (property >= UCHAR_BINARY_LIMIT) return false; if (!IsExactPropertyAlias(name, property)) return false; - return LookupPropertyValueName(property, "Y", result, zone()); + return LookupPropertyValueName(property, negate ? "N" : "Y", false, result, + zone()); } else { // Both property name and value name are specified. Attempt to interpret // the property name as enumerated property. @@ -962,7 +997,8 @@ bool RegExpParser::ParsePropertyClass(ZoneList* result) { if (property < UCHAR_INT_START) return false; if (property >= UCHAR_INT_LIMIT) return false; if (!IsExactPropertyAlias(property_name, property)) return false; - return LookupPropertyValueName(property, value_name, result, zone()); + return LookupPropertyValueName(property, value_name, negate, result, + zone()); } } @@ -1159,19 +1195,10 @@ bool RegExpParser::ParseClassProperty(ZoneList* ranges) { bool parse_success = false; if (next == 'p') { Advance(2); - parse_success = ParsePropertyClass(ranges); + parse_success = ParsePropertyClass(ranges, false); } else if (next == 'P') { Advance(2); - ZoneList* property_class = - new (zone()) ZoneList(2, zone()); - parse_success = ParsePropertyClass(property_class); - if (parse_success) { - ZoneList* negated = - new (zone()) ZoneList(2, zone()); - CharacterRange::Negate(property_class, negated, zone()); - const Vector negated_vector = negated->ToVector(); - ranges->AddAll(negated_vector, zone()); - } + parse_success = ParsePropertyClass(ranges, true); } else { return false; } diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h index 6142a9ea53..93e4bf7a87 100644 --- a/src/regexp/regexp-parser.h +++ b/src/regexp/regexp-parser.h @@ -174,7 +174,7 @@ class RegExpParser BASE_EMBEDDED { bool ParseHexEscape(int length, uc32* value); bool ParseUnicodeEscape(uc32* value); bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); - bool ParsePropertyClass(ZoneList* result); + bool ParsePropertyClass(ZoneList* result, bool negate); uc32 ParseOctalLiteral(); diff --git a/test/mjsunit/harmony/regexp-property-general-category.js b/test/mjsunit/harmony/regexp-property-general-category.js index 7ef63117df..03a526844d 100644 --- a/test/mjsunit/harmony/regexp-property-general-category.js +++ b/test/mjsunit/harmony/regexp-property-general-category.js @@ -30,10 +30,10 @@ assertTrue(/\p{Ll}/iu.test("a")); assertTrue(/\p{Ll}/iu.test("\u{118D4}")); assertTrue(/\p{Ll}/iu.test("A")); assertTrue(/\p{Ll}/iu.test("\u{118B4}")); -assertFalse(/\P{Ll}/iu.test("a")); -assertFalse(/\P{Ll}/iu.test("\u{118D4}")); -assertFalse(/\P{Ll}/iu.test("A")); -assertFalse(/\P{Ll}/iu.test("\u{118B4}")); +assertTrue(/\P{Ll}/iu.test("a")); +assertTrue(/\P{Ll}/iu.test("\u{118D4}")); +assertTrue(/\P{Ll}/iu.test("A")); +assertTrue(/\P{Ll}/iu.test("\u{118B4}")); assertTrue(/\p{Lu}/u.test("A")); assertFalse(/\P{Lu}/u.test("A")); @@ -48,10 +48,10 @@ assertTrue(/\p{Lu}/iu.test("a")); assertTrue(/\p{Lu}/iu.test("\u{118D4}")); assertTrue(/\p{Lu}/iu.test("A")); assertTrue(/\p{Lu}/iu.test("\u{118B4}")); -assertFalse(/\P{Lu}/iu.test("a")); -assertFalse(/\P{Lu}/iu.test("\u{118D4}")); -assertFalse(/\P{Lu}/iu.test("A")); -assertFalse(/\P{Lu}/iu.test("\u{118B4}")); +assertTrue(/\P{Lu}/iu.test("a")); +assertTrue(/\P{Lu}/iu.test("\u{118D4}")); +assertTrue(/\P{Lu}/iu.test("A")); +assertTrue(/\P{Lu}/iu.test("\u{118B4}")); assertTrue(/\p{Sm}/u.test("+")); assertFalse(/\P{Sm}/u.test("+")); diff --git a/test/mjsunit/harmony/regexp-property-lu-ui.js b/test/mjsunit/harmony/regexp-property-lu-ui.js new file mode 100644 index 0000000000..56ec53d2f4 --- /dev/null +++ b/test/mjsunit/harmony/regexp-property-lu-ui.js @@ -0,0 +1,13 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Flags: --harmony-regexp-property --harmony-unicode-regexps + +const regexp = /\P{Lu}/ui; +const regexpu = /[\0-@\[-\xBF\xD7\xDF-\xFF\u0101\u0103\u0105\u0107\u0109\u010B\u010D\u010F\u0111\u0113\u0115\u0117\u0119\u011B\u011D\u011F\u0121\u0123\u0125\u0127\u0129\u012B\u012D\u012F\u0131\u0133\u0135\u0137\u0138\u013A\u013C\u013E\u0140\u0142\u0144\u0146\u0148\u0149\u014B\u014D\u014F\u0151\u0153\u0155\u0157\u0159\u015B\u015D\u015F\u0161\u0163\u0165\u0167\u0169\u016B\u016D\u016F\u0171\u0173\u0175\u0177\u017A\u017C\u017E-\u0180\u0183\u0185\u0188\u018C\u018D\u0192\u0195\u0199-\u019B\u019E\u01A1\u01A3\u01A5\u01A8\u01AA\u01AB\u01AD\u01B0\u01B4\u01B6\u01B9-\u01BB\u01BD-\u01C3\u01C5\u01C6\u01C8\u01C9\u01CB\u01CC\u01CE\u01D0\u01D2\u01D4\u01D6\u01D8\u01DA\u01DC\u01DD\u01DF\u01E1\u01E3\u01E5\u01E7\u01E9\u01EB\u01ED\u01EF\u01F0\u01F2\u01F3\u01F5\u01F9\u01FB\u01FD\u01FF\u0201\u0203\u0205\u0207\u0209\u020B\u020D\u020F\u0211\u0213\u0215\u0217\u0219\u021B\u021D\u021F\u0221\u0223\u0225\u0227\u0229\u022B\u022D\u022F\u0231\u0233-\u0239\u023C\u023F\u0240\u0242\u0247\u0249\u024B\u024D\u024F-\u036F\u0371\u0373-\u0375\u0377-\u037E\u0380-\u0385\u0387\u038B\u038D\u0390\u03A2\u03AC-\u03CE\u03D0\u03D1\u03D5-\u03D7\u03D9\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F3\u03F5\u03F6\u03F8\u03FB\u03FC\u0430-\u045F\u0461\u0463\u0465\u0467\u0469\u046B\u046D\u046F\u0471\u0473\u0475\u0477\u0479\u047B\u047D\u047F\u0481-\u0489\u048B\u048D\u048F\u0491\u0493\u0495\u0497\u0499\u049B\u049D\u049F\u04A1\u04A3\u04A5\u04A7\u04A9\u04AB\u04AD\u04AF\u04B1\u04B3\u04B5\u04B7\u04B9\u04BB\u04BD\u04BF\u04C2\u04C4\u04C6\u04C8\u04CA\u04CC\u04CE\u04CF\u04D1\u04D3\u04D5\u04D7\u04D9\u04DB\u04DD\u04DF\u04E1\u04E3\u04E5\u04E7\u04E9\u04EB\u04ED\u04EF\u04F1\u04F3\u04F5\u04F7\u04F9\u04FB\u04FD\u04FF\u0501\u0503\u0505\u0507\u0509\u050B\u050D\u050F\u0511\u0513\u0515\u0517\u0519\u051B\u051D\u051F\u0521\u0523\u0525\u0527\u0529\u052B\u052D\u052F\u0530\u0557-\u109F\u10C6\u10C8-\u10CC\u10CE-\u139F\u13F6-\u1DFF\u1E01\u1E03\u1E05\u1E07\u1E09\u1E0B\u1E0D\u1E0F\u1E11\u1E13\u1E15\u1E17\u1E19\u1E1B\u1E1D\u1E1F\u1E21\u1E23\u1E25\u1E27\u1E29\u1E2B\u1E2D\u1E2F\u1E31\u1E33\u1E35\u1E37\u1E39\u1E3B\u1E3D\u1E3F\u1E41\u1E43\u1E45\u1E47\u1E49\u1E4B\u1E4D\u1E4F\u1E51\u1E53\u1E55\u1E57\u1E59\u1E5B\u1E5D\u1E5F\u1E61\u1E63\u1E65\u1E67\u1E69\u1E6B\u1E6D\u1E6F\u1E71\u1E73\u1E75\u1E77\u1E79\u1E7B\u1E7D\u1E7F\u1E81\u1E83\u1E85\u1E87\u1E89\u1E8B\u1E8D\u1E8F\u1E91\u1E93\u1E95-\u1E9D\u1E9F\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7\u1EB9\u1EBB\u1EBD\u1EBF\u1EC1\u1EC3\u1EC5\u1EC7\u1EC9\u1ECB\u1ECD\u1ECF\u1ED1\u1ED3\u1ED5\u1ED7\u1ED9\u1EDB\u1EDD\u1EDF\u1EE1\u1EE3\u1EE5\u1EE7\u1EE9\u1EEB\u1EED\u1EEF\u1EF1\u1EF3\u1EF5\u1EF7\u1EF9\u1EFB\u1EFD\u1EFF-\u1F07\u1F10-\u1F17\u1F1E-\u1F27\u1F30-\u1F37\u1F40-\u1F47\u1F4E-\u1F58\u1F5A\u1F5C\u1F5E\u1F60-\u1F67\u1F70-\u1FB7\u1FBC-\u1FC7\u1FCC-\u1FD7\u1FDC-\u1FE7\u1FED-\u1FF7\u1FFC-\u2101\u2103-\u2106\u2108-\u210A\u210E\u210F\u2113\u2114\u2116-\u2118\u211E-\u2123\u2125\u2127\u2129\u212E\u212F\u2134-\u213D\u2140-\u2144\u2146-\u2182\u2184-\u2BFF\u2C2F-\u2C5F\u2C61\u2C65\u2C66\u2C68\u2C6A\u2C6C\u2C71\u2C73\u2C74\u2C76-\u2C7D\u2C81\u2C83\u2C85\u2C87\u2C89\u2C8B\u2C8D\u2C8F\u2C91\u2C93\u2C95\u2C97\u2C99\u2C9B\u2C9D\u2C9F\u2CA1\u2CA3\u2CA5\u2CA7\u2CA9\u2CAB\u2CAD\u2CAF\u2CB1\u2CB3\u2CB5\u2CB7\u2CB9\u2CBB\u2CBD\u2CBF\u2CC1\u2CC3\u2CC5\u2CC7\u2CC9\u2CCB\u2CCD\u2CCF\u2CD1\u2CD3\u2CD5\u2CD7\u2CD9\u2CDB\u2CDD\u2CDF\u2CE1\u2CE3-\u2CEA\u2CEC\u2CEE-\u2CF1\u2CF3-\uA63F\uA641\uA643\uA645\uA647\uA649\uA64B\uA64D\uA64F\uA651\uA653\uA655\uA657\uA659\uA65B\uA65D\uA65F\uA661\uA663\uA665\uA667\uA669\uA66B\uA66D-\uA67F\uA681\uA683\uA685\uA687\uA689\uA68B\uA68D\uA68F\uA691\uA693\uA695\uA697\uA699\uA69B-\uA721\uA723\uA725\uA727\uA729\uA72B\uA72D\uA72F-\uA731\uA733\uA735\uA737\uA739\uA73B\uA73D\uA73F\uA741\uA743\uA745\uA747\uA749\uA74B\uA74D\uA74F\uA751\uA753\uA755\uA757\uA759\uA75B\uA75D\uA75F\uA761\uA763\uA765\uA767\uA769\uA76B\uA76D\uA76F-\uA778\uA77A\uA77C\uA77F\uA781\uA783\uA785\uA787-\uA78A\uA78C\uA78E\uA78F\uA791\uA793-\uA795\uA797\uA799\uA79B\uA79D\uA79F\uA7A1\uA7A3\uA7A5\uA7A7\uA7A9\uA7AE\uA7AF\uA7B5\uA7B7-\uFF20\uFF3B-\u{103FF}\u{10428}-\u{10C7F}\u{10CB3}-\u{1189F}\u{118C0}-\u{1D3FF}\u{1D41A}-\u{1D433}\u{1D44E}-\u{1D467}\u{1D482}-\u{1D49B}\u{1D49D}\u{1D4A0}\u{1D4A1}\u{1D4A3}\u{1D4A4}\u{1D4A7}\u{1D4A8}\u{1D4AD}\u{1D4B6}-\u{1D4CF}\u{1D4EA}-\u{1D503}\u{1D506}\u{1D50B}\u{1D50C}\u{1D515}\u{1D51D}-\u{1D537}\u{1D53A}\u{1D53F}\u{1D545}\u{1D547}-\u{1D549}\u{1D551}-\u{1D56B}\u{1D586}-\u{1D59F}\u{1D5BA}-\u{1D5D3}\u{1D5EE}-\u{1D607}\u{1D622}-\u{1D63B}\u{1D656}-\u{1D66F}\u{1D68A}-\u{1D6A7}\u{1D6C1}-\u{1D6E1}\u{1D6FB}-\u{1D71B}\u{1D735}-\u{1D755}\u{1D76F}-\u{1D78F}\u{1D7A9}-\u{1D7C9}\u{1D7CB}-\u{10FFFF}]/ui; + +for (let codePoint = 0; codePoint <= 0x10FFFF; codePoint++) { + const string = String.fromCodePoint(codePoint); + assertEquals(regexp.test(string), regexpu.test(string)); +} diff --git a/test/mjsunit/harmony/regexp-property-special.js b/test/mjsunit/harmony/regexp-property-special.js new file mode 100644 index 0000000000..99ffe07498 --- /dev/null +++ b/test/mjsunit/harmony/regexp-property-special.js @@ -0,0 +1,51 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Flags: --harmony-regexp-property --harmony-unicode-regexps + +function t(re, s) { assertTrue(re.test(s)); } +function f(re, s) { assertFalse(re.test(s)); } + +t(/\p{ASCII}+/u, "abc123"); +f(/\p{ASCII}+/u, "ⓐⓑⓒ①②③"); +f(/\p{ASCII}+/u, "🄰🄱🄲①②③"); +f(/\P{ASCII}+/u, "abcd123"); +t(/\P{ASCII}+/u, "ⓐⓑⓒ①②③"); +t(/\P{ASCII}+/u, "🄰🄱🄲①②③"); + +f(/[^\p{ASCII}]+/u, "abc123"); +f(/[\p{ASCII}]+/u, "ⓐⓑⓒ①②③"); +f(/[\p{ASCII}]+/u, "🄰🄱🄲①②③"); +t(/[^\P{ASCII}]+/u, "abcd123"); +t(/[\P{ASCII}]+/u, "ⓐⓑⓒ①②③"); +f(/[^\P{ASCII}]+/u, "🄰🄱🄲①②③"); + +t(/\p{Any}+/u, "🄰🄱🄲①②③"); + +assertEquals(["\ud800"], /\p{Any}/u.exec("\ud800\ud801")); +assertEquals(["\udc00"], /\p{Any}/u.exec("\udc00\udc01")); +assertEquals(["\ud800\udc01"], /\p{Any}/u.exec("\ud800\udc01")); +assertEquals(["\udc01"], /\p{Any}/u.exec("\udc01")); + +f(/\P{Any}+/u, "123"); +f(/[\P{Any}]+/u, "123"); +t(/[\P{Any}\d]+/u, "123"); +t(/[^\P{Any}]+/u, "123"); + +t(/\p{Assigned}+/u, "123"); +t(/\p{Assigned}+/u, "🄰🄱🄲"); +f(/\p{Assigned}+/u, "\ufdd0"); +f(/\p{Assigned}+/u, "\u{fffff}"); + +f(/\P{Assigned}+/u, "123"); +f(/\P{Assigned}+/u, "🄰🄱🄲"); +t(/\P{Assigned}+/u, "\ufdd0"); +t(/\P{Assigned}+/u, "\u{fffff}"); +f(/\P{Assigned}/u, ""); + +t(/[^\P{Assigned}]+/u, "123"); +f(/[\P{Assigned}]+/u, "🄰🄱🄲"); +f(/[^\P{Assigned}]+/u, "\ufdd0"); +t(/[\P{Assigned}]+/u, "\u{fffff}"); +f(/[\P{Assigned}]/u, "");