0fec70aeb1
Add support for properties of strings in unicode sets mode (/v). Bug: v8:11935 Change-Id: Iae2f0182b1c42bb900c524ca406784b7b1b52842 Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4051247 Commit-Queue: Patrick Thier <pthier@chromium.org> Reviewed-by: Mathias Bynens <mathias@chromium.org> Cr-Commit-Position: refs/heads/main@{#84481}
242 lines
9.5 KiB
JavaScript
242 lines
9.5 KiB
JavaScript
// Copyright 2022 the V8 project authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style license that can be
|
||
// found in the LICENSE file.
|
||
|
||
// Flags: --harmony-regexp-unicode-sets
|
||
|
||
// u and v are not allowed together.
|
||
assertEarlyError('/./uv');
|
||
assertThrowsAtRuntime("new RegExp('.','uv')", SyntaxError);
|
||
|
||
assertEquals('v', /./v.flags);
|
||
assertTrue(/./v.unicodeSets);
|
||
|
||
// Characters that require escaping within a character class in /v mode
|
||
assertEarlyError('/[(]/v');
|
||
assertEarlyError('/[)]/v');
|
||
assertEarlyError('/[[]/v');
|
||
assertEarlyError('/[]]/v');
|
||
assertEarlyError('/[{]/v');
|
||
assertEarlyError('/[}]/v');
|
||
assertEarlyError('/[/]/v');
|
||
assertEarlyError('/[-]/v');
|
||
// Need to escape the backslash, as assertEarlyError uses eval().
|
||
assertEarlyError('/[\\]/v');
|
||
assertEarlyError('/[|]/v');
|
||
|
||
assertEarlyError('/[&&]/v');
|
||
assertEarlyError('/[!!]/v');
|
||
assertEarlyError('/[##]/v');
|
||
assertEarlyError('/[$$]/v');
|
||
assertEarlyError('/[%%]/v');
|
||
assertEarlyError('/[**]/v');
|
||
assertEarlyError('/[++]/v');
|
||
assertEarlyError('/[,,]/v');
|
||
assertEarlyError('/[..]/v');
|
||
assertEarlyError('/[::]/v');
|
||
assertEarlyError('/[;;]/v');
|
||
assertEarlyError('/[<<]/v');
|
||
assertEarlyError('/[==]/v');
|
||
assertEarlyError('/[>>]/v');
|
||
assertEarlyError('/[??]/v');
|
||
assertEarlyError('/[@@]/v');
|
||
// The first ^ negates the class. The following two are not valid.
|
||
assertEarlyError('/[^^^]/v');
|
||
assertEarlyError('/[``]/v');
|
||
assertEarlyError('/[~~]/v');
|
||
|
||
assertEarlyError('/[a&&&]/v');
|
||
assertEarlyError('/[&&&a]/v');
|
||
|
||
// Unterminated string disjunction.
|
||
assertEarlyError('/[\q{foo]/v');
|
||
assertEarlyError('/[\q{foo|]/v');
|
||
|
||
// Negating classes containing strings is not allowed.
|
||
assertEarlyError('/[^\q{foo}]/v');
|
||
assertEarlyError('/[^\q{}]/v'); // Empty string counts as string.
|
||
assertEarlyError('/[^[\q{foo}]]/v');
|
||
assertEarlyError('/[^[\p{Basic_Emoji}]/v');
|
||
assertEarlyError('/[^\q{foo}&&\q{bar}]/v');
|
||
assertEarlyError('/[^\q{foo}--\q{bar}]/v');
|
||
// Exceptions when negating the class is allowed:
|
||
// The "string" contains only single characters.
|
||
/[^\q{a|b|c}]/v;
|
||
// Not all operands of an intersection contain strings.
|
||
/[^\q{foo}&&\q{bar}&&a]/v;
|
||
// The first operand of a subtraction doesn't contain strings.
|
||
/[^a--\q{foo}--\q{bar}]/v;
|
||
|
||
// Negated properties of strings are not allowed.
|
||
assertEarlyError('/\P{Basic_Emoji}/v');
|
||
assertEarlyError('/\P{Emoji_Keycap_Sequence}/v');
|
||
assertEarlyError('/\P{RGI_Emoji_Modifier_Sequence}/v');
|
||
assertEarlyError('/\P{RGI_Emoji_Flag_Sequence}/v');
|
||
assertEarlyError('/\P{RGI_Emoji_Tag_Sequence}/v');
|
||
assertEarlyError('/\P{RGI_Emoji_ZWJ_Sequence}/v');
|
||
assertEarlyError('/\P{RGI_Emoji}/v');
|
||
|
||
const allAscii = Array.from(
|
||
{length: 127}, (v, i) => { return String.fromCharCode(i); });
|
||
|
||
function check(re, expectMatch, expectNoMatch = [], negationValid = true) {
|
||
if (expectNoMatch === undefined) {
|
||
const expectSet = new Set(expectMatch.map(val => {
|
||
return (typeof val == 'number') ? String(val) : val; }));
|
||
expectNoMatch = allAscii.filter(val => !expectSet.has(val));
|
||
}
|
||
for (const match of expectMatch) {
|
||
assertTrue(re.test(match), `${re}.test(${match})`);
|
||
}
|
||
for (const noMatch of expectNoMatch) {
|
||
assertFalse(re.test(noMatch), `${re}.test(${noMatch})`);
|
||
}
|
||
if (!negationValid) {
|
||
// Negation of classes containing strings is an error.
|
||
const negated = `[^${re.source}]`;
|
||
assertThrows(() => { new RegExp(negated, `${re.flags}`); }, SyntaxError,
|
||
`Invalid regular expression: /${negated}/: ` +
|
||
`Negated character class may contain strings`);
|
||
} else {
|
||
// Nest the current RegExp in a negated class and check expectations are
|
||
// inversed.
|
||
const inverted = new RegExp(`[^${re.source}]`, re.flags);
|
||
for (const match of expectMatch) {
|
||
assertFalse(inverted.test(match), `${inverted}.test(${match})`);
|
||
}
|
||
for (const noMatch of expectNoMatch) {
|
||
assertTrue(inverted.test(noMatch), `${inverted}.test(${noMatch})`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Union with nested class
|
||
check(
|
||
/[\da-f[xy][^[^z]]]/v, Array.from('0123456789abcdefxyz'),
|
||
Array.from('ghijklmnopqrstuv!?'));
|
||
|
||
// Intersections
|
||
check(/[\d&&[0-9]]/v, Array.from('0123456789'), []);
|
||
check(/[\d&&0]/v, [0], Array.from('123456789'));
|
||
check(/[\d&&9]/v, [9], Array.from('012345678'));
|
||
check(/[\d&&[02468]]/v, Array.from('02468'), Array.from('13579'));
|
||
check(/[\d&&[13579]]/v, Array.from('13579'), Array.from('02468'));
|
||
check(
|
||
/[\w&&[^a-zA-Z_]]/v, Array.from('0123456789'),
|
||
Array.from('abcdxyzABCDXYZ_!?'));
|
||
check(
|
||
/[^\w&&[a-zA-Z_]]/v, Array.from('0123456789!?'),
|
||
Array.from('abcdxyzABCDXYZ_'));
|
||
|
||
// Subtractions
|
||
check(/[\d--[!-%]]/v, Array.from('0123456789'));
|
||
check(/[\d--[A-Z]]/v, Array.from('0123456789'));
|
||
check(/[\d--[0-9]]/v, []);
|
||
check(/[\d--[\w]]/v, []);
|
||
check(/[\d--0]/v, Array.from('123456789'));
|
||
check(/[\d--9]/v, Array.from('012345678'));
|
||
check(/[[\d[a-c]]--9]/v, Array.from('012345678abc'));
|
||
check(/[\d--[02468]]/v, Array.from('13579'));
|
||
check(/[\d--[13579]]/v, Array.from('02468'));
|
||
check(/[[3-7]--[0-9]]/v, []);
|
||
check(/[[3-7]--[0-7]]/v, []);
|
||
check(/[[3-7]--[3-9]]/v, []);
|
||
check(/[[3-79]--[0-7]]/v, [9]);
|
||
check(/[[3-79]--[3-9]]/v, []);
|
||
check(/[[3-7]--[0-3]]/v, Array.from('4567'));
|
||
check(/[[3-7]--[0-5]]/v, Array.from('67'));
|
||
check(/[[3-7]--[7-9]]/v, Array.from('3456'));
|
||
check(/[[3-7]--[5-9]]/v, Array.from('34'));
|
||
check(/[[3-7a-c]--[0-3]]/v, Array.from('4567abc'));
|
||
check(/[[3-7a-c]--[0-5]]/v, Array.from('67abc'));
|
||
check(/[[3-7a-c]--[7-9]]/v, Array.from('3456abc'));
|
||
check(/[[3-7a-c]--[5-9]]/v, Array.from('34abc'));
|
||
check(/[[2-8]--[0-3]--5--[7-9]]/v, Array.from('46'));
|
||
check(/[[2-57-8]--[0-3]--[5-7]]/v, Array.from('48'));
|
||
check(/[[0-57-8]--[1-34]--[5-7]]/v, Array.from('08'));
|
||
check(/[\d--[^02468]]/v, Array.from('02468'));
|
||
check(/[\d--[^13579]]/v, Array.from('13579'));
|
||
|
||
// Ignore-Case
|
||
check(/[Ā-č]/v, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
|
||
check(/[ĀĂĄĆ]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
|
||
check(/[āăąć]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
|
||
|
||
// String disjunctions
|
||
check(/[\q{foo|bar|0|5}]/v, ['foo', 'bar', 0, 5], ['fo', 'baz'], false)
|
||
check(/[\q{foo|bar}[05]]/v, ['foo', 'bar', 0, 5], ['fo', 'baz'], false)
|
||
check(/[\q{foo|bar|0|5}&&\q{bar}]/v, ['bar'], ['foo', 0, 5, 'fo', 'baz'], false)
|
||
// The second operand of the intersection doesn't contain strings, so the result
|
||
// will not contain strings and therefore negation is valid.
|
||
check(/[\q{foo|bar|0|5}&&\d]/v, [0, 5], ['foo', 'bar', 'fo', 'baz'], true)
|
||
check(/[\q{foo|bar|0|5}--\q{foo}]/v, ['bar', 0, 5], ['foo', 'fo', 'baz'], false)
|
||
check(/[\q{foo|bar|0|5}--\d]/v, ['foo', 'bar'], [0, 5, 'fo', 'baz'], false)
|
||
|
||
check(
|
||
/[\q{foo|bar|0|5}&&\q{bAr}]/vi, ['bar', 'bAr', 'BAR'],
|
||
['foo', 0, 5, 'fo', 'baz'], false)
|
||
check(
|
||
/[\q{foo|bar|0|5}--\q{FoO}]/vi, ['bar', 'bAr', 'BAR', 0, 5],
|
||
['foo', 'FOO', 'fo', 'baz'], false)
|
||
|
||
check(/[\q{ĀĂĄĆ|AaAc}&&\q{āăąć}]/vi, ['ĀĂĄĆ', 'āăąć'], ['AaAc'], false);
|
||
check(
|
||
/[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'],
|
||
false);
|
||
|
||
// Empty string disjunctions matches nothing, but succeeds.
|
||
let res = /[\q{}]/v.exec('foo');
|
||
assertNotNull(res);
|
||
assertEquals(1, res.length);
|
||
assertEquals('', res[0]);
|
||
|
||
// Ensure longest strings are matched first.
|
||
assertEquals(['xyz'], /[a-c\q{W|xy|xyz}]/v.exec('xyzabc'))
|
||
assertEquals(['xyz'], /[a-c\q{W|xyz|xy}]/v.exec('xyzabc'))
|
||
assertEquals(['xyz'], /[\q{W|xyz|xy}a-c]/v.exec('xyzabc'))
|
||
// Empty string is last.
|
||
assertEquals(['a'], /[\q{W|}a-c]/v.exec('abc'))
|
||
|
||
// Some more sophisticated tests taken from
|
||
// https://v8.dev/features/regexp-v-flag
|
||
assertTrue(/^\p{RGI_Emoji}$/v.test('⚽'));
|
||
assertTrue(/^\p{RGI_Emoji}$/v.test('👨🏾⚕️'));
|
||
assertFalse(/[\p{Script_Extensions=Greek}--π]/v.test('π'));
|
||
assertFalse(/[\p{Script_Extensions=Greek}--[αβγ]]/v.test('α'));
|
||
assertFalse(/[\p{Script_Extensions=Greek}--[α-γ]]/v.test('β'));
|
||
assertTrue(/[\p{Decimal_Number}--[0-9]]/v.test('𑜹'));
|
||
assertFalse(/[\p{Decimal_Number}--[0-9]]/v.test('4'));
|
||
assertTrue(
|
||
/^\p{RGI_Emoji_Tag_Sequence}$/v.test('🏴'));
|
||
assertFalse(
|
||
/^[\p{RGI_Emoji_Tag_Sequence}--\q{🏴}]$/v.test(
|
||
'🏴'));
|
||
assertTrue(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('π'));
|
||
assertFalse(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('𐆊'));
|
||
assertTrue(/[\p{White_Space}&&\p{ASCII}]/v.test('\n'));
|
||
assertFalse(/[\p{White_Space}&&\p{ASCII}]/v.test('\u2028'));
|
||
assertTrue(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test('᠗'));
|
||
assertFalse(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test('ᠴ'));
|
||
assertTrue(/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test(
|
||
'4️⃣'));
|
||
assertTrue(
|
||
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('_'));
|
||
assertTrue(
|
||
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('🇧🇪'));
|
||
assertTrue(/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test(
|
||
'abc'));
|
||
assertTrue(
|
||
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('x'));
|
||
assertTrue(
|
||
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('4'));
|
||
assertTrue(
|
||
/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test('🇧🇪'));
|
||
assertTrue(/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test(
|
||
'🏴'));
|
||
assertTrue(
|
||
/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test('🇨🇭'));
|
||
assertTrue(/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test(
|
||
'🏴'));
|
||
assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/\p{Lowercase_Letter}/giv, 'X'));
|
||
assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/[^\P{Lowercase_Letter}]/giv, 'X'));
|