v8/test/mjsunit/harmony/regexp-unicode-sets.js
pthier 0fec70aeb1 [regexp] Support properties of strings in unicode sets mode
Add support for properties of strings in unicode sets mode (/v).

Bug: v8:11935
Change-Id: Iae2f0182b1c42bb900c524ca406784b7b1b52842
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4051247
Commit-Queue: Patrick Thier <pthier@chromium.org>
Reviewed-by: Mathias Bynens <mathias@chromium.org>
Cr-Commit-Position: refs/heads/main@{#84481}
2022-11-25 10:29:16 +00:00

242 lines
9.5 KiB
JavaScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2022 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Flags: --harmony-regexp-unicode-sets
// u and v are not allowed together.
assertEarlyError('/./uv');
assertThrowsAtRuntime("new RegExp('.','uv')", SyntaxError);
assertEquals('v', /./v.flags);
assertTrue(/./v.unicodeSets);
// Characters that require escaping within a character class in /v mode
assertEarlyError('/[(]/v');
assertEarlyError('/[)]/v');
assertEarlyError('/[[]/v');
assertEarlyError('/[]]/v');
assertEarlyError('/[{]/v');
assertEarlyError('/[}]/v');
assertEarlyError('/[/]/v');
assertEarlyError('/[-]/v');
// Need to escape the backslash, as assertEarlyError uses eval().
assertEarlyError('/[\\]/v');
assertEarlyError('/[|]/v');
assertEarlyError('/[&&]/v');
assertEarlyError('/[!!]/v');
assertEarlyError('/[##]/v');
assertEarlyError('/[$$]/v');
assertEarlyError('/[%%]/v');
assertEarlyError('/[**]/v');
assertEarlyError('/[++]/v');
assertEarlyError('/[,,]/v');
assertEarlyError('/[..]/v');
assertEarlyError('/[::]/v');
assertEarlyError('/[;;]/v');
assertEarlyError('/[<<]/v');
assertEarlyError('/[==]/v');
assertEarlyError('/[>>]/v');
assertEarlyError('/[??]/v');
assertEarlyError('/[@@]/v');
// The first ^ negates the class. The following two are not valid.
assertEarlyError('/[^^^]/v');
assertEarlyError('/[``]/v');
assertEarlyError('/[~~]/v');
assertEarlyError('/[a&&&]/v');
assertEarlyError('/[&&&a]/v');
// Unterminated string disjunction.
assertEarlyError('/[\q{foo]/v');
assertEarlyError('/[\q{foo|]/v');
// Negating classes containing strings is not allowed.
assertEarlyError('/[^\q{foo}]/v');
assertEarlyError('/[^\q{}]/v'); // Empty string counts as string.
assertEarlyError('/[^[\q{foo}]]/v');
assertEarlyError('/[^[\p{Basic_Emoji}]/v');
assertEarlyError('/[^\q{foo}&&\q{bar}]/v');
assertEarlyError('/[^\q{foo}--\q{bar}]/v');
// Exceptions when negating the class is allowed:
// The "string" contains only single characters.
/[^\q{a|b|c}]/v;
// Not all operands of an intersection contain strings.
/[^\q{foo}&&\q{bar}&&a]/v;
// The first operand of a subtraction doesn't contain strings.
/[^a--\q{foo}--\q{bar}]/v;
// Negated properties of strings are not allowed.
assertEarlyError('/\P{Basic_Emoji}/v');
assertEarlyError('/\P{Emoji_Keycap_Sequence}/v');
assertEarlyError('/\P{RGI_Emoji_Modifier_Sequence}/v');
assertEarlyError('/\P{RGI_Emoji_Flag_Sequence}/v');
assertEarlyError('/\P{RGI_Emoji_Tag_Sequence}/v');
assertEarlyError('/\P{RGI_Emoji_ZWJ_Sequence}/v');
assertEarlyError('/\P{RGI_Emoji}/v');
const allAscii = Array.from(
{length: 127}, (v, i) => { return String.fromCharCode(i); });
function check(re, expectMatch, expectNoMatch = [], negationValid = true) {
if (expectNoMatch === undefined) {
const expectSet = new Set(expectMatch.map(val => {
return (typeof val == 'number') ? String(val) : val; }));
expectNoMatch = allAscii.filter(val => !expectSet.has(val));
}
for (const match of expectMatch) {
assertTrue(re.test(match), `${re}.test(${match})`);
}
for (const noMatch of expectNoMatch) {
assertFalse(re.test(noMatch), `${re}.test(${noMatch})`);
}
if (!negationValid) {
// Negation of classes containing strings is an error.
const negated = `[^${re.source}]`;
assertThrows(() => { new RegExp(negated, `${re.flags}`); }, SyntaxError,
`Invalid regular expression: /${negated}/: ` +
`Negated character class may contain strings`);
} else {
// Nest the current RegExp in a negated class and check expectations are
// inversed.
const inverted = new RegExp(`[^${re.source}]`, re.flags);
for (const match of expectMatch) {
assertFalse(inverted.test(match), `${inverted}.test(${match})`);
}
for (const noMatch of expectNoMatch) {
assertTrue(inverted.test(noMatch), `${inverted}.test(${noMatch})`);
}
}
}
// Union with nested class
check(
/[\da-f[xy][^[^z]]]/v, Array.from('0123456789abcdefxyz'),
Array.from('ghijklmnopqrstuv!?'));
// Intersections
check(/[\d&&[0-9]]/v, Array.from('0123456789'), []);
check(/[\d&&0]/v, [0], Array.from('123456789'));
check(/[\d&&9]/v, [9], Array.from('012345678'));
check(/[\d&&[02468]]/v, Array.from('02468'), Array.from('13579'));
check(/[\d&&[13579]]/v, Array.from('13579'), Array.from('02468'));
check(
/[\w&&[^a-zA-Z_]]/v, Array.from('0123456789'),
Array.from('abcdxyzABCDXYZ_!?'));
check(
/[^\w&&[a-zA-Z_]]/v, Array.from('0123456789!?'),
Array.from('abcdxyzABCDXYZ_'));
// Subtractions
check(/[\d--[!-%]]/v, Array.from('0123456789'));
check(/[\d--[A-Z]]/v, Array.from('0123456789'));
check(/[\d--[0-9]]/v, []);
check(/[\d--[\w]]/v, []);
check(/[\d--0]/v, Array.from('123456789'));
check(/[\d--9]/v, Array.from('012345678'));
check(/[[\d[a-c]]--9]/v, Array.from('012345678abc'));
check(/[\d--[02468]]/v, Array.from('13579'));
check(/[\d--[13579]]/v, Array.from('02468'));
check(/[[3-7]--[0-9]]/v, []);
check(/[[3-7]--[0-7]]/v, []);
check(/[[3-7]--[3-9]]/v, []);
check(/[[3-79]--[0-7]]/v, [9]);
check(/[[3-79]--[3-9]]/v, []);
check(/[[3-7]--[0-3]]/v, Array.from('4567'));
check(/[[3-7]--[0-5]]/v, Array.from('67'));
check(/[[3-7]--[7-9]]/v, Array.from('3456'));
check(/[[3-7]--[5-9]]/v, Array.from('34'));
check(/[[3-7a-c]--[0-3]]/v, Array.from('4567abc'));
check(/[[3-7a-c]--[0-5]]/v, Array.from('67abc'));
check(/[[3-7a-c]--[7-9]]/v, Array.from('3456abc'));
check(/[[3-7a-c]--[5-9]]/v, Array.from('34abc'));
check(/[[2-8]--[0-3]--5--[7-9]]/v, Array.from('46'));
check(/[[2-57-8]--[0-3]--[5-7]]/v, Array.from('48'));
check(/[[0-57-8]--[1-34]--[5-7]]/v, Array.from('08'));
check(/[\d--[^02468]]/v, Array.from('02468'));
check(/[\d--[^13579]]/v, Array.from('13579'));
// Ignore-Case
check(/[Ā-č]/v, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
check(/[ĀĂĄĆ]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
check(/[āăąć]/vi, Array.from('ĀāĂ㥹Ćć'), Array.from('abc'));
// String disjunctions
check(/[\q{foo|bar|0|5}]/v, ['foo', 'bar', 0, 5], ['fo', 'baz'], false)
check(/[\q{foo|bar}[05]]/v, ['foo', 'bar', 0, 5], ['fo', 'baz'], false)
check(/[\q{foo|bar|0|5}&&\q{bar}]/v, ['bar'], ['foo', 0, 5, 'fo', 'baz'], false)
// The second operand of the intersection doesn't contain strings, so the result
// will not contain strings and therefore negation is valid.
check(/[\q{foo|bar|0|5}&&\d]/v, [0, 5], ['foo', 'bar', 'fo', 'baz'], true)
check(/[\q{foo|bar|0|5}--\q{foo}]/v, ['bar', 0, 5], ['foo', 'fo', 'baz'], false)
check(/[\q{foo|bar|0|5}--\d]/v, ['foo', 'bar'], [0, 5, 'fo', 'baz'], false)
check(
/[\q{foo|bar|0|5}&&\q{bAr}]/vi, ['bar', 'bAr', 'BAR'],
['foo', 0, 5, 'fo', 'baz'], false)
check(
/[\q{foo|bar|0|5}--\q{FoO}]/vi, ['bar', 'bAr', 'BAR', 0, 5],
['foo', 'FOO', 'fo', 'baz'], false)
check(/[\q{ĀĂĄĆ|AaAc}&&\q{āăąć}]/vi, ['ĀĂĄĆ', 'āăąć'], ['AaAc'], false);
check(
/[\q{ĀĂĄĆ|AaAc}--\q{āăąć}]/vi, ['AaAc', 'aAaC'], ['ĀĂĄĆ', 'āăąć'],
false);
// Empty string disjunctions matches nothing, but succeeds.
let res = /[\q{}]/v.exec('foo');
assertNotNull(res);
assertEquals(1, res.length);
assertEquals('', res[0]);
// Ensure longest strings are matched first.
assertEquals(['xyz'], /[a-c\q{W|xy|xyz}]/v.exec('xyzabc'))
assertEquals(['xyz'], /[a-c\q{W|xyz|xy}]/v.exec('xyzabc'))
assertEquals(['xyz'], /[\q{W|xyz|xy}a-c]/v.exec('xyzabc'))
// Empty string is last.
assertEquals(['a'], /[\q{W|}a-c]/v.exec('abc'))
// Some more sophisticated tests taken from
// https://v8.dev/features/regexp-v-flag
assertTrue(/^\p{RGI_Emoji}$/v.test(''));
assertTrue(/^\p{RGI_Emoji}$/v.test('👨🏾'));
assertFalse(/[\p{Script_Extensions=Greek}--π]/v.test('π'));
assertFalse(/[\p{Script_Extensions=Greek}--[αβγ]]/v.test('α'));
assertFalse(/[\p{Script_Extensions=Greek}--[α-γ]]/v.test('β'));
assertTrue(/[\p{Decimal_Number}--[0-9]]/v.test('𑜹'));
assertFalse(/[\p{Decimal_Number}--[0-9]]/v.test('4'));
assertTrue(
/^\p{RGI_Emoji_Tag_Sequence}$/v.test('🏴󠁧󠁢󠁳󠁣󠁴󠁿'));
assertFalse(
/^[\p{RGI_Emoji_Tag_Sequence}--\q{🏴󠁧󠁢󠁳󠁣󠁴󠁿}]$/v.test(
'🏴󠁧󠁢󠁳󠁣󠁴󠁿'));
assertTrue(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('π'));
assertFalse(/[\p{Script_Extensions=Greek}&&\p{Letter}]/v.test('𐆊'));
assertTrue(/[\p{White_Space}&&\p{ASCII}]/v.test('\n'));
assertFalse(/[\p{White_Space}&&\p{ASCII}]/v.test('\u2028'));
assertTrue(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test(''));
assertFalse(/[\p{Script_Extensions=Mongolian}&&\p{Number}]/v.test(''));
assertTrue(/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test(
'4⃣'));
assertTrue(
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('_'));
assertTrue(
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('🇧🇪'));
assertTrue(/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test(
'abc'));
assertTrue(
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('x'));
assertTrue(
/^[\p{Emoji_Keycap_Sequence}\p{ASCII}\q{🇧🇪|abc}xyz0-9]$/v.test('4'));
assertTrue(
/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test('🇧🇪'));
assertTrue(/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test(
'🏴󠁧󠁢󠁥󠁮󠁧󠁿'));
assertTrue(
/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test('🇨🇭'));
assertTrue(/[\p{RGI_Emoji_Flag_Sequence}\p{RGI_Emoji_Tag_Sequence}]/v.test(
'🏴󠁧󠁢󠁷󠁬󠁳󠁿'));
assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/\p{Lowercase_Letter}/giv, 'X'));
assertEquals('XXXXXX4#', 'aAbBcC4#'.replaceAll(/[^\P{Lowercase_Letter}]/giv, 'X'));