diff --git a/src/isolate.h b/src/isolate.h index 1823764e75..01a1e402e0 100644 --- a/src/isolate.h +++ b/src/isolate.h @@ -970,6 +970,7 @@ class Isolate final : private HiddenFactory { ThreadManager* thread_manager() { return thread_manager_; } +#ifndef V8_INTL_SUPPORT unibrow::Mapping* jsregexp_uncanonicalize() { return &jsregexp_uncanonicalize_; } @@ -978,14 +979,15 @@ class Isolate final : private HiddenFactory { return &jsregexp_canonrange_; } - RuntimeState* runtime_state() { return &runtime_state_; } - - Builtins* builtins() { return &builtins_; } - unibrow::Mapping* regexp_macro_assembler_canonicalize() { return ®exp_macro_assembler_canonicalize_; } +#endif // !V8_INTL_SUPPORT + + RuntimeState* runtime_state() { return &runtime_state_; } + + Builtins* builtins() { return &builtins_; } RegExpStack* regexp_stack() { return regexp_stack_; } @@ -996,11 +998,6 @@ class Isolate final : private HiddenFactory { std::vector* regexp_indices() { return ®exp_indices_; } - unibrow::Mapping* - interp_canonicalize_mapping() { - return ®exp_macro_assembler_canonicalize_; - } - Debug* debug() { return debug_; } bool* is_profiling_address() { return &is_profiling_; } @@ -1642,10 +1639,12 @@ class Isolate final : private HiddenFactory { RuntimeState runtime_state_; Builtins builtins_; SetupIsolateDelegate* setup_delegate_ = nullptr; +#ifndef V8_INTL_SUPPORT unibrow::Mapping jsregexp_uncanonicalize_; unibrow::Mapping jsregexp_canonrange_; unibrow::Mapping regexp_macro_assembler_canonicalize_; +#endif // !V8_INTL_SUPPORT RegExpStack* regexp_stack_ = nullptr; std::vector regexp_indices_; DateCache* date_cache_ = nullptr; diff --git a/src/regexp/interpreter-irregexp.cc b/src/regexp/interpreter-irregexp.cc index 060c373476..55b862dc56 100644 --- a/src/regexp/interpreter-irregexp.cc +++ b/src/regexp/interpreter-irregexp.cc @@ -21,8 +21,6 @@ namespace v8 { namespace internal { -using Canonicalize = unibrow::Mapping; - static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, Vector subject, bool unicode) { diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc index 8d608492bd..7995505226 100644 --- a/src/regexp/jsregexp.cc +++ b/src/regexp/jsregexp.cc @@ -1543,7 +1543,26 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, // that cannot occur in the source string because it is Latin1. static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, bool one_byte_subject, - unibrow::uchar* letters) { + unibrow::uchar* letters, + int letter_length) { +#ifdef V8_INTL_SUPPORT + icu::UnicodeSet set; + set.add(character); + set = set.closeOver(USET_CASE_INSENSITIVE); + int32_t range_count = set.getRangeCount(); + int items = 0; + for (int32_t i = 0; i < range_count; i++) { + UChar32 start = set.getRangeStart(i); + UChar32 end = set.getRangeEnd(i); + CHECK(end - start + items <= letter_length); + while (start <= end) { + if (one_byte_subject && start > String::kMaxOneByteCharCode) break; + letters[items++] = (unibrow::uchar)(start); + start++; + } + } + return items; +#else int length = isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); // Unibrow returns 0 or 1 for characters where case independence is @@ -1564,9 +1583,9 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, } return length; +#endif // V8_INTL_SUPPORT } - static inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler, uc16 c, @@ -1599,8 +1618,8 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, bool preloaded) { RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); bool one_byte = compiler->one_byte(); - unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); if (length < 1) { // This can't match. Must be an one-byte subject and a non-one-byte // character. We do not need to do anything since the one-byte pass @@ -1675,8 +1694,8 @@ static inline bool EmitAtomLetter(Isolate* isolate, bool preloaded) { RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); bool one_byte = compiler->one_byte(); - unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4); if (length <= 1) return false; // We may not need to check against the end of the input string // if this character lies before a character that matched. @@ -1684,7 +1703,6 @@ static inline bool EmitAtomLetter(Isolate* isolate, macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); } Label ok; - DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth); switch (length) { case 2: { if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], @@ -2480,9 +2498,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, details->positions(characters_filled_in); uc16 c = quarks[i]; if (elm.atom()->ignore_case()) { - unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - int length = GetCaseIndependentLetters(isolate, c, - compiler->one_byte(), chars); + unibrow::uchar chars[4]; + int length = GetCaseIndependentLetters( + isolate, c, compiler->one_byte(), chars, 4); if (length == 0) { // This can happen because all case variants are non-Latin1, but we // know the input is Latin1. @@ -5110,6 +5128,17 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { return 0; } +#ifdef V8_INTL_SUPPORT + +// Case Insensitve comparesion +int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) { + RegExpAtom* atom1 = (*a)->AsAtom(); + RegExpAtom* atom2 = (*b)->AsAtom(); + icu::UnicodeString character1(atom1->data().at(0)); + return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT); +} + +#else static unibrow::uchar Canonical( unibrow::Mapping* canonicalize, @@ -5122,7 +5151,6 @@ static unibrow::uchar Canonical( return canonical; } - int CompareFirstCharCaseIndependent( unibrow::Mapping* canonicalize, RegExpTree* const* a, RegExpTree* const* b) { @@ -5137,7 +5165,7 @@ int CompareFirstCharCaseIndependent( } return static_cast(character1) - static_cast(character2); } - +#endif // V8_INTL_SUPPORT // We can stable sort runs of atoms, since the order does not matter if they // start with different characters. @@ -5173,6 +5201,10 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { DCHECK_LE(i, alternatives->length()); DCHECK_LE(first_atom, i); if (IgnoreCase(flags)) { +#ifdef V8_INTL_SUPPORT + alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom, + i - first_atom); +#else unibrow::Mapping* canonicalize = compiler->isolate()->regexp_macro_assembler_canonicalize(); auto compare_closure = @@ -5180,6 +5212,7 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { return CompareFirstCharCaseIndependent(canonicalize, a, b); }; alternatives->StableSort(compare_closure, first_atom, i - first_atom); +#endif // V8_INTL_SUPPORT } else { alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); } @@ -5206,7 +5239,11 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { } RegExpAtom* const atom = alternative->AsAtom(); JSRegExp::Flags flags = atom->flags(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeString common_prefix(atom->data().at(0)); +#else unibrow::uchar common_prefix = atom->data().at(0); +#endif // V8_INTL_SUPPORT int first_with_prefix = i; int prefix_length = atom->length(); i++; @@ -5215,6 +5252,14 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { if (!alternative->IsAtom()) break; RegExpAtom* const atom = alternative->AsAtom(); if (atom->flags() != flags) break; +#ifdef V8_INTL_SUPPORT + icu::UnicodeString new_prefix(atom->data().at(0)); + if (new_prefix != common_prefix) { + if (!IgnoreCase(flags)) break; + if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0) + break; + } +#else unibrow::uchar new_prefix = atom->data().at(0); if (new_prefix != common_prefix) { if (!IgnoreCase(flags)) break; @@ -5224,6 +5269,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { common_prefix = Canonical(canonicalize, common_prefix); if (new_prefix != common_prefix) break; } +#endif // V8_INTL_SUPPORT prefix_length = Min(prefix_length, atom->length()); i++; } @@ -5889,6 +5935,53 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, bool is_one_byte) { CharacterRange::Canonicalize(ranges); int range_count = ranges->length(); +#ifdef V8_INTL_SUPPORT + icu::UnicodeSet already_added; + icu::UnicodeSet others; + for (int i = 0; i < range_count; i++) { + CharacterRange range = ranges->at(i); + uc32 bottom = range.from(); + if (bottom > String::kMaxUtf16CodeUnit) continue; + uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); + // Nothing to be done for surrogates. + if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { + if (bottom > String::kMaxOneByteCharCode) continue; + if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; + } + already_added.add(bottom, top); + while (bottom <= top) { + icu::UnicodeString upper(bottom); + upper.toUpper(); + icu::UnicodeSet expanded(bottom, bottom); + expanded.closeOver(USET_CASE_INSENSITIVE); + for (int32_t i = 0; i < expanded.getRangeCount(); i++) { + UChar32 start = expanded.getRangeStart(i); + UChar32 end = expanded.getRangeEnd(i); + while (start <= end) { + icu::UnicodeString upper2(start); + upper2.toUpper(); + // Only add if the upper case are the same. + if (upper[0] == upper2[0]) { + others.add(start); + } + start++; + } + } + bottom++; + } + } + others.removeAll(already_added); + for (int32_t i = 0; i < others.getRangeCount(); i++) { + UChar32 start = others.getRangeStart(i); + UChar32 end = others.getRangeEnd(i); + if (start == end) { + ranges->Add(CharacterRange::Singleton(start), zone); + } else { + ranges->Add(CharacterRange::Range(start, end), zone); + } + } +#else for (int i = 0; i < range_count; i++) { CharacterRange range = ranges->at(i); uc32 bottom = range.from(); @@ -5954,9 +6047,9 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, } } } +#endif // V8_INTL_SUPPORT } - bool CharacterRange::IsCanonical(ZoneList* ranges) { DCHECK_NOT_NULL(ranges); int n = ranges->length(); @@ -6434,10 +6527,10 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, } uc16 character = atom->data()[j]; if (IgnoreCase(atom->flags())) { - unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + unibrow::uchar chars[4]; int length = GetCaseIndependentLetters( isolate, character, bm->max_char() == String::kMaxOneByteCharCode, - chars); + chars, 4); for (int j = 0; j < length; j++) { bm->Set(offset, chars[j]); } diff --git a/src/regexp/regexp-macro-assembler.cc b/src/regexp/regexp-macro-assembler.cc index 323e805fcd..a75c45d24e 100644 --- a/src/regexp/regexp-macro-assembler.cc +++ b/src/regexp/regexp-macro-assembler.cc @@ -12,6 +12,7 @@ #ifdef V8_INTL_SUPPORT #include "unicode/uchar.h" +#include "unicode/unistr.h" #endif // V8_INTL_SUPPORT namespace v8 { @@ -33,37 +34,17 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, // A GC might move the calling generated code and invalidate the // return address on the stack. DCHECK_EQ(0, byte_length % 2); + +#ifdef V8_INTL_SUPPORT + int32_t length = (int32_t)(byte_length >> 1); + icu::UnicodeString uni_str_1(reinterpret_cast(byte_offset1), + length); + return uni_str_1.caseCompare(reinterpret_cast(byte_offset2), + length, U_FOLD_CASE_DEFAULT) == 0; +#else uc16* substring1 = reinterpret_cast(byte_offset1); uc16* substring2 = reinterpret_cast(byte_offset2); size_t length = byte_length >> 1; - -#ifdef V8_INTL_SUPPORT - if (isolate == nullptr) { - for (size_t i = 0; i < length; i++) { - uc32 c1 = substring1[i]; - uc32 c2 = substring2[i]; - if (unibrow::Utf16::IsLeadSurrogate(c1)) { - // Non-BMP characters do not have case-equivalents in the BMP. - // Both have to be non-BMP for them to be able to match. - if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0; - if (i + 1 < length) { - uc16 c1t = substring1[i + 1]; - uc16 c2t = substring2[i + 1]; - if (unibrow::Utf16::IsTrailSurrogate(c1t) && - unibrow::Utf16::IsTrailSurrogate(c2t)) { - c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t); - c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t); - i++; - } - } - } - c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT); - c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT); - if (c1 != c2) return 0; - } - return 1; - } -#endif // V8_INTL_SUPPORT DCHECK_NOT_NULL(isolate); unibrow::Mapping* canonicalize = isolate->regexp_macro_assembler_canonicalize(); @@ -83,6 +64,7 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1, } } return 1; +#endif // V8_INTL_SUPPORT } diff --git a/src/unicode-inl.h b/src/unicode-inl.h index c96d78438a..21292ca59c 100644 --- a/src/unicode-inl.h +++ b/src/unicode-inl.h @@ -11,6 +11,7 @@ namespace unibrow { +#ifndef V8_INTL_SUPPORT template bool Predicate::get(uchar code_point) { CacheEntry entry = entries_[code_point & kMask]; if (entry.code_point() == code_point) return entry.value(); @@ -55,6 +56,7 @@ template int Mapping::CalculateValue(uchar c, uchar n, return length; } } +#endif // !V8_INTL_SUPPORT // Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they // stream in. This **must** be followed by a call to ValueOfIncrementalFinish diff --git a/src/unicode.cc b/src/unicode.cc index c7818dbaa0..70a084bc22 100644 --- a/src/unicode.cc +++ b/src/unicode.cc @@ -15,8 +15,11 @@ namespace unibrow { +#ifndef V8_INTL_SUPPORT static const int kStartBit = (1 << 30); static const int kChunkBits = (1 << 13); +#endif // !V8_INTL_SUPPORT + static const uchar kSentinel = static_cast(-1); /** @@ -28,7 +31,7 @@ typedef signed short int16_t; // NOLINT typedef unsigned short uint16_t; // NOLINT typedef int int32_t; // NOLINT - +#ifndef V8_INTL_SUPPORT // All access to the character table should go through this function. template static inline uchar TableGet(const int32_t* table, int index) { @@ -44,7 +47,6 @@ static inline bool IsStart(int32_t entry) { return (entry & kStartBit) != 0; } -#ifndef V8_INTL_SUPPORT /** * Look up a character in the Unicode table using a mix of binary and * interpolation search. For a uniformly distributed array @@ -92,6 +94,7 @@ struct MultiCharacterSpecialCase { uchar chars[kW]; }; +#ifndef V8_INTL_SUPPORT // Look up the mapping for the given character in the specified table, // which is of the specified length and uses the specified special case // mapping for multi-char mappings. The next parameter is the character @@ -192,6 +195,7 @@ static int LookupMapping(const int32_t* table, return 0; } } +#endif // !V8_INTL_SUPPORT // This method decodes an UTF-8 value according to RFC 3629 and // https://encoding.spec.whatwg.org/#utf-8-decoder . @@ -1596,7 +1600,6 @@ int ToUppercase::Convert(uchar c, default: return 0; } } -#endif // !V8_INTL_SUPPORT static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings0[1] = { // NOLINT {{kSentinel}} }; // NOLINT @@ -3072,98 +3075,75 @@ int CanonicalizationRange::Convert(uchar c, } } - const uchar UnicodeData::kMaxCodePoint = 0xFFFD; int UnicodeData::GetByteCount() { -#ifndef V8_INTL_SUPPORT // NOLINT - return kUppercaseTable0Size * sizeof(int32_t) // NOLINT - + kUppercaseTable1Size * sizeof(int32_t) // NOLINT - + kUppercaseTable5Size * sizeof(int32_t) // NOLINT - + kUppercaseTable7Size * sizeof(int32_t) // NOLINT - + kLetterTable0Size * sizeof(int32_t) // NOLINT - + kLetterTable1Size * sizeof(int32_t) // NOLINT - + kLetterTable2Size * sizeof(int32_t) // NOLINT - + kLetterTable3Size * sizeof(int32_t) // NOLINT - + kLetterTable4Size * sizeof(int32_t) // NOLINT - + kLetterTable5Size * sizeof(int32_t) // NOLINT - + kLetterTable6Size * sizeof(int32_t) // NOLINT - + kLetterTable7Size * sizeof(int32_t) // NOLINT - + kID_StartTable0Size * sizeof(int32_t) // NOLINT - + kID_StartTable1Size * sizeof(int32_t) // NOLINT - + kID_StartTable2Size * sizeof(int32_t) // NOLINT - + kID_StartTable3Size * sizeof(int32_t) // NOLINT - + kID_StartTable4Size * sizeof(int32_t) // NOLINT - + kID_StartTable5Size * sizeof(int32_t) // NOLINT - + kID_StartTable6Size * sizeof(int32_t) // NOLINT - + kID_StartTable7Size * sizeof(int32_t) // NOLINT - + kID_ContinueTable0Size * sizeof(int32_t) // NOLINT - + kID_ContinueTable1Size * sizeof(int32_t) // NOLINT - + kID_ContinueTable5Size * sizeof(int32_t) // NOLINT - + kID_ContinueTable7Size * sizeof(int32_t) // NOLINT - + kWhiteSpaceTable0Size * sizeof(int32_t) // NOLINT - + kWhiteSpaceTable1Size * sizeof(int32_t) // NOLINT - + kWhiteSpaceTable7Size * sizeof(int32_t) // NOLINT - + - kToLowercaseMultiStrings0Size * - sizeof(MultiCharacterSpecialCase<2>) // NOLINT - + - kToLowercaseMultiStrings1Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kToLowercaseMultiStrings5Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kToLowercaseMultiStrings7Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kToUppercaseMultiStrings0Size * - sizeof(MultiCharacterSpecialCase<3>) // NOLINT - + - kToUppercaseMultiStrings1Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kToUppercaseMultiStrings5Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kToUppercaseMultiStrings7Size * - sizeof(MultiCharacterSpecialCase<3>) // NOLINT -#else - return -#endif // !V8_INTL_SUPPORT - + - kEcma262CanonicalizeMultiStrings0Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kEcma262CanonicalizeMultiStrings1Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kEcma262CanonicalizeMultiStrings5Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kEcma262CanonicalizeMultiStrings7Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kEcma262UnCanonicalizeMultiStrings0Size * - sizeof(MultiCharacterSpecialCase<4>) // NOLINT - + - kEcma262UnCanonicalizeMultiStrings1Size * - sizeof(MultiCharacterSpecialCase<2>) // NOLINT - + - kEcma262UnCanonicalizeMultiStrings5Size * - sizeof(MultiCharacterSpecialCase<2>) // NOLINT - + - kEcma262UnCanonicalizeMultiStrings7Size * - sizeof(MultiCharacterSpecialCase<2>) // NOLINT - + - kCanonicalizationRangeMultiStrings0Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kCanonicalizationRangeMultiStrings1Size * - sizeof(MultiCharacterSpecialCase<1>) // NOLINT - + - kCanonicalizationRangeMultiStrings7Size * - sizeof(MultiCharacterSpecialCase<1>); // NOLINT + return kUppercaseTable0Size * sizeof(int32_t) // NOLINT + + kUppercaseTable1Size * sizeof(int32_t) // NOLINT + + kUppercaseTable5Size * sizeof(int32_t) // NOLINT + + kUppercaseTable7Size * sizeof(int32_t) // NOLINT + + kLetterTable0Size * sizeof(int32_t) // NOLINT + + kLetterTable1Size * sizeof(int32_t) // NOLINT + + kLetterTable2Size * sizeof(int32_t) // NOLINT + + kLetterTable3Size * sizeof(int32_t) // NOLINT + + kLetterTable4Size * sizeof(int32_t) // NOLINT + + kLetterTable5Size * sizeof(int32_t) // NOLINT + + kLetterTable6Size * sizeof(int32_t) // NOLINT + + kLetterTable7Size * sizeof(int32_t) // NOLINT + + kID_StartTable0Size * sizeof(int32_t) // NOLINT + + kID_StartTable1Size * sizeof(int32_t) // NOLINT + + kID_StartTable2Size * sizeof(int32_t) // NOLINT + + kID_StartTable3Size * sizeof(int32_t) // NOLINT + + kID_StartTable4Size * sizeof(int32_t) // NOLINT + + kID_StartTable5Size * sizeof(int32_t) // NOLINT + + kID_StartTable6Size * sizeof(int32_t) // NOLINT + + kID_StartTable7Size * sizeof(int32_t) // NOLINT + + kID_ContinueTable0Size * sizeof(int32_t) // NOLINT + + kID_ContinueTable1Size * sizeof(int32_t) // NOLINT + + kID_ContinueTable5Size * sizeof(int32_t) // NOLINT + + kID_ContinueTable7Size * sizeof(int32_t) // NOLINT + + kWhiteSpaceTable0Size * sizeof(int32_t) // NOLINT + + kWhiteSpaceTable1Size * sizeof(int32_t) // NOLINT + + kWhiteSpaceTable7Size * sizeof(int32_t) // NOLINT + + kToLowercaseMultiStrings0Size * + sizeof(MultiCharacterSpecialCase<2>) // NOLINT + + kToLowercaseMultiStrings1Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kToLowercaseMultiStrings5Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kToLowercaseMultiStrings7Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kToUppercaseMultiStrings0Size * + sizeof(MultiCharacterSpecialCase<3>) // NOLINT + + kToUppercaseMultiStrings1Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kToUppercaseMultiStrings5Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kToUppercaseMultiStrings7Size * + sizeof(MultiCharacterSpecialCase<3>) // NOLINT + + kEcma262CanonicalizeMultiStrings0Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kEcma262CanonicalizeMultiStrings1Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kEcma262CanonicalizeMultiStrings5Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kEcma262CanonicalizeMultiStrings7Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kEcma262UnCanonicalizeMultiStrings0Size * + sizeof(MultiCharacterSpecialCase<4>) // NOLINT + + kEcma262UnCanonicalizeMultiStrings1Size * + sizeof(MultiCharacterSpecialCase<2>) // NOLINT + + kEcma262UnCanonicalizeMultiStrings5Size * + sizeof(MultiCharacterSpecialCase<2>) // NOLINT + + kEcma262UnCanonicalizeMultiStrings7Size * + sizeof(MultiCharacterSpecialCase<2>) // NOLINT + + kCanonicalizationRangeMultiStrings0Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kCanonicalizationRangeMultiStrings1Size * + sizeof(MultiCharacterSpecialCase<1>) // NOLINT + + kCanonicalizationRangeMultiStrings7Size * + sizeof(MultiCharacterSpecialCase<1>); // NOLINT } +#endif // !V8_INTL_SUPPORT } // namespace unibrow diff --git a/src/unicode.h b/src/unicode.h index 1bebfe3e8a..7a6e848017 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -25,6 +25,7 @@ typedef unsigned char byte; */ const int kMaxMappingSize = 4; +#ifndef V8_INTL_SUPPORT template class Predicate { public: @@ -87,7 +88,6 @@ class Mapping { CacheEntry entries_[kSize]; }; - class UnicodeData { private: friend class Test; @@ -95,6 +95,7 @@ class UnicodeData { static const uchar kMaxCodePoint; }; +#endif // !V8_INTL_SUPPORT class Utf16 { public: @@ -227,7 +228,6 @@ struct ToUppercase { uchar* result, bool* allow_caching_ptr); }; -#endif struct Ecma262Canonicalize { static const int kMaxWidth = 1; static int Convert(uchar c, @@ -249,6 +249,7 @@ struct CanonicalizationRange { uchar* result, bool* allow_caching_ptr); }; +#endif // !V8_INTL_SUPPORT } // namespace unibrow diff --git a/test/cctest/test-regexp.cc b/test/cctest/test-regexp.cc index 49dcc49c3e..f5889dd3d6 100644 --- a/test/cctest/test-regexp.cc +++ b/test/cctest/test-regexp.cc @@ -1488,7 +1488,7 @@ TEST(AddInverseToTable) { CHECK(table.Get(0xFFFF)->Get(0)); } - +#ifndef V8_INTL_SUPPORT static uc32 canonicalize(uc32 c) { unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth]; int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, nullptr); @@ -1500,7 +1500,6 @@ static uc32 canonicalize(uc32 c) { } } - TEST(LatinCanonicalize) { unibrow::Mapping un_canonicalize; for (unibrow::uchar lower = 'a'; lower <= 'z'; lower++) { @@ -1514,7 +1513,6 @@ TEST(LatinCanonicalize) { } for (uc32 c = 128; c < (1 << 21); c++) CHECK_GE(canonicalize(c), 128); -#ifndef V8_INTL_SUPPORT unibrow::Mapping to_upper; // Canonicalization is only defined for the Basic Multilingual Plane. for (uc32 c = 0; c < (1 << 16); c++) { @@ -1529,10 +1527,8 @@ TEST(LatinCanonicalize) { u = c; CHECK_EQ(u, canonicalize(c)); } -#endif } - static uc32 CanonRangeEnd(uc32 c) { unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth]; int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, nullptr); @@ -1588,6 +1584,7 @@ TEST(UncanonicalizeEquivalence) { } } +#endif static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input, Vector expected) { @@ -1621,21 +1618,26 @@ TEST(CharacterRangeCaseIndependence) { CharacterRange::Singleton('A')); TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('z'), CharacterRange::Singleton('Z')); +#ifndef V8_INTL_SUPPORT TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a', 'z'), CharacterRange::Range('A', 'Z')); +#endif // !V8_INTL_SUPPORT TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('c', 'f'), CharacterRange::Range('C', 'F')); TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a', 'b'), CharacterRange::Range('A', 'B')); TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('y', 'z'), CharacterRange::Range('Y', 'Z')); +#ifndef V8_INTL_SUPPORT TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a' - 1, 'z' + 1), CharacterRange::Range('A', 'Z')); TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A', 'Z'), CharacterRange::Range('a', 'z')); +#endif // !V8_INTL_SUPPORT TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('C', 'F'), CharacterRange::Range('c', 'f')); +#ifndef V8_INTL_SUPPORT TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A' - 1, 'Z' + 1), CharacterRange::Range('a', 'z')); @@ -1644,6 +1646,7 @@ TEST(CharacterRangeCaseIndependence) { // whole block at a time. TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A', 'k'), CharacterRange::Range('a', 'z')); +#endif // !V8_INTL_SUPPORT } diff --git a/test/intl/regress-8348.js b/test/intl/regress-8348.js new file mode 100644 index 0000000000..c243ea44e8 --- /dev/null +++ b/test/intl/regress-8348.js @@ -0,0 +1,19 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +function testRegExpI(text, msg) { + assertTrue(new RegExp(text, 'i').test(text.toUpperCase()), msg + ': ' + text); +} + +testRegExpI('abc', 'ASCII'); +testRegExpI('ABC', 'ASCII'); +testRegExpI('rst', 'ASCII'); +testRegExpI('RST', 'ASCII'); + +testRegExpI('αβψδεφ', 'Greek'); + +testRegExpI('\u1c80\u1c81', 'Historic Cyrillic added in Unicode 9'); +testRegExpI('\u026A', 'Dotless I, uppercase form added in Unicode 9'); + +testRegExpI('ოქტ', 'Georgian Mtavruli added in Unicode 11');