[Intl] Cutting 43K by removing Unibrow when ICU available

Making 43K of room for landing ICU64.

Size Change (on x64.release)
D8 before 23,683,192
D8 after 23,639,296
Reduce 43,896 bytes

Bugs: v8:8348

Change-Id: I057f7d59e955a2e5e017873e5b3b5daf5b142ae2
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1478710
Commit-Queue: Frank Tang <ftang@chromium.org>
Reviewed-by: Yang Guo <yangguo@chromium.org>
Cr-Commit-Position: refs/heads/master@{#60616}
This commit is contained in:
Frank Tang 2019-04-03 10:23:57 -07:00 committed by Commit Bot
parent 46e944dbad
commit bb24140cb3
9 changed files with 230 additions and 153 deletions

View File

@ -970,6 +970,7 @@ class Isolate final : private HiddenFactory {
ThreadManager* thread_manager() { return thread_manager_; }
#ifndef V8_INTL_SUPPORT
unibrow::Mapping<unibrow::Ecma262UnCanonicalize>* jsregexp_uncanonicalize() {
return &jsregexp_uncanonicalize_;
}
@ -978,14 +979,15 @@ class Isolate final : private HiddenFactory {
return &jsregexp_canonrange_;
}
RuntimeState* runtime_state() { return &runtime_state_; }
Builtins* builtins() { return &builtins_; }
unibrow::Mapping<unibrow::Ecma262Canonicalize>*
regexp_macro_assembler_canonicalize() {
return &regexp_macro_assembler_canonicalize_;
}
#endif // !V8_INTL_SUPPORT
RuntimeState* runtime_state() { return &runtime_state_; }
Builtins* builtins() { return &builtins_; }
RegExpStack* regexp_stack() { return regexp_stack_; }
@ -996,11 +998,6 @@ class Isolate final : private HiddenFactory {
std::vector<int>* regexp_indices() { return &regexp_indices_; }
unibrow::Mapping<unibrow::Ecma262Canonicalize>*
interp_canonicalize_mapping() {
return &regexp_macro_assembler_canonicalize_;
}
Debug* debug() { return debug_; }
bool* is_profiling_address() { return &is_profiling_; }
@ -1642,10 +1639,12 @@ class Isolate final : private HiddenFactory {
RuntimeState runtime_state_;
Builtins builtins_;
SetupIsolateDelegate* setup_delegate_ = nullptr;
#ifndef V8_INTL_SUPPORT
unibrow::Mapping<unibrow::Ecma262UnCanonicalize> jsregexp_uncanonicalize_;
unibrow::Mapping<unibrow::CanonicalizationRange> jsregexp_canonrange_;
unibrow::Mapping<unibrow::Ecma262Canonicalize>
regexp_macro_assembler_canonicalize_;
#endif // !V8_INTL_SUPPORT
RegExpStack* regexp_stack_ = nullptr;
std::vector<int> regexp_indices_;
DateCache* date_cache_ = nullptr;

View File

@ -21,8 +21,6 @@
namespace v8 {
namespace internal {
using Canonicalize = unibrow::Mapping<unibrow::Ecma262Canonicalize>;
static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
int len, Vector<const uc16> subject,
bool unicode) {

View File

@ -1543,7 +1543,26 @@ void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
// that cannot occur in the source string because it is Latin1.
static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
bool one_byte_subject,
unibrow::uchar* letters) {
unibrow::uchar* letters,
int letter_length) {
#ifdef V8_INTL_SUPPORT
icu::UnicodeSet set;
set.add(character);
set = set.closeOver(USET_CASE_INSENSITIVE);
int32_t range_count = set.getRangeCount();
int items = 0;
for (int32_t i = 0; i < range_count; i++) {
UChar32 start = set.getRangeStart(i);
UChar32 end = set.getRangeEnd(i);
CHECK(end - start + items <= letter_length);
while (start <= end) {
if (one_byte_subject && start > String::kMaxOneByteCharCode) break;
letters[items++] = (unibrow::uchar)(start);
start++;
}
}
return items;
#else
int length =
isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
// Unibrow returns 0 or 1 for characters where case independence is
@ -1564,9 +1583,9 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
}
return length;
#endif // V8_INTL_SUPPORT
}
static inline bool EmitSimpleCharacter(Isolate* isolate,
RegExpCompiler* compiler,
uc16 c,
@ -1599,8 +1618,8 @@ static inline bool EmitAtomNonLetter(Isolate* isolate,
bool preloaded) {
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
bool one_byte = compiler->one_byte();
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
unibrow::uchar chars[4];
int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
if (length < 1) {
// This can't match. Must be an one-byte subject and a non-one-byte
// character. We do not need to do anything since the one-byte pass
@ -1675,8 +1694,8 @@ static inline bool EmitAtomLetter(Isolate* isolate,
bool preloaded) {
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
bool one_byte = compiler->one_byte();
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
unibrow::uchar chars[4];
int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
if (length <= 1) return false;
// We may not need to check against the end of the input string
// if this character lies before a character that matched.
@ -1684,7 +1703,6 @@ static inline bool EmitAtomLetter(Isolate* isolate,
macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
}
Label ok;
DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
switch (length) {
case 2: {
if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
@ -2480,9 +2498,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
details->positions(characters_filled_in);
uc16 c = quarks[i];
if (elm.atom()->ignore_case()) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
int length = GetCaseIndependentLetters(isolate, c,
compiler->one_byte(), chars);
unibrow::uchar chars[4];
int length = GetCaseIndependentLetters(
isolate, c, compiler->one_byte(), chars, 4);
if (length == 0) {
// This can happen because all case variants are non-Latin1, but we
// know the input is Latin1.
@ -5110,6 +5128,17 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
return 0;
}
#ifdef V8_INTL_SUPPORT
// Case Insensitve comparesion
int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
RegExpAtom* atom1 = (*a)->AsAtom();
RegExpAtom* atom2 = (*b)->AsAtom();
icu::UnicodeString character1(atom1->data().at(0));
return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
}
#else
static unibrow::uchar Canonical(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
@ -5122,7 +5151,6 @@ static unibrow::uchar Canonical(
return canonical;
}
int CompareFirstCharCaseIndependent(
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
RegExpTree* const* a, RegExpTree* const* b) {
@ -5137,7 +5165,7 @@ int CompareFirstCharCaseIndependent(
}
return static_cast<int>(character1) - static_cast<int>(character2);
}
#endif // V8_INTL_SUPPORT
// We can stable sort runs of atoms, since the order does not matter if they
// start with different characters.
@ -5173,6 +5201,10 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
DCHECK_LE(i, alternatives->length());
DCHECK_LE(first_atom, i);
if (IgnoreCase(flags)) {
#ifdef V8_INTL_SUPPORT
alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
i - first_atom);
#else
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
compiler->isolate()->regexp_macro_assembler_canonicalize();
auto compare_closure =
@ -5180,6 +5212,7 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
return CompareFirstCharCaseIndependent(canonicalize, a, b);
};
alternatives->StableSort(compare_closure, first_atom, i - first_atom);
#endif // V8_INTL_SUPPORT
} else {
alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
}
@ -5206,7 +5239,11 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
}
RegExpAtom* const atom = alternative->AsAtom();
JSRegExp::Flags flags = atom->flags();
#ifdef V8_INTL_SUPPORT
icu::UnicodeString common_prefix(atom->data().at(0));
#else
unibrow::uchar common_prefix = atom->data().at(0);
#endif // V8_INTL_SUPPORT
int first_with_prefix = i;
int prefix_length = atom->length();
i++;
@ -5215,6 +5252,14 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
if (!alternative->IsAtom()) break;
RegExpAtom* const atom = alternative->AsAtom();
if (atom->flags() != flags) break;
#ifdef V8_INTL_SUPPORT
icu::UnicodeString new_prefix(atom->data().at(0));
if (new_prefix != common_prefix) {
if (!IgnoreCase(flags)) break;
if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
break;
}
#else
unibrow::uchar new_prefix = atom->data().at(0);
if (new_prefix != common_prefix) {
if (!IgnoreCase(flags)) break;
@ -5224,6 +5269,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
common_prefix = Canonical(canonicalize, common_prefix);
if (new_prefix != common_prefix) break;
}
#endif // V8_INTL_SUPPORT
prefix_length = Min(prefix_length, atom->length());
i++;
}
@ -5889,6 +5935,53 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
bool is_one_byte) {
CharacterRange::Canonicalize(ranges);
int range_count = ranges->length();
#ifdef V8_INTL_SUPPORT
icu::UnicodeSet already_added;
icu::UnicodeSet others;
for (int i = 0; i < range_count; i++) {
CharacterRange range = ranges->at(i);
uc32 bottom = range.from();
if (bottom > String::kMaxUtf16CodeUnit) continue;
uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
// Nothing to be done for surrogates.
if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
if (bottom > String::kMaxOneByteCharCode) continue;
if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
}
already_added.add(bottom, top);
while (bottom <= top) {
icu::UnicodeString upper(bottom);
upper.toUpper();
icu::UnicodeSet expanded(bottom, bottom);
expanded.closeOver(USET_CASE_INSENSITIVE);
for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
UChar32 start = expanded.getRangeStart(i);
UChar32 end = expanded.getRangeEnd(i);
while (start <= end) {
icu::UnicodeString upper2(start);
upper2.toUpper();
// Only add if the upper case are the same.
if (upper[0] == upper2[0]) {
others.add(start);
}
start++;
}
}
bottom++;
}
}
others.removeAll(already_added);
for (int32_t i = 0; i < others.getRangeCount(); i++) {
UChar32 start = others.getRangeStart(i);
UChar32 end = others.getRangeEnd(i);
if (start == end) {
ranges->Add(CharacterRange::Singleton(start), zone);
} else {
ranges->Add(CharacterRange::Range(start, end), zone);
}
}
#else
for (int i = 0; i < range_count; i++) {
CharacterRange range = ranges->at(i);
uc32 bottom = range.from();
@ -5954,9 +6047,9 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
}
}
}
#endif // V8_INTL_SUPPORT
}
bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
DCHECK_NOT_NULL(ranges);
int n = ranges->length();
@ -6434,10 +6527,10 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
}
uc16 character = atom->data()[j];
if (IgnoreCase(atom->flags())) {
unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
unibrow::uchar chars[4];
int length = GetCaseIndependentLetters(
isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
chars);
chars, 4);
for (int j = 0; j < length; j++) {
bm->Set(offset, chars[j]);
}

View File

@ -12,6 +12,7 @@
#ifdef V8_INTL_SUPPORT
#include "unicode/uchar.h"
#include "unicode/unistr.h"
#endif // V8_INTL_SUPPORT
namespace v8 {
@ -33,37 +34,17 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
// A GC might move the calling generated code and invalidate the
// return address on the stack.
DCHECK_EQ(0, byte_length % 2);
#ifdef V8_INTL_SUPPORT
int32_t length = (int32_t)(byte_length >> 1);
icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
length);
return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
length, U_FOLD_CASE_DEFAULT) == 0;
#else
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
size_t length = byte_length >> 1;
#ifdef V8_INTL_SUPPORT
if (isolate == nullptr) {
for (size_t i = 0; i < length; i++) {
uc32 c1 = substring1[i];
uc32 c2 = substring2[i];
if (unibrow::Utf16::IsLeadSurrogate(c1)) {
// Non-BMP characters do not have case-equivalents in the BMP.
// Both have to be non-BMP for them to be able to match.
if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
if (i + 1 < length) {
uc16 c1t = substring1[i + 1];
uc16 c2t = substring2[i + 1];
if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
unibrow::Utf16::IsTrailSurrogate(c2t)) {
c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
i++;
}
}
}
c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
if (c1 != c2) return 0;
}
return 1;
}
#endif // V8_INTL_SUPPORT
DCHECK_NOT_NULL(isolate);
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
isolate->regexp_macro_assembler_canonicalize();
@ -83,6 +64,7 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
}
}
return 1;
#endif // V8_INTL_SUPPORT
}

View File

@ -11,6 +11,7 @@
namespace unibrow {
#ifndef V8_INTL_SUPPORT
template <class T, int s> bool Predicate<T, s>::get(uchar code_point) {
CacheEntry entry = entries_[code_point & kMask];
if (entry.code_point() == code_point) return entry.value();
@ -55,6 +56,7 @@ template <class T, int s> int Mapping<T, s>::CalculateValue(uchar c, uchar n,
return length;
}
}
#endif // !V8_INTL_SUPPORT
// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
// stream in. This **must** be followed by a call to ValueOfIncrementalFinish

View File

@ -15,8 +15,11 @@
namespace unibrow {
#ifndef V8_INTL_SUPPORT
static const int kStartBit = (1 << 30);
static const int kChunkBits = (1 << 13);
#endif // !V8_INTL_SUPPORT
static const uchar kSentinel = static_cast<uchar>(-1);
/**
@ -28,7 +31,7 @@ typedef signed short int16_t; // NOLINT
typedef unsigned short uint16_t; // NOLINT
typedef int int32_t; // NOLINT
#ifndef V8_INTL_SUPPORT
// All access to the character table should go through this function.
template <int D>
static inline uchar TableGet(const int32_t* table, int index) {
@ -44,7 +47,6 @@ static inline bool IsStart(int32_t entry) {
return (entry & kStartBit) != 0;
}
#ifndef V8_INTL_SUPPORT
/**
* Look up a character in the Unicode table using a mix of binary and
* interpolation search. For a uniformly distributed array
@ -92,6 +94,7 @@ struct MultiCharacterSpecialCase {
uchar chars[kW];
};
#ifndef V8_INTL_SUPPORT
// Look up the mapping for the given character in the specified table,
// which is of the specified length and uses the specified special case
// mapping for multi-char mappings. The next parameter is the character
@ -192,6 +195,7 @@ static int LookupMapping(const int32_t* table,
return 0;
}
}
#endif // !V8_INTL_SUPPORT
// This method decodes an UTF-8 value according to RFC 3629 and
// https://encoding.spec.whatwg.org/#utf-8-decoder .
@ -1596,7 +1600,6 @@ int ToUppercase::Convert(uchar c,
default: return 0;
}
}
#endif // !V8_INTL_SUPPORT
static const MultiCharacterSpecialCase<1> kEcma262CanonicalizeMultiStrings0[1] = { // NOLINT
{{kSentinel}} }; // NOLINT
@ -3072,98 +3075,75 @@ int CanonicalizationRange::Convert(uchar c,
}
}
const uchar UnicodeData::kMaxCodePoint = 0xFFFD;
int UnicodeData::GetByteCount() {
#ifndef V8_INTL_SUPPORT // NOLINT
return kUppercaseTable0Size * sizeof(int32_t) // NOLINT
+ kUppercaseTable1Size * sizeof(int32_t) // NOLINT
+ kUppercaseTable5Size * sizeof(int32_t) // NOLINT
+ kUppercaseTable7Size * sizeof(int32_t) // NOLINT
+ kLetterTable0Size * sizeof(int32_t) // NOLINT
+ kLetterTable1Size * sizeof(int32_t) // NOLINT
+ kLetterTable2Size * sizeof(int32_t) // NOLINT
+ kLetterTable3Size * sizeof(int32_t) // NOLINT
+ kLetterTable4Size * sizeof(int32_t) // NOLINT
+ kLetterTable5Size * sizeof(int32_t) // NOLINT
+ kLetterTable6Size * sizeof(int32_t) // NOLINT
+ kLetterTable7Size * sizeof(int32_t) // NOLINT
+ kID_StartTable0Size * sizeof(int32_t) // NOLINT
+ kID_StartTable1Size * sizeof(int32_t) // NOLINT
+ kID_StartTable2Size * sizeof(int32_t) // NOLINT
+ kID_StartTable3Size * sizeof(int32_t) // NOLINT
+ kID_StartTable4Size * sizeof(int32_t) // NOLINT
+ kID_StartTable5Size * sizeof(int32_t) // NOLINT
+ kID_StartTable6Size * sizeof(int32_t) // NOLINT
+ kID_StartTable7Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable0Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable1Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable5Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable7Size * sizeof(int32_t) // NOLINT
+ kWhiteSpaceTable0Size * sizeof(int32_t) // NOLINT
+ kWhiteSpaceTable1Size * sizeof(int32_t) // NOLINT
+ kWhiteSpaceTable7Size * sizeof(int32_t) // NOLINT
+
kToLowercaseMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+
kToLowercaseMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kToLowercaseMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kToLowercaseMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kToUppercaseMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<3>) // NOLINT
+
kToUppercaseMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kToUppercaseMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kToUppercaseMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<3>) // NOLINT
#else
return
#endif // !V8_INTL_SUPPORT
+
kEcma262CanonicalizeMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kEcma262CanonicalizeMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kEcma262CanonicalizeMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kEcma262CanonicalizeMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kEcma262UnCanonicalizeMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<4>) // NOLINT
+
kEcma262UnCanonicalizeMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+
kEcma262UnCanonicalizeMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+
kEcma262UnCanonicalizeMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+
kCanonicalizationRangeMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kCanonicalizationRangeMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+
kCanonicalizationRangeMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<1>); // NOLINT
return kUppercaseTable0Size * sizeof(int32_t) // NOLINT
+ kUppercaseTable1Size * sizeof(int32_t) // NOLINT
+ kUppercaseTable5Size * sizeof(int32_t) // NOLINT
+ kUppercaseTable7Size * sizeof(int32_t) // NOLINT
+ kLetterTable0Size * sizeof(int32_t) // NOLINT
+ kLetterTable1Size * sizeof(int32_t) // NOLINT
+ kLetterTable2Size * sizeof(int32_t) // NOLINT
+ kLetterTable3Size * sizeof(int32_t) // NOLINT
+ kLetterTable4Size * sizeof(int32_t) // NOLINT
+ kLetterTable5Size * sizeof(int32_t) // NOLINT
+ kLetterTable6Size * sizeof(int32_t) // NOLINT
+ kLetterTable7Size * sizeof(int32_t) // NOLINT
+ kID_StartTable0Size * sizeof(int32_t) // NOLINT
+ kID_StartTable1Size * sizeof(int32_t) // NOLINT
+ kID_StartTable2Size * sizeof(int32_t) // NOLINT
+ kID_StartTable3Size * sizeof(int32_t) // NOLINT
+ kID_StartTable4Size * sizeof(int32_t) // NOLINT
+ kID_StartTable5Size * sizeof(int32_t) // NOLINT
+ kID_StartTable6Size * sizeof(int32_t) // NOLINT
+ kID_StartTable7Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable0Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable1Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable5Size * sizeof(int32_t) // NOLINT
+ kID_ContinueTable7Size * sizeof(int32_t) // NOLINT
+ kWhiteSpaceTable0Size * sizeof(int32_t) // NOLINT
+ kWhiteSpaceTable1Size * sizeof(int32_t) // NOLINT
+ kWhiteSpaceTable7Size * sizeof(int32_t) // NOLINT
+ kToLowercaseMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+ kToLowercaseMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kToLowercaseMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kToLowercaseMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kToUppercaseMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<3>) // NOLINT
+ kToUppercaseMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kToUppercaseMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kToUppercaseMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<3>) // NOLINT
+ kEcma262CanonicalizeMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kEcma262CanonicalizeMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kEcma262CanonicalizeMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kEcma262CanonicalizeMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kEcma262UnCanonicalizeMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<4>) // NOLINT
+ kEcma262UnCanonicalizeMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+ kEcma262UnCanonicalizeMultiStrings5Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+ kEcma262UnCanonicalizeMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<2>) // NOLINT
+ kCanonicalizationRangeMultiStrings0Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kCanonicalizationRangeMultiStrings1Size *
sizeof(MultiCharacterSpecialCase<1>) // NOLINT
+ kCanonicalizationRangeMultiStrings7Size *
sizeof(MultiCharacterSpecialCase<1>); // NOLINT
}
#endif // !V8_INTL_SUPPORT
} // namespace unibrow

View File

@ -25,6 +25,7 @@ typedef unsigned char byte;
*/
const int kMaxMappingSize = 4;
#ifndef V8_INTL_SUPPORT
template <class T, int size = 256>
class Predicate {
public:
@ -87,7 +88,6 @@ class Mapping {
CacheEntry entries_[kSize];
};
class UnicodeData {
private:
friend class Test;
@ -95,6 +95,7 @@ class UnicodeData {
static const uchar kMaxCodePoint;
};
#endif // !V8_INTL_SUPPORT
class Utf16 {
public:
@ -227,7 +228,6 @@ struct ToUppercase {
uchar* result,
bool* allow_caching_ptr);
};
#endif
struct Ecma262Canonicalize {
static const int kMaxWidth = 1;
static int Convert(uchar c,
@ -249,6 +249,7 @@ struct CanonicalizationRange {
uchar* result,
bool* allow_caching_ptr);
};
#endif // !V8_INTL_SUPPORT
} // namespace unibrow

View File

@ -1488,7 +1488,7 @@ TEST(AddInverseToTable) {
CHECK(table.Get(0xFFFF)->Get(0));
}
#ifndef V8_INTL_SUPPORT
static uc32 canonicalize(uc32 c) {
unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, nullptr);
@ -1500,7 +1500,6 @@ static uc32 canonicalize(uc32 c) {
}
}
TEST(LatinCanonicalize) {
unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
for (unibrow::uchar lower = 'a'; lower <= 'z'; lower++) {
@ -1514,7 +1513,6 @@ TEST(LatinCanonicalize) {
}
for (uc32 c = 128; c < (1 << 21); c++)
CHECK_GE(canonicalize(c), 128);
#ifndef V8_INTL_SUPPORT
unibrow::Mapping<unibrow::ToUppercase> to_upper;
// Canonicalization is only defined for the Basic Multilingual Plane.
for (uc32 c = 0; c < (1 << 16); c++) {
@ -1529,10 +1527,8 @@ TEST(LatinCanonicalize) {
u = c;
CHECK_EQ(u, canonicalize(c));
}
#endif
}
static uc32 CanonRangeEnd(uc32 c) {
unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, nullptr);
@ -1588,6 +1584,7 @@ TEST(UncanonicalizeEquivalence) {
}
}
#endif
static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
Vector<CharacterRange> expected) {
@ -1621,21 +1618,26 @@ TEST(CharacterRangeCaseIndependence) {
CharacterRange::Singleton('A'));
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('z'),
CharacterRange::Singleton('Z'));
#ifndef V8_INTL_SUPPORT
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a', 'z'),
CharacterRange::Range('A', 'Z'));
#endif // !V8_INTL_SUPPORT
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('c', 'f'),
CharacterRange::Range('C', 'F'));
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('a', 'b'),
CharacterRange::Range('A', 'B'));
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('y', 'z'),
CharacterRange::Range('Y', 'Z'));
#ifndef V8_INTL_SUPPORT
TestSimpleRangeCaseIndependence(isolate,
CharacterRange::Range('a' - 1, 'z' + 1),
CharacterRange::Range('A', 'Z'));
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A', 'Z'),
CharacterRange::Range('a', 'z'));
#endif // !V8_INTL_SUPPORT
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('C', 'F'),
CharacterRange::Range('c', 'f'));
#ifndef V8_INTL_SUPPORT
TestSimpleRangeCaseIndependence(isolate,
CharacterRange::Range('A' - 1, 'Z' + 1),
CharacterRange::Range('a', 'z'));
@ -1644,6 +1646,7 @@ TEST(CharacterRangeCaseIndependence) {
// whole block at a time.
TestSimpleRangeCaseIndependence(isolate, CharacterRange::Range('A', 'k'),
CharacterRange::Range('a', 'z'));
#endif // !V8_INTL_SUPPORT
}

19
test/intl/regress-8348.js Normal file
View File

@ -0,0 +1,19 @@
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
function testRegExpI(text, msg) {
assertTrue(new RegExp(text, 'i').test(text.toUpperCase()), msg + ': ' + text);
}
testRegExpI('abc', 'ASCII');
testRegExpI('ABC', 'ASCII');
testRegExpI('rst', 'ASCII');
testRegExpI('RST', 'ASCII');
testRegExpI('αβψδεφ', 'Greek');
testRegExpI('\u1c80\u1c81', 'Historic Cyrillic added in Unicode 9');
testRegExpI('\u026A', 'Dotless I, uppercase form added in Unicode 9');
testRegExpI('ოქტ', 'Georgian Mtavruli added in Unicode 11');