diff --git a/BUILD.gn b/BUILD.gn index 91fe3a854f..43b93c4a92 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -2708,6 +2708,7 @@ v8_source_set("v8_base_without_compiler") { "src/regexp/regexp-nodes.h", "src/regexp/regexp-parser.cc", "src/regexp/regexp-parser.h", + "src/regexp/regexp-special-case.h", "src/regexp/regexp-stack.cc", "src/regexp/regexp-stack.h", "src/regexp/regexp-utils.cc", @@ -3240,6 +3241,8 @@ v8_source_set("v8_base_without_compiler") { ] if (v8_enable_i18n_support) { + deps += [ ":run_gen-regexp-special-case" ] + sources += [ "$target_gen_dir/src/regexp/special-case.cc" ] if (is_win) { deps += [ "//third_party/icu:icudata" ] } @@ -3907,6 +3910,50 @@ v8_executable("torque-language-server") { } } +if (current_toolchain == v8_generator_toolchain) { + v8_executable("gen-regexp-special-case") { + visibility = [ ":*" ] # Only targets in this file can depend on this. + + sources = [ + "src/regexp/gen-regexp-special-case.cc", + ] + + deps = [ + ":v8_libbase", + "//build/win:default_exe_manifest", + "//third_party/icu", + ] + + configs = [ ":internal_config" ] + } +} + +action("run_gen-regexp-special-case") { + visibility = [ ":*" ] # Only targets in this file can depend on this. + + script = "tools/run.py" + + sources = v8_extra_library_files + + deps = [ + ":gen-regexp-special-case($v8_generator_toolchain)", + ] + + output_file = "$target_gen_dir/src/regexp/special-case.cc" + + outputs = [ + output_file, + ] + + args = [ + "./" + rebase_path( + get_label_info(":gen-regexp-special-case($v8_generator_toolchain)", + "root_out_dir") + "/gen-regexp-special-case", + root_build_dir), + rebase_path(output_file, root_build_dir), + ] +} + ############################################################################### # Public targets # diff --git a/src/regexp/gen-regexp-special-case.cc b/src/regexp/gen-regexp-special-case.cc new file mode 100644 index 0000000000..8aace6ab88 --- /dev/null +++ b/src/regexp/gen-regexp-special-case.cc @@ -0,0 +1,125 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include +#include + +#include "src/base/logging.h" +#include "unicode/uchar.h" +#include "unicode/uniset.h" + +namespace v8 { +namespace internal { + +// The following code generates BuildSpecialAddSet() and BuildIgnoreSet() +// functions into "src/regexp/special-case.cc". +// See more details in http://shorturl.at/adfO5 +void PrintSet(std::ofstream& out, const char* func_name, + const icu::UnicodeSet& set) { + out << "icu::UnicodeSet " << func_name << "() {\n" + << " icu::UnicodeSet set;\n"; + for (int32_t i = 0; i < set.getRangeCount(); i++) { + if (set.getRangeStart(i) == set.getRangeEnd(i)) { + out << " set.add(0x" << set.getRangeStart(i) << ");\n"; + } else { + out << " set.add(0x" << set.getRangeStart(i) << ", 0x" + << set.getRangeEnd(i) << ");\n"; + } + } + out << " set.freeze();\n" + << " return set;\n" + << "}\n"; +} + +void PrintSpecial(std::ofstream& out) { + icu::UnicodeSet current; + icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. + icu::UnicodeSet special_add; + icu::UnicodeSet ignore; + UErrorCode status = U_ZERO_ERROR; + icu::UnicodeSet upper("[\\p{Lu}]", status); + CHECK(U_SUCCESS(status)); + // Iterate through all chars in BMP except ASCII and Surrogate. + for (UChar32 i = 0x80; i < 0x010000; i++) { + // Ignore those characters which is already processed. + if (!processed.contains(i)) { + current.set(i, i); + current.closeOver(USET_CASE_INSENSITIVE); + + // Remember we already processed current. + processed.addAll(current); + + // All uppercase characters in current. + icu::UnicodeSet keep_upper(current); + keep_upper.retainAll(upper); + + // Check if we have more than one uppercase character in current. + // If there are more than one uppercase character, then it is a special + // set which need to be added into either "Special Add" set or "Ignore" + // set. + int32_t number_of_upper = 0; + for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { + number_of_upper += + keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; + } + if (number_of_upper > 1) { + // Add all non uppercase characters (could be Ll or Mn) to special add + // set. + current.removeAll(upper); + special_add.addAll(current); + + // Add the uppercase characters of non uppercase character to + // special add set. + CHECK_GT(current.getRangeCount(), 0); + UChar32 main_upper = u_toupper(current.getRangeStart(0)); + special_add.add(main_upper); + + // Add all uppercase except the main upper to ignore set. + keep_upper.remove(main_upper); + ignore.addAll(keep_upper); + } + } + } + + // Remove any ASCII + special_add.remove(0x0000, 0x007f); + PrintSet(out, "BuildIgnoreSet", ignore); + PrintSet(out, "BuildSpecialAddSet", special_add); +} + +void WriteHeader(const char* header_filename) { + std::ofstream out(header_filename); + out << std::hex << std::setfill('0') << std::setw(4); + + out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" + << "// The following functions are used to build icu::UnicodeSet\n" + << "// for specical cases different between Unicode and ECMA262.\n" + << "#ifdef V8_INTL_SUPPORT\n" + << "#include \"src/regexp/special-case.h\"\n\n" + << "#include \"unicode/uniset.h\"\n" + << "namespace v8 {\n" + << "namespace internal {\n\n"; + + PrintSpecial(out); + + out << "\n" + << "} // namespace internal\n" + << "} // namespace v8\n" + << "#endif // V8_INTL_SUPPORT\n"; +} + +} // namespace internal +} // namespace v8 + +int main(int argc, const char** argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " \n"; + std::exit(1); + } + v8::internal::WriteHeader(argv[1]); + + return 0; +} diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc index e2dc13ce49..d12c35682e 100644 --- a/src/regexp/regexp-compiler-tonode.cc +++ b/src/regexp/regexp-compiler-tonode.cc @@ -6,6 +6,9 @@ #include "src/execution/isolate.h" #include "src/regexp/regexp.h" +#ifdef V8_INTL_SUPPORT +#include "src/regexp/special-case.h" +#endif // V8_INTL_SUPPORT #include "src/strings/unicode-inl.h" #include "src/zone/zone-list-inl.h" @@ -1137,6 +1140,39 @@ Vector CharacterRange::GetWordBounds() { return Vector(kWordRanges, kWordRangeCount - 1); } +#ifdef V8_INTL_SUPPORT +struct IgnoreSet { + IgnoreSet() : set(BuildIgnoreSet()) {} + const icu::UnicodeSet set; +}; + +struct SpecialAddSet { + SpecialAddSet() : set(BuildSpecialAddSet()) {} + const icu::UnicodeSet set; +}; + +icu::UnicodeSet BuildAsciiAToZSet() { + icu::UnicodeSet set('a', 'z'); + set.add('A', 'Z'); + set.freeze(); + return set; +} + +struct AsciiAToZSet { + AsciiAToZSet() : set(BuildAsciiAToZSet()) {} + const icu::UnicodeSet set; +}; + +static base::LazyInstance::type ignore_set = + LAZY_INSTANCE_INITIALIZER; + +static base::LazyInstance::type special_add_set = + LAZY_INSTANCE_INITIALIZER; + +static base::LazyInstance::type ascii_a_to_z_set = + LAZY_INSTANCE_INITIALIZER; +#endif // V8_INTL_SUPPORT + // static void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ZoneList* ranges, @@ -1144,58 +1180,100 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, CharacterRange::Canonicalize(ranges); int range_count = ranges->length(); #ifdef V8_INTL_SUPPORT - icu::UnicodeSet already_added; icu::UnicodeSet others; for (int i = 0; i < range_count; i++) { CharacterRange range = ranges->at(i); - uc32 bottom = range.from(); - if (bottom > String::kMaxUtf16CodeUnit) continue; - uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit); + uc32 from = range.from(); + if (from > String::kMaxUtf16CodeUnit) continue; + uc32 to = Min(range.to(), String::kMaxUtf16CodeUnit); // Nothing to be done for surrogates. - if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; + if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { - if (bottom > String::kMaxOneByteCharCode) continue; - if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; + if (from > String::kMaxOneByteCharCode) continue; + if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode; } - already_added.add(bottom, top); - icu::Locale locale = icu::Locale::getRoot(); - while (bottom <= top) { - icu::UnicodeString upper(bottom); - upper.toUpper(locale); - icu::UnicodeSet expanded(bottom, bottom); - expanded.closeOver(USET_CASE_INSENSITIVE); - for (int32_t i = 0; i < expanded.getRangeCount(); i++) { - UChar32 start = expanded.getRangeStart(i); - UChar32 end = expanded.getRangeEnd(i); - while (start <= end) { - icu::UnicodeString upper2(start); - upper2.toUpper(locale); - // Only add if the upper case are the same. - if (upper[0] == upper2[0]) { - // #sec-runtime-semantics-canonicalize-ch - // 3.g. If the numeric value of ch ≥ 128 and the numeric value of - // cu < 128, return ch. - if (bottom >= 128 && start < 128) { - others.add(bottom); - } else { - // 3.h. 3.h. 3.h. Return cu. - others.add(start); - } - } - start++; - } - } - bottom++; + others.add(from, to); + } + + // Set of characters already added to ranges that do not need to be added + // again. + icu::UnicodeSet already_added(others); + + // Set of characters in ranges that are in the 52 ASCII characters [a-zA-Z]. + icu::UnicodeSet in_ascii_a_to_z(others); + in_ascii_a_to_z.retainAll(ascii_a_to_z_set.Pointer()->set); + + // Remove all chars in [a-zA-Z] from others. + others.removeAll(in_ascii_a_to_z); + + // Set of characters in ranges that are overlapping with special add set. + icu::UnicodeSet in_special_add(others); + in_special_add.retainAll(special_add_set.Pointer()->set); + + others.removeAll(in_special_add); + + // Ignore all chars in ignore set. + others.removeAll(ignore_set.Pointer()->set); + + // For most of the chars in ranges that is still in others, find the case + // equivlant set by calling closeOver(USET_CASE_INSENSITIVE). + others.closeOver(USET_CASE_INSENSITIVE); + + // Because closeOver(USET_CASE_INSENSITIVE) may add ASCII [a-zA-Z] to others, + // but ECMA262 "i" mode won't consider that, remove them from others. + // Ex: U+017F add 'S' and 's' to others. + others.removeAll(ascii_a_to_z_set.Pointer()->set); + + // Special handling for in_ascii_a_to_z. + for (int32_t i = 0; i < in_ascii_a_to_z.getRangeCount(); i++) { + UChar32 start = in_ascii_a_to_z.getRangeStart(i); + UChar32 end = in_ascii_a_to_z.getRangeEnd(i); + // Check if it is uppercase A-Z by checking bit 6. + if (start & 0x0020) { + // Add the lowercases + others.add(start & 0x005F, end & 0x005F); + } else { + // Add the uppercases + others.add(start | 0x0020, end | 0x0020); } } + + // Special handling for chars in "Special Add" set. + for (int32_t i = 0; i < in_special_add.getRangeCount(); i++) { + UChar32 end = in_special_add.getRangeEnd(i); + for (UChar32 ch = in_special_add.getRangeStart(i); ch <= end; ch++) { + // Add the uppercase of this character if itself is not an uppercase + // character. + // Note: The if condiction cannot be u_islower(ch) because ch could be + // neither uppercase nor lowercase but Mn. + if (!u_isupper(ch)) { + others.add(u_toupper(ch)); + } + icu::UnicodeSet candidates(ch, ch); + candidates.closeOver(USET_CASE_INSENSITIVE); + for (int32_t j = 0; j < candidates.getRangeCount(); j++) { + UChar32 end2 = candidates.getRangeEnd(j); + for (UChar32 ch2 = candidates.getRangeStart(j); ch2 <= end2; ch2++) { + // Add character that is not uppercase to others. + if (!u_isupper(ch2)) { + others.add(ch2); + } + } + } + } + } + + // Remove all characters which already in the ranges. others.removeAll(already_added); + + // Add others to the ranges for (int32_t i = 0; i < others.getRangeCount(); i++) { - UChar32 start = others.getRangeStart(i); - UChar32 end = others.getRangeEnd(i); - if (start == end) { - ranges->Add(CharacterRange::Singleton(start), zone); + UChar32 from = others.getRangeStart(i); + UChar32 to = others.getRangeEnd(i); + if (from == to) { + ranges->Add(CharacterRange::Singleton(from), zone); } else { - ranges->Add(CharacterRange::Range(start, end), zone); + ranges->Add(CharacterRange::Range(from, to), zone); } } #else diff --git a/src/regexp/special-case.h b/src/regexp/special-case.h new file mode 100644 index 0000000000..1ccec5d31a --- /dev/null +++ b/src/regexp/special-case.h @@ -0,0 +1,79 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_SPECIAL_CASE_H_ +#define V8_REGEXP_SPECIAL_CASE_H_ + +#ifdef V8_INTL_SUPPORT +#include "unicode/uversion.h" +namespace U_ICU_NAMESPACE { +class UnicodeSet; +} // namespace U_ICU_NAMESPACE + +namespace v8 { +namespace internal { + +// Functions to build special sets of Unicode characters that need special +// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE). +// +// For the characters in the "ignore set", the process should not treat other +// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case +// equivlant under the ECMA262 RegExp "i" mode because these characters are +// uppercase themselves that no other characters in the set uppercase to. +// +// For the characters in the "special add set", the proecess should add only +// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is +// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode +// and also that ONE uppercase character that other non uppercase character +// uppercase into to the set. Other uppercase characters in the result of +// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262 +// RegExp "i" mode consider two characters as "case equivlant" if both +// characters uppercase to the same character. +// +// For example, consider the following case equivalent set defined by Unicode +// standard. Notice there are more than one uppercase characters in this set: +// U+212B Å Angstrom Sign - an uppercase character. +// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character. +// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which +// uppercase to U+00C5. +// In this case equivlant set is a special set and need special handling while +// considering "case equivlant" under the ECMA262 RegExp "i" mode which is +// different than Unicode Standard: +// * U+212B should be included into the "ignore" set because there are no other +// characters, under the ECMA262 "i" mode, are considered as "case equivlant" +// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5 +// uppercase to U+212B. +// * U+00C5 and U+00E5 will both be included into the "special add" set. While +// calculate the "equivlant set" under ECMA262 "i" mode, the process will +// add U+00E5, because it is not an uppercase character in the set. The +// process will also add U+00C5, because it is the uppercase character which +// other non uppercase character, U+00C5, uppercase into. +// +// For characters not included in "ignore set" and "special add set", the +// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is +// much faster. +// +// Under Unicode 12.0, there are only 7 characters in the "special add set" and +// 4 characters in "ignore set" so even the special add process is slower, it is +// limited to a small set of cases only. +// +// The implementation of these two function will be generated by calling ICU +// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by +// the code in src/regexp/gen-regexp-special-case.cc. +// +// These two function will be used with LazyInstance<> template to generate +// global sharable set to reduce memory usage and speed up performance. + +// Function to build and return the Ignore set. +icu::UnicodeSet BuildIgnoreSet(); + +// Function to build and return the Special Add set. +icu::UnicodeSet BuildSpecialAddSet(); + +} // namespace internal +} // namespace v8 + +#endif // V8_INTL_SUPPORT + +#endif // V8_REGEXP_SPECIAL_CASE_H_