54a60fe6f4
Compiled regular expression patterns make use of several shared common UnicodeSets. This change simplifies the creation and use of these static UnicodeSets. - Pointer fields to the static sets are removed from the compiled patterns, and the static variables are accessed directly. The deleted pointers were a hold-over from earlier code that did not use shared statics. - The UnicodeSet pattern literals are changed from hex constants to u"string literals". - The size of fRuleSets (from regexst.h) is changed from a hard-coded 10 to the number of UnicodeSets actually required. Doing this required a change to regexcst.pl to export the required size. Changing and rerunning this perl code resulted in massive but benign changes to the generated file regexcst.h, the result of perl having changed its order of enumeration of hashes since the file was last regenerated. - UnicodeSets are frozen when possible. Should result in faster matching.
173 lines
6.6 KiB
C++
173 lines
6.6 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
//
|
|
// regexst.h
|
|
//
|
|
// Copyright (C) 2004-2015, International Business Machines Corporation and others.
|
|
// All Rights Reserved.
|
|
//
|
|
// This file contains class RegexStaticSets
|
|
//
|
|
// This class is internal to the regular expression implementation.
|
|
// For the public Regular Expression API, see the file "unicode/regex.h"
|
|
//
|
|
// RegexStaticSets groups together the common UnicodeSets that are needed
|
|
// for compiling or executing RegularExpressions. This grouping simplifies
|
|
// the thread safe lazy creation and sharing of these sets across
|
|
// all instances of regular expressions.
|
|
//
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/regex.h"
|
|
#include "uprops.h"
|
|
#include "cmemory.h"
|
|
#include "cstring.h"
|
|
#include "uassert.h"
|
|
#include "ucln_in.h"
|
|
#include "umutex.h"
|
|
|
|
#include "regexcst.h" // Contains state table for the regex pattern parser.
|
|
// generated by a Perl script.
|
|
#include "regexst.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
// "Rule Char" Characters are those with special meaning, and therefore
|
|
// need to be escaped to appear as literals in a regexp.
|
|
constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
|
|
|
|
//
|
|
// The backslash escape characters that ICU's unescape() function will handle.
|
|
//
|
|
constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
|
|
|
|
//
|
|
// Unicode Set pattern for Regular Expression \w
|
|
//
|
|
constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
|
|
|
|
//
|
|
// Unicode Set Definitions for Regular Expression \s
|
|
//
|
|
constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
|
|
|
|
//
|
|
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
|
|
//
|
|
constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
|
|
constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]";
|
|
constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]";
|
|
constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]";
|
|
constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]";
|
|
constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]";
|
|
constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]";
|
|
|
|
|
|
RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
|
|
UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
|
|
|
|
|
|
RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
|
// Initialize the shared static sets to their correct values.
|
|
fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
|
|
fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
|
|
fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
|
|
fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();
|
|
|
|
|
|
//
|
|
// "Normal" is the set of characters that don't need special handling
|
|
// when finding grapheme cluster boundaries.
|
|
//
|
|
fPropSets[URX_GC_NORMAL].complement();
|
|
fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
|
|
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
|
|
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
|
|
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
|
|
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
|
|
fPropSets[URX_GC_NORMAL].freeze();
|
|
|
|
// Initialize the 8-bit fast bit sets from the parallel full
|
|
// UnicodeSets.
|
|
//
|
|
// TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
|
|
// Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
|
|
// This runs in exponential time, making it easy to adjust the time for
|
|
// convenient measuring.
|
|
//
|
|
// This 8 bit optimization dates from the early days of ICU,
|
|
// with a less optimized UnicodeSet. At the time, the difference
|
|
// was substantial.
|
|
|
|
for (int32_t i=0; i<URX_LAST_SET; i++) {
|
|
fPropSets8[i].init(&fPropSets[i]);
|
|
}
|
|
|
|
// Sets used while parsing rules, but not referenced from the parse state table
|
|
fRuleSets[kRuleSet_rule_char-128]
|
|
.addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
|
|
|
|
fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
|
|
fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
|
|
fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
|
|
|
|
// Finally, initialize an empty UText string for utility purposes
|
|
fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
|
|
|
|
}
|
|
|
|
|
|
RegexStaticSets::~RegexStaticSets() {
|
|
fRuleDigitsAlias = nullptr;
|
|
utext_close(fEmptyText);
|
|
}
|
|
|
|
|
|
//------------------------------------------------------------------------------
|
|
//
|
|
// regex_cleanup Memory cleanup function, free/delete all
|
|
// cached memory. Called by ICU's u_cleanup() function.
|
|
//
|
|
//------------------------------------------------------------------------------
|
|
|
|
U_CDECL_BEGIN
|
|
static UBool U_CALLCONV
|
|
regex_cleanup(void) {
|
|
delete RegexStaticSets::gStaticSets;
|
|
RegexStaticSets::gStaticSets = nullptr;
|
|
gStaticSetsInitOnce.reset();
|
|
return TRUE;
|
|
}
|
|
|
|
static void U_CALLCONV initStaticSets(UErrorCode &status) {
|
|
U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
|
|
ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
|
|
RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
|
|
if (U_FAILURE(status)) {
|
|
delete RegexStaticSets::gStaticSets;
|
|
RegexStaticSets::gStaticSets = nullptr;
|
|
}
|
|
if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
U_CDECL_END
|
|
|
|
void RegexStaticSets::initGlobals(UErrorCode *status) {
|
|
umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|