ICU-11548 Improve regex static UnicodeSets handling
Compiled regular expression patterns make use of several shared common UnicodeSets. This change simplifies the creation and use of these static UnicodeSets. - Pointer fields to the static sets are removed from the compiled patterns, and the static variables are accessed directly. The deleted pointers were a hold-over from earlier code that did not use shared statics. - The UnicodeSet pattern literals are changed from hex constants to u"string literals". - The size of fRuleSets (from regexst.h) is changed from a hard-coded 10 to the number of UnicodeSets actually required. Doing this required a change to regexcst.pl to export the required size. Changing and rerunning this perl code resulted in massive but benign changes to the generated file regexcst.h, the result of perl having changed its order of enumeration of hashes since the file was last regenerated. - UnicodeSets are frozen when possible. Should result in faster matching.
This commit is contained in:
parent
b9bb61259f
commit
54a60fe6f4
@ -148,9 +148,6 @@ void RegexCompile::compile(
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
|
||||
fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
|
||||
|
||||
|
||||
// Initialize the pattern scanning state machine
|
||||
fPatternLength = utext_nativeLength(pat);
|
||||
@ -1565,15 +1562,15 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
case doSetBackslash_s:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
|
||||
set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
|
||||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_S:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
|
||||
SSet.complement();
|
||||
UnicodeSet SSet;
|
||||
SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
|
||||
set->addAll(SSet);
|
||||
break;
|
||||
}
|
||||
@ -1642,15 +1639,15 @@ UBool RegexCompile::doParseActions(int32_t action)
|
||||
case doSetBackslash_w:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
|
||||
set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
|
||||
break;
|
||||
}
|
||||
|
||||
case doSetBackslash_W:
|
||||
{
|
||||
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
|
||||
UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
|
||||
SSet.complement();
|
||||
UnicodeSet SSet;
|
||||
SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
|
||||
set->addAll(SSet);
|
||||
break;
|
||||
}
|
||||
@ -2425,6 +2422,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
||||
{
|
||||
// The set contains two or more chars. (the normal case)
|
||||
// Put it into the compiled pattern as a set.
|
||||
theSet->freeze();
|
||||
int32_t setNumber = fRXPat->fSets->size();
|
||||
fRXPat->fSets->addElement(theSet, *fStatus);
|
||||
appendOp(URX_SETREF, setNumber);
|
||||
@ -2818,8 +2816,8 @@ void RegexCompile::matchStartType() {
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
U_ASSERT(sn>0 && sn<URX_LAST_SET);
|
||||
const UnicodeSet *s = fRXPat->fStaticSets[sn];
|
||||
fRXPat->fInitialChars->addAll(*s);
|
||||
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
|
||||
fRXPat->fInitialChars->addAll(s);
|
||||
numInitialStrings += 2;
|
||||
}
|
||||
currentLen = safeIncrement(currentLen, 1);
|
||||
@ -2831,9 +2829,8 @@ void RegexCompile::matchStartType() {
|
||||
case URX_STAT_SETREF_N:
|
||||
if (currentLen == 0) {
|
||||
int32_t sn = URX_VAL(op);
|
||||
const UnicodeSet *s = fRXPat->fStaticSets[sn];
|
||||
UnicodeSet sc(*s);
|
||||
sc.complement();
|
||||
UnicodeSet sc;
|
||||
sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
|
||||
fRXPat->fInitialChars->addAll(sc);
|
||||
numInitialStrings += 2;
|
||||
}
|
||||
@ -4420,7 +4417,8 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
if (propName.caseCompare(u"word", -1, 0) == 0) {
|
||||
set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
|
||||
set.adoptInsteadAndCheckErrorCode(
|
||||
RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
|
||||
break;
|
||||
}
|
||||
if (propName.compare(u"all", -1) == 0) {
|
||||
|
@ -20,117 +20,117 @@ U_NAMESPACE_BEGIN
|
||||
//
|
||||
// Character classes for regex pattern scanning.
|
||||
//
|
||||
static const uint8_t kRuleSet_ascii_letter = 128;
|
||||
static const uint8_t kRuleSet_digit_char = 129;
|
||||
static const uint8_t kRuleSet_digit_char = 128;
|
||||
static const uint8_t kRuleSet_ascii_letter = 129;
|
||||
static const uint8_t kRuleSet_rule_char = 130;
|
||||
|
||||
constexpr uint32_t kRuleSet_count = 131-128;
|
||||
|
||||
enum Regex_PatternParseAction {
|
||||
doSetBackslash_V,
|
||||
doSetBackslash_h,
|
||||
doBeginNamedBackRef,
|
||||
doSetMatchMode,
|
||||
doEnterQuoteMode,
|
||||
doOpenCaptureParen,
|
||||
doContinueNamedCapture,
|
||||
doSetBackslash_d,
|
||||
doBeginMatchMode,
|
||||
doBackslashX,
|
||||
doSetPosixProp,
|
||||
doIntervalError,
|
||||
doSetLiteralEscaped,
|
||||
doSetBackslash_s,
|
||||
doNOP,
|
||||
doBackslashv,
|
||||
doOpenLookBehind,
|
||||
doPatStart,
|
||||
doPossessiveInterval,
|
||||
doOpenAtomicParen,
|
||||
doOpenLookAheadNeg,
|
||||
doBackslashd,
|
||||
doBackslashZ,
|
||||
doIntervalUpperDigit,
|
||||
doBadNamedCapture,
|
||||
doSetDifference2,
|
||||
doSetAddAmp,
|
||||
doSetNamedChar,
|
||||
doNamedChar,
|
||||
doSetBackslash_H,
|
||||
doBackslashb,
|
||||
doBackslashz,
|
||||
doSetBeginDifference1,
|
||||
doOpenLookAhead,
|
||||
doMatchModeParen,
|
||||
doBackslashV,
|
||||
doIntevalLowerDigit,
|
||||
doCaret,
|
||||
doSetEnd,
|
||||
doSetNegate,
|
||||
doBackslashS,
|
||||
doOrOperator,
|
||||
doBackslashB,
|
||||
doBackslashw,
|
||||
doBackslashR,
|
||||
doRuleError,
|
||||
doDotAny,
|
||||
doMatchMode,
|
||||
doSetBackslash_W,
|
||||
doNGPlus,
|
||||
doSetBackslash_D,
|
||||
doPossessiveOpt,
|
||||
doSetNamedRange,
|
||||
doConditionalExpr,
|
||||
doBackslashs,
|
||||
doPossessiveStar,
|
||||
doPlus,
|
||||
doBadOpenParenType,
|
||||
doCloseParen,
|
||||
doNGInterval,
|
||||
doSetProp,
|
||||
doBackRef,
|
||||
doSetBeginUnion,
|
||||
doEscapeError,
|
||||
doOpt,
|
||||
doSetBeginIntersection1,
|
||||
doPossessivePlus,
|
||||
doBackslashD,
|
||||
doOpenLookBehindNeg,
|
||||
doSetBegin,
|
||||
doSetIntersection2,
|
||||
doCompleteNamedBackRef,
|
||||
doSetRange,
|
||||
doDollar,
|
||||
doBackslashH,
|
||||
doExit,
|
||||
doNGOpt,
|
||||
doOpenNonCaptureParen,
|
||||
doBackslashA,
|
||||
doSetBackslash_v,
|
||||
doBackslashh,
|
||||
doBadModeFlag,
|
||||
doSetNoCloseError,
|
||||
doIntervalSame,
|
||||
doSetAddDash,
|
||||
doBackslashW,
|
||||
doPerlInline,
|
||||
doSetOpError,
|
||||
doBackslashH,
|
||||
doSetLiteralEscaped,
|
||||
doOpenLookAheadNeg,
|
||||
doCompleteNamedBackRef,
|
||||
doPatStart,
|
||||
doBackslashS,
|
||||
doBackslashD,
|
||||
doNGStar,
|
||||
doNOP,
|
||||
doBackslashX,
|
||||
doSetLiteral,
|
||||
doPatFinish,
|
||||
doBeginNamedCapture,
|
||||
doContinueNamedCapture,
|
||||
doBackslashG,
|
||||
doBackslashR,
|
||||
doSetBegin,
|
||||
doSetBackslash_v,
|
||||
doPossessivePlus,
|
||||
doPerlInline,
|
||||
doBackslashZ,
|
||||
doSetAddAmp,
|
||||
doSetBeginDifference1,
|
||||
doIntervalError,
|
||||
doSetNegate,
|
||||
doIntervalInit,
|
||||
doSetIntersection2,
|
||||
doPossessiveInterval,
|
||||
doRuleError,
|
||||
doBackslashW,
|
||||
doContinueNamedBackRef,
|
||||
doOpenNonCaptureParen,
|
||||
doExit,
|
||||
doSetNamedChar,
|
||||
doSetBackslash_V,
|
||||
doConditionalExpr,
|
||||
doEscapeError,
|
||||
doBadOpenParenType,
|
||||
doPossessiveStar,
|
||||
doSetAddDash,
|
||||
doEscapedLiteralChar,
|
||||
doSetBackslash_w,
|
||||
doIntervalUpperDigit,
|
||||
doBackslashv,
|
||||
doSetBackslash_S,
|
||||
doSetNoCloseError,
|
||||
doSetProp,
|
||||
doBackslashB,
|
||||
doSetEnd,
|
||||
doSetRange,
|
||||
doMatchModeParen,
|
||||
doPlus,
|
||||
doBackslashV,
|
||||
doSetMatchMode,
|
||||
doBackslashz,
|
||||
doSetNamedRange,
|
||||
doOpenLookBehindNeg,
|
||||
doInterval,
|
||||
doBadNamedCapture,
|
||||
doBeginMatchMode,
|
||||
doBackslashd,
|
||||
doPatFinish,
|
||||
doNamedChar,
|
||||
doNGPlus,
|
||||
doSetDifference2,
|
||||
doSetBackslash_H,
|
||||
doCloseParen,
|
||||
doDotAny,
|
||||
doOpenCaptureParen,
|
||||
doEnterQuoteMode,
|
||||
doOpenAtomicParen,
|
||||
doBadModeFlag,
|
||||
doSetBackslash_d,
|
||||
doSetFinish,
|
||||
doProperty,
|
||||
doBeginNamedBackRef,
|
||||
doBackRef,
|
||||
doOpt,
|
||||
doDollar,
|
||||
doBeginNamedCapture,
|
||||
doNGInterval,
|
||||
doSetOpError,
|
||||
doSetPosixProp,
|
||||
doSetBeginIntersection1,
|
||||
doBackslashb,
|
||||
doSetBeginUnion,
|
||||
doIntevalLowerDigit,
|
||||
doSetBackslash_h,
|
||||
doStar,
|
||||
doMatchMode,
|
||||
doBackslashA,
|
||||
doOpenLookBehind,
|
||||
doPossessiveOpt,
|
||||
doOrOperator,
|
||||
doBackslashw,
|
||||
doBackslashs,
|
||||
doLiteralChar,
|
||||
doSuppressComments,
|
||||
doCaret,
|
||||
doIntervalSame,
|
||||
doNGOpt,
|
||||
doOpenLookAhead,
|
||||
doSetBackslash_W,
|
||||
doMismatchedParenErr,
|
||||
doNGStar,
|
||||
doSetFinish,
|
||||
doInterval,
|
||||
doBackslashG,
|
||||
doStar,
|
||||
doSetBackslash_w,
|
||||
doSetBackslash_S,
|
||||
doProperty,
|
||||
doContinueNamedBackRef,
|
||||
doIntervalInit,
|
||||
doSetBackslash_s,
|
||||
rbbiLastAction};
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
@ -197,7 +197,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doBadOpenParenType, 255, 206,0, FALSE} // 45
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
|
||||
, {doBeginNamedCapture, 128, 64,0, FALSE} // 48
|
||||
, {doBeginNamedCapture, 129, 64,0, FALSE} // 48
|
||||
, {doBadOpenParenType, 255, 206,0, FALSE} // 49
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment
|
||||
, {doMismatchedParenErr, 253, 206,0, FALSE} // 51
|
||||
@ -213,8 +213,8 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62
|
||||
, {doBadModeFlag, 255, 206,0, FALSE} // 63
|
||||
, {doContinueNamedCapture, 128, 64,0, TRUE} // 64 named-capture
|
||||
, {doContinueNamedCapture, 129, 64,0, TRUE} // 65
|
||||
, {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture
|
||||
, {doContinueNamedCapture, 128, 64,0, TRUE} // 65
|
||||
, {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 67
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star
|
||||
@ -226,13 +226,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75
|
||||
, {doOpt, 255, 20,0, FALSE} // 76
|
||||
, {doNOP, 129, 79,0, FALSE} // 77 interval-open
|
||||
, {doNOP, 128, 79,0, FALSE} // 77 interval-open
|
||||
, {doIntervalError, 255, 206,0, FALSE} // 78
|
||||
, {doIntevalLowerDigit, 129, 79,0, TRUE} // 79 interval-lower
|
||||
, {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower
|
||||
, {doNOP, 44 /* , */, 83,0, TRUE} // 80
|
||||
, {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81
|
||||
, {doIntervalError, 255, 206,0, FALSE} // 82
|
||||
, {doIntervalUpperDigit, 129, 83,0, TRUE} // 83 interval-upper
|
||||
, {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper
|
||||
, {doNOP, 125 /* } */, 86,0, TRUE} // 84
|
||||
, {doIntervalError, 255, 206,0, FALSE} // 85
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type
|
||||
@ -261,15 +261,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 109
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 110
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 111
|
||||
, {doBackRef, 129, 14,0, TRUE} // 112
|
||||
, {doBackRef, 128, 14,0, TRUE} // 112
|
||||
, {doEscapeError, 253, 206,0, FALSE} // 113
|
||||
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 114
|
||||
, {doBeginNamedBackRef, 60 /* < */, 117,0, TRUE} // 115 named-backref
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 116
|
||||
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 117 named-backref-2
|
||||
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 117 named-backref-2
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 118
|
||||
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 119 named-backref-3
|
||||
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 120
|
||||
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 119 named-backref-3
|
||||
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 120
|
||||
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 121
|
||||
, {doBadNamedCapture, 255, 206,0, FALSE} // 122
|
||||
, {doSetNegate, 94 /* ^ */, 126,0, TRUE} // 123 set-open
|
||||
|
@ -10,13 +10,13 @@
|
||||
# regexcst.pl
|
||||
# Compile the regular expression paser state table data into initialized C data.
|
||||
# Usage:
|
||||
# cd icu/source/i18n
|
||||
# cd icu4c/source/i18n
|
||||
# perl regexcst.pl < regexcst.txt > regexcst.h
|
||||
#
|
||||
# The output file, regexcst.h, is included by some of the .cpp regex
|
||||
# implementation files. This perl script is NOT run as part
|
||||
# of a normal ICU build. It is run by hand when needed, and the
|
||||
# regexcst.h generated file is put back into cvs.
|
||||
# regexcst.h generated file is put back into the source code repository.
|
||||
#
|
||||
# See regexcst.txt for a description of the input format for this script.
|
||||
#
|
||||
@ -201,6 +201,8 @@ for ($state=1; $state<$num_states; $state++) {
|
||||
|
||||
die if ($errors>0);
|
||||
|
||||
print "// © 2016 and later: Unicode, Inc. and others.\n";
|
||||
print "// License & terms of use: http://www.unicode.org/copyright.html\n";
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
print "//\n";
|
||||
print "// Generated Header File. Do not edit by hand.\n";
|
||||
@ -246,6 +248,7 @@ foreach $setName (keys %charClasses) {
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
print " constexpr uint32_t kRuleSet_count = $i-128;";
|
||||
print "\n\n";
|
||||
|
||||
#
|
||||
|
@ -35,216 +35,101 @@
|
||||
// generated by a Perl script.
|
||||
#include "regexst.h"
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// Unicode Set pattern strings for all of the required constant sets.
|
||||
// Initialized with hex values for portability to EBCDIC based machines.
|
||||
// Really ugly, but there's no good way to avoid it.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// "Rule Char" Characters are those with no special meaning, and therefore do not
|
||||
// need to be escaped to appear as literals in a regexp. Expressed
|
||||
// as the inverse of those needing escaping -- [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
|
||||
static const UChar gRuleSet_rule_char_pattern[] = {
|
||||
// [ ^ \ * \ ? \ + \ [ \ ( / )
|
||||
0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
|
||||
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
|
||||
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
|
||||
// "Rule Char" Characters are those with special meaning, and therefore
|
||||
// need to be escaped to appear as literals in a regexp.
|
||||
constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
|
||||
|
||||
//
|
||||
// Here are the backslash escape characters that ICU's unescape() function
|
||||
// will handle.
|
||||
// The backslash escape characters that ICU's unescape() function will handle.
|
||||
//
|
||||
static const UChar gUnescapeCharPattern[] = {
|
||||
// [ a c e f n r t u U x ]
|
||||
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};
|
||||
|
||||
constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
|
||||
|
||||
//
|
||||
// Unicode Set Definitions for Regular Expression \w
|
||||
// Unicode Set pattern for Regular Expression \w
|
||||
//
|
||||
static const UChar gIsWordPattern[] = {
|
||||
// [ \ p { A l p h a b e t i c }
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d,
|
||||
// \ p { M } Mark
|
||||
0x5c, 0x70, 0x7b, 0x4d, 0x7d,
|
||||
// \ p { N d } Digit_Numeric
|
||||
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d,
|
||||
// \ p { P c } Connector_Punctuation
|
||||
0x5c, 0x70, 0x7b, 0x50, 0x63, 0x7d,
|
||||
// \ u 2 0 0 c \ u 2 0 0 d ]
|
||||
0x5c, 0x75, 0x32, 0x30, 0x30, 0x63, 0x5c, 0x75, 0x32, 0x30, 0x30, 0x64, 0x5d, 0};
|
||||
|
||||
constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
|
||||
|
||||
//
|
||||
// Unicode Set Definitions for Regular Expression \s
|
||||
//
|
||||
static const UChar gIsSpacePattern[] = {
|
||||
// [ \ p { W h i t e S p a c e } ]
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x57, 0x68, 0x69, 0x74, 0x65, 0x53, 0x70, 0x61, 0x63, 0x65, 0x7d, 0x5d, 0};
|
||||
|
||||
constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
|
||||
|
||||
//
|
||||
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
|
||||
//
|
||||
static const UChar gGC_ControlPattern[] = {
|
||||
// [ [ : Z l : ] [ : Z p : ]
|
||||
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
|
||||
// [ : C c : ] [ : C f : ] -
|
||||
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
|
||||
// [ : G r a p h e m e _
|
||||
0x5b, 0x3a, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
|
||||
// E x t e n d : ] ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_ExtendPattern[] = {
|
||||
// [ \ p { G r a p h e m e _
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
|
||||
// E x t e n d } ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_VPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_TPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVTPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
|
||||
constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
|
||||
constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]";
|
||||
constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]";
|
||||
constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]";
|
||||
constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]";
|
||||
constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]";
|
||||
constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]";
|
||||
|
||||
|
||||
RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
|
||||
RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
|
||||
UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
RegexStaticSets::RegexStaticSets(UErrorCode *status)
|
||||
:
|
||||
fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
|
||||
fRuleDigitsAlias(NULL),
|
||||
fEmptyText(NULL)
|
||||
{
|
||||
// First zero out everything
|
||||
int i;
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
// Then init the sets to their correct values.
|
||||
fPropSets[URX_ISWORD_SET] = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1), *status);
|
||||
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
|
||||
fPropSets[URX_GC_EXTEND] = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status);
|
||||
fPropSets[URX_GC_CONTROL] = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
|
||||
fPropSets[URX_GC_L] = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1), *status);
|
||||
fPropSets[URX_GC_V] = new UnicodeSet(UnicodeString(TRUE, gGC_VPattern, -1), *status);
|
||||
fPropSets[URX_GC_T] = new UnicodeSet(UnicodeString(TRUE, gGC_TPattern, -1), *status);
|
||||
fPropSets[URX_GC_LV] = new UnicodeSet(UnicodeString(TRUE, gGC_LVPattern, -1), *status);
|
||||
fPropSets[URX_GC_LVT] = new UnicodeSet(UnicodeString(TRUE, gGC_LVTPattern, -1), *status);
|
||||
|
||||
RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
// Initialize the shared static sets to their correct values.
|
||||
fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
|
||||
fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
|
||||
fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
|
||||
fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();
|
||||
|
||||
// Check for null pointers
|
||||
if (fPropSets[URX_ISWORD_SET] == NULL || fPropSets[URX_ISSPACE_SET] == NULL || fPropSets[URX_GC_EXTEND] == NULL ||
|
||||
fPropSets[URX_GC_CONTROL] == NULL || fPropSets[URX_GC_L] == NULL || fPropSets[URX_GC_V] == NULL ||
|
||||
fPropSets[URX_GC_T] == NULL || fPropSets[URX_GC_LV] == NULL || fPropSets[URX_GC_LVT] == NULL) {
|
||||
goto ExitConstrDeleteAll;
|
||||
}
|
||||
if (U_FAILURE(*status)) {
|
||||
// Bail out if we were unable to create the above sets.
|
||||
// The rest of the initialization needs them, so we cannot proceed.
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// The following sets are dynamically constructed, because their
|
||||
// initialization strings would be unreasonable.
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// "Normal" is the set of characters that don't need special handling
|
||||
// when finding grapheme cluster boundaries.
|
||||
//
|
||||
fPropSets[URX_GC_NORMAL] = new UnicodeSet(0, UnicodeSet::MAX_VALUE);
|
||||
// Null pointer check
|
||||
if (fPropSets[URX_GC_NORMAL] == NULL) {
|
||||
goto ExitConstrDeleteAll;
|
||||
}
|
||||
fPropSets[URX_GC_NORMAL]->remove(0xac00, 0xd7a4);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_CONTROL]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
|
||||
fPropSets[URX_GC_NORMAL].complement();
|
||||
fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
|
||||
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
|
||||
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
|
||||
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
|
||||
fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
|
||||
fPropSets[URX_GC_NORMAL].freeze();
|
||||
|
||||
// Initialize the 8-bit fast bit sets from the parallel full
|
||||
// UnicodeSets.
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
if (fPropSets[i]) {
|
||||
fPropSets[i]->compact();
|
||||
fPropSets8[i].init(fPropSets[i]);
|
||||
}
|
||||
//
|
||||
// TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
|
||||
// Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
|
||||
// This runs in exponential time, making it easy to adjust the time for
|
||||
// convenient measuring.
|
||||
//
|
||||
// This 8 bit optimization dates from the early days of ICU,
|
||||
// with a less optimized UnicodeSet. At the time, the difference
|
||||
// was substantial.
|
||||
|
||||
for (int32_t i=0; i<URX_LAST_SET; i++) {
|
||||
fPropSets8[i].init(&fPropSets[i]);
|
||||
}
|
||||
|
||||
// Sets used while parsing rules, but not referenced from the parse state table
|
||||
fRuleSets[kRuleSet_rule_char-128] = UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_digit_char-128].add((UChar)0x30, (UChar)0x39); // [0-9]
|
||||
fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x41, (UChar)0x5A); // [A-Z]
|
||||
fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x61, (UChar)0x7A); // [a-z]
|
||||
fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
|
||||
for (i=0; i<UPRV_LENGTHOF(fRuleSets); i++) {
|
||||
fRuleSets[i].compact();
|
||||
}
|
||||
|
||||
// Finally, initialize an empty string for utility purposes
|
||||
fEmptyText = utext_openUChars(NULL, NULL, 0, status);
|
||||
|
||||
if (U_SUCCESS(*status)) {
|
||||
return;
|
||||
}
|
||||
fRuleSets[kRuleSet_rule_char-128]
|
||||
.addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
|
||||
|
||||
ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
delete fPropSets[i];
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
if (U_SUCCESS(*status)) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
|
||||
fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
|
||||
fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
|
||||
|
||||
// Finally, initialize an empty UText string for utility purposes
|
||||
fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
|
||||
|
||||
}
|
||||
|
||||
|
||||
RegexStaticSets::~RegexStaticSets() {
|
||||
int32_t i;
|
||||
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
delete fPropSets[i];
|
||||
fPropSets[i] = NULL;
|
||||
}
|
||||
fRuleDigitsAlias = NULL;
|
||||
|
||||
fRuleDigitsAlias = nullptr;
|
||||
utext_close(fEmptyText);
|
||||
}
|
||||
|
||||
@ -255,29 +140,25 @@ RegexStaticSets::~RegexStaticSets() {
|
||||
// cached memory. Called by ICU's u_cleanup() function.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
UBool
|
||||
RegexStaticSets::cleanup(void) {
|
||||
delete RegexStaticSets::gStaticSets;
|
||||
RegexStaticSets::gStaticSets = NULL;
|
||||
gStaticSetsInitOnce.reset();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
regex_cleanup(void) {
|
||||
return RegexStaticSets::cleanup();
|
||||
delete RegexStaticSets::gStaticSets;
|
||||
RegexStaticSets::gStaticSets = nullptr;
|
||||
gStaticSetsInitOnce.reset();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void U_CALLCONV initStaticSets(UErrorCode &status) {
|
||||
U_ASSERT(RegexStaticSets::gStaticSets == NULL);
|
||||
U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
|
||||
RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete RegexStaticSets::gStaticSets;
|
||||
RegexStaticSets::gStaticSets = NULL;
|
||||
RegexStaticSets::gStaticSets = nullptr;
|
||||
}
|
||||
if (RegexStaticSets::gStaticSets == NULL && U_SUCCESS(status)) {
|
||||
if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
@ -25,6 +25,7 @@
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "regeximp.h"
|
||||
#include "regexcst.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
@ -39,17 +40,16 @@ public:
|
||||
RegexStaticSets(UErrorCode *status);
|
||||
~RegexStaticSets();
|
||||
static void initGlobals(UErrorCode *status);
|
||||
static UBool cleanup();
|
||||
|
||||
UnicodeSet *fPropSets[URX_LAST_SET]; // The sets for common regex items, e.g. \s
|
||||
Regex8BitSet fPropSets8[URX_LAST_SET]; // Fast bitmap sets for latin-1 range for above.
|
||||
UnicodeSet fPropSets[URX_LAST_SET] {}; // The sets for common regex items, e.g. \s
|
||||
Regex8BitSet fPropSets8[URX_LAST_SET] {}; // Fast bitmap sets for latin-1 range for above.
|
||||
|
||||
UnicodeSet fRuleSets[10]; // Sets used while parsing regexp patterns.
|
||||
UnicodeSet fUnescapeCharSet; // Set of chars handled by unescape when
|
||||
// encountered with a \ in a pattern.
|
||||
UnicodeSet *fRuleDigitsAlias;
|
||||
UText *fEmptyText; // An empty string, to be used when a matcher
|
||||
// is created with no input.
|
||||
UnicodeSet fRuleSets[kRuleSet_count] {}; // Sets used while parsing regexp patterns.
|
||||
UnicodeSet fUnescapeCharSet {}; // Set of chars handled by unescape when
|
||||
// encountered with a \ in a pattern.
|
||||
UnicodeSet *fRuleDigitsAlias {};
|
||||
UText *fEmptyText {}; // An empty string, to be used when a matcher
|
||||
// is created with no input.
|
||||
|
||||
};
|
||||
|
||||
|
@ -2542,7 +2542,7 @@ UBool RegexMatcher::isWordBoundary(int64_t pos) {
|
||||
// Current char is a combining one. Not a boundary.
|
||||
return FALSE;
|
||||
}
|
||||
cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
|
||||
cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
|
||||
}
|
||||
|
||||
// Back up until we come to a non-combining char, determine whether
|
||||
@ -2555,7 +2555,7 @@ UBool RegexMatcher::isWordBoundary(int64_t pos) {
|
||||
UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
|
||||
if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
|
||||
|| u_charType(prevChar) == U_FORMAT_CHAR)) {
|
||||
prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
|
||||
prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2580,7 +2580,7 @@ UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
|
||||
// Current char is a combining one. Not a boundary.
|
||||
return FALSE;
|
||||
}
|
||||
cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
|
||||
cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
|
||||
}
|
||||
|
||||
// Back up until we come to a non-combining char, determine whether
|
||||
@ -2594,7 +2594,7 @@ UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
|
||||
U16_PREV(inputBuf, fLookStart, pos, prevChar);
|
||||
if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
|
||||
|| u_charType(prevChar) == U_FORMAT_CHAR)) {
|
||||
prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
|
||||
prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -3203,14 +3203,14 @@ void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
|
||||
UChar32 c;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
UnicodeSet **sets = fPattern->fStaticSets;
|
||||
if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets;
|
||||
if (sets[URX_GC_NORMAL].contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
goto GC_Extend;
|
||||
|
||||
|
||||
@ -3219,10 +3219,10 @@ GC_L:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
goto GC_Extend;
|
||||
@ -3231,8 +3231,8 @@ GC_V:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
goto GC_Extend;
|
||||
@ -3241,7 +3241,7 @@ GC_T:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
c = UTEXT_NEXT32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
(void)UTEXT_PREVIOUS32(fInputText);
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
goto GC_Extend;
|
||||
@ -3253,7 +3253,7 @@ GC_Extend:
|
||||
break;
|
||||
}
|
||||
c = UTEXT_CURRENT32(fInputText);
|
||||
if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
|
||||
if (sets[URX_GC_EXTEND].contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
(void)UTEXT_NEXT32(fInputText);
|
||||
@ -3310,13 +3310,13 @@ GC_Done:
|
||||
UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
if (c < 256) {
|
||||
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
|
||||
if (s8->contains(c)) {
|
||||
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
|
||||
if (s8.contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
} else {
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
|
||||
if (s.contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
}
|
||||
@ -3346,14 +3346,14 @@ GC_Done:
|
||||
|
||||
UChar32 c = UTEXT_NEXT32(fInputText);
|
||||
if (c < 256) {
|
||||
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
|
||||
if (s8->contains(c) == FALSE) {
|
||||
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
|
||||
if (s8.contains(c) == FALSE) {
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c) == FALSE) {
|
||||
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
|
||||
if (s.contains(c) == FALSE) {
|
||||
fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
|
||||
break;
|
||||
}
|
||||
@ -4778,14 +4778,14 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
// Dispatch into a little state machine, based on the char.
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
UnicodeSet **sets = fPattern->fStaticSets;
|
||||
if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
UnicodeSet *sets = RegexStaticSets::gStaticSets->fPropSets;
|
||||
if (sets[URX_GC_NORMAL].contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL].contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
goto GC_Extend;
|
||||
|
||||
|
||||
@ -4793,25 +4793,25 @@ void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &statu
|
||||
GC_L:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_L].contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT].contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_V:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V].contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_T:
|
||||
if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_T].contains(c)) goto GC_T;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
@ -4822,7 +4822,7 @@ GC_Extend:
|
||||
break;
|
||||
}
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
|
||||
if (sets[URX_GC_EXTEND].contains(c) == FALSE) {
|
||||
U16_BACK_1(inputBuf, 0, fp->fInputIdx);
|
||||
break;
|
||||
}
|
||||
@ -4877,13 +4877,13 @@ GC_Done:
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (c < 256) {
|
||||
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
|
||||
if (s8->contains(c)) {
|
||||
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
|
||||
if (s8.contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
} else {
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c)) {
|
||||
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
|
||||
if (s.contains(c)) {
|
||||
success = !success;
|
||||
}
|
||||
}
|
||||
@ -4909,13 +4909,13 @@ GC_Done:
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
|
||||
if (c < 256) {
|
||||
Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
|
||||
if (s8->contains(c) == FALSE) {
|
||||
Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
|
||||
if (s8.contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
const UnicodeSet *s = fPattern->fStaticSets[opValue];
|
||||
if (s->contains(c) == FALSE) {
|
||||
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
|
||||
if (s.contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -97,8 +97,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
fMinMatchLen = other.fMinMatchLen;
|
||||
fFrameSize = other.fFrameSize;
|
||||
fDataSize = other.fDataSize;
|
||||
fStaticSets = other.fStaticSets;
|
||||
fStaticSets8 = other.fStaticSets8;
|
||||
|
||||
fStartType = other.fStartType;
|
||||
fInitialStringIdx = other.fInitialStringIdx;
|
||||
@ -175,8 +173,6 @@ void RegexPattern::init() {
|
||||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
fGroupMap = NULL;
|
||||
fStaticSets = NULL;
|
||||
fStaticSets8 = NULL;
|
||||
fStartType = START_NO_INFO;
|
||||
fInitialStringIdx = 0;
|
||||
fInitialStringLen = 0;
|
||||
@ -805,8 +801,8 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
printf("NOT ");
|
||||
val &= ~URX_NEG_SET;
|
||||
}
|
||||
UnicodeSet *set = fStaticSets[val];
|
||||
set->toPattern(s, TRUE);
|
||||
UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val];
|
||||
set.toPattern(s, TRUE);
|
||||
printf("%s", CStr(s)());
|
||||
}
|
||||
break;
|
||||
|
@ -612,12 +612,6 @@ private:
|
||||
UVector32 *fGroupMap; // Map from capture group number to position of
|
||||
// the group's variables in the matcher stack frame.
|
||||
|
||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||
// regex character classes, e.g. Word.
|
||||
|
||||
Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
|
||||
// sets for predefined regex classes.
|
||||
|
||||
int32_t fStartType; // Info on how a match must start.
|
||||
int32_t fInitialStringIdx; //
|
||||
int32_t fInitialStringLen;
|
||||
|
@ -3500,11 +3500,15 @@ void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
// positions.
|
||||
//
|
||||
parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
if (!assertSuccess(WHERE, status) ) {
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
unEscapedInput = inputString.unescape();
|
||||
parseMatcher = parsePat->matcher(unEscapedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
if (!assertSuccess(WHERE, status) ) {
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
while(parseMatcher->find()) {
|
||||
parseMatcher->appendReplacement(deTaggedInput, "", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
@ -4203,6 +4207,8 @@ void RegexTest::PerlTests() {
|
||||
if (expected != found) {
|
||||
errln("line %d: Expected %smatch, got %smatch",
|
||||
lineNum, expected?"":"no ", found?"":"no " );
|
||||
delete testMat;
|
||||
delete testPat;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -4598,6 +4604,8 @@ void RegexTest::PerlTestsUTF8() {
|
||||
if (expected != found) {
|
||||
errln("line %d: Expected %smatch, got %smatch",
|
||||
lineNum, expected?"":"no ", found?"":"no " );
|
||||
delete testMat;
|
||||
delete testPat;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user