ICU-105 Regular Expressions, ongoing development
X-SVN-Rev: 10063
This commit is contained in:
parent
08ca9c365b
commit
425ac49187
@ -503,6 +503,7 @@ typedef enum UErrorCode {
|
||||
U_REGEX_ERROR_START=0x10300,
|
||||
U_REGEX_INTERNAL_ERROR,
|
||||
U_REGEX_INVALID_STATE,
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE,
|
||||
U_REGEX_ERROR_LIMIT,
|
||||
|
||||
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
@ -59,22 +59,21 @@ static const UChar gRuleSet_rule_char_pattern[] = {
|
||||
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
|
||||
|
||||
|
||||
static const UChar gRuleSet_name_char_pattern[] = {
|
||||
// [ _ \ p { L } \ p { N } ]
|
||||
0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gRuleSet_digit_char_pattern[] = {
|
||||
// [ 0 - 9 ]
|
||||
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
|
||||
|
||||
static const UChar gRuleSet_name_start_char_pattern[] = {
|
||||
// [ _ \ p { L } ]
|
||||
0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
|
||||
|
||||
static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
|
||||
|
||||
static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects.
|
||||
static UnicodeSet *gUnescapeCharSet;
|
||||
|
||||
//
|
||||
// These are the backslash escape characters that ICU's unescape
|
||||
// will handle.
|
||||
//
|
||||
static const UChar gUnescapeCharPattern[] = {
|
||||
// [ a b c e f n r t u U ]
|
||||
0x5b, 0x61, 0x62, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d};
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
@ -88,7 +87,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
||||
|
||||
fScanIndex = 0;
|
||||
fNextIndex = 0;
|
||||
|
||||
fPeekChar = -1;
|
||||
fLineNum = 1;
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
@ -110,13 +109,16 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
||||
gRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(gRuleSet_rule_char_pattern, status);
|
||||
gRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodePropertySet::getRuleWhiteSpaceSet(status));
|
||||
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
|
||||
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete gRuleSets[kRuleSet_rule_char-128];
|
||||
delete gRuleSets[kRuleSet_white_space-128];
|
||||
delete gRuleSets[kRuleSet_digit_char-128];
|
||||
delete gUnescapeCharSet;
|
||||
gRuleSets[kRuleSet_rule_char-128] = NULL;
|
||||
gRuleSets[kRuleSet_white_space-128] = NULL;
|
||||
gRuleSets[kRuleSet_digit_char-128] = NULL;
|
||||
gUnescapeCharSet = NULL;
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -218,7 +220,7 @@ void RegexCompile::compile(
|
||||
// Table row specified "quoted" and the char was quoted.
|
||||
break;
|
||||
}
|
||||
if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
|
||||
if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
|
||||
// Table row specified eof and we hit eof on the input.
|
||||
break;
|
||||
}
|
||||
@ -605,14 +607,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case doDotAny:
|
||||
// scanned a ".", match any single character.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
|
||||
break;
|
||||
|
||||
|
||||
case doExprFinished:
|
||||
case doBackslashA:
|
||||
// Scanned a "\A".
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
|
||||
break;
|
||||
|
||||
case doExit:
|
||||
@ -816,6 +819,11 @@ UChar32 RegexCompile::nextCharLL() {
|
||||
UChar32 ch;
|
||||
UnicodeString &pattern = fRXPat->fPattern;
|
||||
|
||||
if (fPeekChar != -1) {
|
||||
ch = fPeekChar;
|
||||
fPeekChar = -1;
|
||||
return ch;
|
||||
}
|
||||
if (fPatternLength==0 || fNextIndex >= fPatternLength) {
|
||||
return (UChar32)-1;
|
||||
}
|
||||
@ -846,12 +854,25 @@ UChar32 RegexCompile::nextCharLL() {
|
||||
return ch;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// peekCharLL Low Level Character Scanning, sneak a peek at the next
|
||||
// character without actually getting it.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
UChar32 RegexCompile::peekCharLL() {
|
||||
if (fPeekChar == -1) {
|
||||
fPeekChar = nextCharLL();
|
||||
}
|
||||
return fPeekChar;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// nextChar for rules scanning. At this level, we handle stripping
|
||||
// out comments and processing backslash character escapes.
|
||||
// The rest of the rules grammar is handled at the next level up.
|
||||
// nextChar for pattern scanning. At this level, we handle stripping
|
||||
// out comments and processing some backslash character escapes.
|
||||
// The rest of the pattern grammar is handled at the next level up.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
@ -870,7 +891,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
{
|
||||
// We are not in a 'quoted region' of the source.
|
||||
//
|
||||
if (c.fChar == chPound) {
|
||||
if (fFreeForm && c.fChar == chPound) {
|
||||
// Start of a comment. Consume the rest of it.
|
||||
// The new-line char that terminates the comment is always returned.
|
||||
// It will be treated as white-space, and serves to break up anything
|
||||
@ -891,16 +912,22 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
||||
|
||||
//
|
||||
// check for backslash escaped characters.
|
||||
// Use UnicodeString::unescapeAt() to handle them.
|
||||
// Use UnicodeString::unescapeAt() to handle those that it can.
|
||||
// Otherwise just return the '\', and let the pattern parser deal with it.
|
||||
//
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t endX = fNextIndex; // sequence following the '\'
|
||||
if (c.fChar == chBackSlash) {
|
||||
c.fQuoted = TRUE;
|
||||
int32_t startX = fNextIndex;
|
||||
c.fChar = fRXPat->fPattern.unescapeAt(fNextIndex);
|
||||
if (fNextIndex == startX) {
|
||||
error(U_BRK_HEX_DIGITS_EXPECTED);
|
||||
if (gUnescapeCharSet->contains(peekCharLL())) {
|
||||
nextCharLL(); // get & discard the peeked char.
|
||||
c.fQuoted = TRUE;
|
||||
c.fChar = fRXPat->fPattern.unescapeAt(endX);
|
||||
if (startX == endX) {
|
||||
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
|
||||
}
|
||||
fCharNum += endX - startX;
|
||||
fNextIndex = endX;
|
||||
}
|
||||
fCharNum += fNextIndex-startX;
|
||||
}
|
||||
}
|
||||
// putc(c.fChar, stdout);
|
||||
|
@ -65,8 +65,6 @@ public:
|
||||
|
||||
void nextChar(RegexPatternChar &c); // Get the next char from the input stream.
|
||||
|
||||
UBool push(const RegexPatternChar &c); // Push (unget) one character.
|
||||
// Only a single character may be pushed.
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
@ -88,6 +86,7 @@ private:
|
||||
void error(UErrorCode e); // error reporting convenience function.
|
||||
|
||||
UChar32 nextCharLL();
|
||||
UChar32 peekCharLL();
|
||||
UnicodeSet *scanSet();
|
||||
void handleCloseParen();
|
||||
int32_t blockTopLoc(); // Locate a position in the compiled pattern
|
||||
@ -99,6 +98,9 @@ private:
|
||||
RegexPattern *fRXPat;
|
||||
UParseError *fParseErr;
|
||||
|
||||
//
|
||||
// Data associated with low level character scanning
|
||||
//
|
||||
int32_t fScanIndex; // Index of current character being processed
|
||||
// in the rule input string.
|
||||
int32_t fNextIndex; // Index of the next character, which
|
||||
@ -109,6 +111,8 @@ private:
|
||||
int fCharNum; // Char position within the line.
|
||||
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
||||
// as a single line, not two.
|
||||
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
|
||||
|
||||
|
||||
RegexPatternChar fC; // Current char for parse state machine
|
||||
// processing.
|
||||
|
@ -40,6 +40,7 @@ enum Regex_PatternParseAction {
|
||||
doOpenLookAheadNeg,
|
||||
doPlus,
|
||||
doOpenNonCaptureParen,
|
||||
doBackslashA,
|
||||
doNGPlus,
|
||||
doPatFinish,
|
||||
doIntervalMinValue,
|
||||
@ -51,7 +52,6 @@ enum Regex_PatternParseAction {
|
||||
doOpenLookAhead,
|
||||
doNumberExpectedError,
|
||||
doDotAny,
|
||||
doExprFinished,
|
||||
doScanUnicodeSet,
|
||||
doNOP,
|
||||
doExit,
|
||||
@ -80,71 +80,65 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doPatStart, 255, 3, 2, FALSE} // 1 start
|
||||
, {doPatFinish, 255, 2,0, FALSE} // 2 finish
|
||||
, {doStartString, 254, 10,0, TRUE} // 3 term
|
||||
, {doStartString, 130, 10,0, TRUE} // 4
|
||||
, {doScanUnicodeSet, 91 /* [ */, 17,0, TRUE} // 5
|
||||
, {doNOP, 40 /* ( */, 29, 17, TRUE} // 6
|
||||
, {doDotAny, 46 /* . */, 17,0, TRUE} // 7
|
||||
, {doNOP, 253, 255,0, FALSE} // 8
|
||||
, {doRuleError, 255, 67,0, FALSE} // 9
|
||||
, {doStringChar, 254, 10,0, TRUE} // 10 string
|
||||
, {doStringChar, 130, 10,0, TRUE} // 11
|
||||
, {doSplitString, 63 /* ? */, 17,0, FALSE} // 12
|
||||
, {doSplitString, 43 /* + */, 17,0, FALSE} // 13
|
||||
, {doSplitString, 42 /* * */, 17,0, FALSE} // 14
|
||||
, {doSplitString, 123 /* { */, 17,0, FALSE} // 15
|
||||
, {doEndString, 255, 17,0, FALSE} // 16
|
||||
, {doNOP, 42 /* * */, 40,0, TRUE} // 17 expr-quant
|
||||
, {doNOP, 43 /* + */, 43,0, TRUE} // 18
|
||||
, {doNOP, 63 /* ? */, 46,0, TRUE} // 19
|
||||
, {doNOP, 255, 21,0, FALSE} // 20
|
||||
, {doNOP, 254, 3,0, FALSE} // 21 expr-cont
|
||||
, {doNOP, 130, 3,0, FALSE} // 22
|
||||
, {doNOP, 91 /* [ */, 3,0, FALSE} // 23
|
||||
, {doNOP, 40 /* ( */, 3,0, FALSE} // 24
|
||||
, {doNOP, 46 /* . */, 3,0, FALSE} // 25
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 26
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 27
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 28
|
||||
, {doNOP, 63 /* ? */, 31,0, TRUE} // 29 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 17, FALSE} // 30
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 17, TRUE} // 31 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 17, TRUE} // 32
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 21, TRUE} // 33
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 21, TRUE} // 34
|
||||
, {doNOP, 60 /* < */, 37,0, TRUE} // 35
|
||||
, {doBadOpenParenType, 255, 67,0, FALSE} // 36
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 21, TRUE} // 37 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 21, TRUE} // 38
|
||||
, {doBadOpenParenType, 255, 67,0, FALSE} // 39
|
||||
, {doNGStar, 63 /* ? */, 21,0, TRUE} // 40 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 21,0, TRUE} // 41
|
||||
, {doStar, 255, 21,0, FALSE} // 42
|
||||
, {doNGPlus, 63 /* ? */, 21,0, TRUE} // 43 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 21,0, TRUE} // 44
|
||||
, {doPlus, 255, 21,0, FALSE} // 45
|
||||
, {doNGOpt, 63 /* ? */, 21,0, TRUE} // 46 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 21,0, TRUE} // 47
|
||||
, {doOpt, 255, 21,0, FALSE} // 48
|
||||
, {doNOP, 129, 49,0, TRUE} // 49 interval-open
|
||||
, {doIntervalMinValue, 128, 52,0, FALSE} // 50
|
||||
, {doNumberExpectedError, 255, 67,0, FALSE} // 51
|
||||
, {doNOP, 129, 56,0, TRUE} // 52 interval-value
|
||||
, {doNOP, 125 /* } */, 56,0, FALSE} // 53
|
||||
, {doIntervalDigit, 128, 52,0, TRUE} // 54
|
||||
, {doNumberExpectedError, 255, 67,0, FALSE} // 55
|
||||
, {doNOP, 129, 56,0, TRUE} // 56 interval-close
|
||||
, {doTagValue, 125 /* } */, 59,0, TRUE} // 57
|
||||
, {doNumberExpectedError, 255, 67,0, FALSE} // 58
|
||||
, {doNOP, 254, 3,0, FALSE} // 59 expr-cont-no-interval
|
||||
, {doNOP, 130, 3,0, FALSE} // 60
|
||||
, {doNOP, 91 /* [ */, 3,0, FALSE} // 61
|
||||
, {doNOP, 40 /* ( */, 3,0, FALSE} // 62
|
||||
, {doNOP, 46 /* . */, 3,0, FALSE} // 63
|
||||
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 64
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 66
|
||||
, {doExit, 255, 67,0, TRUE} // 67 errorDeath
|
||||
, {doStartString, 254, 11,0, TRUE} // 3 term
|
||||
, {doStartString, 130, 11,0, TRUE} // 4
|
||||
, {doScanUnicodeSet, 91 /* [ */, 18,0, TRUE} // 5
|
||||
, {doNOP, 40 /* ( */, 25, 18, TRUE} // 6
|
||||
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
|
||||
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
|
||||
, {doNOP, 253, 2,0, FALSE} // 9
|
||||
, {doRuleError, 255, 61,0, FALSE} // 10
|
||||
, {doStringChar, 254, 11,0, TRUE} // 11 string
|
||||
, {doStringChar, 130, 11,0, TRUE} // 12
|
||||
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
|
||||
, {doSplitString, 43 /* + */, 18,0, FALSE} // 14
|
||||
, {doSplitString, 42 /* * */, 18,0, FALSE} // 15
|
||||
, {doSplitString, 123 /* { */, 18,0, FALSE} // 16
|
||||
, {doEndString, 255, 18,0, FALSE} // 17
|
||||
, {doNOP, 42 /* * */, 36,0, TRUE} // 18 expr-quant
|
||||
, {doNOP, 43 /* + */, 39,0, TRUE} // 19
|
||||
, {doNOP, 63 /* ? */, 42,0, TRUE} // 20
|
||||
, {doNOP, 255, 22,0, FALSE} // 21
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 22 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 23
|
||||
, {doNOP, 255, 3,0, FALSE} // 24
|
||||
, {doNOP, 63 /* ? */, 27,0, TRUE} // 25 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 18, FALSE} // 26
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE} // 27 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE} // 28
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
|
||||
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
|
||||
, {doBadOpenParenType, 255, 61,0, FALSE} // 32
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 61,0, FALSE} // 35
|
||||
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
|
||||
, {doStar, 255, 22,0, FALSE} // 38
|
||||
, {doNGPlus, 63 /* ? */, 22,0, TRUE} // 39 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 22,0, TRUE} // 40
|
||||
, {doPlus, 255, 22,0, FALSE} // 41
|
||||
, {doNGOpt, 63 /* ? */, 22,0, TRUE} // 42 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 22,0, TRUE} // 43
|
||||
, {doOpt, 255, 22,0, FALSE} // 44
|
||||
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
|
||||
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
|
||||
, {doNumberExpectedError, 255, 61,0, FALSE} // 47
|
||||
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
|
||||
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
|
||||
, {doIntervalDigit, 128, 48,0, TRUE} // 50
|
||||
, {doNumberExpectedError, 255, 61,0, FALSE} // 51
|
||||
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
|
||||
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
|
||||
, {doNumberExpectedError, 255, 61,0, FALSE} // 54
|
||||
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
|
||||
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
|
||||
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
|
||||
, {doNOP, 255, 3,0, FALSE} // 58
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
|
||||
, {doStartString, 255, 11,0, TRUE} // 60
|
||||
, {doExit, 255, 61,0, TRUE} // 61 errorDeath
|
||||
};
|
||||
static const char *RegexStateNames[] = { 0,
|
||||
"start",
|
||||
@ -155,6 +149,7 @@ static const char *RegexStateNames[] = { 0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"string",
|
||||
0,
|
||||
@ -169,11 +164,6 @@ static const char *RegexStateNames[] = { 0,
|
||||
0,
|
||||
"expr-cont",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"open-paren",
|
||||
0,
|
||||
@ -209,9 +199,7 @@ static const char *RegexStateNames[] = { 0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"backslash",
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
@ -77,7 +77,8 @@ term:
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren ^expr-quant
|
||||
'.' n expr-quant doDotAny
|
||||
eof pop
|
||||
'\' n backslash
|
||||
eof finish
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
@ -110,17 +111,12 @@ expr-quant:
|
||||
|
||||
#
|
||||
# expr-cont Expression, continuation. At a point where additional terms are
|
||||
# allowed, but not required.
|
||||
# allowed, but not required. No Quantifiers
|
||||
#
|
||||
expr-cont:
|
||||
quoted term
|
||||
rule_char term
|
||||
'[' term
|
||||
'(' term
|
||||
'.' term
|
||||
'|' n term doOrOperator
|
||||
')' n pop doCloseParen
|
||||
default pop doExprFinished
|
||||
default term
|
||||
|
||||
|
||||
#
|
||||
@ -205,16 +201,18 @@ interval-close:
|
||||
#
|
||||
expr-cont-no-interval:
|
||||
quoted term
|
||||
rule_char term
|
||||
'[' term
|
||||
'(' term
|
||||
'.' term
|
||||
'|' n term doExprOrOperator
|
||||
')' n pop doExprRParen
|
||||
default pop doExprFinished
|
||||
default term
|
||||
|
||||
|
||||
|
||||
#
|
||||
# backslash # Backslash. Figure out which of the \thingies we have encountered.
|
||||
# The low level next-char function will have preprocessed
|
||||
# some of them already; those won't come here.
|
||||
backslash:
|
||||
'A' n term doBackslashA
|
||||
default n string doStartString
|
||||
|
||||
|
||||
|
||||
|
@ -26,7 +26,7 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po
|
||||
static const uint32_t URX_NOP = 7;
|
||||
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
|
||||
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
|
||||
static const uint32_t URX_UNUSED10 = 10; // Value field is index in pattern to
|
||||
static const uint32_t URX_BACKSLASH_A = 10; // Value field is index in pattern to
|
||||
// loop back to.
|
||||
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
|
||||
static const uint32_t URX_DOTANY = 12;
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "uassert.h"
|
||||
#include "uvector.h"
|
||||
#include "regeximp.h"
|
||||
@ -54,20 +55,126 @@ RegexMatcher::~RegexMatcher() {
|
||||
|
||||
|
||||
|
||||
|
||||
static const UChar BACKSLASH = 0x5c;
|
||||
static const UChar DOLLARSIGN = 0x24;
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// appendReplacement
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
|
||||
const UnicodeString &replacement) {
|
||||
const UnicodeString &replacement,
|
||||
UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *this;
|
||||
}
|
||||
if (fMatch == FALSE) {
|
||||
status = U_REGEX_INVALID_STATE;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Copy input string from the end of previous match to start of current match
|
||||
int32_t len = fMatchStart-fLastMatchEnd;
|
||||
if (len > 0) {
|
||||
dest.append(*fInput, fLastMatchEnd, len);
|
||||
}
|
||||
|
||||
|
||||
// scan the replacement text, looking for substitutions ($n) and \escapes.
|
||||
int32_t replLen = replacement.length();
|
||||
int32_t replIdx;
|
||||
for (replIdx = 0; replIdx<replLen; replIdx++) {
|
||||
UChar c = replacement.charAt(replIdx);
|
||||
if (c == BACKSLASH) {
|
||||
// Backslash Escape. Copy the following char out without further checks.
|
||||
replIdx++;
|
||||
if (replIdx >= replLen) {
|
||||
break;
|
||||
}
|
||||
c = replacement.charAt(replIdx);
|
||||
dest.append(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c != DOLLARSIGN) {
|
||||
// Normal char, not a $. Copy it out without further checks.
|
||||
dest.append(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
// We've got a $. Pick up a capture group number if one follows.
|
||||
// Consume at most the number of digits necessary for the largest capture
|
||||
// number that is valid for this pattern.
|
||||
if (++replIdx >= replLen) {
|
||||
// $ was at the end of the replacement string. Dump it out and be done.
|
||||
dest.append(c);
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t numDigits = 0;
|
||||
int32_t groupNum = 0;
|
||||
for (;;) {
|
||||
c = replacement.charAt(replIdx);
|
||||
if (u_isdigit(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
groupNum=groupNum*10 + u_charDigitValue(c);
|
||||
numDigits++;
|
||||
if (++replIdx >= replLen) {
|
||||
break;
|
||||
}
|
||||
if (numDigits >= fPattern->fMaxCaptureDigits) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We've scanned one char ahead in the pattern. Back up so the
|
||||
// next iteration of the loop picks the char again.
|
||||
--replIdx;
|
||||
|
||||
if (numDigits == 0) {
|
||||
// The $ didn't introduce a group number at all.
|
||||
// Treat it as just part of the substitution text.
|
||||
dest.append(DOLLARSIGN);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Finally, append the capture group data to the destination.
|
||||
dest.append(group(groupNum, status));
|
||||
if (U_FAILURE(status)) {
|
||||
// Can fail if group number is out of range.
|
||||
return *this;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// appendTail Intended to be used in conjunction with appendReplacement()
|
||||
// To the destination string, append everything following
|
||||
// the last match position from the input string.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
|
||||
int32_t len = fInputLength-fMatchEnd;
|
||||
if (len > 0) {
|
||||
dest.append(*fInput, fMatchEnd, len);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// end
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
int32_t RegexMatcher::end(UErrorCode &err) const {
|
||||
return end(0, err);
|
||||
}
|
||||
@ -78,7 +185,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
||||
if (U_FAILURE(err)) {
|
||||
return 0;
|
||||
}
|
||||
if (fLastMatch == FALSE) {
|
||||
if (fMatch == FALSE) {
|
||||
err = U_REGEX_INVALID_STATE;
|
||||
return 0;
|
||||
}
|
||||
@ -88,7 +195,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
||||
}
|
||||
int32_t e = 0;
|
||||
if (group == 0) {
|
||||
e = fLastMatchEnd;
|
||||
e = fMatchEnd;
|
||||
} else {
|
||||
int32_t s = fCaptureEnds->elementAti(group);
|
||||
// TODO: what to do if no match on this specific group?
|
||||
@ -101,11 +208,16 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// find()
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::find() {
|
||||
// Start at the position of the last match end. (Will be zero if the
|
||||
// matcher has been reset.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
return find(fLastMatchEnd, status);
|
||||
return find(fMatchEnd, status);
|
||||
}
|
||||
|
||||
|
||||
@ -128,16 +240,20 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
if (fLastMatch) {
|
||||
if (fMatch) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
fLastMatchStart = fLastMatchEnd = fInputLength;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// group()
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UnicodeString RegexMatcher::group(UErrorCode &status) const {
|
||||
return group(0, status);
|
||||
}
|
||||
@ -181,7 +297,7 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) {
|
||||
}
|
||||
reset();
|
||||
MatchAt(0, status);
|
||||
return fLastMatch;
|
||||
return fMatch;
|
||||
}
|
||||
|
||||
|
||||
@ -192,7 +308,7 @@ UBool RegexMatcher::matches(UErrorCode &status) {
|
||||
}
|
||||
reset();
|
||||
MatchAt(0, status);
|
||||
UBool success = (fLastMatch && fLastMatchEnd==fInputLength);
|
||||
UBool success = (fMatch && fMatchEnd==fInputLength);
|
||||
return success;
|
||||
}
|
||||
|
||||
@ -205,23 +321,58 @@ const RegexPattern &RegexMatcher::pattern() const {
|
||||
|
||||
|
||||
|
||||
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &err) {
|
||||
return UnicodeString();
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// replaceAll
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *fInput;
|
||||
}
|
||||
UnicodeString destString;
|
||||
for (reset(); find(); ) {
|
||||
appendReplacement(destString, replacement, status);
|
||||
}
|
||||
appendTail(destString);
|
||||
return destString;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &err) {
|
||||
return UnicodeString();
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// replaceFirst
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return *fInput;
|
||||
}
|
||||
reset();
|
||||
if (!find()) {
|
||||
return *fInput;
|
||||
}
|
||||
|
||||
UnicodeString destString;
|
||||
appendReplacement(destString, replacement, status);
|
||||
appendTail(destString);
|
||||
return destString;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// reset
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
RegexMatcher &RegexMatcher::reset() {
|
||||
fLastMatchStart = 0;
|
||||
fLastMatchEnd = 0;
|
||||
fLastMatch = FALSE;
|
||||
fMatchStart = 0;
|
||||
fMatchEnd = 0;
|
||||
fLastMatchEnd = 0;
|
||||
fMatch = FALSE;
|
||||
int i;
|
||||
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
|
||||
fCaptureStarts->setElementAt(i, -1);
|
||||
@ -252,7 +403,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
||||
if (U_FAILURE(err)) {
|
||||
return 0;
|
||||
}
|
||||
if (fLastMatch == FALSE) {
|
||||
if (fMatch == FALSE) {
|
||||
err = U_REGEX_INVALID_STATE;
|
||||
return 0;
|
||||
}
|
||||
@ -262,7 +413,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
||||
}
|
||||
int32_t s;
|
||||
if (group == 0) {
|
||||
s = fLastMatchStart;
|
||||
s = fMatchStart;
|
||||
} else {
|
||||
s = fCaptureStarts->elementAti(group);
|
||||
// TODO: what to do if no match on this specific group?
|
||||
@ -272,6 +423,26 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// getCaptureText We have encountered a '\' that might preceed a
|
||||
// capture group specification.
|
||||
// If a valid capture group number follows the '\',
|
||||
// return the indicies to the start & end of the captured
|
||||
// text, and update the patIdx to the position following the
|
||||
// \n sequence.
|
||||
//
|
||||
// This function is used during find and replace operations when
|
||||
// processing caputure references in the replacement text.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::getCaptureText(const UnicodeString &rep,
|
||||
int32_t &repIdx,
|
||||
int32_t &textStart,
|
||||
int32_t &textEnd)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
@ -408,6 +579,12 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
fCaptureEnds->setElementAt(inputIdx, opValue);
|
||||
break;
|
||||
|
||||
case URX_BACKSLASH_A:
|
||||
if (inputIdx != 0) {
|
||||
backTrack(inputIdx, patIdx);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_SETREF:
|
||||
if (inputIdx < fInputLength) {
|
||||
@ -449,7 +626,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
default:
|
||||
// Trouble. The compiled pattern contains an entry with an
|
||||
// unrecognized type tag.
|
||||
U_ASSERT(false);
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
@ -458,10 +635,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
}
|
||||
|
||||
breakFromLoop:
|
||||
fLastMatch = isMatch;
|
||||
fMatch = isMatch;
|
||||
if (isMatch) {
|
||||
fLastMatchStart = startIdx;
|
||||
fLastMatchEnd = inputIdx;
|
||||
fLastMatchEnd = fMatchEnd;
|
||||
fMatchStart = startIdx;
|
||||
fMatchEnd = inputIdx;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -65,6 +65,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
||||
fLiteralText = other.fLiteralText;
|
||||
fBadState = other.fBadState;
|
||||
fNumCaptureGroups = other.fNumCaptureGroups;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
if (fBadState) {
|
||||
return *this;
|
||||
}
|
||||
@ -108,6 +109,7 @@ void RegexPattern::init() {
|
||||
fFlags = 0;
|
||||
fBadState = FALSE;
|
||||
fNumCaptureGroups = 0;
|
||||
fMaxCaptureDigits = 1; // TODO: calculate for real.
|
||||
fMatcher = NULL;
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
@ -301,6 +303,8 @@ UnicodeString RegexPattern::pattern() const {
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// split
|
||||
// TODO: perl returns captured strings intermixed with the
|
||||
// fields. Should we do this too?
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
int32_t RegexPattern::split(const UnicodeString &input,
|
||||
@ -359,9 +363,9 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
||||
if (fMatcher->find()) {
|
||||
// We found another delimiter. Move everything from where we started looking
|
||||
// up until the start of the delimiter into the next output string.
|
||||
int32_t fieldLen = fMatcher->fLastMatchStart - nextOutputStringStart;
|
||||
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
|
||||
dest[i].setTo(input, nextOutputStringStart, fieldLen);
|
||||
nextOutputStringStart = fMatcher->fLastMatchEnd;
|
||||
nextOutputStringStart = fMatcher->fMatchEnd;
|
||||
if (nextOutputStringStart == inputLen) {
|
||||
// The delimiter was at the end of the string. We're done.
|
||||
break;
|
||||
@ -407,7 +411,7 @@ static char *opNames[] = {
|
||||
"NOP",
|
||||
"START_CAPTURE",
|
||||
"END_CAPTURE",
|
||||
"?10",
|
||||
"URX_BACKSLASH_A",
|
||||
"SETREF",
|
||||
"DOTANY",
|
||||
"JMP",
|
||||
|
@ -178,6 +178,7 @@ private:
|
||||
// make new ones on each call.
|
||||
|
||||
int32_t fNumCaptureGroups;
|
||||
int32_t fMaxCaptureDigits;
|
||||
|
||||
friend class RegexCompile;
|
||||
friend class RegexMatcher;
|
||||
@ -226,13 +227,16 @@ public:
|
||||
* The append position is set to the position of the first
|
||||
* character following the match in the input string.
|
||||
*
|
||||
* For complete, prepackaged, non-incremental find-and-replace
|
||||
* operations, see replaceFirst() or replaceAll().
|
||||
*
|
||||
* Returns: This Matcher
|
||||
*
|
||||
* error: Illegal state - no match yet attemtped, or last match failed.
|
||||
* IndexOutOfBounds - caputure string number from replacement string.
|
||||
*/
|
||||
virtual RegexMatcher &appendReplacement(UnicodeString &dest,
|
||||
const UnicodeString &replacement);
|
||||
const UnicodeString &replacement, UErrorCode &status);
|
||||
|
||||
|
||||
/*
|
||||
@ -329,7 +333,8 @@ public:
|
||||
|
||||
/*
|
||||
* Replaces every subsequence of the input sequence that matches the pattern
|
||||
* with the given replacement string.
|
||||
* with the given replacement string. This is a convenience function that
|
||||
* provides a complete find-and-replace-all operation.
|
||||
*
|
||||
* This method first resets this matcher. It then scans the input sequence
|
||||
* looking for matches of the pattern. Characters that are not part of any
|
||||
@ -337,10 +342,7 @@ public:
|
||||
* replacement string. The replacement string may contain references to
|
||||
* captured subsequences as in the appendReplacement method.
|
||||
*
|
||||
* @return The target string. Depending on how the RegexMatcher was
|
||||
* created, this may either be the original input string or a copy
|
||||
*
|
||||
* Error: Index out of bounds (replacement string capture group)
|
||||
* @return A string containing the results of the find and replace.
|
||||
*
|
||||
*/
|
||||
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &err);
|
||||
@ -348,16 +350,15 @@ public:
|
||||
|
||||
/*
|
||||
* Replaces the first subsequence of the input sequence that matches
|
||||
* the pattern with the given replacement string.
|
||||
* the pattern with the given replacement string. This is a convenience
|
||||
* function that provides a complete find-and-replace operation.
|
||||
*
|
||||
* This method first resets this matcher. It then scans the input sequence
|
||||
* looking for a match of the pattern. Characters that are not part
|
||||
* of the match are appended directly to the result string; the match is replaced
|
||||
* in the result by the replacement string. The replacement string may contain
|
||||
* references to captured subsequences as in the appendReplacement method.
|
||||
*
|
||||
* Error: Index out of bounds (replacement string capture group)
|
||||
* Illegal state (no match)
|
||||
* Note: Javadoc doesn't list exceptions, but they gotta be there for consistency
|
||||
*/
|
||||
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &err);
|
||||
|
||||
@ -409,27 +410,33 @@ public:
|
||||
|
||||
private:
|
||||
// Constructors and other object boilerplate are private.
|
||||
// Creation by users is through factory method in RegexPattern
|
||||
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
|
||||
// Creation by users is only through the factory method in class RegexPattern
|
||||
RegexMatcher(const RegexPattern *pat);
|
||||
RegexMatcher(const RegexMatcher &other);
|
||||
RegexMatcher &operator =(const RegexMatcher &rhs);
|
||||
friend class RegexPattern;
|
||||
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
|
||||
//
|
||||
// MatchAt This is the internal interface to the match engine itself.
|
||||
// Match status comes back in matcher member variables.
|
||||
//
|
||||
virtual void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool getCaptureText(const UnicodeString &rep,
|
||||
int32_t &repIdx,
|
||||
int32_t &textStart,
|
||||
int32_t &textEnd);
|
||||
|
||||
|
||||
const RegexPattern *fPattern;
|
||||
const UnicodeString *fInput;
|
||||
int32_t fInputLength;
|
||||
UBool fLastMatch; // True if the last match was successful.
|
||||
int32_t fLastMatchStart;
|
||||
int32_t fLastMatchEnd;
|
||||
UBool fMatch; // True if the last match was successful.
|
||||
int32_t fMatchStart; // Position of the start of the most recent match
|
||||
int32_t fMatchEnd; // First position after the end of the most recent match
|
||||
int32_t fLastMatchEnd; // First position after the end of the previous match.
|
||||
UStack *fBackTrackStack;
|
||||
UVector *fCaptureStarts;
|
||||
UVector *fCaptureEnds;
|
||||
|
@ -31,12 +31,12 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
|
||||
if (exec) logln("TestSuite RegexTest: ");
|
||||
switch (index) {
|
||||
|
||||
case 0: name = "API_Match";
|
||||
if (exec) API_Match();
|
||||
break;
|
||||
case 1: name = "Basic";
|
||||
case 0: name = "Basic";
|
||||
if (exec) Basic();
|
||||
break;
|
||||
case 1: name = "API_Match";
|
||||
if (exec) API_Match();
|
||||
break;
|
||||
case 2: name = "API_Replace";
|
||||
if (exec) API_Replace();
|
||||
break;
|
||||
@ -87,6 +87,7 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match
|
||||
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status);
|
||||
return FALSE;
|
||||
}
|
||||
// REPattern->dump();
|
||||
|
||||
UnicodeString inputString(inputText);
|
||||
UnicodeString unEscapedInput = inputString.unescape();
|
||||
@ -295,6 +296,101 @@ void RegexTest::API_Match() {
|
||||
delete matcher;
|
||||
delete pat;
|
||||
}
|
||||
|
||||
//
|
||||
// Replace
|
||||
//
|
||||
{
|
||||
int32_t flags=0;
|
||||
UParseError pe;
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
||||
UnicodeString re("abc");
|
||||
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString data = ".abc..abc...abc..";
|
||||
// 012345678901234567
|
||||
RegexMatcher *matcher = pat->matcher(data, status);
|
||||
|
||||
//
|
||||
// Plain vanilla matches.
|
||||
//
|
||||
UnicodeString dest;
|
||||
dest = matcher->replaceFirst("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".yz..abc...abc..");
|
||||
|
||||
dest = matcher->replaceAll("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".yz..yz...yz..");
|
||||
|
||||
//
|
||||
// Plain vanilla non-matches.
|
||||
//
|
||||
UnicodeString d2 = ".abx..abx...abx..";
|
||||
matcher->reset(d2);
|
||||
dest = matcher->replaceFirst("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".abx..abx...abx..");
|
||||
|
||||
dest = matcher->replaceAll("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == ".abx..abx...abx..");
|
||||
|
||||
//
|
||||
// Empty source string
|
||||
//
|
||||
UnicodeString d3 = "";
|
||||
matcher->reset(d3);
|
||||
dest = matcher->replaceFirst("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "");
|
||||
|
||||
dest = matcher->replaceAll("yz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "");
|
||||
|
||||
//
|
||||
// Empty substitution string
|
||||
//
|
||||
matcher->reset(data); // ".abc..abc...abc.."
|
||||
dest = matcher->replaceFirst("", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "...abc...abc..");
|
||||
|
||||
dest = matcher->replaceAll("", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "........");
|
||||
|
||||
//
|
||||
// match whole string
|
||||
//
|
||||
UnicodeString d4 = "abc";
|
||||
matcher->reset(d4);
|
||||
dest = matcher->replaceFirst("xyz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "xyz");
|
||||
|
||||
dest = matcher->replaceAll("xyz", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "xyz");
|
||||
|
||||
//
|
||||
// Capture Group, simple case
|
||||
//
|
||||
UnicodeString re2("a(..)");
|
||||
RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString d5 = "abcdefg";
|
||||
RegexMatcher *matcher2 = pat2->matcher(d5, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
dest = matcher2->replaceFirst("$1$1", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(dest == "bcbcdefg");
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -314,6 +410,7 @@ void RegexTest::Basic() {
|
||||
//
|
||||
#if 0
|
||||
{
|
||||
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
|
||||
}
|
||||
return;
|
||||
#endif
|
||||
@ -419,6 +516,26 @@ void RegexTest::Basic() {
|
||||
REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
|
||||
REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
|
||||
|
||||
//
|
||||
// Escape sequences that become single literal chars, handled internally
|
||||
// by ICU's Unescape.
|
||||
//
|
||||
|
||||
// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
|
||||
REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
|
||||
REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE); // BS
|
||||
// REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape
|
||||
// REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape
|
||||
REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
|
||||
REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
|
||||
REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
|
||||
REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
|
||||
REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
|
||||
REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
|
||||
|
||||
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
|
||||
REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user