ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10063
This commit is contained in:
Andy Heninger 2002-10-24 22:16:07 +00:00
parent 08ca9c365b
commit 425ac49187
10 changed files with 484 additions and 160 deletions

View File

@ -503,6 +503,7 @@ typedef enum UErrorCode {
U_REGEX_ERROR_START=0x10300,
U_REGEX_INTERNAL_ERROR,
U_REGEX_INVALID_STATE,
U_REGEX_BAD_ESCAPE_SEQUENCE,
U_REGEX_ERROR_LIMIT,
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View File

@ -59,22 +59,21 @@ static const UChar gRuleSet_rule_char_pattern[] = {
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
static const UChar gRuleSet_name_char_pattern[] = {
// [ _ \ p { L } \ p { N } ]
0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
static const UChar gRuleSet_digit_char_pattern[] = {
// [ 0 - 9 ]
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
static const UChar gRuleSet_name_start_char_pattern[] = {
// [ _ \ p { L } ]
0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
static const UChar kAny[] = {0x61, 0x6e, 0x79, 0x00}; // "any"
static UnicodeSet *gRuleSets[10]; // Array of ptrs to the actual UnicodeSet objects.
static UnicodeSet *gUnescapeCharSet;
//
// These are the backslash escape characters that ICU's unescape
// will handle.
//
static const UChar gUnescapeCharPattern[] = {
// [ a b c e f n r t u U ]
0x5b, 0x61, 0x62, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d};
//----------------------------------------------------------------------------------------
@ -88,7 +87,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
fScanIndex = 0;
fNextIndex = 0;
fPeekChar = -1;
fLineNum = 1;
fCharNum = 0;
fQuoteMode = FALSE;
@ -110,13 +109,16 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
gRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(gRuleSet_rule_char_pattern, status);
gRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodePropertySet::getRuleWhiteSpaceSet(status));
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
if (U_FAILURE(status)) {
delete gRuleSets[kRuleSet_rule_char-128];
delete gRuleSets[kRuleSet_white_space-128];
delete gRuleSets[kRuleSet_digit_char-128];
delete gUnescapeCharSet;
gRuleSets[kRuleSet_rule_char-128] = NULL;
gRuleSets[kRuleSet_white_space-128] = NULL;
gRuleSets[kRuleSet_digit_char-128] = NULL;
gUnescapeCharSet = NULL;
return;
}
}
@ -218,7 +220,7 @@ void RegexCompile::compile(
// Table row specified "quoted" and the char was quoted.
break;
}
if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1) {
if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
// Table row specified eof and we hit eof on the input.
break;
}
@ -605,14 +607,15 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doDotAny:
// scanned a ".", match any single character.
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
break;
case doExprFinished:
case doBackslashA:
// Scanned a "\A".
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_A, 0), *fStatus);
break;
case doExit:
@ -816,6 +819,11 @@ UChar32 RegexCompile::nextCharLL() {
UChar32 ch;
UnicodeString &pattern = fRXPat->fPattern;
if (fPeekChar != -1) {
ch = fPeekChar;
fPeekChar = -1;
return ch;
}
if (fPatternLength==0 || fNextIndex >= fPatternLength) {
return (UChar32)-1;
}
@ -846,12 +854,25 @@ UChar32 RegexCompile::nextCharLL() {
return ch;
}
//---------------------------------------------------------------------------------
//
// peekCharLL Low Level Character Scanning, sneak a peek at the next
// character without actually getting it.
//
//---------------------------------------------------------------------------------
UChar32 RegexCompile::peekCharLL() {
if (fPeekChar == -1) {
fPeekChar = nextCharLL();
}
return fPeekChar;
}
//---------------------------------------------------------------------------------
//
// nextChar for rules scanning. At this level, we handle stripping
// out comments and processing backslash character escapes.
// The rest of the rules grammar is handled at the next level up.
// nextChar for pattern scanning. At this level, we handle stripping
// out comments and processing some backslash character escapes.
// The rest of the pattern grammar is handled at the next level up.
//
//---------------------------------------------------------------------------------
void RegexCompile::nextChar(RegexPatternChar &c) {
@ -870,7 +891,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
{
// We are not in a 'quoted region' of the source.
//
if (c.fChar == chPound) {
if (fFreeForm && c.fChar == chPound) {
// Start of a comment. Consume the rest of it.
// The new-line char that terminates the comment is always returned.
// It will be treated as white-space, and serves to break up anything
@ -891,16 +912,22 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
//
// check for backslash escaped characters.
// Use UnicodeString::unescapeAt() to handle them.
// Use UnicodeString::unescapeAt() to handle those that it can.
// Otherwise just return the '\', and let the pattern parser deal with it.
//
int32_t startX = fNextIndex; // start and end positions of the
int32_t endX = fNextIndex; // sequence following the '\'
if (c.fChar == chBackSlash) {
c.fQuoted = TRUE;
int32_t startX = fNextIndex;
c.fChar = fRXPat->fPattern.unescapeAt(fNextIndex);
if (fNextIndex == startX) {
error(U_BRK_HEX_DIGITS_EXPECTED);
if (gUnescapeCharSet->contains(peekCharLL())) {
nextCharLL(); // get & discard the peeked char.
c.fQuoted = TRUE;
c.fChar = fRXPat->fPattern.unescapeAt(endX);
if (startX == endX) {
error(U_REGEX_BAD_ESCAPE_SEQUENCE);
}
fCharNum += endX - startX;
fNextIndex = endX;
}
fCharNum += fNextIndex-startX;
}
}
// putc(c.fChar, stdout);

View File

@ -65,8 +65,6 @@ public:
void nextChar(RegexPatternChar &c); // Get the next char from the input stream.
UBool push(const RegexPatternChar &c); // Push (unget) one character.
// Only a single character may be pushed.
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
@ -88,6 +86,7 @@ private:
void error(UErrorCode e); // error reporting convenience function.
UChar32 nextCharLL();
UChar32 peekCharLL();
UnicodeSet *scanSet();
void handleCloseParen();
int32_t blockTopLoc(); // Locate a position in the compiled pattern
@ -99,6 +98,9 @@ private:
RegexPattern *fRXPat;
UParseError *fParseErr;
//
// Data associated with low level character scanning
//
int32_t fScanIndex; // Index of current character being processed
// in the rule input string.
int32_t fNextIndex; // Index of the next character, which
@ -109,6 +111,8 @@ private:
int fCharNum; // Char position within the line.
UChar32 fLastChar; // Previous char, needed to count CR-LF
// as a single line, not two.
UChar32 fPeekChar; // Saved char, if we've scanned ahead.
RegexPatternChar fC; // Current char for parse state machine
// processing.

View File

@ -40,6 +40,7 @@ enum Regex_PatternParseAction {
doOpenLookAheadNeg,
doPlus,
doOpenNonCaptureParen,
doBackslashA,
doNGPlus,
doPatFinish,
doIntervalMinValue,
@ -51,7 +52,6 @@ enum Regex_PatternParseAction {
doOpenLookAhead,
doNumberExpectedError,
doDotAny,
doExprFinished,
doScanUnicodeSet,
doNOP,
doExit,
@ -80,71 +80,65 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 3, 2, FALSE} // 1 start
, {doPatFinish, 255, 2,0, FALSE} // 2 finish
, {doStartString, 254, 10,0, TRUE} // 3 term
, {doStartString, 130, 10,0, TRUE} // 4
, {doScanUnicodeSet, 91 /* [ */, 17,0, TRUE} // 5
, {doNOP, 40 /* ( */, 29, 17, TRUE} // 6
, {doDotAny, 46 /* . */, 17,0, TRUE} // 7
, {doNOP, 253, 255,0, FALSE} // 8
, {doRuleError, 255, 67,0, FALSE} // 9
, {doStringChar, 254, 10,0, TRUE} // 10 string
, {doStringChar, 130, 10,0, TRUE} // 11
, {doSplitString, 63 /* ? */, 17,0, FALSE} // 12
, {doSplitString, 43 /* + */, 17,0, FALSE} // 13
, {doSplitString, 42 /* * */, 17,0, FALSE} // 14
, {doSplitString, 123 /* { */, 17,0, FALSE} // 15
, {doEndString, 255, 17,0, FALSE} // 16
, {doNOP, 42 /* * */, 40,0, TRUE} // 17 expr-quant
, {doNOP, 43 /* + */, 43,0, TRUE} // 18
, {doNOP, 63 /* ? */, 46,0, TRUE} // 19
, {doNOP, 255, 21,0, FALSE} // 20
, {doNOP, 254, 3,0, FALSE} // 21 expr-cont
, {doNOP, 130, 3,0, FALSE} // 22
, {doNOP, 91 /* [ */, 3,0, FALSE} // 23
, {doNOP, 40 /* ( */, 3,0, FALSE} // 24
, {doNOP, 46 /* . */, 3,0, FALSE} // 25
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 26
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 27
, {doExprFinished, 255, 255,0, FALSE} // 28
, {doNOP, 63 /* ? */, 31,0, TRUE} // 29 open-paren
, {doOpenCaptureParen, 255, 3, 17, FALSE} // 30
, {doOpenNonCaptureParen, 58 /* : */, 3, 17, TRUE} // 31 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 17, TRUE} // 32
, {doOpenLookAhead, 61 /* = */, 3, 21, TRUE} // 33
, {doOpenLookAheadNeg, 33 /* ! */, 3, 21, TRUE} // 34
, {doNOP, 60 /* < */, 37,0, TRUE} // 35
, {doBadOpenParenType, 255, 67,0, FALSE} // 36
, {doOpenLookBehind, 61 /* = */, 3, 21, TRUE} // 37 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 21, TRUE} // 38
, {doBadOpenParenType, 255, 67,0, FALSE} // 39
, {doNGStar, 63 /* ? */, 21,0, TRUE} // 40 quant-star
, {doPossesiveStar, 43 /* + */, 21,0, TRUE} // 41
, {doStar, 255, 21,0, FALSE} // 42
, {doNGPlus, 63 /* ? */, 21,0, TRUE} // 43 quant-plus
, {doPossesivePlus, 43 /* + */, 21,0, TRUE} // 44
, {doPlus, 255, 21,0, FALSE} // 45
, {doNGOpt, 63 /* ? */, 21,0, TRUE} // 46 quant-opt
, {doPossesiveOpt, 43 /* + */, 21,0, TRUE} // 47
, {doOpt, 255, 21,0, FALSE} // 48
, {doNOP, 129, 49,0, TRUE} // 49 interval-open
, {doIntervalMinValue, 128, 52,0, FALSE} // 50
, {doNumberExpectedError, 255, 67,0, FALSE} // 51
, {doNOP, 129, 56,0, TRUE} // 52 interval-value
, {doNOP, 125 /* } */, 56,0, FALSE} // 53
, {doIntervalDigit, 128, 52,0, TRUE} // 54
, {doNumberExpectedError, 255, 67,0, FALSE} // 55
, {doNOP, 129, 56,0, TRUE} // 56 interval-close
, {doTagValue, 125 /* } */, 59,0, TRUE} // 57
, {doNumberExpectedError, 255, 67,0, FALSE} // 58
, {doNOP, 254, 3,0, FALSE} // 59 expr-cont-no-interval
, {doNOP, 130, 3,0, FALSE} // 60
, {doNOP, 91 /* [ */, 3,0, FALSE} // 61
, {doNOP, 40 /* ( */, 3,0, FALSE} // 62
, {doNOP, 46 /* . */, 3,0, FALSE} // 63
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 64
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 65
, {doExprFinished, 255, 255,0, FALSE} // 66
, {doExit, 255, 67,0, TRUE} // 67 errorDeath
, {doStartString, 254, 11,0, TRUE} // 3 term
, {doStartString, 130, 11,0, TRUE} // 4
, {doScanUnicodeSet, 91 /* [ */, 18,0, TRUE} // 5
, {doNOP, 40 /* ( */, 25, 18, TRUE} // 6
, {doDotAny, 46 /* . */, 18,0, TRUE} // 7
, {doNOP, 92 /* \ */, 59,0, TRUE} // 8
, {doNOP, 253, 2,0, FALSE} // 9
, {doRuleError, 255, 61,0, FALSE} // 10
, {doStringChar, 254, 11,0, TRUE} // 11 string
, {doStringChar, 130, 11,0, TRUE} // 12
, {doSplitString, 63 /* ? */, 18,0, FALSE} // 13
, {doSplitString, 43 /* + */, 18,0, FALSE} // 14
, {doSplitString, 42 /* * */, 18,0, FALSE} // 15
, {doSplitString, 123 /* { */, 18,0, FALSE} // 16
, {doEndString, 255, 18,0, FALSE} // 17
, {doNOP, 42 /* * */, 36,0, TRUE} // 18 expr-quant
, {doNOP, 43 /* + */, 39,0, TRUE} // 19
, {doNOP, 63 /* ? */, 42,0, TRUE} // 20
, {doNOP, 255, 22,0, FALSE} // 21
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 22 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 23
, {doNOP, 255, 3,0, FALSE} // 24
, {doNOP, 63 /* ? */, 27,0, TRUE} // 25 open-paren
, {doOpenCaptureParen, 255, 3, 18, FALSE} // 26
, {doOpenNonCaptureParen, 58 /* : */, 3, 18, TRUE} // 27 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 18, TRUE} // 28
, {doOpenLookAhead, 61 /* = */, 3, 22, TRUE} // 29
, {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE} // 30
, {doNOP, 60 /* < */, 33,0, TRUE} // 31
, {doBadOpenParenType, 255, 61,0, FALSE} // 32
, {doOpenLookBehind, 61 /* = */, 3, 22, TRUE} // 33 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE} // 34
, {doBadOpenParenType, 255, 61,0, FALSE} // 35
, {doNGStar, 63 /* ? */, 22,0, TRUE} // 36 quant-star
, {doPossesiveStar, 43 /* + */, 22,0, TRUE} // 37
, {doStar, 255, 22,0, FALSE} // 38
, {doNGPlus, 63 /* ? */, 22,0, TRUE} // 39 quant-plus
, {doPossesivePlus, 43 /* + */, 22,0, TRUE} // 40
, {doPlus, 255, 22,0, FALSE} // 41
, {doNGOpt, 63 /* ? */, 22,0, TRUE} // 42 quant-opt
, {doPossesiveOpt, 43 /* + */, 22,0, TRUE} // 43
, {doOpt, 255, 22,0, FALSE} // 44
, {doNOP, 129, 45,0, TRUE} // 45 interval-open
, {doIntervalMinValue, 128, 48,0, FALSE} // 46
, {doNumberExpectedError, 255, 61,0, FALSE} // 47
, {doNOP, 129, 52,0, TRUE} // 48 interval-value
, {doNOP, 125 /* } */, 52,0, FALSE} // 49
, {doIntervalDigit, 128, 48,0, TRUE} // 50
, {doNumberExpectedError, 255, 61,0, FALSE} // 51
, {doNOP, 129, 52,0, TRUE} // 52 interval-close
, {doTagValue, 125 /* } */, 55,0, TRUE} // 53
, {doNumberExpectedError, 255, 61,0, FALSE} // 54
, {doNOP, 254, 3,0, FALSE} // 55 expr-cont-no-interval
, {doExprOrOperator, 124 /* | */, 3,0, TRUE} // 56
, {doExprRParen, 41 /* ) */, 255,0, TRUE} // 57
, {doNOP, 255, 3,0, FALSE} // 58
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 59 backslash
, {doStartString, 255, 11,0, TRUE} // 60
, {doExit, 255, 61,0, TRUE} // 61 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
@ -155,6 +149,7 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"string",
0,
@ -169,11 +164,6 @@ static const char *RegexStateNames[] = { 0,
0,
"expr-cont",
0,
0,
0,
0,
0,
0,
0,
"open-paren",
0,
@ -209,9 +199,7 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"backslash",
0,
"errorDeath",
0};

View File

@ -77,7 +77,8 @@ term:
'[' n expr-quant doScanUnicodeSet
'(' n open-paren ^expr-quant
'.' n expr-quant doDotAny
eof pop
'\' n backslash
eof finish
default errorDeath doRuleError
@ -110,17 +111,12 @@ expr-quant:
#
# expr-cont Expression, continuation. At a point where additional terms are
# allowed, but not required.
# allowed, but not required. No Quantifiers
#
expr-cont:
quoted term
rule_char term
'[' term
'(' term
'.' term
'|' n term doOrOperator
')' n pop doCloseParen
default pop doExprFinished
default term
#
@ -205,16 +201,18 @@ interval-close:
#
expr-cont-no-interval:
quoted term
rule_char term
'[' term
'(' term
'.' term
'|' n term doExprOrOperator
')' n pop doExprRParen
default pop doExprFinished
default term
#
# backslash # Backslash. Figure out which of the \thingies we have encountered.
# The low level next-char function will have preprocessed
# some of them already; those won't come here.
backslash:
'A' n term doBackslashA
default n string doStartString

View File

@ -26,7 +26,7 @@ static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern po
static const uint32_t URX_NOP = 7;
static const uint32_t URX_START_CAPTURE = 8; // Value field is capture group number.
static const uint32_t URX_END_CAPTURE = 9; // Value field is capture group number
static const uint32_t URX_UNUSED10 = 10; // Value field is index in pattern to
static const uint32_t URX_BACKSLASH_A = 10; // Value field is index in pattern to
// loop back to.
static const uint32_t URX_SETREF = 11; // Value field is index of set in array of sets.
static const uint32_t URX_DOTANY = 12;

View File

@ -11,6 +11,7 @@
#include "unicode/utypes.h"
#include "unicode/regex.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "uassert.h"
#include "uvector.h"
#include "regeximp.h"
@ -54,20 +55,126 @@ RegexMatcher::~RegexMatcher() {
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
//--------------------------------------------------------------------------------
//
// appendReplacement
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
const UnicodeString &replacement) {
const UnicodeString &replacement,
UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (fMatch == FALSE) {
status = U_REGEX_INVALID_STATE;
return *this;
}
// Copy input string from the end of previous match to start of current match
int32_t len = fMatchStart-fLastMatchEnd;
if (len > 0) {
dest.append(*fInput, fLastMatchEnd, len);
}
// scan the replacement text, looking for substitutions ($n) and \escapes.
int32_t replLen = replacement.length();
int32_t replIdx;
for (replIdx = 0; replIdx<replLen; replIdx++) {
UChar c = replacement.charAt(replIdx);
if (c == BACKSLASH) {
// Backslash Escape. Copy the following char out without further checks.
replIdx++;
if (replIdx >= replLen) {
break;
}
c = replacement.charAt(replIdx);
dest.append(c);
continue;
}
if (c != DOLLARSIGN) {
// Normal char, not a $. Copy it out without further checks.
dest.append(c);
continue;
}
// We've got a $. Pick up a capture group number if one follows.
// Consume at most the number of digits necessary for the largest capture
// number that is valid for this pattern.
if (++replIdx >= replLen) {
// $ was at the end of the replacement string. Dump it out and be done.
dest.append(c);
break;
}
int32_t numDigits = 0;
int32_t groupNum = 0;
for (;;) {
c = replacement.charAt(replIdx);
if (u_isdigit(c) == FALSE) {
break;
}
groupNum=groupNum*10 + u_charDigitValue(c);
numDigits++;
if (++replIdx >= replLen) {
break;
}
if (numDigits >= fPattern->fMaxCaptureDigits) {
break;
}
}
// We've scanned one char ahead in the pattern. Back up so the
// next iteration of the loop picks the char again.
--replIdx;
if (numDigits == 0) {
// The $ didn't introduce a group number at all.
// Treat it as just part of the substitution text.
dest.append(DOLLARSIGN);
continue;
}
// Finally, append the capture group data to the destination.
dest.append(group(groupNum, status));
if (U_FAILURE(status)) {
// Can fail if group number is out of range.
return *this;
}
}
return *this;
}
//--------------------------------------------------------------------------------
//
// appendTail Intended to be used in conjunction with appendReplacement()
// To the destination string, append everything following
// the last match position from the input string.
//
//--------------------------------------------------------------------------------
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
int32_t len = fInputLength-fMatchEnd;
if (len > 0) {
dest.append(*fInput, fMatchEnd, len);
}
return dest;
}
//--------------------------------------------------------------------------------
//
// end
//
//--------------------------------------------------------------------------------
int32_t RegexMatcher::end(UErrorCode &err) const {
return end(0, err);
}
@ -78,7 +185,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
if (U_FAILURE(err)) {
return 0;
}
if (fLastMatch == FALSE) {
if (fMatch == FALSE) {
err = U_REGEX_INVALID_STATE;
return 0;
}
@ -88,7 +195,7 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
}
int32_t e = 0;
if (group == 0) {
e = fLastMatchEnd;
e = fMatchEnd;
} else {
int32_t s = fCaptureEnds->elementAti(group);
// TODO: what to do if no match on this specific group?
@ -101,11 +208,16 @@ int32_t RegexMatcher::end(int group, UErrorCode &err) const {
//--------------------------------------------------------------------------------
//
// find()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::find() {
// Start at the position of the last match end. (Will be zero if the
// matcher has been reset.
UErrorCode status = U_ZERO_ERROR;
return find(fLastMatchEnd, status);
return find(fMatchEnd, status);
}
@ -128,16 +240,20 @@ UBool RegexMatcher::find(int32_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (fLastMatch) {
if (fMatch) {
return TRUE;
}
}
fLastMatchStart = fLastMatchEnd = fInputLength;
return FALSE;
}
//--------------------------------------------------------------------------------
//
// group()
//
//--------------------------------------------------------------------------------
UnicodeString RegexMatcher::group(UErrorCode &status) const {
return group(0, status);
}
@ -181,7 +297,7 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) {
}
reset();
MatchAt(0, status);
return fLastMatch;
return fMatch;
}
@ -192,7 +308,7 @@ UBool RegexMatcher::matches(UErrorCode &status) {
}
reset();
MatchAt(0, status);
UBool success = (fLastMatch && fLastMatchEnd==fInputLength);
UBool success = (fMatch && fMatchEnd==fInputLength);
return success;
}
@ -205,23 +321,58 @@ const RegexPattern &RegexMatcher::pattern() const {
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &err) {
return UnicodeString();
//--------------------------------------------------------------------------------
//
// replaceAll
//
//--------------------------------------------------------------------------------
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
if (U_FAILURE(status)) {
return *fInput;
}
UnicodeString destString;
for (reset(); find(); ) {
appendReplacement(destString, replacement, status);
}
appendTail(destString);
return destString;
}
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &err) {
return UnicodeString();
//--------------------------------------------------------------------------------
//
// replaceFirst
//
//--------------------------------------------------------------------------------
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
if (U_FAILURE(status)) {
return *fInput;
}
reset();
if (!find()) {
return *fInput;
}
UnicodeString destString;
appendReplacement(destString, replacement, status);
appendTail(destString);
return destString;
}
//--------------------------------------------------------------------------------
//
// reset
//
//--------------------------------------------------------------------------------
RegexMatcher &RegexMatcher::reset() {
fLastMatchStart = 0;
fLastMatchEnd = 0;
fLastMatch = FALSE;
fMatchStart = 0;
fMatchEnd = 0;
fLastMatchEnd = 0;
fMatch = FALSE;
int i;
for (i=0; i<=fPattern->fNumCaptureGroups; i++) {
fCaptureStarts->setElementAt(i, -1);
@ -252,7 +403,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
if (U_FAILURE(err)) {
return 0;
}
if (fLastMatch == FALSE) {
if (fMatch == FALSE) {
err = U_REGEX_INVALID_STATE;
return 0;
}
@ -262,7 +413,7 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
}
int32_t s;
if (group == 0) {
s = fLastMatchStart;
s = fMatchStart;
} else {
s = fCaptureStarts->elementAti(group);
// TODO: what to do if no match on this specific group?
@ -272,6 +423,26 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
//--------------------------------------------------------------------------------
//
// getCaptureText We have encountered a '\' that might preceed a
// capture group specification.
// If a valid capture group number follows the '\',
// return the indicies to the start & end of the captured
// text, and update the patIdx to the position following the
// \n sequence.
//
// This function is used during find and replace operations when
// processing caputure references in the replacement text.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::getCaptureText(const UnicodeString &rep,
int32_t &repIdx,
int32_t &textStart,
int32_t &textEnd)
{
return FALSE;
}
//--------------------------------------------------------------------------------
//
@ -408,6 +579,12 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
fCaptureEnds->setElementAt(inputIdx, opValue);
break;
case URX_BACKSLASH_A:
if (inputIdx != 0) {
backTrack(inputIdx, patIdx);
}
break;
case URX_SETREF:
if (inputIdx < fInputLength) {
@ -449,7 +626,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
default:
// Trouble. The compiled pattern contains an entry with an
// unrecognized type tag.
U_ASSERT(false);
U_ASSERT(FALSE);
}
if (U_FAILURE(status)) {
@ -458,10 +635,11 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
}
breakFromLoop:
fLastMatch = isMatch;
fMatch = isMatch;
if (isMatch) {
fLastMatchStart = startIdx;
fLastMatchEnd = inputIdx;
fLastMatchEnd = fMatchEnd;
fMatchStart = startIdx;
fMatchEnd = inputIdx;
}
return;
}

View File

@ -65,6 +65,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fLiteralText = other.fLiteralText;
fBadState = other.fBadState;
fNumCaptureGroups = other.fNumCaptureGroups;
fMaxCaptureDigits = other.fMaxCaptureDigits;
if (fBadState) {
return *this;
}
@ -108,6 +109,7 @@ void RegexPattern::init() {
fFlags = 0;
fBadState = FALSE;
fNumCaptureGroups = 0;
fMaxCaptureDigits = 1; // TODO: calculate for real.
fMatcher = NULL;
UErrorCode status=U_ZERO_ERROR;
@ -301,6 +303,8 @@ UnicodeString RegexPattern::pattern() const {
//---------------------------------------------------------------------
//
// split
// TODO: perl returns captured strings intermixed with the
// fields. Should we do this too?
//
//---------------------------------------------------------------------
int32_t RegexPattern::split(const UnicodeString &input,
@ -359,9 +363,9 @@ int32_t RegexPattern::split(const UnicodeString &input,
if (fMatcher->find()) {
// We found another delimiter. Move everything from where we started looking
// up until the start of the delimiter into the next output string.
int32_t fieldLen = fMatcher->fLastMatchStart - nextOutputStringStart;
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
dest[i].setTo(input, nextOutputStringStart, fieldLen);
nextOutputStringStart = fMatcher->fLastMatchEnd;
nextOutputStringStart = fMatcher->fMatchEnd;
if (nextOutputStringStart == inputLen) {
// The delimiter was at the end of the string. We're done.
break;
@ -407,7 +411,7 @@ static char *opNames[] = {
"NOP",
"START_CAPTURE",
"END_CAPTURE",
"?10",
"URX_BACKSLASH_A",
"SETREF",
"DOTANY",
"JMP",

View File

@ -178,6 +178,7 @@ private:
// make new ones on each call.
int32_t fNumCaptureGroups;
int32_t fMaxCaptureDigits;
friend class RegexCompile;
friend class RegexMatcher;
@ -226,13 +227,16 @@ public:
* The append position is set to the position of the first
* character following the match in the input string.
*
* For complete, prepackaged, non-incremental find-and-replace
* operations, see replaceFirst() or replaceAll().
*
* Returns: This Matcher
*
* error: Illegal state - no match yet attemtped, or last match failed.
* IndexOutOfBounds - caputure string number from replacement string.
*/
virtual RegexMatcher &appendReplacement(UnicodeString &dest,
const UnicodeString &replacement);
const UnicodeString &replacement, UErrorCode &status);
/*
@ -329,7 +333,8 @@ public:
/*
* Replaces every subsequence of the input sequence that matches the pattern
* with the given replacement string.
* with the given replacement string. This is a convenience function that
* provides a complete find-and-replace-all operation.
*
* This method first resets this matcher. It then scans the input sequence
* looking for matches of the pattern. Characters that are not part of any
@ -337,10 +342,7 @@ public:
* replacement string. The replacement string may contain references to
* captured subsequences as in the appendReplacement method.
*
* @return The target string. Depending on how the RegexMatcher was
* created, this may either be the original input string or a copy
*
* Error: Index out of bounds (replacement string capture group)
* @return A string containing the results of the find and replace.
*
*/
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &err);
@ -348,16 +350,15 @@ public:
/*
* Replaces the first subsequence of the input sequence that matches
* the pattern with the given replacement string.
* the pattern with the given replacement string. This is a convenience
* function that provides a complete find-and-replace operation.
*
* This method first resets this matcher. It then scans the input sequence
* looking for a match of the pattern. Characters that are not part
* of the match are appended directly to the result string; the match is replaced
* in the result by the replacement string. The replacement string may contain
* references to captured subsequences as in the appendReplacement method.
*
* Error: Index out of bounds (replacement string capture group)
* Illegal state (no match)
* Note: Javadoc doesn't list exceptions, but they gotta be there for consistency
*/
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &err);
@ -409,27 +410,33 @@ public:
private:
// Constructors and other object boilerplate are private.
// Creation by users is through factory method in RegexPattern
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
// Creation by users is only through the factory method in class RegexPattern
RegexMatcher(const RegexPattern *pat);
RegexMatcher(const RegexMatcher &other);
RegexMatcher &operator =(const RegexMatcher &rhs);
friend class RegexPattern;
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
//
// MatchAt This is the internal interface to the match engine itself.
// Match status comes back in matcher member variables.
//
virtual void MatchAt(int32_t startIdx, UErrorCode &status);
void MatchAt(int32_t startIdx, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool getCaptureText(const UnicodeString &rep,
int32_t &repIdx,
int32_t &textStart,
int32_t &textEnd);
const RegexPattern *fPattern;
const UnicodeString *fInput;
int32_t fInputLength;
UBool fLastMatch; // True if the last match was successful.
int32_t fLastMatchStart;
int32_t fLastMatchEnd;
UBool fMatch; // True if the last match was successful.
int32_t fMatchStart; // Position of the start of the most recent match
int32_t fMatchEnd; // First position after the end of the most recent match
int32_t fLastMatchEnd; // First position after the end of the previous match.
UStack *fBackTrackStack;
UVector *fCaptureStarts;
UVector *fCaptureEnds;

View File

@ -31,12 +31,12 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
if (exec) logln("TestSuite RegexTest: ");
switch (index) {
case 0: name = "API_Match";
if (exec) API_Match();
break;
case 1: name = "Basic";
case 0: name = "Basic";
if (exec) Basic();
break;
case 1: name = "API_Match";
if (exec) API_Match();
break;
case 2: name = "API_Replace";
if (exec) API_Replace();
break;
@ -87,6 +87,7 @@ UBool RegexTest::doRegexLMTest(char *pat, char *text, UBool looking, UBool match
errln("RegexTest failure in RegexPattern::compile() at line %d. Status = %d\n", line, status);
return FALSE;
}
// REPattern->dump();
UnicodeString inputString(inputText);
UnicodeString unEscapedInput = inputString.unescape();
@ -295,6 +296,101 @@ void RegexTest::API_Match() {
delete matcher;
delete pat;
}
//
// Replace
//
{
int32_t flags=0;
UParseError pe;
UErrorCode status=U_ZERO_ERROR;
UnicodeString re("abc");
RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString data = ".abc..abc...abc..";
// 012345678901234567
RegexMatcher *matcher = pat->matcher(data, status);
//
// Plain vanilla matches.
//
UnicodeString dest;
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".yz..abc...abc..");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".yz..yz...yz..");
//
// Plain vanilla non-matches.
//
UnicodeString d2 = ".abx..abx...abx..";
matcher->reset(d2);
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".abx..abx...abx..");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == ".abx..abx...abx..");
//
// Empty source string
//
UnicodeString d3 = "";
matcher->reset(d3);
dest = matcher->replaceFirst("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "");
dest = matcher->replaceAll("yz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "");
//
// Empty substitution string
//
matcher->reset(data); // ".abc..abc...abc.."
dest = matcher->replaceFirst("", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "...abc...abc..");
dest = matcher->replaceAll("", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "........");
//
// match whole string
//
UnicodeString d4 = "abc";
matcher->reset(d4);
dest = matcher->replaceFirst("xyz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "xyz");
dest = matcher->replaceAll("xyz", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "xyz");
//
// Capture Group, simple case
//
UnicodeString re2("a(..)");
RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
REGEX_CHECK_STATUS;
UnicodeString d5 = "abcdefg";
RegexMatcher *matcher2 = pat2->matcher(d5, status);
REGEX_CHECK_STATUS;
dest = matcher2->replaceFirst("$1$1", status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(dest == "bcbcdefg");
}
}
@ -314,6 +410,7 @@ void RegexTest::Basic() {
//
#if 0
{
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
}
return;
#endif
@ -419,6 +516,26 @@ void RegexTest::Basic() {
REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
//
// Escape sequences that become single literal chars, handled internally
// by ICU's Unescape.
//
// REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE); // BS
// REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L (or whatever) TODO: bug in Unescape
// REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape TODO: bug in Unescape
REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
};