ICU-2422 regexp, match flag setting options added.
X-SVN-Rev: 11032
This commit is contained in:
parent
5e8f53a387
commit
2397658197
@ -185,7 +185,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
|
||||
fCharNum = 0;
|
||||
fQuoteMode = FALSE;
|
||||
fFreeForm = FALSE;
|
||||
fCaseI = (fRXPat->fFlags & UREGEX_CASE_INSENSITIVE) != 0;
|
||||
fModeFlags = fRXPat->fFlags;
|
||||
|
||||
fMatchOpenParen = -1;
|
||||
fMatchCloseParen = -1;
|
||||
@ -579,9 +579,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
// of the two NOPs. Depending on what follows in the pattern, the
|
||||
// NOPs may be changed to SAVE_STATE or JMP ops, with a target
|
||||
// address of the end of the parenthesized group.
|
||||
fParenStack.push(-2, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
fParenStack.push(fModeFlags, *fStatus); // Match mode state
|
||||
fParenStack.push(capturing, *fStatus); // Frame type.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
|
||||
|
||||
// Save the mapping from group number to stack frame variable position.
|
||||
fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
|
||||
@ -601,9 +602,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the two NOPs.
|
||||
fParenStack.push(-1, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
fParenStack.push(fModeFlags, *fStatus); // Match mode state
|
||||
fParenStack.push(plain, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
|
||||
}
|
||||
break;
|
||||
|
||||
@ -628,7 +630,8 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
// of the two NOPs. Depending on what follows in the pattern, the
|
||||
// NOPs may be changed to SAVE_STATE or JMP ops, with a target
|
||||
// address of the end of the parenthesized group.
|
||||
fParenStack.push(-3, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fModeFlags, *fStatus); // Match mode state
|
||||
fParenStack.push(atomic, *fStatus); // Frame type.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
}
|
||||
@ -659,9 +662,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the NOPs.
|
||||
fParenStack.push(lookAhead, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
fParenStack.push(fModeFlags, *fStatus); // Match mode state
|
||||
fParenStack.push(lookAhead, *fStatus); // Frame type.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
|
||||
}
|
||||
break;
|
||||
|
||||
@ -690,9 +694,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the StateSave and NOP.
|
||||
fParenStack.push( negLookAhead, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
fParenStack.push(fModeFlags, *fStatus); // Match mode state
|
||||
fParenStack.push( negLookAhead, *fStatus); // Frame type
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
|
||||
|
||||
// Instructions #5 and #6 will be added when the ')' is encountered.
|
||||
}
|
||||
@ -957,16 +962,30 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
|
||||
case doDotAny:
|
||||
// scanned a ".", match any single character.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOTANY, 0), *fStatus);
|
||||
{
|
||||
int32_t op;
|
||||
if (fModeFlags & UREGEX_DOTALL) {
|
||||
op = URX_BUILD(URX_DOTANY_ALL, 0);
|
||||
} else {
|
||||
op = URX_BUILD(URX_DOTANY, 0);
|
||||
}
|
||||
fRXPat->fCompiledPat->addElement(op, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
case doCaret: // TODO: multi-line mode flag.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus);
|
||||
case doCaret:
|
||||
{
|
||||
int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case doDollar: // TODO: multi-line mode flag.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
|
||||
case doDollar:
|
||||
{
|
||||
int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
case doBackslashA:
|
||||
@ -1051,8 +1070,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
case doScanUnicodeSet:
|
||||
{
|
||||
UnicodeSet *theSet = scanSet();
|
||||
if (fCaseI && theSet != NULL) {
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && theSet != NULL) {
|
||||
caseClose(theSet); // TODO: replace with the real function.
|
||||
// theSet->closeOver(USET_CASE);
|
||||
}
|
||||
compileSet(theSet);
|
||||
}
|
||||
@ -1094,7 +1114,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
// of compilation, it will be changed to the variables location.
|
||||
U_ASSERT(groupNum > 0);
|
||||
int32_t op;
|
||||
if (fCaseI) {
|
||||
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
|
||||
op = URX_BUILD(URX_BACKREF_I, groupNum);
|
||||
} else {
|
||||
op = URX_BUILD(URX_BACKREF, groupNum);
|
||||
@ -1217,11 +1237,70 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
break;
|
||||
|
||||
|
||||
case doMatchMode: // (?i) and similar
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
case doBeginMatchMode:
|
||||
fNewModeFlags = fModeFlags;
|
||||
fSetModeFlag = TRUE;
|
||||
break;
|
||||
|
||||
case doMatchMode: // (?i) and similar
|
||||
{
|
||||
int32_t bit = 0;
|
||||
switch (fC.fChar) {
|
||||
case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
|
||||
case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
|
||||
case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
|
||||
case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
|
||||
case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
|
||||
default:
|
||||
U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
|
||||
// by the scanner.
|
||||
}
|
||||
if (fSetModeFlag) {
|
||||
fNewModeFlags |= bit;
|
||||
} else {
|
||||
fNewModeFlags &= ~bit;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case doSetMatchMode:
|
||||
// We've got a (?i) or similar. The match mode is being changed, but
|
||||
// the change is not scoped to a parenthesized block.
|
||||
fModeFlags = fNewModeFlags;
|
||||
|
||||
// Prevent any string from spanning across the change of match mode.
|
||||
// Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
|
||||
fixLiterals();
|
||||
break;
|
||||
|
||||
|
||||
case doMatchModeParen:
|
||||
// We've got a (?i: or similar. Begin a parenthesized block, save old
|
||||
// mode flags so they can be restored at the close of the block.
|
||||
//
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced by a save-state if the
|
||||
// parenthesized group gets a * quantifier, followed by
|
||||
// - NOP, which may later be replaced by a save-state if there
|
||||
// is an '|' alternation within the parens.
|
||||
{
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the two NOPs (a normal non-capturing () frame, except for the
|
||||
// saving of the orignal mode flags.)
|
||||
fParenStack.push(fModeFlags, *fStatus);
|
||||
fParenStack.push(flags, *fStatus); // Frame Marker
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
|
||||
// Set the current mode flags to the new values.
|
||||
fModeFlags = fNewModeFlags;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
default:
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
@ -1278,7 +1357,7 @@ void RegexCompile::literalChar() {
|
||||
opType = URX_TYPE(op);
|
||||
U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
|
||||
if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
|
||||
if (fCaseI) {
|
||||
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
|
||||
op = URX_BUILD(URX_STRING_I, fStringOpStart);
|
||||
} else {
|
||||
op = URX_BUILD(URX_STRING, fStringOpStart);
|
||||
@ -1308,7 +1387,7 @@ void RegexCompile::literalChar() {
|
||||
//------------------------------------------------------------------------------
|
||||
void RegexCompile::emitONE_CHAR(UChar32 c) {
|
||||
int32_t op;
|
||||
if (fCaseI && (u_tolower(c) != u_toupper(c))) {
|
||||
if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && (u_tolower(c) != u_toupper(c))) {
|
||||
// We have a cased character, and are in case insensitive matching mode.
|
||||
// TODO: replace with a better test. See Alan L.'s mail of 2/6
|
||||
c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
|
||||
@ -1540,11 +1619,17 @@ void RegexCompile::handleCloseParen() {
|
||||
fMatchOpenParen = patIdx;
|
||||
}
|
||||
|
||||
// At the close of any parenthesized block, restore the match mode flags to
|
||||
// the value they had at the open paren. Saved value is
|
||||
// at the top of the paren stack.
|
||||
fModeFlags = fParenStack.popi();
|
||||
|
||||
// DO any additional fixups, depending on the specific kind of
|
||||
// parentesized grouping this is
|
||||
|
||||
switch (patIdx) {
|
||||
case plain:
|
||||
case flags:
|
||||
// No additional fixups required.
|
||||
// (Grouping-only parentheses)
|
||||
break;
|
||||
|
@ -74,7 +74,8 @@ public:
|
||||
capturing = -2,
|
||||
atomic = -3,
|
||||
lookAhead = -4,
|
||||
negLookAhead = -5
|
||||
negLookAhead = -5,
|
||||
flags = -6
|
||||
};
|
||||
|
||||
private:
|
||||
@ -142,7 +143,11 @@ private:
|
||||
//
|
||||
// Data associated with the generation of the pcode for the match engine
|
||||
//
|
||||
UBool fCaseI; // Case Insensitive Match Mode is on.
|
||||
int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.)
|
||||
int32_t fNewModeFlags; // New flags, while compiling (?i, holds state
|
||||
// until last flag is scanned.
|
||||
UBool fSetModeFlag; // true for (?ismx, false for (?-ismx
|
||||
|
||||
|
||||
int32_t fStringOpStart; // While a literal string is being scanned
|
||||
// holds the start index within RegexPattern.
|
||||
|
@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
|
||||
enum Regex_PatternParseAction {
|
||||
doCloseParen,
|
||||
doProperty,
|
||||
doBeginMatchMode,
|
||||
doOrOperator,
|
||||
doOpenCaptureParen,
|
||||
doBadOpenParenType,
|
||||
@ -53,11 +54,13 @@ enum Regex_PatternParseAction {
|
||||
doBackslashA,
|
||||
doBackslashB,
|
||||
doNGPlus,
|
||||
doSetMatchMode,
|
||||
doPatFinish,
|
||||
doBackslashD,
|
||||
doPossesiveOpt,
|
||||
doEscapeError,
|
||||
doBackslashG,
|
||||
doMatchModeParen,
|
||||
doOpt,
|
||||
doInterval,
|
||||
doLiteralChar,
|
||||
@ -136,11 +139,11 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 43,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 46, 2, TRUE} // 34
|
||||
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 35
|
||||
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 36
|
||||
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 37
|
||||
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 38
|
||||
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 39
|
||||
, {doBeginMatchMode, 105 /* i */, 49,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 109 /* m */, 49,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 115 /* s */, 49,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 120 /* x */, 49,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 45 /* - */, 49,0, FALSE} // 39
|
||||
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 40
|
||||
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 41
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 42
|
||||
@ -151,12 +154,12 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doMismatchedParenErr, 253, 101,0, FALSE} // 47
|
||||
, {doNOP, 255, 46,0, TRUE} // 48
|
||||
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 50
|
||||
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 51
|
||||
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 50
|
||||
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 51
|
||||
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 52
|
||||
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 53
|
||||
, {doNOP, 41 /* ) */, 2,0, TRUE} // 54
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 2, 14, TRUE} // 55
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 54
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 55
|
||||
, {doNOP, 255, 101,0, FALSE} // 56
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 20,0, TRUE} // 58
|
||||
|
@ -132,11 +132,11 @@ open-paren-extended:
|
||||
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
|
||||
'<' n open-paren-lookbehind
|
||||
'#' n paren-comment ^term
|
||||
'i' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
'i' paren-flag doBeginMatchMode
|
||||
'm' paren-flag doBeginMatchMode
|
||||
's' paren-flag doBeginMatchMode
|
||||
'x' paren-flag doBeginMatchMode
|
||||
'-' paren-flag doBeginMatchMode
|
||||
'(' n errorDeath doConditionalExpr
|
||||
'{' n errorDeath doPerlInline
|
||||
default errorDeath doBadOpenParenType
|
||||
@ -157,16 +157,16 @@ paren-comment:
|
||||
default n paren-comment
|
||||
|
||||
#
|
||||
# paren-flag Scanned a (?ismx-ismx flag setting thing
|
||||
# TODO: this is not fully implemented yet.
|
||||
# paren-flag Scanned a (?ismx-ismx flag setting
|
||||
#
|
||||
paren-flag:
|
||||
'i' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
')' n term
|
||||
':' n term ^expr-quant doOpenNonCaptureParen
|
||||
')' n term doSetMatchMode
|
||||
':' n term ^expr-quant doMatchModeParen
|
||||
default errorDeath
|
||||
|
||||
|
||||
|
@ -117,9 +117,11 @@ enum {
|
||||
// First Operand: Index of start of string in string literals
|
||||
// Second Operand (next word in compiled code):
|
||||
// the length of the string.
|
||||
URX_BACKREF_I = 41 // Case insensitive back reference.
|
||||
URX_BACKREF_I = 41, // Case insensitive back reference.
|
||||
// Parameter is the index of the
|
||||
// capture group variables in the state stack frame.
|
||||
URX_DOLLAR_M = 42, // $ in multi-line mode.
|
||||
URX_CARET_M = 43 // ^ in multi-line mode.
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
@ -166,7 +168,9 @@ enum {
|
||||
"LA_END", \
|
||||
"ONECHAR_I", \
|
||||
"STRING_I", \
|
||||
"BACKREF_I"
|
||||
"BACKREF_I", \
|
||||
"DOLLAR_M", \
|
||||
"CARET_M"
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
|
@ -810,17 +810,52 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
|
||||
// TODO: support for multi-line mode.
|
||||
break;
|
||||
|
||||
|
||||
case URX_CARET: // ^, test for start of line
|
||||
case URX_DOLLAR_M: // $, test for End of line in multi-line mode
|
||||
{
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
// We really are at the end of input. Success.
|
||||
break;
|
||||
}
|
||||
// If we are positioned just before a new-line , succeed.
|
||||
// It makes no difference where the new-line is within the input.
|
||||
UChar32 c = inputBuf[fp->fInputIdx];
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
break; // At new-line at end of input. Success
|
||||
}
|
||||
// not at a new line. Fail.
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_CARET: // ^, test for start of line
|
||||
if (fp->fInputIdx != 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
} // TODO: support for multi-line mode.
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_CARET_M: // ^, test for start of line in mulit-line mode
|
||||
{
|
||||
if (fp->fInputIdx == 0) {
|
||||
// We are at the start input. Success.
|
||||
break;
|
||||
}
|
||||
// Check the character just before the current pos.
|
||||
UChar c = inputBuf[fp->fInputIdx - 1];
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
// It's a new-line. ^ is true. Success.
|
||||
break;
|
||||
}
|
||||
// Not at the start of a line. Fail.
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_A: // Test for start of input
|
||||
if (fp->fInputIdx != 0) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
@ -966,10 +1001,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
|
||||
case URX_DOTANY:
|
||||
{
|
||||
// . matches anything
|
||||
// . matches anything, but stops at end-of-line.
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
// At end of input. Match failed. Backtrack out.
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
// There is input left. Advance over one char, unless we've hit end-of-line
|
||||
@ -988,20 +1023,20 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
case URX_DOTANY_ALL:
|
||||
{
|
||||
// ., in dot-matches-all (including new lines) mode
|
||||
// . matches anything
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
// At end of input. Match failed. Backtrack out.
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
// There is input left. Advance over one char, unless we've hit end-of-line
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
if (c == 0x0a || c==0x0d || c==0x0c || c==0x85 ||c==0x2028 || c==0x2029) {
|
||||
// There is input left. Advance over one char, except if we are
|
||||
// at a cr/lf, advance over both of them.
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (c==0x0d) {
|
||||
// In the case of a CR/LF, we need to advance over both.
|
||||
UChar32 nextc = fInput->char32At(fp->fInputIdx);
|
||||
if (c == 0x0d && nextc == 0x0a) {
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
UChar nextc = inputBuf[fp->fInputIdx];
|
||||
if (nextc == 0x0a) {
|
||||
fp->fInputIdx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -452,10 +452,14 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_NOP:
|
||||
case URX_DOTANY:
|
||||
case URX_FAIL:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_BACKSLASH_A:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_X:
|
||||
case URX_END:
|
||||
case URX_DOLLAR_M:
|
||||
case URX_CARET_M:
|
||||
// Types with no operand field of interest.
|
||||
break;
|
||||
|
||||
@ -468,8 +472,6 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_W:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
case URX_STRING_LEN:
|
||||
case URX_CTR_INIT:
|
||||
case URX_CTR_INIT_NG:
|
||||
@ -485,6 +487,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_JMPX:
|
||||
case URX_LA_START:
|
||||
case URX_LA_END:
|
||||
case URX_BACKREF_I:
|
||||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||
break;
|
||||
|
@ -64,13 +64,17 @@ struct REStackFrame;
|
||||
enum {
|
||||
/** Forces normalization of pattern and strings. @draft ICU 2.4 */
|
||||
UREGEX_CANON_EQ = 128,
|
||||
|
||||
/** Enable case insensitive matching. @draft ICU 2.4 */
|
||||
UREGEX_CASE_INSENSITIVE = 2,
|
||||
|
||||
/** Allow white space and comments within patterns @draft ICU 2.4 */
|
||||
UREGEX_COMMENTS = 4,
|
||||
|
||||
/** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
|
||||
* @draft ICU 2.4 */
|
||||
UREGEX_DOTALL = 32,
|
||||
|
||||
/** Control behavior of "$" and "^"
|
||||
* If set, recognize line terminators within string,
|
||||
* otherwise, match only at start and end of input string.
|
||||
|
@ -1228,10 +1228,6 @@ void RegexTest::Errors() {
|
||||
REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
|
||||
REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
|
||||
|
||||
// Flag settings not yet implemented
|
||||
REGEX_ERR("(?i:stuff*)", 1, 3, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Look-ahead, Look-behind
|
||||
REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // look-behind
|
||||
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
|
||||
@ -1666,10 +1662,16 @@ void RegexTest::PerlTests() {
|
||||
}
|
||||
|
||||
else if (perlExpr.startsWith("\\")) { // \Escape. Take following char as a literal.
|
||||
// or as an escaped sequence (e.g. \n)
|
||||
if (perlExpr.length() > 1) {
|
||||
perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
|
||||
}
|
||||
resultString.append(perlExpr.charAt(0));
|
||||
UChar c = perlExpr.charAt(0);
|
||||
switch (c) {
|
||||
case 'n': c = '\n'; break;
|
||||
// add any other escape sequences that show up in the test expected results.
|
||||
}
|
||||
resultString.append(c);
|
||||
perlExpr.remove(0, 1);
|
||||
}
|
||||
|
||||
@ -1693,6 +1695,8 @@ void RegexTest::PerlTests() {
|
||||
UnicodeString expectedS(fields[4]);
|
||||
expectedS.findAndReplace(nulnulSrc, nulnul);
|
||||
expectedS.findAndReplace(ffffSrc, ffff);
|
||||
expectedS.findAndReplace("\\n", "\n");
|
||||
|
||||
|
||||
if (expectedS.compare(resultString) != 0) {
|
||||
errln("Line %d: Incorrect perl expression results. Expected \"%s\"; got \"%s\"",
|
||||
|
8
icu4c/source/test/testdata/regextst.txt
vendored
8
icu4c/source/test/testdata/regextst.txt
vendored
@ -210,5 +210,9 @@
|
||||
|
||||
# Case Insensitive
|
||||
"aBc" i "<0>ABC</0>"
|
||||
#"a[^bc]d" i "ABD" # TODO: case closure bug
|
||||
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
|
||||
#"a[^bc]d" i "ABD" # TODO: case closure bug
|
||||
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
|
||||
|
||||
"(?:(?i)a)b" "<0>Ab</0>"
|
||||
"ab(?i)cd" "<0>abCd</0>"
|
||||
"ab$cd" "abcd"
|
||||
|
Loading…
Reference in New Issue
Block a user