ICU-105 Regular Expressions, changes from code review
X-SVN-Rev: 10294
This commit is contained in:
parent
bf1f6b1213
commit
24bf088281
@ -1839,7 +1839,6 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||||||
"U_REGEX_PROPERTY_SYNTAX",
|
"U_REGEX_PROPERTY_SYNTAX",
|
||||||
"U_REGEX_UNIMPLEMENTED",
|
"U_REGEX_UNIMPLEMENTED",
|
||||||
"U_REGEX_MISMATCHED_PAREN",
|
"U_REGEX_MISMATCHED_PAREN",
|
||||||
"U_REGEX_MATCH_MODE_ERROR"
|
|
||||||
};
|
};
|
||||||
|
|
||||||
U_CAPI const char * U_EXPORT2
|
U_CAPI const char * U_EXPORT2
|
||||||
|
@ -500,18 +500,17 @@ typedef enum UErrorCode {
|
|||||||
/*
|
/*
|
||||||
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
|
* The error codes in the range 0x10300-0x103ff are reserved for regular expression related errrs
|
||||||
*/
|
*/
|
||||||
U_REGEX_ERROR_START=0x10300,
|
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
|
||||||
U_REGEX_INTERNAL_ERROR,
|
U_REGEX_INTERNAL_ERROR, /**< An internal error (bug) was detected. */
|
||||||
U_REGEX_RULE_SYNTAX,
|
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
|
||||||
U_REGEX_INVALID_STATE,
|
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
|
||||||
U_REGEX_BAD_ESCAPE_SEQUENCE,
|
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
|
||||||
U_REGEX_PROPERTY_SYNTAX,
|
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
|
||||||
U_REGEX_UNIMPLEMENTED,
|
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
|
||||||
U_REGEX_MISMATCHED_PAREN,
|
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
|
||||||
U_REGEX_MATCH_MODE_ERROR,
|
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||||
U_REGEX_ERROR_LIMIT,
|
|
||||||
|
|
||||||
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
U_ERROR_LIMIT=U_REGEX_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||||
} UErrorCode;
|
} UErrorCode;
|
||||||
|
|
||||||
/* Use the following to determine if an UErrorCode represents */
|
/* Use the following to determine if an UErrorCode represents */
|
||||||
|
@ -28,8 +28,6 @@
|
|||||||
#include "ucln_in.h"
|
#include "ucln_in.h"
|
||||||
#include "mutex.h"
|
#include "mutex.h"
|
||||||
|
|
||||||
#include "stdio.h" // TODO: Get rid of this
|
|
||||||
|
|
||||||
#include "regeximp.h"
|
#include "regeximp.h"
|
||||||
#include "regexcst.h" // Contains state table for the regex pattern parser.
|
#include "regexcst.h" // Contains state table for the regex pattern parser.
|
||||||
// generated by a Perl script.
|
// generated by a Perl script.
|
||||||
@ -40,7 +38,6 @@
|
|||||||
U_NAMESPACE_BEGIN
|
U_NAMESPACE_BEGIN
|
||||||
|
|
||||||
const char RegexCompile::fgClassID=0;
|
const char RegexCompile::fgClassID=0;
|
||||||
static const int RESCAN_DEBUG = 0;
|
|
||||||
|
|
||||||
//----------------------------------------------------------------------------------------
|
//----------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
@ -173,6 +170,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||||||
|
|
||||||
//
|
//
|
||||||
// Set up the constant (static) Unicode Sets.
|
// Set up the constant (static) Unicode Sets.
|
||||||
|
// TODO: something cleaner for that -128 constant.
|
||||||
//
|
//
|
||||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
|
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_rule_char-128], gRuleSet_rule_char_pattern, status);
|
||||||
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
|
ThreadSafeUnicodeSetInit(&gRuleSets[kRuleSet_white_space-128], gRuleWhiteSpacePattern, status);
|
||||||
@ -282,14 +280,12 @@ void RegexCompile::compile(
|
|||||||
// the search will stop there, if not before.
|
// the search will stop there, if not before.
|
||||||
//
|
//
|
||||||
tableEl = &gRuleParseStateTable[state];
|
tableEl = &gRuleParseStateTable[state];
|
||||||
if (RESCAN_DEBUG) {
|
REGEX_SCAN_DEBUG_PRINTF( "char, line, col = (\'%c\', %d, %d) state=%s ",
|
||||||
printf( "char, line, col = (\'%c\', %d, %d) state=%s ",
|
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
|
||||||
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (;;) { // loop through table rows belonging to this state, looking for one
|
for (;;) { // loop through table rows belonging to this state, looking for one
|
||||||
// that matches the current input char.
|
// that matches the current input char.
|
||||||
if (RESCAN_DEBUG) { printf( ".");}
|
REGEX_SCAN_DEBUG_PRINTF( ".");
|
||||||
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
|
if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
|
||||||
// Table row specified an individual character, not a set, and
|
// Table row specified an individual character, not a set, and
|
||||||
// the input character is not quoted, and
|
// the input character is not quoted, and
|
||||||
@ -323,7 +319,7 @@ void RegexCompile::compile(
|
|||||||
// No match on this row, advance to the next row for this state,
|
// No match on this row, advance to the next row for this state,
|
||||||
tableEl++;
|
tableEl++;
|
||||||
}
|
}
|
||||||
if (RESCAN_DEBUG) { printf( "\n");}
|
REGEX_SCAN_DEBUG_PRINTF("\n");
|
||||||
|
|
||||||
//
|
//
|
||||||
// We've found the row of the state table that matches the current input
|
// We've found the row of the state table that matches the current input
|
||||||
@ -340,7 +336,7 @@ void RegexCompile::compile(
|
|||||||
fStackPtr++;
|
fStackPtr++;
|
||||||
if (fStackPtr >= kStackSize) {
|
if (fStackPtr >= kStackSize) {
|
||||||
error(U_REGEX_INTERNAL_ERROR);
|
error(U_REGEX_INTERNAL_ERROR);
|
||||||
// printf( "RegexCompile::parse() - state stack overflow.\n");
|
REGEX_SCAN_DEBUG_PRINTF( "RegexCompile::parse() - state stack overflow.\n");
|
||||||
fStackPtr--;
|
fStackPtr--;
|
||||||
}
|
}
|
||||||
fStack[fStackPtr] = tableEl->fPushState;
|
fStack[fStackPtr] = tableEl->fPushState;
|
||||||
@ -369,6 +365,36 @@ void RegexCompile::compile(
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// The pattern has now been read and processed, and the compiled code generated.
|
||||||
|
//
|
||||||
|
|
||||||
|
//
|
||||||
|
// Compute the number of digits requried for the largest capture group number.
|
||||||
|
//
|
||||||
|
fRXPat->fMaxCaptureDigits = 1;
|
||||||
|
int32_t n = 10;
|
||||||
|
for (;;) {
|
||||||
|
if (n > fRXPat->fNumCaptureGroups) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
fRXPat->fMaxCaptureDigits++;
|
||||||
|
n *= 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// A stupid bit of non-sense to prevent code coverage testing from complaining
|
||||||
|
// about the pattern.dump() debug function. Go through the motions of dumping,
|
||||||
|
// even though, without the #define set, it will do nothing.
|
||||||
|
//
|
||||||
|
#ifndef REGEX_DUMP_DEBUG
|
||||||
|
static UBool phonyDumpDone = FALSE;
|
||||||
|
if (phonyDumpDone==FALSE) {
|
||||||
|
fRXPat->dump();
|
||||||
|
phonyDumpDone = TRUE;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1094,27 +1120,39 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
|
|||||||
if (theSet == NULL) {
|
if (theSet == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (theSet->size() > 1) {
|
int32_t setSize = theSet->size();
|
||||||
// The set contains two or more chars.
|
UChar32 firstSetChar = theSet->charAt(0);
|
||||||
// Put it into the compiled pattern as a set.
|
if (firstSetChar == -1) {
|
||||||
int32_t setNumber = fRXPat->fSets->size();
|
// Sets that contain only strings, but no individual chars,
|
||||||
fRXPat->fSets->addElement(theSet, *fStatus);
|
// will end up here. TODO: figure out what to with sets containing strings.
|
||||||
int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
|
setSize = 0;
|
||||||
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
|
|
||||||
}
|
}
|
||||||
else
|
|
||||||
{
|
switch (setSize) {
|
||||||
// The set contains only a single code point. Put it into
|
case 0: // Set of no elements. Always fails to match.
|
||||||
// the compiled pattern as a single char operation rather
|
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
|
||||||
// than a set, and discard the set itself.
|
break;
|
||||||
UChar32 c = theSet->charAt(0);
|
|
||||||
if (c == -1) {
|
case 1:
|
||||||
// Set contained no chars. Stuff an invalid char that can't match.
|
{
|
||||||
c = 0x1fffff;
|
// The set contains only a single code point. Put it into
|
||||||
|
// the compiled pattern as a single char operation rather
|
||||||
|
// than a set, and discard the set itself.
|
||||||
|
int32_t charToken = URX_BUILD(URX_ONECHAR, firstSetChar);
|
||||||
|
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
||||||
|
delete theSet;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
// The set contains two or more chars. (the normal case)
|
||||||
|
// Put it into the compiled pattern as a set.
|
||||||
|
int32_t setNumber = fRXPat->fSets->size();
|
||||||
|
fRXPat->fSets->addElement(theSet, *fStatus);
|
||||||
|
int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
|
||||||
|
fRXPat->fCompiledPat->addElement(setOp, *fStatus);
|
||||||
}
|
}
|
||||||
int32_t charToken = URX_BUILD(URX_ONECHAR, c);
|
|
||||||
fRXPat->fCompiledPat->addElement(charToken, *fStatus);
|
|
||||||
delete theSet;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1321,7 +1359,7 @@ UnicodeSet *RegexCompile::scanSet() {
|
|||||||
if (U_FAILURE(localStatus)) {
|
if (U_FAILURE(localStatus)) {
|
||||||
// TODO: Get more accurate position of the error from UnicodeSet's return info.
|
// TODO: Get more accurate position of the error from UnicodeSet's return info.
|
||||||
// UnicodeSet appears to not be reporting correctly at this time.
|
// UnicodeSet appears to not be reporting correctly at this time.
|
||||||
printf( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
|
REGEX_SCAN_DEBUG_PRINTF( "UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex());
|
||||||
error(localStatus);
|
error(localStatus);
|
||||||
delete uset;
|
delete uset;
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -28,8 +28,6 @@
|
|||||||
U_NAMESPACE_BEGIN
|
U_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
|
||||||
static const UBool REGEX_DEBUG = TRUE;
|
|
||||||
|
|
||||||
//--------------------------------------------------------------------------------
|
//--------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// class RegexCompile Contains the regular expression compiler.
|
// class RegexCompile Contains the regular expression compiler.
|
||||||
|
@ -13,13 +13,45 @@
|
|||||||
#define _REGEXIMP_H
|
#define _REGEXIMP_H
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// debugging support. Enable one or more of the #defines immediately following
|
||||||
|
//
|
||||||
|
//#define REGEX_SCAN_DEBUG
|
||||||
|
#define REGEX_DUMP_DEBUG
|
||||||
|
//#define REGEX_RUN_DEBUG
|
||||||
|
// End of #defines inteded to be directly set.
|
||||||
|
|
||||||
|
#ifdef REGEX_SCAN_DEBUG
|
||||||
|
#define REGEX_SCAN_DEBUG_PRINTF printf
|
||||||
|
#else
|
||||||
|
#define REGEX_SCAN_DEBUG_PRINTF
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef REGEX_DUMP_DEBUG
|
||||||
|
#define REGEX_DUMP_DEBUG_PRINTF printf
|
||||||
|
#else
|
||||||
|
#define REGEX_DUMP_DEBUG_PRINTF
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef REGEX_RUN_DEBUG
|
||||||
|
#define REGEX_RUN_DEBUG_PRINTF printf
|
||||||
|
#define REGEX_DUMP_DEBUG_PRINTF printf
|
||||||
|
#else
|
||||||
|
#define REGEX_RUN_DEBUG_PRINTF
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(REGEX_SCAN_DEBUG) || defined(REGEX_RUN_DEBUG) || defined(REGEX_DUMP_DEBUG)
|
||||||
|
#include <stdio.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
|
// Opcode types In the compiled form of the regexp, these are the type, or opcodes,
|
||||||
// of the entries.
|
// of the entries.
|
||||||
//
|
//
|
||||||
enum {
|
enum {
|
||||||
URX_RESERVED_OP = 0,
|
URX_RESERVED_OP = 0,
|
||||||
URX_UNUSED1 = 1,
|
URX_BACKTRACK = 1,
|
||||||
URX_END = 2,
|
URX_END = 2,
|
||||||
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
||||||
URX_STRING = 4, // Value field is index of string start
|
URX_STRING = 4, // Value field is index of string start
|
||||||
@ -52,7 +84,7 @@ enum {
|
|||||||
// Used for debug printing only.
|
// Used for debug printing only.
|
||||||
#define URX_OPCODE_NAMES \
|
#define URX_OPCODE_NAMES \
|
||||||
"URX_RESERVED_OP", \
|
"URX_RESERVED_OP", \
|
||||||
"URX_UNUSED1", \
|
"URX_BACKTRACK", \
|
||||||
"END", \
|
"END", \
|
||||||
"ONECHAR", \
|
"ONECHAR", \
|
||||||
"STRING", \
|
"STRING", \
|
||||||
|
@ -280,9 +280,9 @@ UnicodeString RegexMatcher::group(UErrorCode &status) const {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
UnicodeString RegexMatcher::group(int32_t group, UErrorCode &status) const {
|
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
|
||||||
int32_t s = start(group, status);
|
int32_t s = start(groupNum, status);
|
||||||
int32_t e = end(group, status);
|
int32_t e = end(groupNum, status);
|
||||||
|
|
||||||
// Note: calling start() and end() above will do all necessary checking that
|
// Note: calling start() and end() above will do all necessary checking that
|
||||||
// the group number is OK and that a match exists. status will be set.
|
// the group number is OK and that a match exists. status will be set.
|
||||||
@ -539,6 +539,28 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||||||
int32_t opType; // the opcode
|
int32_t opType; // the opcode
|
||||||
int32_t opValue; // and the operand value.
|
int32_t opValue; // and the operand value.
|
||||||
|
|
||||||
|
#ifdef REGEX_RUN_DEBUG
|
||||||
|
{
|
||||||
|
printf("MatchAt(startIdx=%d)\n", startIdx);
|
||||||
|
printf("Original Pattern: ");
|
||||||
|
int i;
|
||||||
|
for (i=0; i<fPattern->fPattern.length(); i++) {
|
||||||
|
printf("%c", fPattern->fPattern.charAt(i));
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
printf("Input String: ");
|
||||||
|
for (i=0; i<fInput->length(); i++) {
|
||||||
|
UChar c = fInput->charAt(i);
|
||||||
|
if (c<32 || c>256) {
|
||||||
|
c = '.';
|
||||||
|
}
|
||||||
|
printf("%c", c);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
printf("\n");
|
||||||
|
printf("PatLoc inputIdx char\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (U_FAILURE(status)) {
|
if (U_FAILURE(status)) {
|
||||||
return;
|
return;
|
||||||
@ -569,7 +591,10 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||||||
op = pat->elementAti(patIdx);
|
op = pat->elementAti(patIdx);
|
||||||
opType = URX_TYPE(op);
|
opType = URX_TYPE(op);
|
||||||
opValue = URX_VAL(op);
|
opValue = URX_VAL(op);
|
||||||
// printf("%d %d \"%c\"\n", patIdx, inputIdx, fInput->char32At(inputIdx));
|
#ifdef REGEX_RUN_DEBUG
|
||||||
|
printf("inputIdx=%d inputChar=%c ", inputIdx, fInput->char32At(inputIdx));
|
||||||
|
fPattern->dumpOp(patIdx);
|
||||||
|
#endif
|
||||||
patIdx++;
|
patIdx++;
|
||||||
|
|
||||||
switch (opType) {
|
switch (opType) {
|
||||||
@ -579,6 +604,14 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
||||||
|
case URX_BACKTRACK:
|
||||||
|
// Force a backtrack. In some circumstances, the pattern compiler
|
||||||
|
// will notice that the pattern can't possibly match anything, and will
|
||||||
|
// emit one of these at that point.
|
||||||
|
backTrack(inputIdx, patIdx);
|
||||||
|
break;
|
||||||
|
|
||||||
|
|
||||||
case URX_ONECHAR:
|
case URX_ONECHAR:
|
||||||
{
|
{
|
||||||
UChar32 inputChar = fInput->char32At(inputIdx);
|
UChar32 inputChar = fInput->char32At(inputIdx);
|
||||||
@ -909,7 +942,12 @@ breakFromLoop:
|
|||||||
fLastMatchEnd = fMatchEnd;
|
fLastMatchEnd = fMatchEnd;
|
||||||
fMatchStart = startIdx;
|
fMatchStart = startIdx;
|
||||||
fMatchEnd = inputIdx;
|
fMatchEnd = inputIdx;
|
||||||
|
REGEX_RUN_DEBUG_PRINTF("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
REGEX_RUN_DEBUG_PRINTF("No match\n\n");
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,8 +18,6 @@
|
|||||||
#include "regexcmp.h"
|
#include "regexcmp.h"
|
||||||
#include "regeximp.h"
|
#include "regeximp.h"
|
||||||
|
|
||||||
#include "stdio.h" // TODO: get rid of this...
|
|
||||||
|
|
||||||
U_NAMESPACE_BEGIN
|
U_NAMESPACE_BEGIN
|
||||||
|
|
||||||
//--------------------------------------------------------------------------
|
//--------------------------------------------------------------------------
|
||||||
@ -197,7 +195,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
|
|||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
RegexPattern *RegexPattern::compile(
|
RegexPattern *RegexPattern::compile(
|
||||||
const UnicodeString ®ex,
|
const UnicodeString ®ex,
|
||||||
int32_t flags,
|
uint32_t flags,
|
||||||
UParseError &pe,
|
UParseError &pe,
|
||||||
UErrorCode &status) {
|
UErrorCode &status) {
|
||||||
|
|
||||||
@ -243,7 +241,7 @@ RegexPattern *RegexPattern::compile( const UnicodeString ®ex,
|
|||||||
// flags
|
// flags
|
||||||
//
|
//
|
||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
int32_t RegexPattern::flags() const {
|
uint32_t RegexPattern::flags() const {
|
||||||
return fFlags;
|
return fFlags;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -320,8 +318,6 @@ UnicodeString RegexPattern::pattern() const {
|
|||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// split
|
// split
|
||||||
// TODO: perl returns captured strings intermixed with the
|
|
||||||
// fields. Should we do this too?
|
|
||||||
//
|
//
|
||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
int32_t RegexPattern::split(const UnicodeString &input,
|
int32_t RegexPattern::split(const UnicodeString &input,
|
||||||
@ -383,10 +379,28 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
|||||||
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
|
int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
|
||||||
dest[i].setTo(input, nextOutputStringStart, fieldLen);
|
dest[i].setTo(input, nextOutputStringStart, fieldLen);
|
||||||
nextOutputStringStart = fMatcher->fMatchEnd;
|
nextOutputStringStart = fMatcher->fMatchEnd;
|
||||||
|
|
||||||
|
// If the delimiter pattern has capturing parentheses, the captured
|
||||||
|
// text goes out into the next n destination strings.
|
||||||
|
int32_t groupNum;
|
||||||
|
for (groupNum=1; groupNum<=this->fNumCaptureGroups; groupNum++) {
|
||||||
|
if (i==destCapacity-1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
dest[i] = fMatcher->group(groupNum, status);
|
||||||
|
}
|
||||||
|
|
||||||
if (nextOutputStringStart == inputLen) {
|
if (nextOutputStringStart == inputLen) {
|
||||||
// The delimiter was at the end of the string. We're done.
|
// The delimiter was at the end of the string. We're done.
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (i==destCapacity-1) {
|
||||||
|
// We've filled up the last output string with capture group data.
|
||||||
|
// Give back the last string, to be used for the remainder of the input.
|
||||||
|
i--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -410,88 +424,102 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
|||||||
//---------------------------------------------------------------------
|
//---------------------------------------------------------------------
|
||||||
static const char *opNames[] = {URX_OPCODE_NAMES};
|
static const char *opNames[] = {URX_OPCODE_NAMES};
|
||||||
|
|
||||||
void RegexPattern::dump() {
|
void RegexPattern::dumpOp(int32_t index) const {
|
||||||
|
int32_t op = fCompiledPat->elementAti(index);
|
||||||
|
int32_t val = URX_VAL(op);
|
||||||
|
int32_t type = URX_TYPE(op);
|
||||||
|
int32_t pinnedType = type;
|
||||||
|
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
|
||||||
|
pinnedType = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%4d %08x %-15s ", index, op, opNames[pinnedType]);
|
||||||
|
switch (type) {
|
||||||
|
case URX_NOP:
|
||||||
|
case URX_DOTANY:
|
||||||
|
case URX_FAIL:
|
||||||
|
case URX_BACKSLASH_A:
|
||||||
|
case URX_BACKSLASH_G:
|
||||||
|
case URX_BACKSLASH_X:
|
||||||
|
case URX_END:
|
||||||
|
// Types with no operand field of interest.
|
||||||
|
break;
|
||||||
|
|
||||||
|
case URX_START_CAPTURE:
|
||||||
|
case URX_END_CAPTURE:
|
||||||
|
case URX_STATIC_SETREF:
|
||||||
|
case URX_STATE_SAVE:
|
||||||
|
case URX_JMP:
|
||||||
|
case URX_BACKSLASH_B:
|
||||||
|
case URX_BACKSLASH_D:
|
||||||
|
case URX_BACKSLASH_W:
|
||||||
|
case URX_BACKSLASH_Z:
|
||||||
|
case URX_CARET:
|
||||||
|
case URX_DOLLAR:
|
||||||
|
case URX_STRING_LEN:
|
||||||
|
// types with an integer operand field.
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%d", val);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case URX_ONECHAR:
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
|
||||||
|
break;
|
||||||
|
|
||||||
|
case URX_STRING:
|
||||||
|
{
|
||||||
|
int32_t lengthOp = fCompiledPat->elementAti(index+1);
|
||||||
|
U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
|
||||||
|
int32_t length = URX_VAL(lengthOp);
|
||||||
|
int32_t i;
|
||||||
|
for (i=val; i<val+length; i++) {
|
||||||
|
UChar c = fLiteralText[i];
|
||||||
|
if (c < 32 || c >= 256) {c = '.';}
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%c", c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case URX_SETREF:
|
||||||
|
{
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%d ", val);
|
||||||
|
UnicodeString s;
|
||||||
|
UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
|
||||||
|
set->toPattern(s, TRUE);
|
||||||
|
for (int32_t i=0; i<s.length(); i++) {
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
default:
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("??????");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void RegexPattern::dump() const {
|
||||||
int index;
|
int index;
|
||||||
int i;
|
int i;
|
||||||
UChar c;
|
|
||||||
int32_t op;
|
|
||||||
int32_t pinnedType;
|
|
||||||
int32_t type;
|
|
||||||
int32_t val;
|
|
||||||
int32_t stringStart;
|
|
||||||
|
|
||||||
|
REGEX_DUMP_DEBUG_PRINTF("Original Pattern: ");
|
||||||
printf("Original Pattern: ");
|
|
||||||
for (i=0; i<fPattern.length(); i++) {
|
for (i=0; i<fPattern.length(); i++) {
|
||||||
printf("%c", fPattern.charAt(i));
|
REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
|
||||||
}
|
}
|
||||||
printf("\n");
|
REGEX_DUMP_DEBUG_PRINTF("\n");
|
||||||
printf("Pattern Valid?: %s\n", fBadState? "no" : "yes");
|
REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?: %s\n", fBadState? "no" : "yes");
|
||||||
printf("\nIndex Binary Type Operand\n"
|
REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
|
||||||
"-------------------------------------------\n");
|
"-------------------------------------------\n");
|
||||||
for (index = 0; ; index++) {
|
for (index = 0; index<fCompiledPat->size(); index++) {
|
||||||
op = fCompiledPat->elementAti(index);
|
dumpOp(index);
|
||||||
val = URX_VAL(op);
|
|
||||||
type = URX_TYPE(op);
|
|
||||||
pinnedType = type;
|
|
||||||
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
|
|
||||||
pinnedType = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
|
|
||||||
switch (type) {
|
|
||||||
case URX_NOP:
|
|
||||||
case URX_DOTANY:
|
|
||||||
case URX_FAIL:
|
|
||||||
case URX_BACKSLASH_A:
|
|
||||||
case URX_BACKSLASH_G:
|
|
||||||
case URX_BACKSLASH_X:
|
|
||||||
// Types with no operand field of interest.
|
|
||||||
break;
|
|
||||||
|
|
||||||
case URX_START_CAPTURE:
|
|
||||||
case URX_END_CAPTURE:
|
|
||||||
case URX_SETREF:
|
|
||||||
case URX_STATIC_SETREF:
|
|
||||||
case URX_STATE_SAVE:
|
|
||||||
case URX_JMP:
|
|
||||||
case URX_BACKSLASH_B:
|
|
||||||
case URX_BACKSLASH_D:
|
|
||||||
case URX_BACKSLASH_W:
|
|
||||||
case URX_BACKSLASH_Z:
|
|
||||||
case URX_CARET:
|
|
||||||
case URX_DOLLAR:
|
|
||||||
// types with an integer operand field.
|
|
||||||
printf("%d", val);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case URX_ONECHAR:
|
|
||||||
printf("%c", val<256?val:'?');
|
|
||||||
break;
|
|
||||||
|
|
||||||
case URX_STRING:
|
|
||||||
stringStart = val;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case URX_STRING_LEN:
|
|
||||||
for (i=stringStart; i<stringStart+val; i++) {
|
|
||||||
c = fLiteralText[i];
|
|
||||||
if (c >= 256) {c = '?';};
|
|
||||||
printf("%c", c);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case URX_END:
|
|
||||||
goto breakFromLoop;
|
|
||||||
|
|
||||||
default:
|
|
||||||
printf("??????");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
}
|
}
|
||||||
breakFromLoop:
|
REGEX_DUMP_DEBUG_PRINTF("\n\n");
|
||||||
printf("\n\n");
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const char RegexPattern::fgClassID = 0;
|
const char RegexPattern::fgClassID = 0;
|
||||||
|
@ -81,6 +81,8 @@ enum {
|
|||||||
* to be applied to input text, and a few convenience methods for simple common
|
* to be applied to input text, and a few convenience methods for simple common
|
||||||
* uses of regular expressions.
|
* uses of regular expressions.
|
||||||
*
|
*
|
||||||
|
* <p>Class RegexPattern is not intended to be subclassed.</p>
|
||||||
|
*
|
||||||
* @draft ICU 2.4
|
* @draft ICU 2.4
|
||||||
*/
|
*/
|
||||||
class U_I18N_API RegexPattern: public UObject {
|
class U_I18N_API RegexPattern: public UObject {
|
||||||
@ -192,7 +194,7 @@ public:
|
|||||||
* @draft ICU 2.4
|
* @draft ICU 2.4
|
||||||
*/
|
*/
|
||||||
static RegexPattern *compile( const UnicodeString ®ex,
|
static RegexPattern *compile( const UnicodeString ®ex,
|
||||||
int32_t flags,
|
uint32_t flags,
|
||||||
UParseError &pe,
|
UParseError &pe,
|
||||||
UErrorCode &status);
|
UErrorCode &status);
|
||||||
|
|
||||||
@ -202,7 +204,7 @@ public:
|
|||||||
* @return the match mode flags
|
* @return the match mode flags
|
||||||
* @draft ICU 2.4
|
* @draft ICU 2.4
|
||||||
*/
|
*/
|
||||||
virtual int32_t flags() const;
|
virtual uint32_t flags() const;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Creates a RegexMatcher that will match the given input against this pattern. The
|
* Creates a RegexMatcher that will match the given input against this pattern. The
|
||||||
@ -275,7 +277,7 @@ public:
|
|||||||
//
|
//
|
||||||
// dump Debug function, displays the compiled form of a pattern.
|
// dump Debug function, displays the compiled form of a pattern.
|
||||||
//
|
//
|
||||||
void dump();
|
void dump() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||||
@ -291,14 +293,12 @@ public:
|
|||||||
*/
|
*/
|
||||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||||
|
|
||||||
static const char fgClassID;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//
|
//
|
||||||
// Implementation Data
|
// Implementation Data
|
||||||
//
|
//
|
||||||
UnicodeString fPattern; // The original pattern string.
|
UnicodeString fPattern; // The original pattern string.
|
||||||
int32_t fFlags; // The flags used when compiling the pattern.
|
uint32_t fFlags; // The flags used when compiling the pattern.
|
||||||
//
|
//
|
||||||
UVector *fCompiledPat; // The compiled pattern.
|
UVector *fCompiledPat; // The compiled pattern.
|
||||||
UnicodeString fLiteralText; // Any literal string data from the pattern,
|
UnicodeString fLiteralText; // Any literal string data from the pattern,
|
||||||
@ -317,6 +317,12 @@ private:
|
|||||||
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
|
||||||
// regex character classes, e.g. Word.
|
// regex character classes, e.g. Word.
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The address of this static class variable serves as this class's ID
|
||||||
|
* for ICU "poor man's RTTI".
|
||||||
|
*/
|
||||||
|
static const char fgClassID;
|
||||||
|
|
||||||
friend class RegexCompile;
|
friend class RegexCompile;
|
||||||
friend class RegexMatcher;
|
friend class RegexMatcher;
|
||||||
|
|
||||||
@ -325,6 +331,7 @@ private:
|
|||||||
//
|
//
|
||||||
void init(); // Common initialization, for use by constructors.
|
void init(); // Common initialization, for use by constructors.
|
||||||
void zap(); // Common cleanup
|
void zap(); // Common cleanup
|
||||||
|
void dumpOp(int32_t index) const;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -343,6 +350,8 @@ private:
|
|||||||
* input text to which the expression can be applied. It includes methods
|
* input text to which the expression can be applied. It includes methods
|
||||||
* for testing for matches, and for find and replace operations.
|
* for testing for matches, and for find and replace operations.
|
||||||
*
|
*
|
||||||
|
* <p>Class RegexMatcher is not intended to be subclassed.</p>
|
||||||
|
*
|
||||||
* @draft ICU 2.4
|
* @draft ICU 2.4
|
||||||
*/
|
*/
|
||||||
class U_I18N_API RegexMatcher: public UObject {
|
class U_I18N_API RegexMatcher: public UObject {
|
||||||
@ -355,6 +364,227 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual ~RegexMatcher();
|
virtual ~RegexMatcher();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempts to match the entire input string against the pattern.
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors.
|
||||||
|
* @return TRUE if there is a match
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UBool matches(UErrorCode &status);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempts to match the input string, starting from the beginning, against the pattern.
|
||||||
|
* Like the matches() method, this function always starts at the beginning of the input string;
|
||||||
|
* unlike that function, it does not require that the entire input string be matched.
|
||||||
|
*
|
||||||
|
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
|
||||||
|
* <code>end()</code>, and <code>group()</code> functions.</p>
|
||||||
|
*
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors.
|
||||||
|
* @return TRUE if there is a match at the start of the input string.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UBool lookingAt(UErrorCode &status);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the next pattern match in the input string.
|
||||||
|
* The find begins searching the input at the location following the end of
|
||||||
|
* the previous match, or at the start of the string if there is no previous match.
|
||||||
|
* If a match is found, <code>start(), end()</code> and <code>group()</code>
|
||||||
|
* will provide more information regarding the match.
|
||||||
|
* <p>Note that if the input string is changed by the application,
|
||||||
|
* use find(startPos, status) instead of find(), because the saved starting
|
||||||
|
* position may not be valid with the altered input string.</p>
|
||||||
|
* @return TRUE if a match is found.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UBool find();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets this RegexMatcher and then attempts to find the next substring of the
|
||||||
|
* input string that matches the pattern, starting at the specified index.
|
||||||
|
*
|
||||||
|
* @param start the position in the input string to begin the search
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors.
|
||||||
|
* @return TRUE if a match is found.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UBool find(int32_t start, UErrorCode &status);
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns a string containing the text matched by the previous match.
|
||||||
|
* If the pattern can match an empty string, an empty string may be returned.
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors.
|
||||||
|
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||||
|
* has been attempted or the last match failed.
|
||||||
|
* @return a string containing the matched input text.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UnicodeString group(UErrorCode &status) const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a string containing the text captured by the given group
|
||||||
|
* during the previous match operation. Group(0) is the entire match.
|
||||||
|
*
|
||||||
|
* @param group the capture group number
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors.
|
||||||
|
* Possible errors are U_REGEX_INVALID_STATE if no match
|
||||||
|
* has been attempted or the last match failed and
|
||||||
|
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
|
||||||
|
* @return the captured text
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of capturing groups in this matcher's pattern.
|
||||||
|
* @return the number of capture groups
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual int32_t groupCount() const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index in the input string of the start of the text matched
|
||||||
|
* during the previous match operation.
|
||||||
|
* @param status a reference to a UErrorCode to receive any errors.
|
||||||
|
* @return The position in the input string of the start of the last match.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual int32_t start(UErrorCode &status) const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index in the input string of the start of the text matched by the
|
||||||
|
* specified capture group during the previous match operation. Return -1 if
|
||||||
|
* the capture group exists in the pattern, but was not part of the last match.
|
||||||
|
*
|
||||||
|
* @param group the capture group number
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||||
|
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||||
|
* attempted or the last match failed, and
|
||||||
|
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||||
|
* @return the start position of substring matched by the specified group.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual int32_t start(int group, UErrorCode &status) const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index in the input string of the character following the
|
||||||
|
* text matched during the previous match operation.
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||||
|
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||||
|
* attempted or the last match failed.
|
||||||
|
* @return the index of the last character matched, plus one.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual int32_t end(UErrorCode &status) const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the index in the input string of the character following the
|
||||||
|
* text matched by the specified capture group during the previous match operation.
|
||||||
|
* @param group the capture group number
|
||||||
|
* @param status A reference to a UErrorCode to receive any errors. Possible
|
||||||
|
* errors are U_REGEX_INVALID_STATE if no match has been
|
||||||
|
* attempted or the last match failed and
|
||||||
|
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
||||||
|
* @return the index of the last character, plus one, of the text
|
||||||
|
* captured by the specifed group during the previous match operation.
|
||||||
|
* Return -1 if the capture group was not part of the match.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual int32_t end(int group, UErrorCode &status) const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets this matcher. The effect is to remove any memory of previous matches,
|
||||||
|
* and to cause subsequent find() operations to begin at the beginning of
|
||||||
|
* the input string.
|
||||||
|
*
|
||||||
|
* @return this RegexMatcher.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual RegexMatcher &reset();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets this matcher with a new input string. This allows instances of RegexMatcher
|
||||||
|
* to be reused, which is more efficient than creating a new RegexMatcher for
|
||||||
|
* each input string to be processed.
|
||||||
|
* @return this RegexMatcher.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual RegexMatcher &reset(const UnicodeString &input);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the input string being matched. The returned string is not a copy,
|
||||||
|
* but the live input string. It should not be altered or deleted.
|
||||||
|
* @return the input string
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual const UnicodeString &input() const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the pattern that is interpreted by this matcher.
|
||||||
|
* @return the RegexPattern for this RegexMatcher
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual const RegexPattern &pattern() const;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replaces every substring of the input that matches the pattern
|
||||||
|
* with the given replacement string. This is a convenience function that
|
||||||
|
* provides a complete find-and-replace-all operation.
|
||||||
|
*
|
||||||
|
* This method first resets this matcher. It then scans the input string
|
||||||
|
* looking for matches of the pattern. Input that is not part of any
|
||||||
|
* match is left unchanged; each match is replaced in the result by the
|
||||||
|
* replacement string. The replacement string may contain references to
|
||||||
|
* capture groups.
|
||||||
|
*
|
||||||
|
* @param replacement a string containing the replacement text.
|
||||||
|
* @param status a reference to a UErrorCode to receive any errors.
|
||||||
|
* @return a string containing the results of the find and replace.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replaces the first substring of the input that matches
|
||||||
|
* the pattern with the replacement string. This is a convenience
|
||||||
|
* function that provides a complete find-and-replace operation.
|
||||||
|
*
|
||||||
|
* <p>This function first resets this RegexMatcher. It then scans the input string
|
||||||
|
* looking for a match of the pattern. Input that is not part
|
||||||
|
* of the match is appended directly to the result string; the match is replaced
|
||||||
|
* in the result by the replacement string. The replacement string may contain
|
||||||
|
* references to captured groups.</p>
|
||||||
|
*
|
||||||
|
* <p>The state of the matcher (the position at which a subsequent find()
|
||||||
|
* would begin) after completing a replaceFirst() is not specified. The
|
||||||
|
* RegexMatcher should be reset before doing additional find() operations.</p>
|
||||||
|
*
|
||||||
|
* @param replacement a string containing the replacement text.
|
||||||
|
* @param status a reference to a UErrorCode to receive any errors.
|
||||||
|
* @return a string containing the results of the find and replace.
|
||||||
|
* @draft ICU 2.4
|
||||||
|
*/
|
||||||
|
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implements a replace operation intended to be used as part of an
|
* Implements a replace operation intended to be used as part of an
|
||||||
* incremental find-and-replace.
|
* incremental find-and-replace.
|
||||||
@ -398,219 +628,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
virtual UnicodeString &appendTail(UnicodeString &dest);
|
virtual UnicodeString &appendTail(UnicodeString &dest);
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the index in the input string of the character following the
|
|
||||||
* text matched during the previous match operation.
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
|
||||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
|
||||||
* attempted or the last match failed.
|
|
||||||
* @return the index of the last character matched, plus one.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual int32_t end(UErrorCode &status) const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the index in the input string of the character following the
|
|
||||||
* text matched by the specified capture group during the previous match operation.
|
|
||||||
* @param group the capture group number
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
|
||||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
|
||||||
* attempted or the last match failed and
|
|
||||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
|
||||||
* @return the index of the last character, plus one, of the text
|
|
||||||
* captured by the specifed group during the previous match operation.
|
|
||||||
* Return -1 if the capture group was not part of the match.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual int32_t end(int group, UErrorCode &status) const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Find the next pattern match in the input string.
|
|
||||||
* The find begins searching the input at the location following the end of
|
|
||||||
* the previous match, or at the start of the string if there is no previous match.
|
|
||||||
* If a match is found, <code>start(), end()</code> and <code>group()</code>
|
|
||||||
* will provide more information regarding the match.
|
|
||||||
* @return TRUE if a match is found.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UBool find();
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resets this RegexMatcher and then attempts to find the next substring of the
|
|
||||||
* input string that matches the pattern, starting at the specified index.
|
|
||||||
*
|
|
||||||
* @param status the position in the input string to begin the search
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors.
|
|
||||||
* @return TRUE if a match is found.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UBool find(int32_t start, UErrorCode &status);
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Returns a string containing the text matched by the previous match.
|
|
||||||
* If the pattern can match an empty string, an empty string may be returned.
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors.
|
|
||||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
|
||||||
* has been attempted or the last match failed.
|
|
||||||
* @return a string containing the matched input text.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UnicodeString group(UErrorCode &status) const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a string containing the text captured by the given group
|
|
||||||
* during the previous match operation. Group(0) is the entire match.
|
|
||||||
*
|
|
||||||
* @param group the capture group number
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors.
|
|
||||||
* Possible errors are U_REGEX_INVALID_STATE if no match
|
|
||||||
* has been attempted or the last match failed and
|
|
||||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
|
|
||||||
* @return the captured text
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UnicodeString group(int32_t group, UErrorCode &status) const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the number of capturing groups in this matcher's pattern.
|
|
||||||
* @return the number of capture groups
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual int32_t groupCount() const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the input string being matched. The returned string is not a copy,
|
|
||||||
* but the live input string. It should not be altered or deleted.
|
|
||||||
* @return the input string
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual const UnicodeString &input() const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Attempts to match the input string, starting from the beginning, against the pattern.
|
|
||||||
* Like the matches() method, this function always starts at the beginning of the input string;
|
|
||||||
* unlike that function, it does not require that the entire input string be matched.
|
|
||||||
*
|
|
||||||
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
|
|
||||||
* <code>end()</code>, and <code>group()</code> functions.</p>
|
|
||||||
*
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors.
|
|
||||||
* @return TRUE if there is a match at the start of the input string.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UBool lookingAt(UErrorCode &status);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Attempts to match the entire input string against the pattern.
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors.
|
|
||||||
* @return TRUE if there is a match
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UBool matches(UErrorCode &status);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the pattern that is interpreted by this matcher.
|
|
||||||
* @return the RegexPattern for this RegexMatcher
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual const RegexPattern &pattern() const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Replaces every substring of the input that matches the pattern
|
|
||||||
* with the given replacement string. This is a convenience function that
|
|
||||||
* provides a complete find-and-replace-all operation.
|
|
||||||
*
|
|
||||||
* This method first resets this matcher. It then scans the input string
|
|
||||||
* looking for matches of the pattern. Input that is not part of any
|
|
||||||
* match is left unchanged; each match is replaced in the result by the
|
|
||||||
* replacement string. The replacement string may contain references to
|
|
||||||
* capture groups.
|
|
||||||
*
|
|
||||||
* @param replacement a string containing the replacement text.
|
|
||||||
* @param status a reference to a UErrorCode to receive any errors.
|
|
||||||
* @return a string containing the results of the find and replace.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Replaces the first substring of the input that matches
|
|
||||||
* the pattern with the replacement string. This is a convenience
|
|
||||||
* function that provides a complete find-and-replace operation.
|
|
||||||
*
|
|
||||||
* This function first resets this RegexMatcher. It then scans the input string
|
|
||||||
* looking for a match of the pattern. Input that is not part
|
|
||||||
* of the match is appended directly to the result string; the match is replaced
|
|
||||||
* in the result by the replacement string. The replacement string may contain
|
|
||||||
* references to captured groups.
|
|
||||||
*
|
|
||||||
* @param replacement a string containing the replacement text.
|
|
||||||
* @param status a reference to a UErrorCode to receive any errors.
|
|
||||||
* @return a string containing the results of the find and replace.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resets this matcher. The effect is to remove any memory of previous matches,
|
|
||||||
* and to cause subsequent find() operations to begin at the beginning of
|
|
||||||
* the input string.
|
|
||||||
*
|
|
||||||
* @return this RegexMatcher.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual RegexMatcher &reset();
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Resets this matcher with a new input string. This allows instances of RegexMatcher
|
|
||||||
* to be reused, which is more efficient than creating a new RegexMatcher for
|
|
||||||
* each input string to be processed.
|
|
||||||
* @return this RegexMatcher.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual RegexMatcher &reset(const UnicodeString &input);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the index in the input string of the start of the text matched
|
|
||||||
* during the previous match operation.
|
|
||||||
* @param status a reference to a UErrorCode to receive any errors.
|
|
||||||
* @return The position in the input string of the start of the last match.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual int32_t start(UErrorCode &status) const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the index in the input string of the start of the text matched by the
|
|
||||||
* specified capture group during the previous match operation. Return -1 if
|
|
||||||
* the capture group exists in the pattern, but was not part of the last match.
|
|
||||||
*
|
|
||||||
* @param group the capture group number
|
|
||||||
* @param status A reference to a UErrorCode to receive any errors. Possible
|
|
||||||
* errors are U_REGEX_INVALID_STATE if no match has been
|
|
||||||
* attempted or the last match failed, and
|
|
||||||
* U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
|
|
||||||
* @return the start position of substring matched by the specified group.
|
|
||||||
* @draft ICU 2.4
|
|
||||||
*/
|
|
||||||
virtual int32_t start(int group, UErrorCode &status) const;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||||
@ -626,8 +643,6 @@ public:
|
|||||||
*/
|
*/
|
||||||
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
|
||||||
|
|
||||||
static const char fgClassID;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// Constructors and other object boilerplate are private.
|
// Constructors and other object boilerplate are private.
|
||||||
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
|
// Instances of RegexMatcher can not be assigned, copied, cloned, etc.
|
||||||
@ -658,6 +673,13 @@ private:
|
|||||||
UVector *fCaptureStarts;
|
UVector *fCaptureStarts;
|
||||||
UVector *fCaptureEnds;
|
UVector *fCaptureEnds;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The address of this static class variable serves as this class's ID
|
||||||
|
* for ICU "poor man's RTTI".
|
||||||
|
*/
|
||||||
|
static const char fgClassID;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
U_NAMESPACE_END
|
U_NAMESPACE_END
|
||||||
|
@ -368,7 +368,7 @@ void RegexTest::Basic() {
|
|||||||
//
|
//
|
||||||
#if 0
|
#if 0
|
||||||
{
|
{
|
||||||
REGEX_FIND("(?:ABC)+", "<0>ABCABCABC</0>D");
|
REGEX_FIND("[{ab}]", "a");
|
||||||
}
|
}
|
||||||
exit(1);
|
exit(1);
|
||||||
#endif
|
#endif
|
||||||
@ -436,6 +436,9 @@ void RegexTest::Basic() {
|
|||||||
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
|
REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
|
||||||
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
|
REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
|
||||||
|
|
||||||
|
// Set contains only a string, no individual chars.
|
||||||
|
REGEX_TESTLM("[{ab}]", "a", FALSE, FALSE);
|
||||||
|
|
||||||
//
|
//
|
||||||
// OR operator in patterns
|
// OR operator in patterns
|
||||||
//
|
//
|
||||||
@ -975,6 +978,52 @@ void RegexTest::API_Pattern() {
|
|||||||
|
|
||||||
delete pat1;
|
delete pat1;
|
||||||
|
|
||||||
|
// split, with a pattern with (capture)
|
||||||
|
pat1 = RegexPattern::compile("<(\\w*)>", pe, status);
|
||||||
|
REGEX_CHECK_STATUS;
|
||||||
|
|
||||||
|
n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
|
||||||
|
REGEX_CHECK_STATUS;
|
||||||
|
REGEX_ASSERT(n==6);
|
||||||
|
REGEX_ASSERT(fields[0]=="");
|
||||||
|
REGEX_ASSERT(fields[1]=="a");
|
||||||
|
REGEX_ASSERT(fields[2]=="Now is ");
|
||||||
|
REGEX_ASSERT(fields[3]=="b");
|
||||||
|
REGEX_ASSERT(fields[4]=="the time");
|
||||||
|
REGEX_ASSERT(fields[5]=="c");
|
||||||
|
REGEX_ASSERT(fields[6]=="");
|
||||||
|
|
||||||
|
n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
|
||||||
|
REGEX_CHECK_STATUS;
|
||||||
|
REGEX_ASSERT(n==6);
|
||||||
|
REGEX_ASSERT(fields[0]==" ");
|
||||||
|
REGEX_ASSERT(fields[1]=="a");
|
||||||
|
REGEX_ASSERT(fields[2]=="Now is ");
|
||||||
|
REGEX_ASSERT(fields[3]=="b");
|
||||||
|
REGEX_ASSERT(fields[4]=="the time");
|
||||||
|
REGEX_ASSERT(fields[5]=="c");
|
||||||
|
REGEX_ASSERT(fields[6]=="");
|
||||||
|
|
||||||
|
n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
|
||||||
|
REGEX_CHECK_STATUS;
|
||||||
|
REGEX_ASSERT(n==4);
|
||||||
|
REGEX_ASSERT(fields[0]==" ");
|
||||||
|
REGEX_ASSERT(fields[1]=="a");
|
||||||
|
REGEX_ASSERT(fields[2]=="Now is ");
|
||||||
|
REGEX_ASSERT(fields[3]=="the time<c>");
|
||||||
|
delete pat1;
|
||||||
|
|
||||||
|
pat1 = RegexPattern::compile("([-,])", pe, status);
|
||||||
|
REGEX_CHECK_STATUS;
|
||||||
|
n = pat1->split("1-10,20", fields, 10, status);
|
||||||
|
REGEX_CHECK_STATUS;
|
||||||
|
REGEX_ASSERT(n==5);
|
||||||
|
REGEX_ASSERT(fields[0]=="1");
|
||||||
|
REGEX_ASSERT(fields[1]=="-");
|
||||||
|
REGEX_ASSERT(fields[2]=="10");
|
||||||
|
REGEX_ASSERT(fields[3]==",");
|
||||||
|
REGEX_ASSERT(fields[4]=="20");
|
||||||
|
delete pat1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user