ICU-2468 regexp word boundaries using RBBI
X-SVN-Rev: 13641
This commit is contained in:
parent
0317f272a3
commit
d4e4635656
@ -1089,11 +1089,17 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
break;
|
||||
|
||||
case doBackslashB:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
|
||||
{
|
||||
int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
case doBackslashb:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
|
||||
{
|
||||
int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
case doBackslashD:
|
||||
@ -1325,6 +1331,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
|
||||
case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
|
||||
case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
|
||||
case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
|
||||
case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
|
||||
case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
|
||||
default:
|
||||
@ -1376,6 +1383,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
||||
}
|
||||
break;
|
||||
|
||||
case doBadModeFlag:
|
||||
error(U_REGEX_INVALID_FLAG);
|
||||
break;
|
||||
|
||||
case doSuppressComments:
|
||||
// We have just scanned a '(?'. We now need to prevent the character scanner from
|
||||
// treating a '#' as a to-the-end-of-line comment.
|
||||
@ -2115,6 +2126,7 @@ void RegexCompile::matchStartType() {
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_BU:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_DOLLAR:
|
||||
@ -2585,6 +2597,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_BU:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
@ -2824,6 +2837,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_BU:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_CARET:
|
||||
@ -3077,6 +3091,7 @@ void RegexCompile::stripNOPs() {
|
||||
case URX_DOTANY:
|
||||
case URX_FAIL:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_BU:
|
||||
case URX_BACKSLASH_G:
|
||||
case URX_BACKSLASH_X:
|
||||
case URX_BACKSLASH_Z:
|
||||
|
@ -79,6 +79,7 @@ enum Regex_PatternParseAction {
|
||||
doExit,
|
||||
doNGInterval,
|
||||
doPatStart,
|
||||
doBadModeFlag,
|
||||
doBackslashb,
|
||||
doPossessiveStar,
|
||||
doBackslashd,
|
||||
@ -111,15 +112,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 79,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 81,0, TRUE} // 9
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 12
|
||||
, {doRuleError, 255, 99,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 57,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 60,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 63,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 66,0, TRUE} // 17
|
||||
, {doRuleError, 255, 101,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 62,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 65,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17
|
||||
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
|
||||
, {doNOP, 255, 20,0, FALSE} // 19
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
|
||||
@ -127,7 +128,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doNOP, 255, 2,0, FALSE} // 22
|
||||
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
|
||||
, {doNOP, 255, 27,0, FALSE} // 24
|
||||
, {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 255, 29,0, FALSE} // 26
|
||||
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
|
||||
@ -135,73 +136,75 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
||||
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
|
||||
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 43,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 46, 2, TRUE} // 34
|
||||
, {doBeginMatchMode, 105 /* i */, 49,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 109 /* m */, 49,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 115 /* s */, 49,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 120 /* x */, 49,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 45 /* - */, 49,0, FALSE} // 39
|
||||
, {doConditionalExpr, 40 /* ( */, 99,0, TRUE} // 40
|
||||
, {doPerlInline, 123 /* { */, 99,0, TRUE} // 41
|
||||
, {doBadOpenParenType, 255, 99,0, FALSE} // 42
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 43 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 99,0, FALSE} // 45
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 46 paren-comment
|
||||
, {doMismatchedParenErr, 253, 99,0, FALSE} // 47
|
||||
, {doNOP, 255, 46,0, TRUE} // 48
|
||||
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag
|
||||
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 50
|
||||
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 51
|
||||
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 52
|
||||
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 53
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 54
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 55
|
||||
, {doNOP, 255, 99,0, FALSE} // 56
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 58
|
||||
, {doStar, 255, 20,0, FALSE} // 59
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 60 quant-plus
|
||||
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 61
|
||||
, {doPlus, 255, 20,0, FALSE} // 62
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 63 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 64
|
||||
, {doOpt, 255, 20,0, FALSE} // 65
|
||||
, {doNOP, 129, 66,0, TRUE} // 66 interval-open
|
||||
, {doNOP, 128, 69,0, FALSE} // 67
|
||||
, {doIntervalError, 255, 99,0, FALSE} // 68
|
||||
, {doIntevalLowerDigit, 128, 69,0, TRUE} // 69 interval-lower
|
||||
, {doNOP, 44 /* , */, 73,0, TRUE} // 70
|
||||
, {doIntervalSame, 125 /* } */, 76,0, TRUE} // 71
|
||||
, {doIntervalError, 255, 99,0, FALSE} // 72
|
||||
, {doIntervalUpperDigit, 128, 73,0, TRUE} // 73 interval-upper
|
||||
, {doNOP, 125 /* } */, 76,0, TRUE} // 74
|
||||
, {doIntervalError, 255, 99,0, FALSE} // 75
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 76 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 77
|
||||
, {doInterval, 255, 20,0, FALSE} // 78
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 79 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 80
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 81
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 82
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 83
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 84
|
||||
, {doProperty, 78 /* N */, 14,0, FALSE} // 85
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 86
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 87
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 88
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 89
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 90
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 91
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 92
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 93
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 94
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 95
|
||||
, {doBackRef, 128, 14,0, TRUE} // 96
|
||||
, {doEscapeError, 253, 99,0, FALSE} // 97
|
||||
, {doLiteralChar, 255, 14,0, TRUE} // 98
|
||||
, {doExit, 255, 99,0, TRUE} // 99 errorDeath
|
||||
, {doNOP, 60 /* < */, 44,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 47, 2, TRUE} // 34
|
||||
, {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39
|
||||
, {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40
|
||||
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41
|
||||
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 42
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 43
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 46
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment
|
||||
, {doMismatchedParenErr, 253, 101,0, FALSE} // 48
|
||||
, {doNOP, 255, 47,0, TRUE} // 49
|
||||
, {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag
|
||||
, {doMatchMode, 109 /* m */, 50,0, TRUE} // 51
|
||||
, {doMatchMode, 115 /* s */, 50,0, TRUE} // 52
|
||||
, {doMatchMode, 119 /* w */, 50,0, TRUE} // 53
|
||||
, {doMatchMode, 120 /* x */, 50,0, TRUE} // 54
|
||||
, {doMatchMode, 45 /* - */, 50,0, TRUE} // 55
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57
|
||||
, {doBadModeFlag, 255, 101,0, FALSE} // 58
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60
|
||||
, {doStar, 255, 20,0, FALSE} // 61
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus
|
||||
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63
|
||||
, {doPlus, 255, 20,0, FALSE} // 64
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66
|
||||
, {doOpt, 255, 20,0, FALSE} // 67
|
||||
, {doNOP, 129, 68,0, TRUE} // 68 interval-open
|
||||
, {doNOP, 128, 71,0, FALSE} // 69
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 70
|
||||
, {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower
|
||||
, {doNOP, 44 /* , */, 75,0, TRUE} // 72
|
||||
, {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 74
|
||||
, {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper
|
||||
, {doNOP, 125 /* } */, 78,0, TRUE} // 76
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 77
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79
|
||||
, {doInterval, 255, 20,0, FALSE} // 80
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 82
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 83
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 84
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 85
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 86
|
||||
, {doProperty, 78 /* N */, 14,0, FALSE} // 87
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 88
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 89
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 91
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 92
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 93
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 94
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 95
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 97
|
||||
, {doBackRef, 128, 14,0, TRUE} // 98
|
||||
, {doEscapeError, 253, 101,0, FALSE} // 99
|
||||
, {doLiteralChar, 255, 14,0, TRUE} // 100
|
||||
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
@ -245,6 +248,7 @@ static const char * const RegexStateNames[] = { 0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"open-paren-lookbehind",
|
||||
0,
|
||||
@ -259,6 +263,7 @@ static const char * const RegexStateNames[] = { 0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"quant-star",
|
||||
0,
|
||||
|
@ -135,6 +135,7 @@ open-paren-extended:
|
||||
'i' paren-flag doBeginMatchMode
|
||||
'm' paren-flag doBeginMatchMode
|
||||
's' paren-flag doBeginMatchMode
|
||||
'w' paren-flag doBeginMatchMode
|
||||
'x' paren-flag doBeginMatchMode
|
||||
'-' paren-flag doBeginMatchMode
|
||||
'(' n errorDeath doConditionalExpr
|
||||
@ -163,11 +164,12 @@ paren-flag:
|
||||
'i' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'w' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
')' n term doSetMatchMode
|
||||
':' n term ^expr-quant doMatchModeParen
|
||||
default errorDeath
|
||||
default errorDeath doBadModeFlag
|
||||
|
||||
|
||||
#
|
||||
|
@ -21,7 +21,7 @@ U_NAMESPACE_BEGIN
|
||||
//
|
||||
#ifdef _DEBUG
|
||||
//#define REGEX_SCAN_DEBUG
|
||||
//#define REGEX_DUMP_DEBUG
|
||||
#define REGEX_DUMP_DEBUG
|
||||
//#define REGEX_RUN_DEBUG
|
||||
#endif
|
||||
// End of #defines inteded to be directly set.
|
||||
@ -165,10 +165,12 @@ enum {
|
||||
URX_LOOP_C = 51, // Continue a [set]* or OneChar* loop.
|
||||
// Operand is a matcher static data location.
|
||||
// Must always immediately follow LOOP_x_I instruction.
|
||||
URX_LOOP_DOT_I = 52 // .*, initialization of the optimized loop.
|
||||
URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop.
|
||||
// Operand value:
|
||||
// 0: Normal (. doesn't match new-line) mode.
|
||||
// 1: . matches new-line mode.
|
||||
URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style
|
||||
// word boundaries.
|
||||
|
||||
};
|
||||
|
||||
@ -227,7 +229,8 @@ enum {
|
||||
"STAT_SETREF_N", \
|
||||
"LOOP_SR_I", \
|
||||
"LOOP_C", \
|
||||
"LOOP_DOT_I"
|
||||
"LOOP_DOT_I", \
|
||||
"BACKSLASH_BU"
|
||||
|
||||
|
||||
//
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
#include "uvector.h"
|
||||
@ -42,6 +43,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
|
||||
fDeferredStatus = U_ZERO_ERROR;
|
||||
fStack = new UVector32(fDeferredStatus);
|
||||
fData = fSmallData;
|
||||
fWordBreakItr = NULL;
|
||||
if (pat==NULL) {
|
||||
fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
@ -67,6 +69,7 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &inp
|
||||
fDeferredStatus = U_ZERO_ERROR;
|
||||
fStack = new UVector32(status);
|
||||
fData = fSmallData;
|
||||
fWordBreakItr = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
@ -89,6 +92,7 @@ RegexMatcher::RegexMatcher(const UnicodeString ®exp,
|
||||
fData = fSmallData;
|
||||
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
|
||||
fPattern = fPatternOwned;
|
||||
fWordBreakItr = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
@ -115,6 +119,7 @@ RegexMatcher::~RegexMatcher() {
|
||||
fPatternOwned = NULL;
|
||||
fPattern = NULL;
|
||||
}
|
||||
delete fWordBreakItr;
|
||||
}
|
||||
|
||||
|
||||
@ -674,6 +679,9 @@ RegexMatcher &RegexMatcher::reset() {
|
||||
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
|
||||
fInput = &input;
|
||||
reset();
|
||||
if (fWordBreakItr != NULL) {
|
||||
fWordBreakItr->setText(input);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -893,9 +901,6 @@ REStackFrame *RegexMatcher::resetStack() {
|
||||
// opposite in membership in \w set
|
||||
//
|
||||
// parameters: pos - the current position in the input buffer
|
||||
// start - the position where the match operation started.
|
||||
// don't backup before this position when looking back
|
||||
// for a preceding base char.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
||||
@ -934,6 +939,46 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
||||
return isBoundary;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// isUWordBoundary
|
||||
//
|
||||
// Test for a word boundary using RBBI word break.
|
||||
//
|
||||
// parameters: pos - the current position in the input buffer
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::isUWordBoundary(int32_t pos) {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
||||
// If we haven't yet created a break iterator for this matcher, do it now.
|
||||
if (fWordBreakItr == NULL) {
|
||||
fWordBreakItr =
|
||||
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
// TODO: reliable error reporting for BI failures.
|
||||
return FALSE;
|
||||
}
|
||||
fWordBreakItr->setText(*fInput);
|
||||
}
|
||||
|
||||
// If we are not positioned at an RBBI style boundary, \b isn't at a boundary either.
|
||||
if (fWordBreakItr->isBoundary(pos) == FALSE) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Discard RBBI boundaries where the "words" on both sides have the break
|
||||
// status of UBRK_WORD_NONE. Spaces and puncutation, for example.
|
||||
int32_t prevStatus = fWordBreakItr->getRuleStatus();
|
||||
if (prevStatus >= UBRK_WORD_NUMBER && prevStatus < UBRK_WORD_IDEO_LIMIT) {
|
||||
return TRUE;
|
||||
}
|
||||
fWordBreakItr->next();
|
||||
int32_t nextStatus = fWordBreakItr->getRuleStatus();
|
||||
UBool returnVal = (nextStatus >= UBRK_WORD_NUMBER && nextStatus < UBRK_WORD_IDEO_LIMIT);
|
||||
return returnVal;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// StateSave
|
||||
@ -1244,6 +1289,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
|
||||
{
|
||||
UBool success = isUWordBoundary(fp->fInputIdx);
|
||||
success ^= (opValue != 0); // flip sense for \B
|
||||
if (!success) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_D: // Test for decimal digit
|
||||
{
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
|
@ -445,6 +445,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
||||
case URX_JMP_SAV:
|
||||
case URX_JMP_SAV_X:
|
||||
case URX_BACKSLASH_B:
|
||||
case URX_BACKSLASH_BU:
|
||||
case URX_BACKSLASH_D:
|
||||
case URX_BACKSLASH_Z:
|
||||
case URX_STRING_LEN:
|
||||
|
@ -55,6 +55,8 @@ class UVector32;
|
||||
class UnicodeSet;
|
||||
struct REStackFrame;
|
||||
struct Regex8BitSet;
|
||||
class RuleBasedBreakIterator;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
@ -864,7 +866,8 @@ private:
|
||||
//
|
||||
void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int32_t pos); // perform the \b test
|
||||
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
|
||||
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
|
||||
REStackFrame *resetStack();
|
||||
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
|
||||
int32_t frameSize, UErrorCode &status);
|
||||
@ -896,6 +899,8 @@ private:
|
||||
UBool fTouchedEnd; // Set true if match engine reaches eof on input
|
||||
// while attempting a match.
|
||||
|
||||
RuleBasedBreakIterator *fWordBreakItr;
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
15
icu4c/source/test/testdata/regextst.txt
vendored
15
icu4c/source/test/testdata/regextst.txt
vendored
@ -66,6 +66,7 @@
|
||||
".*\Ahello" "stuff\nhello" # don't match after embedded new-line.
|
||||
|
||||
# \b \B
|
||||
#
|
||||
".*?\b(.).*" "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>"
|
||||
"\ba\b" "-<0>a</0>"
|
||||
"\by\b" "xy"
|
||||
@ -78,6 +79,19 @@
|
||||
|
||||
"(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A</1>\u0302BC\u0303\u0304<2> </2>\u0305 \u0306<3>X</3>\u0307Y\u0308</0>"
|
||||
|
||||
|
||||
#
|
||||
# Unicode word boundary mode
|
||||
#
|
||||
"(?w).*?\b" "<0></0>hello, world"
|
||||
"(?w).*?(\b.+?\b).*" "<0> <1>123.45</1> </0>"
|
||||
".*?(\b.+?\b).*" "<0> <1>123</1>.45 </0>"
|
||||
"(?w:.*?(\b.+?\b).*)" "<0> <1>123.45</1> </0>"
|
||||
"(?w:.*?(\b.+?\b).*)" "<0><1>don't</1> </0>"
|
||||
"(?w:.+?(\b.+?\b).*)" "<0> <1>don't</1> </0>"
|
||||
"(?w:.+?(\b.+?\b).*)" "<0> . ,,,:$$ <1>37,000.50</1> </0>"
|
||||
|
||||
|
||||
# . does not match new-lines
|
||||
"." "\u000a\u000d\u0085\u000c\u2028\u2029<0>X</0>\u000aY"
|
||||
"A." "A\u000a "# no match
|
||||
@ -349,6 +363,7 @@
|
||||
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
|
||||
"(\ud800)(\udc00)" "\U00010000"
|
||||
|
||||
|
||||
#
|
||||
# Bug 3225
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user