ICU-2468 regexp word boundaries using RBBI

X-SVN-Rev: 13641
This commit is contained in:
Andy Heninger 2003-11-08 02:01:42 +00:00
parent 0317f272a3
commit d4e4635656
8 changed files with 186 additions and 84 deletions

View File

@ -1089,11 +1089,17 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doBackslashB:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 1), *fStatus);
{
int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus);
}
break;
case doBackslashb:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_B, 0), *fStatus);
{
int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
}
break;
case doBackslashD:
@ -1325,6 +1331,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
default:
@ -1376,6 +1383,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
}
break;
case doBadModeFlag:
error(U_REGEX_INVALID_FLAG);
break;
case doSuppressComments:
// We have just scanned a '(?'. We now need to prevent the character scanner from
// treating a '#' as a to-the-end-of-line comment.
@ -2115,6 +2126,7 @@ void RegexCompile::matchStartType() {
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_BACKSLASH_B:
case URX_BACKSLASH_BU:
case URX_BACKSLASH_G:
case URX_BACKSLASH_Z:
case URX_DOLLAR:
@ -2585,6 +2597,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_BACKSLASH_B:
case URX_BACKSLASH_BU:
case URX_BACKSLASH_G:
case URX_BACKSLASH_Z:
case URX_CARET:
@ -2824,6 +2837,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_START_CAPTURE:
case URX_END_CAPTURE:
case URX_BACKSLASH_B:
case URX_BACKSLASH_BU:
case URX_BACKSLASH_G:
case URX_BACKSLASH_Z:
case URX_CARET:
@ -3077,6 +3091,7 @@ void RegexCompile::stripNOPs() {
case URX_DOTANY:
case URX_FAIL:
case URX_BACKSLASH_B:
case URX_BACKSLASH_BU:
case URX_BACKSLASH_G:
case URX_BACKSLASH_X:
case URX_BACKSLASH_Z:

View File

@ -79,6 +79,7 @@ enum Regex_PatternParseAction {
doExit,
doNGInterval,
doPatStart,
doBadModeFlag,
doBackslashb,
doPossessiveStar,
doBackslashd,
@ -111,15 +112,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 79,0, TRUE} // 9
, {doNOP, 92 /* \ */, 81,0, TRUE} // 9
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
, {doRuleError, 255, 99,0, FALSE} // 13
, {doNOP, 42 /* * */, 57,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 60,0, TRUE} // 15
, {doNOP, 63 /* ? */, 63,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 66,0, TRUE} // 17
, {doRuleError, 255, 101,0, FALSE} // 13
, {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 62,0, TRUE} // 15
, {doNOP, 63 /* ? */, 65,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
, {doNOP, 255, 20,0, FALSE} // 19
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
@ -127,7 +128,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doNOP, 255, 2,0, FALSE} // 22
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doNOP, 255, 27,0, FALSE} // 24
, {doNOP, 35 /* # */, 46, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 255, 29,0, FALSE} // 26
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
@ -135,73 +136,75 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
, {doNOP, 60 /* < */, 43,0, TRUE} // 33
, {doNOP, 35 /* # */, 46, 2, TRUE} // 34
, {doBeginMatchMode, 105 /* i */, 49,0, FALSE} // 35
, {doBeginMatchMode, 109 /* m */, 49,0, FALSE} // 36
, {doBeginMatchMode, 115 /* s */, 49,0, FALSE} // 37
, {doBeginMatchMode, 120 /* x */, 49,0, FALSE} // 38
, {doBeginMatchMode, 45 /* - */, 49,0, FALSE} // 39
, {doConditionalExpr, 40 /* ( */, 99,0, TRUE} // 40
, {doPerlInline, 123 /* { */, 99,0, TRUE} // 41
, {doBadOpenParenType, 255, 99,0, FALSE} // 42
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 43 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 44
, {doBadOpenParenType, 255, 99,0, FALSE} // 45
, {doNOP, 41 /* ) */, 255,0, TRUE} // 46 paren-comment
, {doMismatchedParenErr, 253, 99,0, FALSE} // 47
, {doNOP, 255, 46,0, TRUE} // 48
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 50
, {doMatchMode, 115 /* s */, 49,0, TRUE} // 51
, {doMatchMode, 120 /* x */, 49,0, TRUE} // 52
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 53
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 54
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 55
, {doNOP, 255, 99,0, FALSE} // 56
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 58
, {doStar, 255, 20,0, FALSE} // 59
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 60 quant-plus
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 61
, {doPlus, 255, 20,0, FALSE} // 62
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 63 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 64
, {doOpt, 255, 20,0, FALSE} // 65
, {doNOP, 129, 66,0, TRUE} // 66 interval-open
, {doNOP, 128, 69,0, FALSE} // 67
, {doIntervalError, 255, 99,0, FALSE} // 68
, {doIntevalLowerDigit, 128, 69,0, TRUE} // 69 interval-lower
, {doNOP, 44 /* , */, 73,0, TRUE} // 70
, {doIntervalSame, 125 /* } */, 76,0, TRUE} // 71
, {doIntervalError, 255, 99,0, FALSE} // 72
, {doIntervalUpperDigit, 128, 73,0, TRUE} // 73 interval-upper
, {doNOP, 125 /* } */, 76,0, TRUE} // 74
, {doIntervalError, 255, 99,0, FALSE} // 75
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 76 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 77
, {doInterval, 255, 20,0, FALSE} // 78
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 79 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 80
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 81
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 82
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 83
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 84
, {doProperty, 78 /* N */, 14,0, FALSE} // 85
, {doProperty, 112 /* p */, 14,0, FALSE} // 86
, {doProperty, 80 /* P */, 14,0, FALSE} // 87
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 88
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 89
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 90
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 91
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 92
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 93
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 94
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 95
, {doBackRef, 128, 14,0, TRUE} // 96
, {doEscapeError, 253, 99,0, FALSE} // 97
, {doLiteralChar, 255, 14,0, TRUE} // 98
, {doExit, 255, 99,0, TRUE} // 99 errorDeath
, {doNOP, 60 /* < */, 44,0, TRUE} // 33
, {doNOP, 35 /* # */, 47, 2, TRUE} // 34
, {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35
, {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36
, {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37
, {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38
, {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39
, {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 42
, {doBadOpenParenType, 255, 101,0, FALSE} // 43
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45
, {doBadOpenParenType, 255, 101,0, FALSE} // 46
, {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment
, {doMismatchedParenErr, 253, 101,0, FALSE} // 48
, {doNOP, 255, 47,0, TRUE} // 49
, {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag
, {doMatchMode, 109 /* m */, 50,0, TRUE} // 51
, {doMatchMode, 115 /* s */, 50,0, TRUE} // 52
, {doMatchMode, 119 /* w */, 50,0, TRUE} // 53
, {doMatchMode, 120 /* x */, 50,0, TRUE} // 54
, {doMatchMode, 45 /* - */, 50,0, TRUE} // 55
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57
, {doBadModeFlag, 255, 101,0, FALSE} // 58
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60
, {doStar, 255, 20,0, FALSE} // 61
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63
, {doPlus, 255, 20,0, FALSE} // 64
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66
, {doOpt, 255, 20,0, FALSE} // 67
, {doNOP, 129, 68,0, TRUE} // 68 interval-open
, {doNOP, 128, 71,0, FALSE} // 69
, {doIntervalError, 255, 101,0, FALSE} // 70
, {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower
, {doNOP, 44 /* , */, 75,0, TRUE} // 72
, {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73
, {doIntervalError, 255, 101,0, FALSE} // 74
, {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper
, {doNOP, 125 /* } */, 78,0, TRUE} // 76
, {doIntervalError, 255, 101,0, FALSE} // 77
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79
, {doInterval, 255, 20,0, FALSE} // 80
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 82
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 83
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 84
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 85
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 86
, {doProperty, 78 /* N */, 14,0, FALSE} // 87
, {doProperty, 112 /* p */, 14,0, FALSE} // 88
, {doProperty, 80 /* P */, 14,0, FALSE} // 89
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 91
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 92
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 93
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 94
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 95
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 97
, {doBackRef, 128, 14,0, TRUE} // 98
, {doEscapeError, 253, 101,0, FALSE} // 99
, {doLiteralChar, 255, 14,0, TRUE} // 100
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -245,6 +248,7 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"open-paren-lookbehind",
0,
@ -259,6 +263,7 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"quant-star",
0,

View File

@ -135,6 +135,7 @@ open-paren-extended:
'i' paren-flag doBeginMatchMode
'm' paren-flag doBeginMatchMode
's' paren-flag doBeginMatchMode
'w' paren-flag doBeginMatchMode
'x' paren-flag doBeginMatchMode
'-' paren-flag doBeginMatchMode
'(' n errorDeath doConditionalExpr
@ -163,11 +164,12 @@ paren-flag:
'i' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'w' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
')' n term doSetMatchMode
':' n term ^expr-quant doMatchModeParen
default errorDeath
default errorDeath doBadModeFlag
#

View File

@ -21,7 +21,7 @@ U_NAMESPACE_BEGIN
//
#ifdef _DEBUG
//#define REGEX_SCAN_DEBUG
//#define REGEX_DUMP_DEBUG
#define REGEX_DUMP_DEBUG
//#define REGEX_RUN_DEBUG
#endif
// End of #defines inteded to be directly set.
@ -165,10 +165,12 @@ enum {
URX_LOOP_C = 51, // Continue a [set]* or OneChar* loop.
// Operand is a matcher static data location.
// Must always immediately follow LOOP_x_I instruction.
URX_LOOP_DOT_I = 52 // .*, initialization of the optimized loop.
URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop.
// Operand value:
// 0: Normal (. doesn't match new-line) mode.
// 1: . matches new-line mode.
URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style
// word boundaries.
};
@ -227,7 +229,8 @@ enum {
"STAT_SETREF_N", \
"LOOP_SR_I", \
"LOOP_C", \
"LOOP_DOT_I"
"LOOP_DOT_I", \
"BACKSLASH_BU"
//

View File

@ -18,6 +18,7 @@
#include "unicode/uniset.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "unicode/rbbi.h"
#include "uassert.h"
#include "cmemory.h"
#include "uvector.h"
@ -42,6 +43,7 @@ RegexMatcher::RegexMatcher(const RegexPattern *pat) {
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(fDeferredStatus);
fData = fSmallData;
fWordBreakItr = NULL;
if (pat==NULL) {
fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
return;
@ -67,6 +69,7 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &inp
fDeferredStatus = U_ZERO_ERROR;
fStack = new UVector32(status);
fData = fSmallData;
fWordBreakItr = NULL;
if (U_FAILURE(status)) {
return;
}
@ -89,6 +92,7 @@ RegexMatcher::RegexMatcher(const UnicodeString &regexp,
fData = fSmallData;
fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
fPattern = fPatternOwned;
fWordBreakItr = NULL;
if (U_FAILURE(status)) {
return;
}
@ -115,6 +119,7 @@ RegexMatcher::~RegexMatcher() {
fPatternOwned = NULL;
fPattern = NULL;
}
delete fWordBreakItr;
}
@ -674,6 +679,9 @@ RegexMatcher &RegexMatcher::reset() {
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
fInput = &input;
reset();
if (fWordBreakItr != NULL) {
fWordBreakItr->setText(input);
}
return *this;
}
@ -893,9 +901,6 @@ REStackFrame *RegexMatcher::resetStack() {
// opposite in membership in \w set
//
// parameters: pos - the current position in the input buffer
// start - the position where the match operation started.
// don't backup before this position when looking back
// for a preceding base char.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isWordBoundary(int32_t pos) {
@ -934,6 +939,46 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
return isBoundary;
}
//--------------------------------------------------------------------------------
//
// isUWordBoundary
//
// Test for a word boundary using RBBI word break.
//
// parameters: pos - the current position in the input buffer
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isUWordBoundary(int32_t pos) {
UErrorCode status=U_ZERO_ERROR;
// If we haven't yet created a break iterator for this matcher, do it now.
if (fWordBreakItr == NULL) {
fWordBreakItr =
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
// TODO: reliable error reporting for BI failures.
return FALSE;
}
fWordBreakItr->setText(*fInput);
}
// If we are not positioned at an RBBI style boundary, \b isn't at a boundary either.
if (fWordBreakItr->isBoundary(pos) == FALSE) {
return FALSE;
}
// Discard RBBI boundaries where the "words" on both sides have the break
// status of UBRK_WORD_NONE. Spaces and puncutation, for example.
int32_t prevStatus = fWordBreakItr->getRuleStatus();
if (prevStatus >= UBRK_WORD_NUMBER && prevStatus < UBRK_WORD_IDEO_LIMIT) {
return TRUE;
}
fWordBreakItr->next();
int32_t nextStatus = fWordBreakItr->getRuleStatus();
UBool returnVal = (nextStatus >= UBRK_WORD_NUMBER && nextStatus < UBRK_WORD_IDEO_LIMIT);
return returnVal;
}
//--------------------------------------------------------------------------------
//
// StateSave
@ -1244,6 +1289,17 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break;
case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
{
UBool success = isUWordBoundary(fp->fInputIdx);
success ^= (opValue != 0); // flip sense for \B
if (!success) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
}
break;
case URX_BACKSLASH_D: // Test for decimal digit
{
if (fp->fInputIdx >= inputLen) {

View File

@ -445,6 +445,7 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_JMP_SAV:
case URX_JMP_SAV_X:
case URX_BACKSLASH_B:
case URX_BACKSLASH_BU:
case URX_BACKSLASH_D:
case URX_BACKSLASH_Z:
case URX_STRING_LEN:

View File

@ -55,6 +55,8 @@ class UVector32;
class UnicodeSet;
struct REStackFrame;
struct Regex8BitSet;
class RuleBasedBreakIterator;
/**
@ -864,7 +866,8 @@ private:
//
void MatchAt(int32_t startIdx, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool isWordBoundary(int32_t pos); // perform the \b test
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
REStackFrame *resetStack();
inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
int32_t frameSize, UErrorCode &status);
@ -896,6 +899,8 @@ private:
UBool fTouchedEnd; // Set true if match engine reaches eof on input
// while attempting a match.
RuleBasedBreakIterator *fWordBreakItr;
};
U_NAMESPACE_END

View File

@ -66,6 +66,7 @@
".*\Ahello" "stuff\nhello" # don't match after embedded new-line.
# \b \B
#
".*?\b(.).*" "<0> $%^&*( <1>h</1>ello123%^&*()gxx</0>"
"\ba\b" "-<0>a</0>"
"\by\b" "xy"
@ -78,6 +79,19 @@
"(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?(?:.*?\b(.))?.*" "<0> \u0301 \u0301<1>A</1>\u0302BC\u0303\u0304<2> </2>\u0305 \u0306<3>X</3>\u0307Y\u0308</0>"
#
# Unicode word boundary mode
#
"(?w).*?\b" "<0></0>hello, world"
"(?w).*?(\b.+?\b).*" "<0> <1>123.45</1> </0>"
".*?(\b.+?\b).*" "<0> <1>123</1>.45 </0>"
"(?w:.*?(\b.+?\b).*)" "<0> <1>123.45</1> </0>"
"(?w:.*?(\b.+?\b).*)" "<0><1>don't</1> </0>"
"(?w:.+?(\b.+?\b).*)" "<0> <1>don't</1> </0>"
"(?w:.+?(\b.+?\b).*)" "<0> . ,,,:$$ <1>37,000.50</1> </0>"
# . does not match new-lines
"." "\u000a\u000d\u0085\u000c\u2028\u2029<0>X</0>\u000aY"
"A." "A\u000a "# no match
@ -349,6 +363,7 @@
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
"(\ud800)(\udc00)" "\U00010000"
#
# Bug 3225