ICU-2468 regexp \b word boundaries match Unicode TR definition.
X-SVN-Rev: 13978
This commit is contained in:
parent
734b35159a
commit
d7b1c9d7a1
@ -950,6 +950,7 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::isUWordBoundary(int32_t pos) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UBool returnVal = FALSE;
|
||||
|
||||
// If we haven't yet created a break iterator for this matcher, do it now.
|
||||
if (fWordBreakItr == NULL) {
|
||||
@ -962,20 +963,7 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) {
|
||||
fWordBreakItr->setText(*fInput);
|
||||
}
|
||||
|
||||
// If we are not positioned at an RBBI style boundary, \b isn't at a boundary either.
|
||||
if (fWordBreakItr->isBoundary(pos) == FALSE) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Discard RBBI boundaries where the "words" on both sides have the break
|
||||
// status of UBRK_WORD_NONE. Spaces and puncutation, for example.
|
||||
int32_t prevStatus = fWordBreakItr->getRuleStatus();
|
||||
if (prevStatus >= UBRK_WORD_NUMBER && prevStatus < UBRK_WORD_IDEO_LIMIT) {
|
||||
return TRUE;
|
||||
}
|
||||
fWordBreakItr->next();
|
||||
int32_t nextStatus = fWordBreakItr->getRuleStatus();
|
||||
UBool returnVal = (nextStatus >= UBRK_WORD_NUMBER && nextStatus < UBRK_WORD_IDEO_LIMIT);
|
||||
returnVal = fWordBreakItr->isBoundary(pos);
|
||||
return returnVal;
|
||||
}
|
||||
|
||||
|
9
icu4c/source/test/testdata/regextst.txt
vendored
9
icu4c/source/test/testdata/regextst.txt
vendored
@ -84,12 +84,13 @@
|
||||
# Unicode word boundary mode
|
||||
#
|
||||
"(?w).*?\b" "<0></0>hello, world"
|
||||
"(?w).*?(\b.+?\b).*" "<0> <1>123.45</1> </0>"
|
||||
"(?w).*?(\b.+?\b).*" "<0><1> </1> 123.45 </0>"
|
||||
"(?w).*?(\b\d.*?\b).*" "<0> <1>123.45</1> </0>"
|
||||
".*?(\b.+?\b).*" "<0> <1>123</1>.45 </0>"
|
||||
"(?w:.*?(\b.+?\b).*)" "<0> <1>123.45</1> </0>"
|
||||
"(?w:.*?(\b\d.*?\b).*)" "<0> <1>123.45</1> </0>"
|
||||
"(?w:.*?(\b.+?\b).*)" "<0><1>don't</1> </0>"
|
||||
"(?w:.+?(\b.+?\b).*)" "<0> <1>don't</1> </0>"
|
||||
"(?w:.+?(\b.+?\b).*)" "<0> . ,,,:$$ <1>37,000.50</1> </0>"
|
||||
"(?w:.+?(\b\S.+?\b).*)" "<0> <1>don't</1> </0>"
|
||||
"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" "<0><1>.</1><2> </2><3>,</3><4>:</4><5>$</5><6>37,000.50</6><7> </7> </0>"
|
||||
|
||||
|
||||
# . does not match new-lines
|
||||
|
Loading…
Reference in New Issue
Block a user