ICU-2468 regexp \b word boundaries match Unicode TR definition.

X-SVN-Rev: 13978
This commit is contained in:
Andy Heninger 2003-12-03 22:21:14 +00:00
parent 734b35159a
commit d7b1c9d7a1
2 changed files with 8 additions and 19 deletions

View File

@ -950,6 +950,7 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
//--------------------------------------------------------------------------------
UBool RegexMatcher::isUWordBoundary(int32_t pos) {
UErrorCode status = U_ZERO_ERROR;
UBool returnVal = FALSE;
// If we haven't yet created a break iterator for this matcher, do it now.
if (fWordBreakItr == NULL) {
@ -962,20 +963,7 @@ UBool RegexMatcher::isUWordBoundary(int32_t pos) {
fWordBreakItr->setText(*fInput);
}
// If we are not positioned at an RBBI style boundary, \b isn't at a boundary either.
if (fWordBreakItr->isBoundary(pos) == FALSE) {
return FALSE;
}
// Discard RBBI boundaries where the "words" on both sides have the break
// status of UBRK_WORD_NONE. Spaces and puncutation, for example.
int32_t prevStatus = fWordBreakItr->getRuleStatus();
if (prevStatus >= UBRK_WORD_NUMBER && prevStatus < UBRK_WORD_IDEO_LIMIT) {
return TRUE;
}
fWordBreakItr->next();
int32_t nextStatus = fWordBreakItr->getRuleStatus();
UBool returnVal = (nextStatus >= UBRK_WORD_NUMBER && nextStatus < UBRK_WORD_IDEO_LIMIT);
returnVal = fWordBreakItr->isBoundary(pos);
return returnVal;
}

View File

@ -84,12 +84,13 @@
# Unicode word boundary mode
#
"(?w).*?\b" "<0></0>hello, world"
"(?w).*?(\b.+?\b).*" "<0> <1>123.45</1> </0>"
"(?w).*?(\b.+?\b).*" "<0><1> </1> 123.45 </0>"
"(?w).*?(\b\d.*?\b).*" "<0> <1>123.45</1> </0>"
".*?(\b.+?\b).*" "<0> <1>123</1>.45 </0>"
"(?w:.*?(\b.+?\b).*)" "<0> <1>123.45</1> </0>"
"(?w:.*?(\b\d.*?\b).*)" "<0> <1>123.45</1> </0>"
"(?w:.*?(\b.+?\b).*)" "<0><1>don't</1> </0>"
"(?w:.+?(\b.+?\b).*)" "<0> <1>don't</1> </0>"
"(?w:.+?(\b.+?\b).*)" "<0> . ,,,:$$ <1>37,000.50</1> </0>"
"(?w:.+?(\b\S.+?\b).*)" "<0> <1>don't</1> </0>"
"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" "<0><1>.</1><2> </2><3>,</3><4>:</4><5>$</5><6>37,000.50</6><7> </7> </0>"
# . does not match new-lines