ICU-2292 line break rules updated, 15 mins testmonkey passes
X-SVN-Rev: 13663
This commit is contained in:
parent
7eb4264ca5
commit
8feb899d7d
@ -730,8 +730,10 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
// Ran off end, no match found.
|
||||
// Treat as a break at the end of the input string.
|
||||
result = fText->endIndex();
|
||||
// move forward one
|
||||
fText->setIndex(initialPosition);
|
||||
fText->next32();
|
||||
fText->getIndex();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -44,6 +44,7 @@ $SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
|
||||
@ -60,7 +61,6 @@ $LVT = [:Hangul_Syllable_Type = LVT:];
|
||||
|
||||
$HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (South East Asian: Thai, Lao, Khmer)
|
||||
@ -91,6 +91,7 @@ $PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$SPcm = $SP $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
$WJcm = $WJ $CM*;
|
||||
|
||||
#
|
||||
# Each class of character can stand by itself as an unbroken token, with trailing combining stuff
|
||||
@ -114,6 +115,7 @@ $PR $CM+;
|
||||
$QU $CM+;
|
||||
$SP $CM+;
|
||||
$SY $CM+;
|
||||
$WJ $CM+;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
@ -131,12 +133,19 @@ $CR $LF {100};
|
||||
|
||||
# LB 4 x SP
|
||||
# x ZW
|
||||
$LB3NonBreaks [$SP $ZW];
|
||||
$ZW [$SP $ZW];
|
||||
$LB5NonBreaks $CM* [$SP $ZW];
|
||||
|
||||
# LB 5 Break after zero width space
|
||||
$LB5Breaks = [$LB3Breaks $ZW];
|
||||
|
||||
# LB 6
|
||||
#
|
||||
# Korean Syllable Definitions
|
||||
#
|
||||
|
||||
($HangulSyllable) $CM*;
|
||||
|
||||
# LB 7 Combining marks. TODO: get it right!
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
@ -163,10 +172,8 @@ $CLcm $SP* $NScm;
|
||||
($B2cm)+;
|
||||
|
||||
# LB 11b
|
||||
$LB5NonBreaks $CM* $GLcm .?;
|
||||
$LB5NonBreaks $CM* $GLcm $LB5NonBreaks $CM*;
|
||||
$GLcm $LB3NonBreaks?;
|
||||
$GLcm $LB5NonBreaks $CM*;
|
||||
$LB5NonBreaks $CM* ($GLcm | $WJcm);
|
||||
($GLcm | $WJcm) .?;
|
||||
|
||||
# LB 12
|
||||
$LB12NonBreaks = [$LB5NonBreaks - $SP];
|
||||
@ -184,14 +191,12 @@ $QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
|
||||
$LB14NonBreaks = [$LB12NonBreaks - $CB];
|
||||
$LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
|
||||
|
||||
|
||||
# LB 15
|
||||
$LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
|
||||
$BBcm [^$CB];
|
||||
$BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
|
||||
|
||||
# LB 16
|
||||
#($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
|
||||
$ALcm $INcm;
|
||||
$CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
|
||||
$IDcm $INcm;
|
||||
@ -206,11 +211,8 @@ $ALcm+ $NUcm; # includes $LB19
|
||||
$CM+ $NUcm; # Rule 7c
|
||||
$NUcm $ALcm+;
|
||||
|
||||
|
||||
|
||||
# LB 18
|
||||
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
|
||||
#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?;
|
||||
|
||||
# LB 19
|
||||
$CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
@ -226,7 +228,6 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
|
||||
|
||||
!!reverse;
|
||||
|
||||
|
||||
$CM+ $ALPlus;
|
||||
$CM+ $BA;
|
||||
$CM+ $BB;
|
||||
@ -246,6 +247,7 @@ $CM+ $PR;
|
||||
$CM+ $QU;
|
||||
$CM+ $SP;
|
||||
$CM+ $SY;
|
||||
$CM+ $WJ;
|
||||
|
||||
# LB 3
|
||||
|
||||
@ -262,6 +264,9 @@ $LF $CR;
|
||||
|
||||
# LB 6 Jamo is treated like an alphabet
|
||||
|
||||
$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
|
||||
$CM* $BackHangulSyllable;
|
||||
|
||||
# LB 7 Combining marks.
|
||||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
@ -288,9 +293,9 @@ $CM* $NS $SP* $CM* $CL;
|
||||
($CM* $B2)+;
|
||||
|
||||
# LB 11b
|
||||
$CM* $GL $CM* $LB5NonBreaks;
|
||||
$CM* $LB5NonBreaks $CM* $GL;
|
||||
$LB3NonBreaks $CM* $GL;
|
||||
$CM* ($GL | $WJ) $CM* $LB5NonBreaks;
|
||||
$CM* $LB5NonBreaks $CM* ($GL | $WJ);
|
||||
. $CM* ($GL | $WJ);
|
||||
|
||||
# LB 12
|
||||
|
||||
@ -340,6 +345,9 @@ $CM* $ALPlus $CM+ / $LB5Breaks;
|
||||
|
||||
!!safe_reverse;
|
||||
|
||||
# LB 6
|
||||
$V+ $L;
|
||||
|
||||
# LB 7
|
||||
$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
|
||||
$CM+ $SP / .;
|
||||
@ -354,13 +362,16 @@ $SP+ $CM* $QU;
|
||||
$SP+ $CM* $CL;
|
||||
|
||||
# LB 18
|
||||
$IS+ $CM* $NU;
|
||||
($CM* $IS)+ $CM* $NU;
|
||||
$CL $CM* ($NU | $IS);
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
||||
# LB 6
|
||||
$V+ $T;
|
||||
|
||||
# LB 7
|
||||
[^$BK $CR $LF $NL $ZW $SP] $CM+;
|
||||
$SP $CM+ / [^$CM];
|
||||
|
@ -595,11 +595,11 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
||||
if(exec) TestExtended(); break;
|
||||
case 17: name = "TestMonkey";
|
||||
if(exec) {
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
TestMonkey(params);
|
||||
#else
|
||||
#else
|
||||
logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
@ -2295,7 +2295,6 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
// Rule (5). ALetter x ALetter
|
||||
if (fALetterSet->contains(c1) &&
|
||||
fALetterSet->contains(c2)) {
|
||||
@ -2494,6 +2493,7 @@ RBBILineMonkey::RBBILineMonkey()
|
||||
fAL = new UnicodeSet("[\\p{Line_break=AL}]", status);
|
||||
fID = new UnicodeSet("[\\p{Line_break=ID}]", status);
|
||||
fSA = new UnicodeSet("[\\p{Line_break=SA}]", status);
|
||||
fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status);
|
||||
fXX = new UnicodeSet("[\\p{Line_break=XX}]", status);
|
||||
|
||||
fAL->addAll(*fXX); // Default behavior for XX is identical to AL
|
||||
@ -2530,6 +2530,7 @@ RBBILineMonkey::RBBILineMonkey()
|
||||
fSets->addElement(fAI, status);
|
||||
fSets->addElement(fAL, status);
|
||||
fSets->addElement(fID, status);
|
||||
fSets->addElement(fWJ, status);
|
||||
fSets->addElement(fSA, status);
|
||||
// fSets->addElement(fXX, status);
|
||||
|
||||
@ -2600,7 +2601,8 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
|
||||
// advance over any CM class chars. (Line Break CM class is different from
|
||||
// grapheme cluster CM, so we need to do this even for HangulSyllables.
|
||||
// Line Break may eat additional stuff as combining, beyond what graphem cluster did.
|
||||
if (!(fBK->contains(*posChar) || *posChar==0x0a || *posChar==0x0d || *posChar==0x85)) {
|
||||
if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a
|
||||
|| *posChar==0x0d || *posChar==0x85)) {
|
||||
for (;;) {
|
||||
*nextChar = fText->char32At(nPos);
|
||||
if (!fCM->contains(*nextChar)) {
|
||||
@ -2791,11 +2793,21 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
||||
// LB 9 Don't break after OP SP*
|
||||
/// UBool cmFlag = FALSE;
|
||||
for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
|
||||
while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
|
||||
tPos=fText->moveIndex32(tPos, -1);
|
||||
}
|
||||
if (fOP->contains(fText->char32At(tPos))) {
|
||||
break;
|
||||
}
|
||||
if (fSP->contains(prevChar) == FALSE
|
||||
|| fSP->contains(fText->char32At(tPos)) == FALSE
|
||||
if (fSP->contains(fText->char32At(tPos)) == TRUE) {
|
||||
int32_t temp = fText->moveIndex32(tPos, 1);
|
||||
if (fCM->contains(fText->char32At(temp))) {
|
||||
// if we have $SP$CM+ which is an $ID
|
||||
goto fall_through_9;
|
||||
}
|
||||
}
|
||||
// fSP->contains(prevChar) == FALSE ||
|
||||
if (fSP->contains(fText->char32At(tPos)) == FALSE
|
||||
|| tPos == 0) {
|
||||
/// || cmFlag == TRUE) {
|
||||
// if we have $SP$CM+ which is an $ID
|
||||
@ -2834,6 +2846,9 @@ fall_through_9:
|
||||
if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 12 break after space
|
||||
if (fSP->contains(prevChar)) {
|
||||
@ -2896,6 +2911,10 @@ fall_through_9:
|
||||
nextPos = numEndIdx;
|
||||
pos = fCharBI->preceding(numEndIdx);
|
||||
thisChar = fText->char32At(pos);
|
||||
while (fCM->contains(thisChar)) {
|
||||
pos = fCharBI->preceding(pos);
|
||||
thisChar = fText->char32At(pos);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@ -3010,7 +3029,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
|
||||
{
|
||||
int count = 0;
|
||||
int i = 0;
|
||||
int forward[20];
|
||||
int forward[50];
|
||||
bi->setText(ustr);
|
||||
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
|
||||
forward[count] = i;
|
||||
@ -3078,9 +3097,14 @@ void RBBITest::TestWordBreaks(void)
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[25];
|
||||
UChar str[300];
|
||||
char *strlist[] =
|
||||
{
|
||||
"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
|
||||
"\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
|
||||
"\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
|
||||
"\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
|
||||
"\\u90ca\\u3588\\u009c\\u0953\\u194b",
|
||||
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
|
||||
"\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
|
||||
"\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
|
||||
@ -3124,7 +3148,7 @@ void RBBITest::TestWordBreaks(void)
|
||||
// RBBICharMonkey monkey;
|
||||
RBBIWordMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
int expected[50];
|
||||
int expectedcount = 0;
|
||||
|
||||
monkey.setText(ustr);
|
||||
@ -3144,7 +3168,7 @@ void RBBITest::TestWordBoundary(void)
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
|
||||
UChar str[20];
|
||||
UChar str[50];
|
||||
char *strlist[] =
|
||||
{
|
||||
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
|
||||
@ -3182,7 +3206,7 @@ void RBBITest::TestWordBoundary(void)
|
||||
// printf("looping %d\n", loop);
|
||||
u_unescape(strlist[loop], str, 20);
|
||||
UnicodeString ustr(str);
|
||||
int forward[20];
|
||||
int forward[50];
|
||||
int count = 0;
|
||||
|
||||
bi->setText(ustr);
|
||||
@ -3217,9 +3241,21 @@ void RBBITest::TestLineBreaks(void)
|
||||
Locale locale("en");
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
UChar str[20];
|
||||
UChar str[50];
|
||||
char *strlist[] =
|
||||
{
|
||||
"\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
|
||||
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
|
||||
"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
|
||||
"\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
|
||||
"\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
|
||||
"\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
|
||||
"\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
|
||||
"\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
|
||||
"\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
|
||||
"\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
|
||||
"\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
|
||||
"\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
|
||||
"\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
|
||||
"\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
|
||||
"\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
|
||||
@ -3235,7 +3271,6 @@ void RBBITest::TestLineBreaks(void)
|
||||
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
|
||||
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
|
||||
"\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
|
||||
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
|
||||
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
|
||||
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
|
||||
"\\u2014\\u0020\\u000a\\u17c5\\u24fc",
|
||||
@ -3253,7 +3288,7 @@ void RBBITest::TestLineBreaks(void)
|
||||
UnicodeString ustr(str);
|
||||
RBBILineMonkey monkey;
|
||||
|
||||
int expected[20];
|
||||
int expected[50];
|
||||
int expectedcount = 0;
|
||||
|
||||
monkey.setText(ustr);
|
||||
@ -3386,6 +3421,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
UnicodeString testText;
|
||||
int32_t numCharClasses;
|
||||
UVector *chClasses;
|
||||
int expected[TESTSTRINGLEN*2 + 1];
|
||||
int expectedCount = 0;
|
||||
char expectedBreaks[TESTSTRINGLEN*2 + 1];
|
||||
char forwardBreaks[TESTSTRINGLEN*2 + 1];
|
||||
char reverseBreaks[TESTSTRINGLEN*2+1];
|
||||
@ -3443,6 +3480,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
memset(expectedBreaks, 0, sizeof(expectedBreaks));
|
||||
expectedBreaks[0] = 1;
|
||||
int32_t breakPos = 0;
|
||||
expectedCount = 0;
|
||||
for (;;) {
|
||||
breakPos = mk.next(breakPos);
|
||||
if (breakPos == -1) {
|
||||
@ -3452,6 +3490,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
errln("breakPos > testText.length()");
|
||||
}
|
||||
expectedBreaks[breakPos] = 1;
|
||||
expected[expectedCount ++] = breakPos;
|
||||
}
|
||||
|
||||
// Find the break positions using forward iteration
|
||||
@ -3528,20 +3567,13 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint
|
||||
|
||||
// Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
|
||||
UnicodeString errorText = "<data>";
|
||||
/*** if (strcmp(errorType, "next()") == 0) {
|
||||
/***if (strcmp(errorType, "next()") == 0) {
|
||||
startContext = 0;
|
||||
int j = i;
|
||||
while (true) {
|
||||
if (forwardBreaks[j ++] != 0) {
|
||||
printf("%d\n", j);
|
||||
break;
|
||||
}
|
||||
if (j % 100 == 0) {
|
||||
printf("continue %d\n", j);
|
||||
}
|
||||
}
|
||||
endContext = j + 1;
|
||||
endContext = testText.length();
|
||||
|
||||
printStringBreaks(testText, expected, expectedCount);
|
||||
}***/
|
||||
|
||||
for (ci=startContext; ci<endContext;) {
|
||||
UnicodeString hexChars("0123456789abcdef");
|
||||
UChar32 c;
|
||||
|
Loading…
Reference in New Issue
Block a user