ICU-3700 updated rules for Unicode 4.0.1.

X-SVN-Rev: 15286
This commit is contained in:
Eric Mader 2004-05-12 23:29:24 +00:00
parent 7f012c20cb
commit 0a217cf782
2 changed files with 28 additions and 7 deletions

View File

@ -212,10 +212,13 @@ $CM+ $NUcm; # Rule 7c
$NUcm $ALcm+;
# LB 18
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm | $SYcm)* $CLcm? $POcm?;
$PRcm $ALcm;
$PRcm $IDcm;
# LB 19
$CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
$IScm $ALcm;
#
# Reverse Rules.
@ -326,11 +329,14 @@ $CM* $NU $CM+ / $LB5Breaks; # Rule 7c
$CM* $ALPlus $CM* $NU;
# LB 18
($CM* $PO)? ($CM* $CL)? ($CM* ($NU | $IS))* $CM* $NU ($CM* ($OP | $HY))? ($CM* $PR)?;
($CM* $PO)? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* $PR)?;
$CM* $ALPlus $CM* $PR;
$CM* ($ID | $BackHangulSyllable) $CM* $PR;
# LB 19
$CM* $ALPlus $CM* $ALPlus;
# The $CM* is from rule 7C, and unattached CM is treated as AL
$CM* $ALPlus $CM* $IS;
$CM* $ALPlus $CM+ / $LB5Breaks;
## problem state table can't handle lookahead when it is at the
@ -358,8 +364,8 @@ $SP+ $CM* $QU;
$SP+ $CM* $CL;
# LB 18
($CM* $IS)+ $CM* $NU;
$CL $CM* ($NU | $IS);
($CM* ($IS | $SY))+ $CM* $NU;
$CL $CM* ($NU | $IS | $SY);
## -------------------------------------------------
@ -384,3 +390,4 @@ $CL $CM* $SP+;
# LB 18
$HY $CM* $NU;
$IS $CM* $CL;
$SY $CM* $CL;

View File

@ -2538,7 +2538,7 @@ RBBILineMonkey::RBBILineMonkey()
"(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
"((\\p{Line_Break=NU}|\\p{Line_Break=IS})\\p{Line_Break=CM}*)*"
"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
"(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
0, status);
@ -2799,7 +2799,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
fEX->contains(thisChar) ||
!fNU->contains(prevChar) && fIS->contains(thisChar) ||
fSY->contains(thisChar)) {
!fNU->contains(prevChar) && fSY->contains(thisChar)) {
continue;
}
@ -2910,6 +2910,14 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
}
if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
continue;
}
if (fPR->contains(prevChar) && fID->contains(thisChar)) {
continue;
}
// LB 18b
if (fHY->contains(prevChar) || fBB->contains(thisChar)) {
break;
@ -2920,6 +2928,11 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// LB 19b
if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
continue;
}
// LB 20 Break everywhere else
break;
@ -3238,6 +3251,7 @@ void RBBITest::TestLineBreaks(void)
UChar str[50];
static const char *strlist[] =
{
"\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
"\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
@ -3317,7 +3331,7 @@ void RBBITest::TestSentBreaks(void)
};
int loop;
int forward[100];
for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
u_unescape(strlist[loop], str, 100);
UnicodeString ustr(str);