ICU-2924 RBBI Line Break Rule Updates, work in progress.

X-SVN-Rev: 12701
This commit is contained in:
Andy Heninger 2003-07-28 06:40:25 +00:00
parent 468a10f112
commit 6bbbeb7637
3 changed files with 88 additions and 48 deletions

View File

@ -7,6 +7,11 @@
# Implement default line breaking as defined by Unicode TR 14.
#
# Known Deviations from TR14:
# LB 7a The Sequence SP CM+ is not treated as an ID.
# The SP in SP CM is not distinguished from any other SP.
# LB 14a, break before and after CB, is not implemented.
#
# Character Classes defined by TR 14.
@ -44,12 +49,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
#
# Character classes from TR 29. Needed for finding characters.
#
#
$Extend = [:Grapheme_Extend = TRUE:];
#
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
@ -63,23 +62,24 @@ $ALPlus = $AL | $AI | $SA | $XX;
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
#
$ALcm = $ALPlus $CM*;
$BAcm = $BA $Extend*;
$BBcm = $BB $Extend*;
$B2cm = $B2 $Extend*;
$CLcm = $CL $Extend*;
$EXcm = $EX $Extend*;
$GLcm = $GL $Extend*;
$HYcm = $HY $Extend*;
$IDcm = ($ID $CM* | $SP $CM+);
$INcm = $IN $Extend*;
$IScm = $IS $Extend*;
$NScm = $NS $Extend*;
$NUcm = $NU $Extend*;
$OPcm = $OP $Extend*;
$POcm = $PO $Extend*;
$PRcm = $PR $Extend*;
$QUcm = $QU $Extend*;
$SYcm = $SY $Extend*;
$BAcm = $BA $CM*;
$BBcm = $BB $CM*;
$B2cm = $B2 $CM*;
$CLcm = $CL $CM*;
$EXcm = $EX $CM*;
$GLcm = $GL $CM*;
$HYcm = $HY $CM*;
$IDcm = $ID $CM*;
$INcm = $IN $CM*;
$IScm = $IS $CM*;
$NScm = $NS $CM*;
$NUcm = $NU $CM*;
$OPcm = $OP $CM*;
$POcm = $PO $CM*;
$PRcm = $PR $CM*;
$QUcm = $QU $CM*;
$SPcm = $SP $CM*;
$SYcm = $SY $CM*;
# New Lines. Always break after, never break before.
@ -90,22 +90,22 @@ $SYcm = $SY $Extend*;
# appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
$EndingsSoft = $SP* $ZW*;
$EndingsHard = $SP* $ZW* $NLF;
$EndingsSoft = ($ZW* $SP)* $ZW*;
$EndingsHard = ($ZW* $SP)* $ZW* $NLF;
#
# Openings Sequences that can precede Words, and that should not be separated from them.
# Rules LB 9, 10
#
$Openings = (($QUcm $SP*)? $OPcm $SP*)*;
$Openings = (($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*)*;
#
# Closings Seqences that follow words, and that should not be separated from them,
# Rule LB 8, 11, 15
$Closings = ($SP*( ($CL ($SP* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm)*;
$Closings = (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)? | $EX | $IS | $SY) $CM*) | $BAcm | $HYcm | $NScm)*;
$WordClosings = ($CLcm | $EXcm | $IScm | $SYcm | $BAcm | $HYcm | $NScm)*;
$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;
#
# Words. Includes mixed Alpha-numerics.
@ -115,12 +115,14 @@ $Number = $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbe
# Regex form, rather than rule 18
# Alpha-numeric. 16, 17
$Word = ($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?) |
$INcm |
$CB; # Deviation from TR for $CB
$Word = ($ALcm | $NUcm)+ $INcm* |
$IDcm ($POcm? | $INcm*) |
$CM+ ($POcm? | $INcm*) | # CM with no base is like ID (LB 7a)
$INcm+ |
$CB; # Deviation from Unicode spec for $CB
# We treat as a single char word
$Dashes = (($B2cm $SP*)*); # Dashes 11a
$Dashes = (($B2cm ($ZW* $SP)*)*); # Dashes 11a
@ -132,11 +134,11 @@ $HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don'
$Word15 = $Openings? (
($BBcm* $Openings ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words.
$BBcm* [^[:Cc:] $BK $CR $LF $ZW $SP $GL ] $Extend* | # Allow characters that don't meet the
$BBcm* [^$BK $CR $LF $ZW $SP $GL ] # more elaborate definitions for WORD
$BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM* | # Allow characters that don't meet the
$BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] # more elaborate definitions for WORD
) $WordClosings?; # to be glued.
$GluedWord = $Openings? ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
$GluedWord = $Openings? (($ZW* $SP)* $GLcm | $QUcm)? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
# Rules 13, 14
#
@ -158,7 +160,8 @@ $Openings $GluedWord $Closings $EndingsHard{100};
# containing a space that may inhibit a break from occuring.
#
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($CM* $SP)) | (($CM* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
#!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
!.*;

View File

@ -2532,7 +2532,6 @@ RBBILineMonkey::RBBILineMonkey()
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
@ -2559,19 +2558,38 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
return -1;
}
// We need to figure out where the next character of interest starts
// Depends on the previous char, and whether it eats following CombiningMarks
// or not.
UChar32 c = fText->char32At(prevPos);
if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) {
// char doesn't automatically combine with CM.
nextPos = fText->moveIndex32(prevPos, 1);
} else {
nextPos = fCharBI->following(prevPos);
for (;;) {
UChar32 c = fText->char32At(nextPos);
if (!fCM->contains(c)) {
break;
}
nextPos = fText->moveIndex32(nextPos, 1);
}
}
pos = prevPos;
// Loop runs once per position in the test text, until a break position
// is found.
nextPos = fCharBI->following(prevPos);
pos = prevPos;
for (;;) {
prevPos = pos;
pos = nextPos;
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = fCharBI->following(pos); // Advance by grapheme cluster
UChar32 prevChar = fText->char32At(prevPos);
UChar32 thisChar = fText->char32At(pos);
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
// Break at end of text.
if (pos >= fText->length()) {
break;
@ -2598,7 +2616,7 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
continue;
}
// LB 4 DOn't break before spaces or zero-width space.
// LB 4 Don't break before spaces or zero-width space.
if (fSP->contains(thisChar)) {
continue;
}
@ -2606,6 +2624,23 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
continue;
}
if (!fSP->contains(thisChar)) {
// nextPos advances over Hangul Syllables plus any chars
// of line break class CM.
// Advancing by a grapheme cluster with a character break iterator
// almost gets this, except Line Break CM includes some
// stuff that is not combining from the grapheme cluster definition.
nextPos = fCharBI->following(pos); // Advance by grapheme cluster
// now advance over any CM class chars that were missed
for (;;) {
UChar32 c = fText->char32At(nextPos);
if (!fCM->contains(c)) {
break;
}
nextPos = fText->moveIndex32(nextPos, 1);
}
}
// LB 5 Break after zero width space
if (fZW->contains(prevChar)) {
@ -2739,8 +2774,11 @@ fall_through_11:
}
// LB 17
// LB 17 ID x PO (Note: Leading CM behaves like ID)
// AL x NU
// NU x AL
if (fID->contains(prevChar) && fPO->contains(thisChar) ||
fCM->contains(prevChar) && fPO->contains(thisChar) ||
fAL->contains(prevChar) && fNU->contains(thisChar) ||
fNU->contains(prevChar) && fAL->contains(thisChar) ) {
continue;
@ -2773,7 +2811,6 @@ fall_through_11:
}
// We should never get here.
return pos;
}
@ -2916,7 +2953,7 @@ void RBBITest::TestMonkey(char *params) {
}
if (breakType == "line" || breakType == "all") {
#if 0
#if 1
// TODO: Enable test
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);

View File

@ -462,7 +462,7 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
# Surrogate line break tests.
#
<data>•\u4e01•\ud840\udc01•\u4e02•abc•\ue000•\udb80\udc01•</data>
<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>
# Regression for bug 836
<data>•AAA•(AAA •</data>