ICU-2924 RBBI Line Break Rule Updates, work in progress.
X-SVN-Rev: 12701
This commit is contained in:
parent
468a10f112
commit
6bbbeb7637
@ -7,6 +7,11 @@
|
||||
# Implement default line breaking as defined by Unicode TR 14.
|
||||
#
|
||||
|
||||
# Known Deviations from TR14:
|
||||
# LB 7a The Sequence SP CM+ is not treated as an ID.
|
||||
# The SP in SP CM is not distinguished from any other SP.
|
||||
# LB 14a, break before and after CB, is not implemented.
|
||||
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
@ -44,12 +49,6 @@ $XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
|
||||
|
||||
#
|
||||
# Character classes from TR 29. Needed for finding characters.
|
||||
#
|
||||
#
|
||||
$Extend = [:Grapheme_Extend = TRUE:];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
@ -63,23 +62,24 @@ $ALPlus = $AL | $AI | $SA | $XX;
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$BAcm = $BA $Extend*;
|
||||
$BBcm = $BB $Extend*;
|
||||
$B2cm = $B2 $Extend*;
|
||||
$CLcm = $CL $Extend*;
|
||||
$EXcm = $EX $Extend*;
|
||||
$GLcm = $GL $Extend*;
|
||||
$HYcm = $HY $Extend*;
|
||||
$IDcm = ($ID $CM* | $SP $CM+);
|
||||
$INcm = $IN $Extend*;
|
||||
$IScm = $IS $Extend*;
|
||||
$NScm = $NS $Extend*;
|
||||
$NUcm = $NU $Extend*;
|
||||
$OPcm = $OP $Extend*;
|
||||
$POcm = $PO $Extend*;
|
||||
$PRcm = $PR $Extend*;
|
||||
$QUcm = $QU $Extend*;
|
||||
$SYcm = $SY $Extend*;
|
||||
$BAcm = $BA $CM*;
|
||||
$BBcm = $BB $CM*;
|
||||
$B2cm = $B2 $CM*;
|
||||
$CLcm = $CL $CM*;
|
||||
$EXcm = $EX $CM*;
|
||||
$GLcm = $GL $CM*;
|
||||
$HYcm = $HY $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$INcm = $IN $CM*;
|
||||
$IScm = $IS $CM*;
|
||||
$NScm = $NS $CM*;
|
||||
$NUcm = $NU $CM*;
|
||||
$OPcm = $OP $CM*;
|
||||
$POcm = $PO $CM*;
|
||||
$PRcm = $PR $CM*;
|
||||
$QUcm = $QU $CM*;
|
||||
$SPcm = $SP $CM*;
|
||||
$SYcm = $SY $CM*;
|
||||
|
||||
|
||||
# New Lines. Always break after, never break before.
|
||||
@ -90,22 +90,22 @@ $SYcm = $SY $Extend*;
|
||||
# appears at the end of line break rule.
|
||||
#
|
||||
$NLF = $BK | $CR | $LF | $NL | $CR $LF;
|
||||
$EndingsSoft = $SP* $ZW*;
|
||||
$EndingsHard = $SP* $ZW* $NLF;
|
||||
$EndingsSoft = ($ZW* $SP)* $ZW*;
|
||||
$EndingsHard = ($ZW* $SP)* $ZW* $NLF;
|
||||
|
||||
|
||||
#
|
||||
# Openings Sequences that can precede Words, and that should not be separated from them.
|
||||
# Rules LB 9, 10
|
||||
#
|
||||
$Openings = (($QUcm $SP*)? $OPcm $SP*)*;
|
||||
$Openings = (($QUcm ($ZW* $SP)*)? $OPcm ($ZW* $SP)*)*;
|
||||
|
||||
#
|
||||
# Closings Seqences that follow words, and that should not be separated from them,
|
||||
# Rule LB 8, 11, 15
|
||||
$Closings = ($SP*( ($CL ($SP* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm)*;
|
||||
$Closings = (($ZW* $SP)*( ($CLcm (($ZW* $SP)* $NScm)? | $EX | $IS | $SY) $CM*) | $BAcm | $HYcm | $NScm)*;
|
||||
|
||||
$WordClosings = ($CLcm | $EXcm | $IScm | $SYcm | $BAcm | $HYcm | $NScm)*;
|
||||
$WordClosings = ($SP* $CLcm | $SP* $EXcm | $SP* $IScm | $SP* $SYcm | $BAcm | $HYcm | $NScm)*;
|
||||
|
||||
#
|
||||
# Words. Includes mixed Alpha-numerics.
|
||||
@ -115,12 +115,14 @@ $Number = $PRcm? ($OPcm | $HYcm)? $NU ($NU | $IS)* $CL? $POcm?; # Numbe
|
||||
# Regex form, rather than rule 18
|
||||
|
||||
# Alpha-numeric. 16, 17
|
||||
$Word = ($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?) |
|
||||
$INcm |
|
||||
$CB; # Deviation from TR for $CB
|
||||
$Word = ($ALcm | $NUcm)+ $INcm* |
|
||||
$IDcm ($POcm? | $INcm*) |
|
||||
$CM+ ($POcm? | $INcm*) | # CM with no base is like ID (LB 7a)
|
||||
$INcm+ |
|
||||
$CB; # Deviation from Unicode spec for $CB
|
||||
# We treat as a single char word
|
||||
|
||||
$Dashes = (($B2cm $SP*)*); # Dashes 11a
|
||||
$Dashes = (($B2cm ($ZW* $SP)*)*); # Dashes 11a
|
||||
|
||||
|
||||
|
||||
@ -132,11 +134,11 @@ $HYMinus = $HYcm ($NUcm ($NUcm | $IS)* $CL? $POcm?)?; # For Rle LB15, Don'
|
||||
|
||||
$Word15 = $Openings? (
|
||||
($BBcm* $Openings ($Word | $Number | $Dashes)? ($BAcm | $HYMinus | $NScm)*) | # Rule 15. Stuff sticks around words.
|
||||
$BBcm* [^[:Cc:] $BK $CR $LF $ZW $SP $GL ] $Extend* | # Allow characters that don't meet the
|
||||
$BBcm* [^$BK $CR $LF $ZW $SP $GL ] # more elaborate definitions for WORD
|
||||
$BBcm* [^[:Cc:] $BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] $CM* | # Allow characters that don't meet the
|
||||
$BBcm* [^$BK $CR $LF $NL $ZW ($ZW* $SP) $GL ] # more elaborate definitions for WORD
|
||||
) $WordClosings?; # to be glued.
|
||||
|
||||
$GluedWord = $Openings? ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
||||
$GluedWord = $Openings? (($ZW* $SP)* $GLcm | $QUcm)? $Word15 ((($ZW* $SP)* $GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
||||
# Rules 13, 14
|
||||
|
||||
#
|
||||
@ -158,7 +160,8 @@ $Openings $GluedWord $Closings $EndingsHard{100};
|
||||
# containing a space that may inhibit a break from occuring.
|
||||
#
|
||||
|
||||
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
|
||||
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($CM* $SP)) | (($CM* $SP)+ $OP);
|
||||
$ClumpingChars = [^$SP $BK $CR $LF];
|
||||
|
||||
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
|
||||
#!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR);
|
||||
!.*;
|
@ -2532,7 +2532,6 @@ RBBILineMonkey::RBBILineMonkey()
|
||||
|
||||
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
|
||||
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
}
|
||||
@ -2559,19 +2558,38 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
// We need to figure out where the next character of interest starts
|
||||
// Depends on the previous char, and whether it eats following CombiningMarks
|
||||
// or not.
|
||||
UChar32 c = fText->char32At(prevPos);
|
||||
if (c == 0x0d || c == 0x0a || c == 0x85 || fBK->contains(c) || fSP->contains(c)) {
|
||||
// char doesn't automatically combine with CM.
|
||||
nextPos = fText->moveIndex32(prevPos, 1);
|
||||
} else {
|
||||
nextPos = fCharBI->following(prevPos);
|
||||
for (;;) {
|
||||
UChar32 c = fText->char32At(nextPos);
|
||||
if (!fCM->contains(c)) {
|
||||
break;
|
||||
}
|
||||
nextPos = fText->moveIndex32(nextPos, 1);
|
||||
}
|
||||
}
|
||||
pos = prevPos;
|
||||
|
||||
|
||||
// Loop runs once per position in the test text, until a break position
|
||||
// is found.
|
||||
nextPos = fCharBI->following(prevPos);
|
||||
pos = prevPos;
|
||||
for (;;) {
|
||||
prevPos = pos;
|
||||
pos = nextPos;
|
||||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = fCharBI->following(pos); // Advance by grapheme cluster
|
||||
|
||||
UChar32 prevChar = fText->char32At(prevPos);
|
||||
UChar32 thisChar = fText->char32At(pos);
|
||||
|
||||
nextCPPos = fText->moveIndex32(pos, 1);
|
||||
nextPos = nextCPPos;
|
||||
|
||||
// Break at end of text.
|
||||
if (pos >= fText->length()) {
|
||||
break;
|
||||
@ -2598,7 +2616,7 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 4 DOn't break before spaces or zero-width space.
|
||||
// LB 4 Don't break before spaces or zero-width space.
|
||||
if (fSP->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
@ -2606,6 +2624,23 @@ int32_t RBBILineMonkey::next(int32_t prevPos) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!fSP->contains(thisChar)) {
|
||||
// nextPos advances over Hangul Syllables plus any chars
|
||||
// of line break class CM.
|
||||
// Advancing by a grapheme cluster with a character break iterator
|
||||
// almost gets this, except Line Break CM includes some
|
||||
// stuff that is not combining from the grapheme cluster definition.
|
||||
nextPos = fCharBI->following(pos); // Advance by grapheme cluster
|
||||
// now advance over any CM class chars that were missed
|
||||
for (;;) {
|
||||
UChar32 c = fText->char32At(nextPos);
|
||||
if (!fCM->contains(c)) {
|
||||
break;
|
||||
}
|
||||
nextPos = fText->moveIndex32(nextPos, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// LB 5 Break after zero width space
|
||||
if (fZW->contains(prevChar)) {
|
||||
@ -2739,8 +2774,11 @@ fall_through_11:
|
||||
}
|
||||
|
||||
|
||||
// LB 17
|
||||
// LB 17 ID x PO (Note: Leading CM behaves like ID)
|
||||
// AL x NU
|
||||
// NU x AL
|
||||
if (fID->contains(prevChar) && fPO->contains(thisChar) ||
|
||||
fCM->contains(prevChar) && fPO->contains(thisChar) ||
|
||||
fAL->contains(prevChar) && fNU->contains(thisChar) ||
|
||||
fNU->contains(prevChar) && fAL->contains(thisChar) ) {
|
||||
continue;
|
||||
@ -2773,7 +2811,6 @@ fall_through_11:
|
||||
|
||||
}
|
||||
|
||||
// We should never get here.
|
||||
return pos;
|
||||
}
|
||||
|
||||
@ -2916,7 +2953,7 @@ void RBBITest::TestMonkey(char *params) {
|
||||
}
|
||||
|
||||
if (breakType == "line" || breakType == "all") {
|
||||
#if 0
|
||||
#if 1
|
||||
// TODO: Enable test
|
||||
RBBILineMonkey m;
|
||||
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
|
||||
|
2
icu4c/source/test/testdata/rbbitst.txt
vendored
2
icu4c/source/test/testdata/rbbitst.txt
vendored
@ -462,7 +462,7 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
|
||||
|
||||
# Surrogate line break tests.
|
||||
#
|
||||
<data>•\u4e01•\ud840\udc01•\u4e02•abc•\ue000•\udb80\udc01•</data>
|
||||
<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data>
|
||||
|
||||
# Regression for bug 836
|
||||
<data>•AAA•(AAA •</data>
|
||||
|
Loading…
Reference in New Issue
Block a user