ICU-2093 Update word breakr rules to latest Unicode TR, work in progress

X-SVN-Rev: 11472
This commit is contained in:
Andy Heninger 2003-04-08 05:35:13 +00:00
parent f70487d239
commit 806b6d974f
3 changed files with 63 additions and 58 deletions

View File

@ -1,43 +1,49 @@
# #
# Copyright (C) 2002, International Business Machines Corporation and others. # Copyright (C) 2002, 2003, International Business Machines Corporation and others.
# All Rights Reserved. # All Rights Reserved.
# #
# file: word.txt # file: word.txt
# #
# ICU Word Break Rules # ICU Word Break Rules
# See Unicode Technical Report #29. # See Unicode Standard Annex #29.
# These rules are based on the proposed draft dated 2002-08-06 # These rules are based on the proposed draft dated 2003-03-31
# #
#################################################################################### ####################################################################################
# #
# Definitions imported from Line Break Rules. # Character class definitions from TR 29
# #
#################################################################################### ####################################################################################
$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF $Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF]; [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
- [:Ideographic:]
- [:Katakana:]
- [:Script = Thai:]
- [:Script = Lao:]
- [:Script = Hiragana:]];
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
$Numeric = [:LineBreak = Numeric:];
####################################################################################
#
# Definitions imported from Character Break Rules.
#
####################################################################################
# #
# Character Class Definitions. # Character Class Definitions.
# The names are those from TR29. # The names are those from TR29.
# #
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]]; $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
$Extend = [[:Grapheme_Extend = TRUE:]];
# Note on $Extend: Earlier versions of TR29 included Mc characters.
# To avoid test breakage, Mc is still included for the time being.
# $Extend = [[:Mn:] [:Me:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
@ -48,67 +54,63 @@ $Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_G
# #
#################################################################################### ####################################################################################
$Katakana = [[:Kana:] \u30fc \uff70 \uff9e-\uff9f];
$Hiragana = [[:Hira:]];
$Letter = [[[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
[[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
$Format = [[:Cf:]]; $Format = [[:Cf:]];
$MidLetter = [\u0027 \u00ad \u05f4 \u2019];
$MidNumLet = [\u002e \u003a];
# From Line Break, IS - Numeric Separator (Infix) # Rule 3: Treat a grapheme cluster as if it were a single character.
# $IS = [\u002c \u002e \u003a \u003b \u0589]; # Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
$MidNum = [\u002c \u003b \u0589]; # because we don't need to find the boundaries between adjacent syllables -
# they won't be word boundaries.
# #
# "Extended" definitions. Classes of characters including trailing combining chars and,
# for types of chars that can appear in the interior of a word only,
# trailing format characters.
#
$LetterEx = $Letter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidNumExF = $MidNum $Extend* $Format*;
$MidNumLetExF = $MidNumLet $Extend* $Format*;
$MidLetterExF = $MidLetter $Extend* $Format*;
# #
# Numbers. Rules 6, 9, 10 form the TR. # "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
# #
$NumberSequence = $NumericEx ($Format* ($MidNumExF | $MidNumLetExF)? $NumericEx)*; $ALetterEx = $ALetter $Extend* $Format*;
$NumericEx = $Numeric $Extend* $Format*;
$MidNumEx = $MidNum $Extend* $Format*;
$MidNumLetEx = $MidNumLet $Extend* $Format*;
$MidLetterEx = $MidLetter $Extend* $Format*;
$KatakanaEx = $Katakana $Extend* $Format*;
#
# Numbers. Rules 8, 11, 12 form the TR.
#
$NumberSequence = $NumericEx (($MidNumEx | $MidNumLetEx)? $NumericEx)*;
$NumberSequence {100}; $NumberSequence {100};
# #
# Words. Alpha-numerics. Rule 3 - 10 # Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
# - must include at least one letter. # - must include at least one letter.
# - may include both letters and numbers. # - may include both letters and numbers.
# - may include MideLetter, MidNumber punctuation. # - may include MideLetter, MidNumber punctuation.
# #
$LetterSequence = $LetterEx ($Format* ($MidLetterExF | $MidNumLetExF)? $LetterEx)*; $LetterSequence = $ALetterEx (($MidLetterEx | $MidNumLetEx)? $ALetterEx)*; # rules #6, #7
$NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200}; $NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};
# #
# Hiragana and Katakana # Do not break between Katakana. Rule #13.
# #
$Hiragana $Extend* {300}; $KatakanaEx+ {300};
$Katakana $Extend* ($Format* $Katakana $Extend*)* {300}; [:Hiragana:] $Extend* {300};
# #
# Ideographic Characters. Stand by themselves as words. # Ideographic Characters. Stand by themselves as words.
# Separated from the "Everything Else" rule, below, only so that they
# can be tagged with a return value. TODO: is this what we want?
# #
[:IDEOGRAPHIC:] $Extend* {400}; [:IDEOGRAPHIC:] $Extend* {400};
# #
# Everything Else, with no tag. # Everything Else, with no tag.
# Non-Control chars combine with $Extend (combining) chars. # Non-Control chars combine with $Extend (combining) chars.
# Controls are returned by themselves. # Controls are do not.
# #
[^$Control] $Extend*; [^$Control [:Ideographic:]] $Extend*;
\r\n; [\u000d][\u000a];
.;
# #
# Reverse Rules. Back up over any of the chars that can group together. # Reverse Rules. Back up over any of the chars that can group together.
@ -121,6 +123,7 @@ $Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
# reaches something that can only be the start (and probably only) char in a "word". # reaches something that can only be the start (and probably only) char in a "word".
# A space or punctuation meets the test. # A space or punctuation meets the test.
# #
$NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a]; $NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
! $NonStarters* .; !.*;
#! ($NonStarters* | \n \r) .;

View File

@ -352,7 +352,7 @@ void RBBIAPITest::TestFirstNextFollowing()
doTest(testString, p, q, 17, " here."); doTest(testString, p, q, 17, " here.");
// hindi starts here // hindi starts here
p=q; p=q;
q=charIter1->next(4); q=charIter1->next(5);
doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924"); doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924");
p=q; p=q;
q=charIter1->next(2); q=charIter1->next(2);
@ -515,7 +515,10 @@ void RBBIAPITest::TestLastPreviousPreceding()
doTest(testString, p, q, 31, "\\u0964"); doTest(testString, p, q, 31, "\\u0964");
p=q; p=q;
q=charIter1->previous(); q=charIter1->previous();
doTest(testString, p, q, 29, "\\u0939\\u094c"); doTest(testString, p, q, 30, "\\u094c");
p=q;
q=charIter1->previous();
doTest(testString, p, q, 29, "\\u0939");
q=charIter1->preceding(26); q=charIter1->preceding(26);
doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902"); doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902");
q=charIter1->preceding(16); q=charIter1->preceding(16);
@ -609,7 +612,7 @@ void RBBIAPITest::TestIsBoundary(){
errln("FAIL: in construction"); errln("FAIL: in construction");
else{ else{
charIter1->setText(testString1); charIter1->setText(testString1);
int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26}; int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26};
doBoundaryTest(*charIter1, testString1, bounds1); doBoundaryTest(*charIter1, testString1, bounds1);
} }

View File

@ -2178,8 +2178,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
} }
} }
if (!seen2) { if (!seen2) {
errln("No break between U+" + UCharToUnicodeString(c1) errln("No Break between \\U%04x and \\U%04x", c1, c2);
+ " and U+" + UCharToUnicodeString(c2));
errCount++; errCount++;
if (errCount >= 75) if (errCount >= 75)
return; return;
@ -2206,8 +2205,8 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
tb.setText(work); tb.setText(work);
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next()) for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
if (k == 2) { if (k == 2) {
errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) + errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
", U+d U+a U+" + UCharToUnicodeString(work[3])); work[0], work[1], work[2], work[3]);
errCount++; errCount++;
if (errCount >= 75) if (errCount >= 75)
return; return;