ICU-2093 Update word breakr rules to latest Unicode TR, work in progress
X-SVN-Rev: 11472
This commit is contained in:
parent
f70487d239
commit
806b6d974f
@ -1,43 +1,49 @@
|
|||||||
#
|
#
|
||||||
# Copyright (C) 2002, International Business Machines Corporation and others.
|
# Copyright (C) 2002, 2003, International Business Machines Corporation and others.
|
||||||
# All Rights Reserved.
|
# All Rights Reserved.
|
||||||
#
|
#
|
||||||
# file: word.txt
|
# file: word.txt
|
||||||
#
|
#
|
||||||
# ICU Word Break Rules
|
# ICU Word Break Rules
|
||||||
# See Unicode Technical Report #29.
|
# See Unicode Standard Annex #29.
|
||||||
# These rules are based on the proposed draft dated 2002-08-06
|
# These rules are based on the proposed draft dated 2003-03-31
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
####################################################################################
|
####################################################################################
|
||||||
#
|
#
|
||||||
# Definitions imported from Line Break Rules.
|
# Character class definitions from TR 29
|
||||||
#
|
#
|
||||||
####################################################################################
|
####################################################################################
|
||||||
$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
|
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||||
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
|
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||||
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
|
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||||
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
|
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||||
|
|
||||||
|
|
||||||
|
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||||
|
- [:Ideographic:]
|
||||||
|
- [:Katakana:]
|
||||||
|
- [:Script = Thai:]
|
||||||
|
- [:Script = Lao:]
|
||||||
|
- [:Script = Hiragana:]];
|
||||||
|
|
||||||
|
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||||
|
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
|
||||||
|
|
||||||
|
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
|
||||||
|
|
||||||
|
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
|
||||||
|
$Numeric = [:LineBreak = Numeric:];
|
||||||
|
|
||||||
|
|
||||||
####################################################################################
|
|
||||||
#
|
|
||||||
# Definitions imported from Character Break Rules.
|
|
||||||
#
|
|
||||||
####################################################################################
|
|
||||||
#
|
#
|
||||||
# Character Class Definitions.
|
# Character Class Definitions.
|
||||||
# The names are those from TR29.
|
# The names are those from TR29.
|
||||||
#
|
#
|
||||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||||
|
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||||
# Note on $Extend: Earlier versions of TR29 included Mc characters.
|
|
||||||
# To avoid test breakage, Mc is still included for the time being.
|
|
||||||
# $Extend = [[:Mn:] [:Me:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
|
||||||
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -48,67 +54,63 @@ $Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_G
|
|||||||
#
|
#
|
||||||
####################################################################################
|
####################################################################################
|
||||||
|
|
||||||
$Katakana = [[:Kana:] \u30fc \uff70 \uff9e-\uff9f];
|
|
||||||
$Hiragana = [[:Hira:]];
|
|
||||||
$Letter = [[[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
|
|
||||||
[[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
|
|
||||||
$Format = [[:Cf:]];
|
$Format = [[:Cf:]];
|
||||||
|
|
||||||
$MidLetter = [\u0027 \u00ad \u05f4 \u2019];
|
|
||||||
|
|
||||||
$MidNumLet = [\u002e \u003a];
|
|
||||||
|
|
||||||
|
|
||||||
# From Line Break, IS - Numeric Separator (Infix)
|
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||||
# $IS = [\u002c \u002e \u003a \u003b \u0589];
|
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||||
$MidNum = [\u002c \u003b \u0589];
|
# because we don't need to find the boundaries between adjacent syllables -
|
||||||
|
# they won't be word boundaries.
|
||||||
#
|
#
|
||||||
# "Extended" definitions. Classes of characters including trailing combining chars and,
|
|
||||||
# for types of chars that can appear in the interior of a word only,
|
|
||||||
# trailing format characters.
|
|
||||||
#
|
|
||||||
$LetterEx = $Letter $Extend*;
|
|
||||||
$NumericEx = $Numeric $Extend*;
|
|
||||||
$MidNumExF = $MidNum $Extend* $Format*;
|
|
||||||
$MidNumLetExF = $MidNumLet $Extend* $Format*;
|
|
||||||
$MidLetterExF = $MidLetter $Extend* $Format*;
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Numbers. Rules 6, 9, 10 form the TR.
|
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||||
#
|
#
|
||||||
$NumberSequence = $NumericEx ($Format* ($MidNumExF | $MidNumLetExF)? $NumericEx)*;
|
$ALetterEx = $ALetter $Extend* $Format*;
|
||||||
|
$NumericEx = $Numeric $Extend* $Format*;
|
||||||
|
$MidNumEx = $MidNum $Extend* $Format*;
|
||||||
|
$MidNumLetEx = $MidNumLet $Extend* $Format*;
|
||||||
|
$MidLetterEx = $MidLetter $Extend* $Format*;
|
||||||
|
$KatakanaEx = $Katakana $Extend* $Format*;
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Numbers. Rules 8, 11, 12 form the TR.
|
||||||
|
#
|
||||||
|
$NumberSequence = $NumericEx (($MidNumEx | $MidNumLetEx)? $NumericEx)*;
|
||||||
$NumberSequence {100};
|
$NumberSequence {100};
|
||||||
|
|
||||||
#
|
#
|
||||||
# Words. Alpha-numerics. Rule 3 - 10
|
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||||
# - must include at least one letter.
|
# - must include at least one letter.
|
||||||
# - may include both letters and numbers.
|
# - may include both letters and numbers.
|
||||||
# - may include MideLetter, MidNumber punctuation.
|
# - may include MideLetter, MidNumber punctuation.
|
||||||
#
|
#
|
||||||
$LetterSequence = $LetterEx ($Format* ($MidLetterExF | $MidNumLetExF)? $LetterEx)*;
|
$LetterSequence = $ALetterEx (($MidLetterEx | $MidNumLetEx)? $ALetterEx)*; # rules #6, #7
|
||||||
$NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};
|
$NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};
|
||||||
|
|
||||||
#
|
#
|
||||||
# Hiragana and Katakana
|
# Do not break between Katakana. Rule #13.
|
||||||
#
|
#
|
||||||
$Hiragana $Extend* {300};
|
$KatakanaEx+ {300};
|
||||||
$Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
|
[:Hiragana:] $Extend* {300};
|
||||||
|
|
||||||
#
|
#
|
||||||
# Ideographic Characters. Stand by themselves as words.
|
# Ideographic Characters. Stand by themselves as words.
|
||||||
|
# Separated from the "Everything Else" rule, below, only so that they
|
||||||
|
# can be tagged with a return value. TODO: is this what we want?
|
||||||
#
|
#
|
||||||
[:IDEOGRAPHIC:] $Extend* {400};
|
[:IDEOGRAPHIC:] $Extend* {400};
|
||||||
|
|
||||||
#
|
#
|
||||||
# Everything Else, with no tag.
|
# Everything Else, with no tag.
|
||||||
# Non-Control chars combine with $Extend (combining) chars.
|
# Non-Control chars combine with $Extend (combining) chars.
|
||||||
# Controls are returned by themselves.
|
# Controls are do not.
|
||||||
#
|
#
|
||||||
[^$Control] $Extend*;
|
[^$Control [:Ideographic:]] $Extend*;
|
||||||
\r\n;
|
[\u000d][\u000a];
|
||||||
.;
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Reverse Rules. Back up over any of the chars that can group together.
|
# Reverse Rules. Back up over any of the chars that can group together.
|
||||||
@ -121,6 +123,7 @@ $Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
|
|||||||
# reaches something that can only be the start (and probably only) char in a "word".
|
# reaches something that can only be the start (and probably only) char in a "word".
|
||||||
# A space or punctuation meets the test.
|
# A space or punctuation meets the test.
|
||||||
#
|
#
|
||||||
$NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a];
|
$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
|
||||||
|
|
||||||
! $NonStarters* .;
|
!.*;
|
||||||
|
#! ($NonStarters* | \n \r) .;
|
||||||
|
@ -352,7 +352,7 @@ void RBBIAPITest::TestFirstNextFollowing()
|
|||||||
doTest(testString, p, q, 17, " here.");
|
doTest(testString, p, q, 17, " here.");
|
||||||
// hindi starts here
|
// hindi starts here
|
||||||
p=q;
|
p=q;
|
||||||
q=charIter1->next(4);
|
q=charIter1->next(5);
|
||||||
doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924");
|
doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924");
|
||||||
p=q;
|
p=q;
|
||||||
q=charIter1->next(2);
|
q=charIter1->next(2);
|
||||||
@ -515,7 +515,10 @@ void RBBIAPITest::TestLastPreviousPreceding()
|
|||||||
doTest(testString, p, q, 31, "\\u0964");
|
doTest(testString, p, q, 31, "\\u0964");
|
||||||
p=q;
|
p=q;
|
||||||
q=charIter1->previous();
|
q=charIter1->previous();
|
||||||
doTest(testString, p, q, 29, "\\u0939\\u094c");
|
doTest(testString, p, q, 30, "\\u094c");
|
||||||
|
p=q;
|
||||||
|
q=charIter1->previous();
|
||||||
|
doTest(testString, p, q, 29, "\\u0939");
|
||||||
q=charIter1->preceding(26);
|
q=charIter1->preceding(26);
|
||||||
doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902");
|
doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902");
|
||||||
q=charIter1->preceding(16);
|
q=charIter1->preceding(16);
|
||||||
@ -609,7 +612,7 @@ void RBBIAPITest::TestIsBoundary(){
|
|||||||
errln("FAIL: in construction");
|
errln("FAIL: in construction");
|
||||||
else{
|
else{
|
||||||
charIter1->setText(testString1);
|
charIter1->setText(testString1);
|
||||||
int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};
|
int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26};
|
||||||
doBoundaryTest(*charIter1, testString1, bounds1);
|
doBoundaryTest(*charIter1, testString1, bounds1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2178,8 +2178,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!seen2) {
|
if (!seen2) {
|
||||||
errln("No break between U+" + UCharToUnicodeString(c1)
|
errln("No Break between \\U%04x and \\U%04x", c1, c2);
|
||||||
+ " and U+" + UCharToUnicodeString(c2));
|
|
||||||
errCount++;
|
errCount++;
|
||||||
if (errCount >= 75)
|
if (errCount >= 75)
|
||||||
return;
|
return;
|
||||||
@ -2206,8 +2205,8 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
|||||||
tb.setText(work);
|
tb.setText(work);
|
||||||
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
||||||
if (k == 2) {
|
if (k == 2) {
|
||||||
errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) +
|
errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
|
||||||
", U+d U+a U+" + UCharToUnicodeString(work[3]));
|
work[0], work[1], work[2], work[3]);
|
||||||
errCount++;
|
errCount++;
|
||||||
if (errCount >= 75)
|
if (errCount >= 75)
|
||||||
return;
|
return;
|
||||||
|
Loading…
Reference in New Issue
Block a user