ICU-2093 Update word breakr rules to latest Unicode TR, work in progress
X-SVN-Rev: 11472
This commit is contained in:
parent
f70487d239
commit
806b6d974f
@ -1,43 +1,49 @@
|
||||
#
|
||||
# Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
# Copyright (C) 2002, 2003, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Technical Report #29.
|
||||
# These rules are based on the proposed draft dated 2002-08-06
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on the proposed draft dated 2003-03-31
|
||||
#
|
||||
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Definitions imported from Line Break Rules.
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
####################################################################################
|
||||
$Numeric = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
|
||||
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
|
||||
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
|
||||
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
|
||||
$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
|
||||
[:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
|
||||
|
||||
|
||||
$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
|
||||
- [:Ideographic:]
|
||||
- [:Katakana:]
|
||||
- [:Script = Thai:]
|
||||
- [:Script = Lao:]
|
||||
- [:Script = Hiragana:]];
|
||||
|
||||
$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:]
|
||||
[:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]];
|
||||
|
||||
$MidNumLet = [[:name = FULL STOP:] [:name = COLON:]];
|
||||
|
||||
$MidNum = [[:LineBreak = Infix_Numeric:] - $MidNumLet];
|
||||
$Numeric = [:LineBreak = Numeric:];
|
||||
|
||||
|
||||
####################################################################################
|
||||
#
|
||||
# Definitions imported from Character Break Rules.
|
||||
#
|
||||
####################################################################################
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||
|
||||
# Note on $Extend: Earlier versions of TR29 included Mc characters.
|
||||
# To avoid test breakage, Mc is still included for the time being.
|
||||
# $Extend = [[:Mn:] [:Me:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
||||
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
|
||||
|
||||
@ -48,67 +54,63 @@ $Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_G
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
$Katakana = [[:Kana:] \u30fc \uff70 \uff9e-\uff9f];
|
||||
$Hiragana = [[:Hira:]];
|
||||
$Letter = [[[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3] -
|
||||
[[:IDEOGRAPHIC:] [:THAI:] [:LAO:] $Hiragana $Katakana ]];
|
||||
$Format = [[:Cf:]];
|
||||
|
||||
$MidLetter = [\u0027 \u00ad \u05f4 \u2019];
|
||||
|
||||
$MidNumLet = [\u002e \u003a];
|
||||
|
||||
|
||||
# From Line Break, IS - Numeric Separator (Infix)
|
||||
# $IS = [\u002c \u002e \u003a \u003b \u0589];
|
||||
$MidNum = [\u002c \u003b \u0589];
|
||||
|
||||
# Rule 3: Treat a grapheme cluster as if it were a single character.
|
||||
# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters
|
||||
# because we don't need to find the boundaries between adjacent syllables -
|
||||
# they won't be word boundaries.
|
||||
#
|
||||
# "Extended" definitions. Classes of characters including trailing combining chars and,
|
||||
# for types of chars that can appear in the interior of a word only,
|
||||
# trailing format characters.
|
||||
#
|
||||
$LetterEx = $Letter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidNumExF = $MidNum $Extend* $Format*;
|
||||
$MidNumLetExF = $MidNumLet $Extend* $Format*;
|
||||
$MidLetterExF = $MidLetter $Extend* $Format*;
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 6, 9, 10 form the TR.
|
||||
# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char.
|
||||
#
|
||||
$NumberSequence = $NumericEx ($Format* ($MidNumExF | $MidNumLetExF)? $NumericEx)*;
|
||||
$ALetterEx = $ALetter $Extend* $Format*;
|
||||
$NumericEx = $Numeric $Extend* $Format*;
|
||||
$MidNumEx = $MidNum $Extend* $Format*;
|
||||
$MidNumLetEx = $MidNumLet $Extend* $Format*;
|
||||
$MidLetterEx = $MidLetter $Extend* $Format*;
|
||||
$KatakanaEx = $Katakana $Extend* $Format*;
|
||||
|
||||
|
||||
#
|
||||
# Numbers. Rules 8, 11, 12 form the TR.
|
||||
#
|
||||
$NumberSequence = $NumericEx (($MidNumEx | $MidNumLetEx)? $NumericEx)*;
|
||||
$NumberSequence {100};
|
||||
|
||||
#
|
||||
# Words. Alpha-numerics. Rule 3 - 10
|
||||
# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10
|
||||
# - must include at least one letter.
|
||||
# - may include both letters and numbers.
|
||||
# - may include MideLetter, MidNumber punctuation.
|
||||
#
|
||||
$LetterSequence = $LetterEx ($Format* ($MidLetterExF | $MidNumLetExF)? $LetterEx)*;
|
||||
$LetterSequence = $ALetterEx (($MidLetterEx | $MidNumLetEx)? $ALetterEx)*; # rules #6, #7
|
||||
$NumberSequence? $LetterSequence ($NumberSequence | $LetterSequence)* {200};
|
||||
|
||||
#
|
||||
# Hiragana and Katakana
|
||||
# Do not break between Katakana. Rule #13.
|
||||
#
|
||||
$Hiragana $Extend* {300};
|
||||
$Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
|
||||
$KatakanaEx+ {300};
|
||||
[:Hiragana:] $Extend* {300};
|
||||
|
||||
#
|
||||
# Ideographic Characters. Stand by themselves as words.
|
||||
# Separated from the "Everything Else" rule, below, only so that they
|
||||
# can be tagged with a return value. TODO: is this what we want?
|
||||
#
|
||||
[:IDEOGRAPHIC:] $Extend* {400};
|
||||
|
||||
#
|
||||
# Everything Else, with no tag.
|
||||
# Non-Control chars combine with $Extend (combining) chars.
|
||||
# Controls are returned by themselves.
|
||||
# Controls are do not.
|
||||
#
|
||||
[^$Control] $Extend*;
|
||||
\r\n;
|
||||
.;
|
||||
[^$Control [:Ideographic:]] $Extend*;
|
||||
[\u000d][\u000a];
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
@ -121,6 +123,7 @@ $Katakana $Extend* ($Format* $Katakana $Extend*)* {300};
|
||||
# reaches something that can only be the start (and probably only) char in a "word".
|
||||
# A space or punctuation meets the test.
|
||||
#
|
||||
$NonStarters = [$Numeric $Letter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format \u000a];
|
||||
$NonStarters = [$Numeric $ALetter $Katakana $MidLetter $MidNum $MidNumLet $Extend $Format];
|
||||
|
||||
! $NonStarters* .;
|
||||
!.*;
|
||||
#! ($NonStarters* | \n \r) .;
|
||||
|
@ -352,7 +352,7 @@ void RBBIAPITest::TestFirstNextFollowing()
|
||||
doTest(testString, p, q, 17, " here.");
|
||||
// hindi starts here
|
||||
p=q;
|
||||
q=charIter1->next(4);
|
||||
q=charIter1->next(5);
|
||||
doTest(testString, p, q, 22, " \\u092d\\u093e\\u0930\\u0924");
|
||||
p=q;
|
||||
q=charIter1->next(2);
|
||||
@ -515,7 +515,10 @@ void RBBIAPITest::TestLastPreviousPreceding()
|
||||
doTest(testString, p, q, 31, "\\u0964");
|
||||
p=q;
|
||||
q=charIter1->previous();
|
||||
doTest(testString, p, q, 29, "\\u0939\\u094c");
|
||||
doTest(testString, p, q, 30, "\\u094c");
|
||||
p=q;
|
||||
q=charIter1->previous();
|
||||
doTest(testString, p, q, 29, "\\u0939");
|
||||
q=charIter1->preceding(26);
|
||||
doTest(testString, 26, q, 23, "\\u0938\\u0941\\u0902");
|
||||
q=charIter1->preceding(16);
|
||||
@ -609,7 +612,7 @@ void RBBIAPITest::TestIsBoundary(){
|
||||
errln("FAIL: in construction");
|
||||
else{
|
||||
charIter1->setText(testString1);
|
||||
int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26};
|
||||
int32_t bounds1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 23, 24, 25, 26};
|
||||
doBoundaryTest(*charIter1, testString1, bounds1);
|
||||
}
|
||||
|
||||
|
@ -2178,8 +2178,7 @@ void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
||||
}
|
||||
}
|
||||
if (!seen2) {
|
||||
errln("No break between U+" + UCharToUnicodeString(c1)
|
||||
+ " and U+" + UCharToUnicodeString(c2));
|
||||
errln("No Break between \\U%04x and \\U%04x", c1, c2);
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
@ -2206,8 +2205,8 @@ void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
||||
tb.setText(work);
|
||||
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
||||
if (k == 2) {
|
||||
errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) +
|
||||
", U+d U+a U+" + UCharToUnicodeString(work[3]));
|
||||
errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
|
||||
work[0], work[1], work[2], work[3]);
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
|
Loading…
Reference in New Issue
Block a user