ICU-3170 More RBBI tweaks for Unicode 4.01 update

X-SVN-Rev: 14912
This commit is contained in:
Andy Heninger 2004-04-08 23:38:02 +00:00
parent a11bcac7f5
commit f1f3be34f8
5 changed files with 15 additions and 11 deletions

View File

@ -15,9 +15,9 @@
#
$CR = \r;
$LF = \n;
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - [:Grapheme_Extend = TRUE:]];
$Extend = [[:Grapheme_Extend = TRUE:] - [$Control]];
$Extend = [[:Grapheme_Extend = TRUE:]];
#
# Korean Syllable Definitions

View File

@ -14,7 +14,7 @@
# Character categories as defined in TR 29
#
$Sep = [\u000a \u000d \u0085 \u2028 \u2029];
$Format = [[:Format:]];
$Format = [[:Format:] - [:Grapheme_Extend:]];
$Sp = [[:Whitespace:] - $Sep];
$Lower = [[:Lowercase:]];
$Upper = [[:TitleCase_Letter:] [:Uppercase:]];

View File

@ -54,8 +54,8 @@ $Numeric = [:LineBreak = Numeric:];
$CR = \u000d;
$LF = \u000a;
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
$Extend = [[:Grapheme_Extend = TRUE:] - $Control];
$Format = [[:Cf:]];
$Extend = [[:Grapheme_Extend = TRUE:]];
$Format = [[:Cf:] - $Extend];
$Hiragana = [:Hiragana:];
$Ideographic = [:IDEOGRAPHIC:];

View File

@ -108,14 +108,18 @@ static const UChar gIsWordPattern[] = {
static const UChar gGC_ControlPattern[] = {
// [ [ : Z l : ] [ : Z p : ]
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
// [ : C c : ] [ : C f : ] ]
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
// [ : C c : ] [ : C f : ] -
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
// [ : G r a p h e m e _
0x5b, 0x3a, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
// E x t e n d : ] ]
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0};
static const UChar gGC_ExtendPattern[] = {
// [ \ p { G r a p h e m e _
0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
// E x t e n d } - \ p { C f } ]
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x2d, 0x5c, 0x70, 0x7b, 0x43, 0x66, 0x7d, 0x5d, 0};
// E x t e n d } ]
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
static const UChar gGC_LPattern[] = {
// [ \ p { H a n g u l _ S y l

View File

@ -2091,7 +2091,7 @@ RBBICharMonkey::RBBICharMonkey() {
fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster
fCRLFSet = new UnicodeSet("[\\r\\n]", status);
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]", status);
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
fHangulSet = new UnicodeSet(
"[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
@ -2200,7 +2200,7 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
fMidNumLetSet = new UnicodeSet("[\\u002e\\u003a]", status);
fMidNumSet = new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]", status);
fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]", status);
fFormatSet = new UnicodeSet("[\\p{Format}]", status);
fFormatSet = new UnicodeSet("[\\p{Format}-\\p{Grapheme_Extend}]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {