ICU-3170 More RBBI tweaks for Unicode 4.01 update
X-SVN-Rev: 14912
This commit is contained in:
parent
a11bcac7f5
commit
f1f3be34f8
@ -15,9 +15,9 @@
|
||||
#
|
||||
$CR = \r;
|
||||
$LF = \n;
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - [:Grapheme_Extend = TRUE:]];
|
||||
|
||||
$Extend = [[:Grapheme_Extend = TRUE:] - [$Control]];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
|
||||
#
|
||||
# Korean Syllable Definitions
|
||||
|
@ -14,7 +14,7 @@
|
||||
# Character categories as defined in TR 29
|
||||
#
|
||||
$Sep = [\u000a \u000d \u0085 \u2028 \u2029];
|
||||
$Format = [[:Format:]];
|
||||
$Format = [[:Format:] - [:Grapheme_Extend:]];
|
||||
$Sp = [[:Whitespace:] - $Sep];
|
||||
$Lower = [[:Lowercase:]];
|
||||
$Upper = [[:TitleCase_Letter:] [:Uppercase:]];
|
||||
|
@ -54,8 +54,8 @@ $Numeric = [:LineBreak = Numeric:];
|
||||
$CR = \u000d;
|
||||
$LF = \u000a;
|
||||
$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:]];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:] - $Control];
|
||||
$Format = [[:Cf:]];
|
||||
$Extend = [[:Grapheme_Extend = TRUE:]];
|
||||
$Format = [[:Cf:] - $Extend];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$Ideographic = [:IDEOGRAPHIC:];
|
||||
|
||||
|
@ -108,14 +108,18 @@ static const UChar gIsWordPattern[] = {
|
||||
static const UChar gGC_ControlPattern[] = {
|
||||
// [ [ : Z l : ] [ : Z p : ]
|
||||
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
|
||||
// [ : C c : ] [ : C f : ] ]
|
||||
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
|
||||
// [ : C c : ] [ : C f : ] -
|
||||
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
|
||||
// [ : G r a p h e m e _
|
||||
0x5b, 0x3a, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
|
||||
// E x t e n d : ] ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x3a, 0x5d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_ExtendPattern[] = {
|
||||
// [ \ p { G r a p h e m e _
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
|
||||
// E x t e n d } - \ p { C f } ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x2d, 0x5c, 0x70, 0x7b, 0x43, 0x66, 0x7d, 0x5d, 0};
|
||||
// E x t e n d } ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
|
@ -2091,7 +2091,7 @@ RBBICharMonkey::RBBICharMonkey() {
|
||||
fMatcher = new RegexMatcher("\\X", 0, status); // Pattern to match a grampheme cluster
|
||||
|
||||
fCRLFSet = new UnicodeSet("[\\r\\n]", status);
|
||||
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]]", status);
|
||||
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
|
||||
fHangulSet = new UnicodeSet(
|
||||
"[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
|
||||
@ -2200,7 +2200,7 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
|
||||
fMidNumLetSet = new UnicodeSet("[\\u002e\\u003a]", status);
|
||||
fMidNumSet = new UnicodeSet("[\\p{Line_Break=Infix_Numeric}]", status);
|
||||
fNumericSet = new UnicodeSet("[\\p{Line_Break=Numeric}]", status);
|
||||
fFormatSet = new UnicodeSet("[\\p{Format}]", status);
|
||||
fFormatSet = new UnicodeSet("[\\p{Format}-\\p{Grapheme_Extend}]", status);
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
|
||||
fOtherSet = new UnicodeSet();
|
||||
if(U_FAILURE(status)) {
|
||||
|
Loading…
Reference in New Issue
Block a user