ICU-2917 bad grapheme cluster matching with some Hangul syllables.
X-SVN-Rev: 12113
This commit is contained in:
parent
82f6fab817
commit
16b5b797a3
@ -112,22 +112,40 @@ static const UChar gIsWordPattern[] = {
|
||||
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_ExtendPattern[] = {
|
||||
// [ [ : M n : ] [ : M e : ]
|
||||
0x5b, 0x5b, 0x3a, 0x4d, 0x6e, 0x3a, 0x5d, 0x5b, 0x3a, 0x4d, 0x65, 0x3a, 0x5d,
|
||||
// \ u f f 9 e - \ u f f 9 f ]
|
||||
0x5c, 0x75, 0x66, 0x66, 0x39, 0x65, 0x2d, 0x5c, 0x75, 0x66, 0x66, 0x39, 0x66, 0x5d, 0};
|
||||
// [ \ p { G r a p h e m e _
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
|
||||
// E x t e n d } ]
|
||||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LPattern[] = {
|
||||
// [ \ u 1 1 0 0 - \ u 1 1 5 f ]
|
||||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x30, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x35, 0x66, 0x5d, 0};
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_VPattern[] = {
|
||||
// [ \ u 1 1 6 0 - \ u 1 1 a 2 ]
|
||||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x36, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x32, 0x5d, 0};
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_TPattern[] = {
|
||||
// [ \ u 1 1 a 8 - \ u 1 1 f 9 ]
|
||||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVTPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
|
||||
|
||||
|
||||
|
||||
@ -150,6 +168,8 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
fPropSets[URX_GC_L] = new UnicodeSet(gGC_LPattern, *status);
|
||||
fPropSets[URX_GC_V] = new UnicodeSet(gGC_VPattern, *status);
|
||||
fPropSets[URX_GC_T] = new UnicodeSet(gGC_TPattern, *status);
|
||||
fPropSets[URX_GC_LV] = new UnicodeSet(gGC_LVPattern, *status);
|
||||
fPropSets[URX_GC_LVT] = new UnicodeSet(gGC_LVTPattern, *status);
|
||||
|
||||
|
||||
|
||||
@ -171,21 +191,6 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
|
||||
|
||||
|
||||
// The Precomposed Hangul syllables have the range of 0xac00 - 0xd7a3.
|
||||
// Categorize these as LV or LVT, using the decomposition algorithm from
|
||||
// the Unicode Standard 3.0, section 3.11
|
||||
fPropSets[URX_GC_LV] = new UnicodeSet;
|
||||
fPropSets[URX_GC_LVT] = new UnicodeSet;
|
||||
const int32_t TCount = 28;
|
||||
UChar c;
|
||||
for (c=0xac00; c<0xd7a4; c+=TCount) {
|
||||
fPropSets[URX_GC_LV]->add(c);
|
||||
}
|
||||
fPropSets[URX_GC_LVT]->add(0xac00, 0xd7a3);
|
||||
fPropSets[URX_GC_LVT]->removeAll(*fPropSets[URX_GC_LV]);
|
||||
|
||||
|
||||
// Initialize the 8-bit fast bit sets from the parallel full
|
||||
// UnicodeSets.
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
|
@ -1202,6 +1202,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
goto GC_Extend;
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user