ICU-2917 bad grapheme cluster matching with some Hangul syllables.

X-SVN-Rev: 12113
This commit is contained in:
Andy Heninger 2003-05-27 03:03:47 +00:00
parent 82f6fab817
commit 16b5b797a3
2 changed files with 32 additions and 25 deletions

View File

@ -112,22 +112,40 @@ static const UChar gIsWordPattern[] = {
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
static const UChar gGC_ExtendPattern[] = {
// [ [ : M n : ] [ : M e : ]
0x5b, 0x5b, 0x3a, 0x4d, 0x6e, 0x3a, 0x5d, 0x5b, 0x3a, 0x4d, 0x65, 0x3a, 0x5d,
// \ u f f 9 e - \ u f f 9 f ]
0x5c, 0x75, 0x66, 0x66, 0x39, 0x65, 0x2d, 0x5c, 0x75, 0x66, 0x66, 0x39, 0x66, 0x5d, 0};
// [ \ p { G r a p h e m e _
0x5b, 0x5c, 0x70, 0x7b, 0x47, 0x72, 0x61, 0x70, 0x68, 0x65, 0x6d, 0x65, 0x5f,
// E x t e n d } ]
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
static const UChar gGC_LPattern[] = {
// [ \ u 1 1 0 0 - \ u 1 1 5 f ]
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x30, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x35, 0x66, 0x5d, 0};
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
static const UChar gGC_VPattern[] = {
// [ \ u 1 1 6 0 - \ u 1 1 a 2 ]
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x36, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x32, 0x5d, 0};
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = V } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
static const UChar gGC_TPattern[] = {
// [ \ u 1 1 a 8 - \ u 1 1 f 9 ]
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = T } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
static const UChar gGC_LVPattern[] = {
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L V } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
static const UChar gGC_LVTPattern[] = {
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L V T } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
@ -150,6 +168,8 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
fPropSets[URX_GC_L] = new UnicodeSet(gGC_LPattern, *status);
fPropSets[URX_GC_V] = new UnicodeSet(gGC_VPattern, *status);
fPropSets[URX_GC_T] = new UnicodeSet(gGC_TPattern, *status);
fPropSets[URX_GC_LV] = new UnicodeSet(gGC_LVPattern, *status);
fPropSets[URX_GC_LVT] = new UnicodeSet(gGC_LVTPattern, *status);
@ -171,21 +191,6 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) {
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
// The Precomposed Hangul syllables have the range of 0xac00 - 0xd7a3.
// Categorize these as LV or LVT, using the decomposition algorithm from
// the Unicode Standard 3.0, section 3.11
fPropSets[URX_GC_LV] = new UnicodeSet;
fPropSets[URX_GC_LVT] = new UnicodeSet;
const int32_t TCount = 28;
UChar c;
for (c=0xac00; c<0xd7a4; c+=TCount) {
fPropSets[URX_GC_LV]->add(c);
}
fPropSets[URX_GC_LVT]->add(0xac00, 0xd7a3);
fPropSets[URX_GC_LVT]->removeAll(*fPropSets[URX_GC_LV]);
// Initialize the 8-bit fast bit sets from the parallel full
// UnicodeSets.
for (i=0; i<URX_LAST_SET; i++) {

View File

@ -1202,6 +1202,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
if (sets[URX_GC_L]->contains(c)) goto GC_L;
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
if (sets[URX_GC_V]->contains(c)) goto GC_V;
if (sets[URX_GC_T]->contains(c)) goto GC_T;
goto GC_Extend;