ICU-2422 regexp, \X does grapheme clusters

X-SVN-Rev: 11071
This commit is contained in:
Andy Heninger 2003-02-16 07:24:55 +00:00
parent f1cd2be8d2
commit d31f8de161
4 changed files with 179 additions and 32 deletions

View File

@ -108,6 +108,35 @@ static const UChar gIsWordPattern[] = {
// [ \ t \ n \ f \ r \ p { Z } ]
0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d, 0};
//
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
//
static const UChar gGC_ControlPattern[] = {
// [ [ : Z l : ] [ : Z p : ]
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
// [ : C c : ] [ : C f : ] ]
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
static const UChar gGC_ExtendPattern[] = {
// [ [ : M n : ] [ : M e : ]
0x5b, 0x5b, 0x3a, 0x4d, 0x6e, 0x3a, 0x5d, 0x5b, 0x3a, 0x4d, 0x65, 0x3a, 0x5d,
// \ u f f 9 e - \ u f f 9 f ]
0x5c, 0x75, 0x66, 0x66, 0x39, 0x65, 0x2d, 0x5c, 0x75, 0x66, 0x66, 0x39, 0x66, 0x5d, 0};
static const UChar gGC_LPattern[] = {
// [ \ u 1 1 0 0 - \ u 1 1 5 f ]
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x30, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x35, 0x66, 0x5d, 0};
static const UChar gGC_VPattern[] = {
// [ \ u 1 1 6 0 - \ u 1 1 a 2 ]
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x36, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x32, 0x5d, 0};
static const UChar gGC_TPattern[] = {
// [ \ u 1 1 a 8 - \ u 1 1 f 9 ]
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};
static UnicodeSet *gPropSets[URX_LAST_SET];
@ -137,6 +166,73 @@ static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UE
}
//----------------------------------------------------------------------------------------
//
// InitGraphemeClusterSets Initialize the constant UnicodeSets needed for the
// determination of Grapheme Cluster boundaries.
//
//----------------------------------------------------------------------------------------
static void InitGraphemeClusterSets() {
UErrorCode status = U_ZERO_ERROR; // TODO: some sort of error handling needed.
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND], gGC_ExtendPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL], gGC_ControlPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L], gGC_LPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V], gGC_VPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T], gGC_TPattern, status);
if (gPropSets[URX_GC_NORMAL] == NULL) {
//
// These sets are dynamically constructed, because their
// intialization strings would be unreasonable.
//
UnicodeSet *LV = new UnicodeSet;;
UnicodeSet *LVT = new UnicodeSet;
UnicodeSet *Normal = new UnicodeSet;
// The Precomposed Hangul syllables have the range of 0xac00 - 0xd7a3.
// Categorize these as LV or LVT, using the decomposition algorithm from
// the Unicode Standard 3.0, section 3.11
const int32_t TCount = 28;
UChar c;
for (c=0xac00; c<0xd7a4; c+=TCount) {
LV->add(c);
}
LVT->add(0xac00, 0xd7a3);
LVT->removeAll(*LV);
//
// "Normal" is the set of characters that don't need special handling
// when finding grapheme cluster boundaries.
//
Normal->complement();
Normal->remove(0xac00, 0xd7a4);
Normal->removeAll(*gPropSets[URX_GC_CONTROL]);
Normal->removeAll(*gPropSets[URX_GC_L]);
Normal->removeAll(*gPropSets[URX_GC_V]);
Normal->removeAll(*gPropSets[URX_GC_T]);
//
// Thread Safe initialization of the global pointers to these sets.
//
Mutex lock;
if (gPropSets[URX_GC_NORMAL] == NULL) {
gPropSets[URX_GC_NORMAL] = Normal;
gPropSets[URX_GC_LV] = LV;
gPropSets[URX_GC_LVT] = LVT;
} else {
delete Normal;
delete LV;
delete LVT;
}
}
}
//----------------------------------------------------------------------------------------
//
@ -213,6 +309,8 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
ThreadSafeUnicodeSetInit(&gUnescapeCharSet, gUnescapeCharPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status);
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
InitGraphemeClusterSets();
}

View File

@ -181,7 +181,7 @@ enum {
//
// Access to Unicode Sets for Perl-like composite character properties
// Access to Unicode Sets composite character properties
// The sets are accessed by the match engine for things like \w (word boundary)
//
enum {
@ -189,7 +189,17 @@ enum {
URX_ISALNUM_SET = 2,
URX_ISALPHA_SET = 3,
URX_ISSPACE_SET = 4,
URX_LAST_SET = 5,
URX_GC_NORMAL, // Sets for finding grapheme cluster boundaries.
URX_GC_EXTEND,
URX_GC_CONTROL,
URX_GC_L,
URX_GC_LV,
URX_GC_LVT,
URX_GC_V,
URX_GC_T,
URX_LAST_SET,
URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set
// membership test.

View File

@ -904,46 +904,82 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
break;
case URX_BACKSLASH_X: // Match combining character sequence
{ // Closer to Grapheme cluster than to Perl \X
case URX_BACKSLASH_X:
// Match a Grapheme, as defined by Unicode TR 29.
// Differs slightly from Perl, which consumes combining marks independently
// of context.
{
// Fail if at end of input
if (fp->fInputIdx >= inputLen) {
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
// Always consume one char
UChar32 c = fInput->char32At(fp->fInputIdx);
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
// Examine (and consume) the current char.
// Dispatch into a little state machine, based on the char.
UChar32 c;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
UnicodeSet **sets = fPattern->fStaticSets;
if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
if (sets[URX_GC_L]->contains(c)) goto GC_L;
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
goto GC_Extend;
// Consume CR/LF as a pair
if (c == 0x0d) {
UChar32 c = fInput->char32At(fp->fInputIdx);
if (c == 0x0a) {
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
break;
GC_L:
if (fp->fInputIdx >= inputLen) goto GC_Done;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (sets[URX_GC_L]->contains(c)) goto GC_L;
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
if (sets[URX_GC_V]->contains(c)) goto GC_V;
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
goto GC_Extend;
GC_V:
if (fp->fInputIdx >= inputLen) goto GC_Done;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (sets[URX_GC_V]->contains(c)) goto GC_V;
if (sets[URX_GC_T]->contains(c)) goto GC_T;
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
goto GC_Extend;
GC_T:
if (fp->fInputIdx >= inputLen) goto GC_Done;
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
if (sets[URX_GC_T]->contains(c)) goto GC_T;
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
goto GC_Extend;
GC_Extend:
// Combining characters are consumed here
for (;;) {
if (fp->fInputIdx >= inputLen) {
break;
}
U16_GET(inputBuf, 0, fp->fInputIdx, inputLen, c);
if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
break;
}
U16_FWD_1(inputBuf, fp->fInputIdx, inputLen);
}
goto GC_Done;
GC_Control:
// Most control chars stand alone (don't combine with combining chars),
// except for that CR/LF sequence is a single grapheme cluster.
if (c == 0x0d && fp->fInputIdx < inputLen && inputBuf[fp->fInputIdx] == 0x0a) {
fp->fInputIdx++;
}
// Consume any combining marks following a non-control char
int8_t ctype = u_charType(c);
if (ctype != U_CONTROL_CHAR) {
for(;;) {
c = fInput->char32At(fp->fInputIdx);
ctype = u_charType(c);
// TODO: make a set and add the "other grapheme extend" chars
// to the list of stuff to be skipped over.
if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
break;
}
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
if (fp->fInputIdx >= inputLen) {
break;
}
}
}
GC_Done:
break;
}
break;

View File

@ -92,9 +92,12 @@
"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
"(\S+).*?(\S+).*" "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>"
# \X consume one combining char sequence.
# \X consume one Grapheme Cluster.
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
# ^ matches only at beginning of line
".*^(Hello)" "<0><1>Hello</1></0> Hello Hello Hello Goodbye"