ICU-2422 regexp, \X does grapheme clusters
X-SVN-Rev: 11071
This commit is contained in:
parent
f1cd2be8d2
commit
d31f8de161
@ -108,6 +108,35 @@ static const UChar gIsWordPattern[] = {
|
||||
// [ \ t \ n \ f \ r \ p { Z } ]
|
||||
0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d, 0};
|
||||
|
||||
|
||||
//
|
||||
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
|
||||
//
|
||||
static const UChar gGC_ControlPattern[] = {
|
||||
// [ [ : Z l : ] [ : Z p : ]
|
||||
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
|
||||
// [ : C c : ] [ : C f : ] ]
|
||||
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_ExtendPattern[] = {
|
||||
// [ [ : M n : ] [ : M e : ]
|
||||
0x5b, 0x5b, 0x3a, 0x4d, 0x6e, 0x3a, 0x5d, 0x5b, 0x3a, 0x4d, 0x65, 0x3a, 0x5d,
|
||||
// \ u f f 9 e - \ u f f 9 f ]
|
||||
0x5c, 0x75, 0x66, 0x66, 0x39, 0x65, 0x2d, 0x5c, 0x75, 0x66, 0x66, 0x39, 0x66, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LPattern[] = {
|
||||
// [ \ u 1 1 0 0 - \ u 1 1 5 f ]
|
||||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x30, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x35, 0x66, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_VPattern[] = {
|
||||
// [ \ u 1 1 6 0 - \ u 1 1 a 2 ]
|
||||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x36, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x32, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_TPattern[] = {
|
||||
// [ \ u 1 1 a 8 - \ u 1 1 f 9 ]
|
||||
0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0};
|
||||
|
||||
|
||||
static UnicodeSet *gPropSets[URX_LAST_SET];
|
||||
|
||||
|
||||
@ -137,6 +166,73 @@ static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UE
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// InitGraphemeClusterSets Initialize the constant UnicodeSets needed for the
|
||||
// determination of Grapheme Cluster boundaries.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
static void InitGraphemeClusterSets() {
|
||||
UErrorCode status = U_ZERO_ERROR; // TODO: some sort of error handling needed.
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND], gGC_ExtendPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL], gGC_ControlPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L], gGC_LPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V], gGC_VPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T], gGC_TPattern, status);
|
||||
|
||||
if (gPropSets[URX_GC_NORMAL] == NULL) {
|
||||
|
||||
//
|
||||
// These sets are dynamically constructed, because their
|
||||
// intialization strings would be unreasonable.
|
||||
//
|
||||
UnicodeSet *LV = new UnicodeSet;;
|
||||
UnicodeSet *LVT = new UnicodeSet;
|
||||
UnicodeSet *Normal = new UnicodeSet;
|
||||
|
||||
|
||||
// The Precomposed Hangul syllables have the range of 0xac00 - 0xd7a3.
|
||||
// Categorize these as LV or LVT, using the decomposition algorithm from
|
||||
// the Unicode Standard 3.0, section 3.11
|
||||
const int32_t TCount = 28;
|
||||
UChar c;
|
||||
for (c=0xac00; c<0xd7a4; c+=TCount) {
|
||||
LV->add(c);
|
||||
}
|
||||
LVT->add(0xac00, 0xd7a3);
|
||||
LVT->removeAll(*LV);
|
||||
|
||||
|
||||
//
|
||||
// "Normal" is the set of characters that don't need special handling
|
||||
// when finding grapheme cluster boundaries.
|
||||
//
|
||||
Normal->complement();
|
||||
Normal->remove(0xac00, 0xd7a4);
|
||||
Normal->removeAll(*gPropSets[URX_GC_CONTROL]);
|
||||
Normal->removeAll(*gPropSets[URX_GC_L]);
|
||||
Normal->removeAll(*gPropSets[URX_GC_V]);
|
||||
Normal->removeAll(*gPropSets[URX_GC_T]);
|
||||
|
||||
//
|
||||
// Thread Safe initialization of the global pointers to these sets.
|
||||
//
|
||||
Mutex lock;
|
||||
if (gPropSets[URX_GC_NORMAL] == NULL) {
|
||||
gPropSets[URX_GC_NORMAL] = Normal;
|
||||
gPropSets[URX_GC_LV] = LV;
|
||||
gPropSets[URX_GC_LVT] = LVT;
|
||||
} else {
|
||||
delete Normal;
|
||||
delete LV;
|
||||
delete LVT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
@ -213,6 +309,8 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
|
||||
ThreadSafeUnicodeSetInit(&gUnescapeCharSet, gUnescapeCharPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET], gIsWordPattern, status);
|
||||
ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET], gIsSpacePattern, status);
|
||||
|
||||
InitGraphemeClusterSets();
|
||||
}
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@ enum {
|
||||
|
||||
|
||||
//
|
||||
// Access to Unicode Sets for Perl-like composite character properties
|
||||
// Access to Unicode Sets composite character properties
|
||||
// The sets are accessed by the match engine for things like \w (word boundary)
|
||||
//
|
||||
enum {
|
||||
@ -189,7 +189,17 @@ enum {
|
||||
URX_ISALNUM_SET = 2,
|
||||
URX_ISALPHA_SET = 3,
|
||||
URX_ISSPACE_SET = 4,
|
||||
URX_LAST_SET = 5,
|
||||
|
||||
URX_GC_NORMAL, // Sets for finding grapheme cluster boundaries.
|
||||
URX_GC_EXTEND,
|
||||
URX_GC_CONTROL,
|
||||
URX_GC_L,
|
||||
URX_GC_LV,
|
||||
URX_GC_LVT,
|
||||
URX_GC_V,
|
||||
URX_GC_T,
|
||||
|
||||
URX_LAST_SET,
|
||||
|
||||
URX_NEG_SET = 0x800000 // Flag bit to reverse sense of set
|
||||
// membership test.
|
||||
|
@ -904,46 +904,82 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
|
||||
break;
|
||||
|
||||
|
||||
case URX_BACKSLASH_X: // Match combining character sequence
|
||||
{ // Closer to Grapheme cluster than to Perl \X
|
||||
case URX_BACKSLASH_X:
|
||||
// Match a Grapheme, as defined by Unicode TR 29.
|
||||
// Differs slightly from Perl, which consumes combining marks independently
|
||||
// of context.
|
||||
{
|
||||
|
||||
// Fail if at end of input
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
fp = (REStackFrame *)fStack->popFrame(frameSize);
|
||||
break;
|
||||
}
|
||||
|
||||
// Always consume one char
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
// Examine (and consume) the current char.
|
||||
// Dispatch into a little state machine, based on the char.
|
||||
UChar32 c;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
UnicodeSet **sets = fPattern->fStaticSets;
|
||||
if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
|
||||
if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
goto GC_Extend;
|
||||
|
||||
// Consume CR/LF as a pair
|
||||
if (c == 0x0d) {
|
||||
UChar32 c = fInput->char32At(fp->fInputIdx);
|
||||
if (c == 0x0a) {
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
break;
|
||||
|
||||
|
||||
GC_L:
|
||||
if (fp->fInputIdx >= inputLen) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (sets[URX_GC_L]->contains(c)) goto GC_L;
|
||||
if (sets[URX_GC_LV]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_V:
|
||||
if (fp->fInputIdx >= inputLen) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (sets[URX_GC_V]->contains(c)) goto GC_V;
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_T:
|
||||
if (fp->fInputIdx >= inputLen) goto GC_Done;
|
||||
U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
|
||||
if (sets[URX_GC_T]->contains(c)) goto GC_T;
|
||||
U16_PREV(inputBuf, 0, fp->fInputIdx, c);
|
||||
goto GC_Extend;
|
||||
|
||||
GC_Extend:
|
||||
// Combining characters are consumed here
|
||||
for (;;) {
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
break;
|
||||
}
|
||||
U16_GET(inputBuf, 0, fp->fInputIdx, inputLen, c);
|
||||
if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
|
||||
break;
|
||||
}
|
||||
U16_FWD_1(inputBuf, fp->fInputIdx, inputLen);
|
||||
}
|
||||
goto GC_Done;
|
||||
|
||||
GC_Control:
|
||||
// Most control chars stand alone (don't combine with combining chars),
|
||||
// except for that CR/LF sequence is a single grapheme cluster.
|
||||
if (c == 0x0d && fp->fInputIdx < inputLen && inputBuf[fp->fInputIdx] == 0x0a) {
|
||||
fp->fInputIdx++;
|
||||
}
|
||||
|
||||
// Consume any combining marks following a non-control char
|
||||
int8_t ctype = u_charType(c);
|
||||
if (ctype != U_CONTROL_CHAR) {
|
||||
for(;;) {
|
||||
c = fInput->char32At(fp->fInputIdx);
|
||||
ctype = u_charType(c);
|
||||
// TODO: make a set and add the "other grapheme extend" chars
|
||||
// to the list of stuff to be skipped over.
|
||||
if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
|
||||
break;
|
||||
}
|
||||
fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
|
||||
if (fp->fInputIdx >= inputLen) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
GC_Done:
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
|
||||
|
5
icu4c/source/test/testdata/regextst.txt
vendored
5
icu4c/source/test/testdata/regextst.txt
vendored
@ -92,9 +92,12 @@
|
||||
"\s+" "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
|
||||
"(\S+).*?(\S+).*" "<0><1>Not-spaces</1> <2>more-non-spaces</2> </0>"
|
||||
|
||||
# \X consume one combining char sequence.
|
||||
# \X consume one Grapheme Cluster.
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
|
||||
"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?" "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"
|
||||
|
||||
# ^ matches only at beginning of line
|
||||
".*^(Hello)" "<0><1>Hello</1></0> Hello Hello Hello Goodbye"
|
||||
|
Loading…
Reference in New Issue
Block a user