ICU-2422 regexp, \X does grapheme clusters

X-SVN-Rev: 11071
2003-02-16 07:24:55 +00:00 · 2003-02-16 07:24:55 +00:00 · d31f8de161
commit d31f8de161
parent f1cd2be8d2
4 changed files with 179 additions and 32 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -108,6 +108,35 @@ static const UChar gIsWordPattern[] = {
 //    [     \     t     \     n     \     f     \     r     \     p     {     Z     }     ]
    0x5b, 0x5c, 0x74, 0x5c, 0x6e, 0x5c, 0x66, 0x5c, 0x72, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5d,  0};

+
+//
+//  UnicodeSets used in implementation of Grapheme Cluster detection, \X
+//
+    static const UChar gGC_ControlPattern[] = {
+//    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]    
+    0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, 
+//    [     :     C     c     :     ]     [     :     C     f     :     ]     ] 
+    0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x5d, 0};
+
+    static const UChar gGC_ExtendPattern[] = {
+//    [     [     :     M     n     :     ]     [     :     M     e     :     ]     
+    0x5b, 0x5b, 0x3a, 0x4d, 0x6e, 0x3a, 0x5d, 0x5b, 0x3a, 0x4d, 0x65, 0x3a, 0x5d,
+//    \     u     f     f     9     e     -     \     u     f     f     9     f     ]
+    0x5c, 0x75, 0x66, 0x66, 0x39, 0x65, 0x2d, 0x5c, 0x75, 0x66, 0x66, 0x39, 0x66, 0x5d, 0};
+
+    static const UChar gGC_LPattern[] = {
+//    [     \     u     1     1     0     0     -     \     u     1     1     5     f     ]      
+    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x30, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x35, 0x66, 0x5d, 0}; 
+
+    static const UChar gGC_VPattern[] = {
+//    [     \     u     1     1     6     0     -     \     u     1     1     a     2     ]      
+    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x36, 0x30, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x32, 0x5d, 0}; 
+
+    static const UChar gGC_TPattern[] = {
+//    [     \     u     1     1     a     8     -     \     u     1     1     f     9     ]      
+    0x5b, 0x5c, 0x75, 0x31, 0x31, 0x61, 0x38, 0x2d, 0x5c, 0x75, 0x31, 0x31, 0x66, 0x39, 0x5d, 0}; 
+
+
 static UnicodeSet *gPropSets[URX_LAST_SET];


@ -137,6 +166,73 @@ static void ThreadSafeUnicodeSetInit(UnicodeSet **pSet, const UChar *pattern, UE
 }


+//----------------------------------------------------------------------------------------
+//
+//   InitGraphemeClusterSets   Initialize the constant UnicodeSets needed for the 
+//                             determination of Grapheme Cluster boundaries.
+//
+//----------------------------------------------------------------------------------------
+static void InitGraphemeClusterSets() {
+    UErrorCode status = U_ZERO_ERROR;     // TODO:  some sort of error handling needed.
+    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_EXTEND],       gGC_ExtendPattern,           status);    
+    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_CONTROL],      gGC_ControlPattern,          status);    
+    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_L],            gGC_LPattern,                status);    
+    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_V],            gGC_VPattern,                status);    
+    ThreadSafeUnicodeSetInit(&gPropSets[URX_GC_T],            gGC_TPattern,                status);   
+    
+    if (gPropSets[URX_GC_NORMAL] == NULL) {
+
+        //
+        // These sets  are dynamically constructed, because their
+        //   intialization strings would be unreasonable.
+        //
+        UnicodeSet *LV     = new UnicodeSet;;
+        UnicodeSet *LVT    = new UnicodeSet;
+        UnicodeSet *Normal = new UnicodeSet;
+
+
+        // The Precomposed Hangul syllables have the range of 0xac00 - 0xd7a3.
+        // Categorize these as LV or LVT, using the decomposition algorithm from
+        // the Unicode Standard 3.0, section 3.11
+        const int32_t TCount = 28;
+        UChar  c;
+        for (c=0xac00; c<0xd7a4; c+=TCount) {
+            LV->add(c);
+        }
+        LVT->add(0xac00, 0xd7a3);
+        LVT->removeAll(*LV);
+        
+            
+        //
+        //  "Normal" is the set of characters that don't need special handling
+        //            when finding grapheme cluster boundaries.
+        //
+        Normal->complement();
+        Normal->remove(0xac00, 0xd7a4);
+        Normal->removeAll(*gPropSets[URX_GC_CONTROL]);
+        Normal->removeAll(*gPropSets[URX_GC_L]);
+        Normal->removeAll(*gPropSets[URX_GC_V]);
+        Normal->removeAll(*gPropSets[URX_GC_T]);
+
+        //
+        //  Thread Safe initialization of the global pointers to these sets.
+        //
+        Mutex  lock;
+        if (gPropSets[URX_GC_NORMAL] == NULL) {
+            gPropSets[URX_GC_NORMAL] = Normal;
+            gPropSets[URX_GC_LV]     = LV;
+            gPropSets[URX_GC_LVT]    = LVT;
+        } else {
+            delete Normal;
+            delete LV;
+            delete LVT;
+        }
+    }
+}
+
+
+
+

 //----------------------------------------------------------------------------------------
 //
@ -213,6 +309,8 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
    ThreadSafeUnicodeSetInit(&gUnescapeCharSet,                    gUnescapeCharPattern,        status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISWORD_SET],           gIsWordPattern,              status);
    ThreadSafeUnicodeSetInit(&gPropSets[URX_ISSPACE_SET],          gIsSpacePattern,             status);    
+
+    InitGraphemeClusterSets();
 }


--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -181,7 +181,7 @@ enum {

                
 //
-//  Access to Unicode Sets for Perl-like composite character properties
+//  Access to Unicode Sets composite character properties
 //     The sets are accessed by the match engine for things like \w (word boundary)
 //     
 enum {
@ -189,7 +189,17 @@ enum {
     URX_ISALNUM_SET = 2,
     URX_ISALPHA_SET = 3,
     URX_ISSPACE_SET = 4,
-     URX_LAST_SET    = 5,
+
+     URX_GC_NORMAL,          // Sets for finding grapheme cluster boundaries.
+     URX_GC_EXTEND,
+     URX_GC_CONTROL,
+     URX_GC_L,
+     URX_GC_LV,
+     URX_GC_LVT,
+     URX_GC_V,
+     URX_GC_T,
+
+     URX_LAST_SET,

     URX_NEG_SET     = 0x800000          // Flag bit to reverse sense of set
                                         //   membership test.
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -904,46 +904,82 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
            break;


-        case URX_BACKSLASH_X:          // Match combining character sequence
-            {                          //  Closer to Grapheme cluster than to Perl \X
+        case URX_BACKSLASH_X:     
+            //  Match a Grapheme, as defined by Unicode TR 29.
+            //  Differs slightly from Perl, which consumes combining marks independently
+            //    of context.
+            {                  
+
                // Fail if at end of input
                if (fp->fInputIdx >= inputLen) {
                    fp = (REStackFrame *)fStack->popFrame(frameSize);
                    break;
                }

-                // Always consume one char
-                UChar32 c = fInput->char32At(fp->fInputIdx);   
-                fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
+                // Examine (and consume) the current char.
+                //   Dispatch into a little state machine, based on the char.
+                UChar32  c;
+                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
+                UnicodeSet **sets = fPattern->fStaticSets;
+                if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
+                if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
+                if (sets[URX_GC_L]->contains(c))       goto GC_L;
+                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
+                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
+                goto GC_Extend;

-                // Consume CR/LF as a pair
-                if (c == 0x0d)  { 
-                    UChar32 c = fInput->char32At(fp->fInputIdx);   
-                    if (c == 0x0a) {
-                         fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
-                         break;
+
+
+GC_L:
+                if (fp->fInputIdx >= inputLen)         goto GC_Done;
+                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
+                if (sets[URX_GC_L]->contains(c))       goto GC_L;
+                if (sets[URX_GC_LV]->contains(c))      goto GC_V;
+                if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
+                if (sets[URX_GC_V]->contains(c))       goto GC_V;
+                U16_PREV(inputBuf, 0, fp->fInputIdx, c);
+                goto GC_Extend;
+
+GC_V:
+                if (fp->fInputIdx >= inputLen)         goto GC_Done;
+                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
+                if (sets[URX_GC_V]->contains(c))       goto GC_V;
+                if (sets[URX_GC_T]->contains(c))       goto GC_T;
+                U16_PREV(inputBuf, 0, fp->fInputIdx, c);
+                goto GC_Extend;
+
+GC_T:
+                if (fp->fInputIdx >= inputLen)         goto GC_Done;
+                U16_NEXT(inputBuf, fp->fInputIdx, inputLen, c);
+                if (sets[URX_GC_T]->contains(c))       goto GC_T;
+                U16_PREV(inputBuf, 0, fp->fInputIdx, c);
+                goto GC_Extend;
+
+GC_Extend:
+                // Combining characters are consumed here
+                for (;;) {
+                    if (fp->fInputIdx >= inputLen) {
+                        break;
                    }
+                    U16_GET(inputBuf, 0, fp->fInputIdx, inputLen, c);
+                    if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
+                        break;
+                    }
+                    U16_FWD_1(inputBuf, fp->fInputIdx, inputLen);
+                }
+                goto GC_Done;
+
+GC_Control:
+                // Most control chars stand alone (don't combine with combining chars),  
+                //   except for that CR/LF sequence is a single grapheme cluster.
+                if (c == 0x0d && fp->fInputIdx < inputLen && inputBuf[fp->fInputIdx] == 0x0a) {
+                    fp->fInputIdx++;
                }

-                // Consume any combining marks following a non-control char
-                int8_t ctype = u_charType(c);
-                if (ctype != U_CONTROL_CHAR) {
-                    for(;;) {   
-                        c = fInput->char32At(fp->fInputIdx);   
-                        ctype = u_charType(c);
-                        // TODO:  make a set and add the "other grapheme extend" chars
-                        //        to the list of stuff to be skipped over.
-                        if (!(ctype == U_NON_SPACING_MARK || ctype == U_ENCLOSING_MARK)) {
-                            break;
-                        }
-                        fp->fInputIdx = fInput->moveIndex32(fp->fInputIdx, 1);
-                        if (fp->fInputIdx >= inputLen) {
-                            break; 
-                        }
-                    }
-                }
+GC_Done:
+                break;
            }
-            break;
+            



--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -92,9 +92,12 @@
 "\s+"                          "not_space<0> \t \r \n \u3000 \u2004 \u2028 \u2029</0>xyz"
 "(\S+).*?(\S+).*"              "<0><1>Not-spaces</1>   <2>more-non-spaces</2>  </0>"

-# \X  consume one combining char sequence.
+# \X  consume one Grapheme Cluster.
 "(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>A</1><2>B</2><3> </3><4>\r\n</4></0>"
 "(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>A\u0301</1><2>\n</2><3>\u0305</3><4>a\u0302\u0303\u0304</4></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>\u1100\u1161\u11a8</1><2>\u115f\u11a2\u11f9</2></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>\u1100\uac01</1><2>\uac02</2><3>\uac03\u11b0</3></0>"
+"(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?(\X)?"   "<0><1>\u1100\u1101\uac02\u0301</1><2>\u1100</2></0>"

 # ^ matches only at beginning of line
 ".*^(Hello)"                   "<0><1>Hello</1></0> Hello Hello Hello Goodbye"