ICU-2422 Regexp, optimizing find() operations, work in progress.

X-SVN-Rev: 11290
2003-03-13 01:56:01 +00:00 · 2003-03-13 01:56:01 +00:00 · fef34e930e
commit fef34e930e
parent 708317c997
6 changed files with 197 additions and 44 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -520,6 +520,12 @@ void    RegexCompile::compile(
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
    fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);

+    //
+    // Optimization pass:  Categorize how a match can start, for use by find()
+    //  
+    matchStartType();
+
+
    //
    // A stupid bit of non-sense to prevent code coverage testing from complaining
    //   about the pattern.dump() debug function.  Go through the motions of dumping,
@ -2073,24 +2079,27 @@ UBool  RegexCompile::possibleNullMatch(int32_t start, int32_t end) {
 //   matchStartType    Determine how a match can start.
 //                     Used to optimize find() operations.
 //
+//                     Operation is very similar to minMatchLength().  Walk the compiled
+//                     pattern, keeping an on-going minimum-match-length.  For any
+//                     op where the min match coming in is zero, add that ops possible
+//                     starting matches to the possible starts for the overall pattern.
+//
 //----------------------------------------------------------------------------------------
-int32_t   RegexCompile::matchStartType() {
+void   RegexCompile::matchStartType() {
    if (U_FAILURE(*fStatus)) {
-        return 0;
+        return;
    }


-    int32_t    loc;
-    int32_t    op;
-    int32_t    opType;
-    int32_t    currentLen = 0;
+    int32_t    loc;                    // Location in the pattern of the current op being processed.
+    int32_t    op;                     // The op being processed
+    int32_t    opType;                 // The opcode type of the op
+    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
+    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.

-    UnicodeSet   startingChars;
-    int32_t      startStringIndex;
-    int32_t      startStringLen;
-
-    UBool        atStart = TRUE;     // True if no part of the pattern yet encountered
-                                     //   could have advanced the position in a match.
+    UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
+                                       //   could have advanced the position in a match.
+                                       //   (Maximum match length so far == 0)

    // forwardedLength is a vector holding minimum-match-length values that
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
@ -2155,19 +2164,19 @@ int32_t   RegexCompile::matchStartType() {
            if (currentLen == 0) {
                // This character could appear at the start of a match.
                //   Add it to the set of possible starting characters.
-                startingChars.add(URX_VAL(op));
+                fRXPat->fInitialChars->add(URX_VAL(op));
            }
            currentLen++;
            atStart = FALSE;
            break;
            

-        case URX_SETREF:               // TODO:  Sense of op, invert the set
+        case URX_SETREF:       
            if (currentLen == 0) {
                int32_t  sn = URX_VAL(op);
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
-                startingChars.addAll(*s);
+                fRXPat->fInitialChars->addAll(*s);
            }
            currentLen++;
            atStart = FALSE;
@ -2183,9 +2192,9 @@ int32_t   RegexCompile::matchStartType() {
                if (negated) {
                    UnicodeSet sc(*s);
                    sc.complement();
-                    startingChars.addAll(sc);
+                    fRXPat->fInitialChars->addAll(sc);
                } else {
-                    startingChars.addAll(*s);
+                    fRXPat->fInitialChars->addAll(*s);
                }
            }
            currentLen++;
@ -2197,9 +2206,12 @@ int32_t   RegexCompile::matchStartType() {
        case URX_BACKSLASH_D:
            // Digit Char
             if (currentLen == 0) {
-                 UnicodeSet s;   // TODO:  sense of op, invert the set.
+                 UnicodeSet s;   
                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
-                 startingChars.addAll(s);
+                 if (URX_VAL(op) != 0) {
+                     s.complement();
+                 }
+                 fRXPat->fInitialChars->addAll(s);
            }
            currentLen++;
            atStart = FALSE;
@ -2215,21 +2227,27 @@ int32_t   RegexCompile::matchStartType() {
                    //   to the set of possible starting match chars.
                    UnicodeSet s(c, c);
                    s.closeOver(USET_CASE);
-                    startingChars.addAll(s);
+                    fRXPat->fInitialChars->addAll(s);
                } else {
                    // Char has no case variants.  Just add it as-is to the
                    //   set of possible starting chars.
-                    startingChars.add(c);
+                    fRXPat->fInitialChars->add(c);
                }
            }
            currentLen++;
            atStart = FALSE;
            break;

-        case URX_BACKSLASH_W:
+
        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
        case URX_DOTANY_ALL:    // . matches one or two.
        case URX_DOTANY:
+            if (currentLen == 0) {
+                // These constructs are all bad news when they appear at the start
+                //   of a match.  Any character can begin the match.
+                fRXPat->fInitialChars->clear();
+                fRXPat->fInitialChars->complement();
+            }
            currentLen++;
            atStart = FALSE;
            break;
@ -2252,12 +2270,14 @@ int32_t   RegexCompile::matchStartType() {
                    }
                }
            }
+            atStart = FALSE;
            break;

        case URX_FAIL:
            // Fails are kind of like a branch, except that the min length was
            //   propagated already, by the state save.
            currentLen = forwardedLength.elementAti(loc+1);
+            atStart = FALSE;
            break;


@ -2272,20 +2292,61 @@ int32_t   RegexCompile::matchStartType() {
                    }
                } 
            }
+            atStart = FALSE;
            break;
            



        case URX_STRING:
-        case URX_STRING_I:
            {
                loc++;
                int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
-                currentLen += URX_VAL(stringLenOp);
+                int32_t stringLen   = URX_VAL(stringLenOp);
+                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
+                U_ASSERT(stringLenOp >= 2);
+                if (currentLen == 0) {
+                    // Add the starting character of this string to the set of possible starting
+                    //   characters for this pattern.
+                    int32_t stringStartIdx = URX_VAL(op);
+                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
+                    fRXPat->fInitialChars->add(c);
+
+                    // Remember this string.  After the entire pattern has been checked,
+                    //  if nothing else is identified that can start a match, we'll use it.
+                    numInitialStrings++;
+                    fRXPat->fInitialStringIdx = stringStartIdx;
+                    fRXPat->fInitialStringLen = stringLen;
+                }
+                    
+                currentLen += stringLen;
+                atStart = FALSE;
            }
            break;

+        case URX_STRING_I:
+            {
+                // Case-insensitive string.  Unlike exact-match strings, we won't
+                //   attempt a string search for possible match positions.  But we
+                //   do update the set of possible starting characters.
+                loc++;
+                int32_t stringLenOp = fRXPat->fCompiledPat->elementAti(loc);
+                int32_t stringLen   = URX_VAL(stringLenOp);
+                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
+                U_ASSERT(stringLenOp >= 2);
+                if (currentLen == 0) {
+                    // Add the starting character of this string to the set of possible starting
+                    //   characters for this pattern.
+                    int32_t stringStartIdx = URX_VAL(op);
+                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
+                    UnicodeSet s(c, c);
+                    s.closeOver(USET_CASE);
+                    fRXPat->fInitialChars->addAll(s);
+                }
+                currentLen += stringLen;
+                atStart = FALSE;
+            }
+            break;

        case URX_CTR_INIT:
        case URX_CTR_INIT_NG:
@ -2295,6 +2356,7 @@ int32_t   RegexCompile::matchStartType() {
                //   so location must be updated accordingly.
                loc+=3;
            }
+            atStart = FALSE;
            break;


@ -2303,6 +2365,7 @@ int32_t   RegexCompile::matchStartType() {
        case URX_CTR_LOOP_P:
            // Loop ops. 
            //  The jump is conditional, backwards only.
+            atStart = FALSE;
            break;
            
            
@ -2312,8 +2375,6 @@ int32_t   RegexCompile::matchStartType() {
            {
                // Look-around.  Scan forward until the matching look-ahead end,
                //   without processing the look-around block.  This is overly pessimistic.
-                //   TODO:  Positive lookahead could recursively do the block, then continue
-                //          with the longer of the block or the value coming in.
                int32_t  depth = 0;
                for (;;) {
                    loc++;
@ -2337,8 +2398,9 @@ int32_t   RegexCompile::matchStartType() {
        case URX_LB_END:
        case URX_LBN_CONT:
        case URX_LBN_END:
-            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
-            //   range being sized, which happens when measuring size of look-behind blocks.
+            U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be 
+                                 //  consumed by the scan in URX_LA_START and LB_START
+
            break;
            
        default:
@ -2353,8 +2415,41 @@ int32_t   RegexCompile::matchStartType() {
    if (forwardedLength.elementAti(end+1) < currentLen) {
        currentLen = forwardedLength.elementAti(end+1);
    }
-            
-    return currentLen;
+
+
+    // Sort out what we should check for when looking for candidate match start positions.
+    // In order of preference,
+    //     1.   Start of input text buffer.
+    //     2.   A literal string.
+    //     3.   Start of line in multi-line mode.
+    //     4.   A single literal character.
+    //     5.   A character from a set of characters.
+    //
+    if (fRXPat->fStartType == START_START) {
+        // Match only at the start of an input text string.
+        //    start type is already set.  We're done.
+    } else if (numInitialStrings == 1 && fRXPat->fInitialChars->size() == 1) {
+        // Match beginning only with a literal string.
+        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
+        U_ASSERT(fRXPat->fInitialChars->contains(c));
+        fRXPat->fStartType = START_STRING;
+    } else if (fRXPat->fStartType == START_LINE) {
+        // Match at start of line in Mulit-Line mode.
+        // Nothing to do here; everything is already set.
+    } else if (fRXPat->fInitialChars->size() == 1) {
+        // All matches begin with the same char.
+        fRXPat->fStartType   = START_CHAR;
+        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
+        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
+    } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE) {
+        // Matches start with a set of character smaller than the set of all chars.
+        fRXPat->fStartType = START_SET;
+    } else {
+        // Matches can start with anything
+        fRXPat->fStartType = START_NO_INFO;
+    }
+
+    return;
 }


@ -2444,7 +2539,6 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
        case URX_SETREF:
        case URX_BACKSLASH_D:
        case URX_ONECHAR_I:
-        case URX_BACKSLASH_W:
        case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
        case URX_DOTANY_ALL:    // . matches one or two.
        case URX_DOTANY:
@ -2662,7 +2756,6 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
        case URX_SETREF:
        case URX_BACKSLASH_D:
        case URX_ONECHAR_I:
-        case URX_BACKSLASH_W:
        case URX_DOTANY_ALL:  
        case URX_DOTANY:
            currentLen+=2;
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -111,7 +111,7 @@ private:
                               int32_t end);
    int32_t     maxMatchLength(int32_t start,
                               int32_t end);
-    int32_t     matchStartType();
+    void        matchStartType();


    UErrorCode                    *fStatus;
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -71,7 +71,7 @@ enum {
     URX_UNUSED        = 15,   
     URX_BACKSLASH_B   = 16,   // Value field:  0:  \b    1:  \B
     URX_BACKSLASH_G   = 17, 
-     URX_BACKSLASH_W   = 18,   // Value field:  0:  \w    1:  \W
+     URX_UNUSED_1      = 18,   // Value field:  0:  \w    1:  \W
     URX_BACKSLASH_X   = 19,
     URX_BACKSLASH_Z   = 20,   // \z   Unconditional end of line.

@ -169,7 +169,7 @@ enum {
        "URX_UNUSED",          \
        "URX_BACKSLASH_B",     \
        "URX_BACKSLASH_G",     \
-        "URX_BACKSLASH_W",     \
+        "URX_UNUSED_1",        \
        "URX_BACKSLASH_X",     \
        "URX_BACKSLASH_Z",     \
        "URX_DOTANY_ALL",      \
@ -258,6 +258,14 @@ enum StartOfMatch {
    START_LINE,                // Match starts with ^ in multi-line mode.
    START_STRING               // Match starts with a literal string.
 };
+
+#define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \
+                               (v)==START_CHAR?    "START_CHAR"    : \
+                               (v)==START_SET?     "START_SET"     : \
+                               (v)==START_START?   "START_START"   : \
+                               (v)==START_LINE?    "START_LINE"    : \
+                               (v)==START_STRING?  "START_STRING"  : \
+                                                   "ILLEGAL")
    
 U_NAMESPACE_END
 #endif
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -69,9 +69,13 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fMinMatchLen      = other.fMinMatchLen;
    fMaxMatchLen      = other.fMaxMatchLen;
    fMaxCaptureDigits = other.fMaxCaptureDigits;
-    fStaticSets       = other.fStaticSets;    
+    fStaticSets       = other.fStaticSets; 
+    
    fStartType        = other.fStartType;
-    fStartInfo        = other.fStartInfo;
+    fInitialStringIdx = other.fInitialStringIdx;
+    fInitialStringLen = other.fInitialStringLen;
+    fInitialChars     = new UnicodeSet(*other.fInitialChars);
+    fInitialChar      = other.fInitialChar;
    if (fBadState) {
        return *this;
    }
@ -123,14 +127,18 @@ void RegexPattern::init() {
    fFrameSize        = 0;
    fDataSize         = 0;
    fStartType        = START_NO_INFO;
-    fStartInfo        = 0;
+    fInitialStringIdx = 0;
+    fInitialStringLen = 0;
+    fInitialChars     = NULL;
+    fInitialChar      = 0;
    
    UErrorCode status=U_ZERO_ERROR;
    // Init of a completely new RegexPattern.
-    fCompiledPat = new UVector32(status);
-    fGroupMap    = new UVector32(status);
-    fSets        = new UVector(status);
-    if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL) {
+    fCompiledPat  = new UVector32(status);
+    fGroupMap     = new UVector32(status);
+    fSets         = new UVector(status);
+    fInitialChars = new UnicodeSet;
+    if (U_FAILURE(status) || fCompiledPat == NULL || fSets == NULL || fInitialChars == NULL) {
        fBadState = TRUE;
        return;
    }
@ -162,6 +170,8 @@ void RegexPattern::zap() {
    fSets = NULL;
    delete fGroupMap;
    fGroupMap = NULL;
+    delete fInitialChars;
+    fInitialChars = NULL;
 }


@ -478,7 +488,6 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_JMP:
    case URX_BACKSLASH_B:
    case URX_BACKSLASH_D:
-    case URX_BACKSLASH_W:
    case URX_BACKSLASH_Z:
    case URX_STRING_LEN:
    case URX_CTR_INIT:
@ -576,6 +585,41 @@ void   RegexPattern::dump() const {
    REGEX_DUMP_DEBUG_PRINTF("Pattern Valid?:     %s\n"  , fBadState? "no" : "yes");
    REGEX_DUMP_DEBUG_PRINTF("   Min Match Length:  %d\n", fMinMatchLen);
    REGEX_DUMP_DEBUG_PRINTF("   Max Match Length:  %d\n", fMaxMatchLen);
+    REGEX_DUMP_DEBUG_PRINTF("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));   
+    if (fStartType == START_STRING) {
+        REGEX_DUMP_DEBUG_PRINTF("    Initial match sting: \"");
+        for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
+            REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText[i]);   // TODO:  non-printables, surrogates.
+        }
+
+    } else if (fStartType == START_SET) {
+        int32_t numSetChars = fInitialChars->size();
+        if (numSetChars > 20) {
+            numSetChars = 20;
+        }
+        REGEX_DUMP_DEBUG_PRINTF("     Match First Chars : ");
+        for (i=0; i<numSetChars; i++) {
+            UChar32 c = fInitialChars->charAt(i);
+            if (0x20<c && c <0x7e) { 
+                REGEX_DUMP_DEBUG_PRINTF("%c ", c);
+            } else {
+                REGEX_DUMP_DEBUG_PRINTF("%#x ", c);
+            }
+        }
+        if (numSetChars < fInitialChars->size()) {
+            REGEX_DUMP_DEBUG_PRINTF(" ...");
+        }
+        REGEX_DUMP_DEBUG_PRINTF("\n");
+
+    } else if (fStartType == START_CHAR) {
+        REGEX_DUMP_DEBUG_PRINTF("    First char of Match : ");
+        if (0x20 < fInitialChar && fInitialChar<0x7e) {
+                REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar);
+            } else {
+                REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar);
+            }
+    }
+
    REGEX_DUMP_DEBUG_PRINTF("\nIndex   Binary     Type             Operand\n"
           "-------------------------------------------\n");
    for (index = 0; index<fCompiledPat->size(); index++) {
@ -584,6 +628,8 @@ void   RegexPattern::dump() const {
    REGEX_DUMP_DEBUG_PRINTF("\n\n");
 };

+
+
 const char RegexPattern::fgClassID = 0;

 //----------------------------------------------------------------------------------
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -351,7 +351,10 @@ private:
                                   //   regex character classes, e.g. Word.

    int32_t         fStartType;    // Info on how a match must start.
-    int32_t         fStartInfo;    //   Data, interpretation depends on start type.
+    int32_t         fInitialStringIdx;     //  
+    int32_t         fInitialStringLen;
+    UnicodeSet     *fInitialChars;  
+    UChar32         fInitialChar;

    /**
     * The address of this static class variable serves as this class's ID
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -279,6 +279,9 @@
 #"^(?:a?b?)*$"	        d         "a--"	
 "^(?:a?b?)*$"	                  "a--"

+"This is a string with (?:one |two |three )endings"   "<0>This is a string with two endings</0>"
+"((?:a|b|c)whoop-dee-do) | [jkl]|zed"             "x"
+"astring|another[bcd]|alpha|a|[a]"  d "x"

 #
 #  Regexps from http://www.regexlib.com