ICU-96 parsing update: use implicit &[top] if there is no reset and parsing quoted strings

X-SVN-Rev: 4054
2001-03-14 00:12:46 +00:00 · 2001-03-14 00:12:46 +00:00 · 2d87db9275
commit 2d87db9275
parent a072ac7f00
5 changed files with 72 additions and 18 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -173,11 +173,14 @@ ucol_openRules(    const    UChar                  *rules,
    return 0;
  }

-  /* do we need to normalize the string beforehand? */
-
-  src.source = rules;
-  src.current = rules;
-  src.end = rules+rulesLength;
+  /*src.source = rules;*/
+  src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
+  uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
+  src.current = src.source;
+  src.end = src.source+rulesLength;
+  src.sourceCurrent = src.source;
+  src.extraCurrent = src.end;
+  src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
  src.UCA = UCA;
  src.invUCA = ucol_initInverseUCA(status);
  src.resultLen = 0;
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@ -291,13 +291,13 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
  } 

  if(strength == UCOL_SECONDARY) { /* similar as simple */
-    if(low > UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
+    if(low >= UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
      low = UCOL_COMMON_TOP2<<24;
    }
    if(high > UCOL_COMMON_BOT2<<24 && high < UCOL_COMMON_TOP2<<24) {
      high = UCOL_COMMON_TOP2<<24;
    } 
-    if(low <= UCOL_COMMON_BOT2<<24) {
+    if(low < UCOL_COMMON_BOT2<<24) {
      g->noOfRanges = ucol_allocWeights(UCOL_COMMON_TOP2<<24, high, count, g->ranges);
      g->current = UCOL_COMMON_BOT2;
      return g->current;
--- a/icu4c/source/i18n/ucol_tok.c
+++ b/icu4c/source/i18n/ucol_tok.c
@ -324,6 +324,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu

      UBool inChars = TRUE;
      UBool inQuote = FALSE;
+      UBool wasInQuote = FALSE;
      UChar *optionEnd = NULL;

      newStrength = UCOL_TOK_UNSET; 
@ -339,12 +340,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
          } else {
            if ((newCharsLen == 0) || inChars) {
              if(newCharsLen == 0) {
-                charsOffset = src->current - src->source;
+                charsOffset = src->extraCurrent - src->source;
              }
              newCharsLen++;
            } else {
              if(newExtensionsLen == 0) {
-                extensionOffset = src->current - src->source;
+                extensionOffset = src->extraCurrent - src->source;
              }
              newExtensionsLen++;
            }
@ -357,6 +358,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
                goto EndOfLoop;
              }

+              /* if we start with strength, we'll reset to top */
+              if(lastToken == NULL) {
+                top = TRUE;
+                newStrength = UCOL_TOK_RESET;
+                goto EndOfLoop;
+              }
              newStrength = UCOL_IDENTICAL;
              break;

@ -365,6 +372,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
                goto EndOfLoop;
              }

+              /* if we start with strength, we'll reset to top */
+              if(lastToken == NULL) {
+                top = TRUE;
+                newStrength = UCOL_TOK_RESET;
+                goto EndOfLoop;
+              }
              newStrength = UCOL_TERTIARY;
              break;

@ -373,6 +386,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
                goto EndOfLoop;
              }

+              /* if we start with strength, we'll reset to top */
+              if(lastToken == NULL) {
+                top = TRUE;
+                newStrength = UCOL_TOK_RESET;
+                goto EndOfLoop;
+              }
              newStrength = UCOL_SECONDARY;
              break;

@ -381,6 +400,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
                goto EndOfLoop;
              }

+              /* if we start with strength, we'll reset to top */
+              if(lastToken == NULL) {
+                top = TRUE;
+                newStrength = UCOL_TOK_RESET;
+                goto EndOfLoop;
+              }
              /* before this, do a scan to verify whether this is */
              /* another strength */
              if(*(src->current+1) == 0x003C) {
@ -436,22 +461,30 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
              inChars = FALSE;
              break;

+            /* found a quote, we're gonna start copying */
            case 0x0027/*'\''*/:
              inQuote = TRUE;
-              ch = *(++(src->current)); /*pattern[++index]; */
+              wasInQuote = TRUE;

              if (newCharsLen == 0) {
-                charsOffset = src->current - src->source;
+                charsOffset = src->extraCurrent - src->source;
                newCharsLen++;
-              } else if (inChars) {
-                if(newCharsLen == 0) {
-                  charsOffset = src->current - src->source;
+              } else if (inChars) { /* we're reading some chars */
+                charsOffset = src->extraCurrent - src->source;
+                if(newCharsLen != 0) {
+                  uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
+                  src->extraCurrent += newCharsLen;
                }
                newCharsLen++;
              } else {
+                if(newExtensionsLen != 0) {
+                  uprv_memcpy(src->extraCurrent, src->current - newExtensionsLen, newExtensionsLen*sizeof(UChar));
+                  src->extraCurrent += newExtensionsLen;
+                }
                newExtensionsLen++;
              }

+              ch = *(++(src->current)); /*pattern[++index]; */
              break;

            /* '@' is french only if the strength is not currently set */
@ -491,10 +524,20 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
            }
        }

+        if(wasInQuote) {
+          if(ch != 0x27) {
+            *src->extraCurrent++ = ch;
+          }
+          if(src->extraCurrent == src->extraEnd) {
+            /* reallocate */
+          }
+        }
+
          src->current++;
        }

     EndOfLoop:
+        wasInQuote = FALSE;
      if (newStrength == UCOL_TOK_UNSET) {
        return 0;
      }
@ -721,6 +764,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
          src->resultLen++;
          uhash_put(uchars2tokens, sourceToken, sourceToken, status);
        } else { /* reset to something already in rules */
+          top = FALSE;
        }
      }
      /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */  
@ -739,5 +783,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) {
 void ucol_tok_closeTokenList(UColTokenParser *src) {
  uhash_close(uchars2tokens);
  uprv_free(src->lh);
+  uprv_free(src->source);
 }

--- a/icu4c/source/i18n/ucol_tok.h
+++ b/icu4c/source/i18n/ucol_tok.h
@ -9,6 +9,9 @@
 #define UCOL_RESET_TOP_VALUE 0x9F000303
 #define UCOL_NEXT_TOP_VALUE  0xD0000303

+/* this is space for the extra strings that need to be unquoted */
+/* during the parsing of the rules */
+#define UCOL_TOK_EXTRA_RULE_SPACE_SIZE 1024
 typedef struct UColToken UColToken;

 typedef struct  {
@ -53,9 +56,12 @@ struct UColToken {
 };

 typedef struct {
-  const UChar *source;
-  const UChar *end;
-  const UChar *current;
+  UChar *source;
+  UChar *end;
+  UChar *current;
+  UChar *sourceCurrent;
+  UChar *extraCurrent;
+  UChar *extraEnd;
  const InverseTableHeader *invUCA;
  const UCollator *UCA;
  UCATableHeader *image;
--- a/icu4c/source/test/cintltst/cg7coll.c
+++ b/icu4c/source/test/cintltst/cg7coll.c
@ -116,7 +116,7 @@ const static int32_t results[TESTLOCALES][TOTALTESTSET] = {
    /* new table collation with rules "& Question-mark ; ? & Hash-mark ; # & Ampersand ; '&'  " loop to TOTALTESTSET */
    { 23, 24, 25, 22, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 21, 14, 1, 11, 2, 3, 4, 5, 19, 20, 6, 8, 10, 7, 29 },
    /* analogous to Japanese rules " & aa ; a- & ee ; e- & ii ; i- & oo ; o- & uu ; u- " */  /* loop to TOTALTESTSET */
-    { 19, 22, 21, 23, 24, 25, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 }
+    { 19, 22, 21, 24, 23, 25, 12, 13, 9, 0, 17, 16, 28, 26, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 }
 };

 static UChar*