ICU-1997 use uprv_isRuleWhiteSpace for parsing collation rules

X-SVN-Rev: 9354
2002-07-25 21:59:19 +00:00 · 2002-07-25 21:59:19 +00:00 · 6eea49d8e7
commit 6eea49d8e7
parent a151553d16
3 changed files with 296 additions and 300 deletions
--- a/icu4c/source/data/locales/ja.txt
+++ b/icu4c/source/data/locales/ja.txt
@ -434,7 +434,7 @@ ja {
            "&\u309A = \u309A"
            // Equaling normal and halfwidth/fullwidth characters
-            "&' '=\u3000" // IDEOGRAPHIC SPACE
+            "&' '='\u3000'" // IDEOGRAPHIC SPACE
            "&'\u0020' = '\uFFE3'" // SPACE
            "&'\u0021' = '\uFF01'" // EXCLAMATION MARK
            "&'\u0022' = '\uFF02'" // QUOTATION MARK
--- a/icu4c/source/data/unidata/UCARules.txt
+++ b/icu4c/source/data/unidata/UCARules.txt
@ -99,14 +99,14 @@
    =	 ֯
    =	 ֽ
    =	 ׄ
-    =	 ۝
+    =	 '۝'
-    =	 ۞
+    =	 '۞'
    =	 ۟
    =	 ۠
    =	 ۪
    =	 ۫
    =	 ۬
-    =	 ܏
+    =	 '܏'
    =	 ๎
    =	 ༘
    =	 ༙
@ -117,30 +117,30 @@
    =	 ྆
    =	 ྇
    =	 ࿆
-    =	 ᠋
+    =	 '᠋'
-    =	 ᠌
+    =	 '᠌'
-    =	 ᠍
+    =	 '᠍'
-    =	 ᠎
+    =	 '᠎'
-    =	 
+    =	 ''
-    =	 ‌
+    =	 '‌'
-    =	 ‍
+    =	 '‍'
-    =	 ‎
+    =	 '‎'
-    =	 ‏
+    =	 '‏'
-    =	 ‪
+    =	 '‪'
-    =	 ‫
+    =	 '‫'
-    =	 ‬
+    =	 '‬'
-    =	 ‭
+    =	 '‭'
-    =	 ‮
+    =	 '‮'
-    =	 ⁪
+    =	 '⁪'
-    =	 ⁫
+    =	 '⁫'
-    =	 ⁬
+    =	 '⁬'
-    =	 ⁭
+    =	 '⁭'
-    =	 ⁮
+    =	 '⁮'
-    =	 ⁯
+    =	 '⁯'
-    =	 
+    =	 ''
-    =	 ￹
+    =	 '￹'
-    =	 ￺
+    =	 '￺'
-    =	 ￻
+    =	 '￻'
    =	 𝅥
    =	 𝅦
    =	 𝅧
@ -555,23 +555,23 @@
 <	 '\u000C'
 <	 '\u000D'
 <	 '\u0085'
- <	  
+ <	 '\u2028'
- <	  
+ <	 ' '
 <	 '\u0020'
-   <<<	 　
+   <<<	 '　'
-   <<<	  
+   <<<	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
-   <<<	  
+   <<<	 ' '
-    =	  
+    =	 ' '
-    =	  
+    =	 ' '
- <	  
+ <	 ' '
- <	 ـ
+ <	 'ـ'
 <	 '`'
   <<<	 ｀
 <	 ´
--- a/icu4c/source/i18n/ucol_tok.cpp
+++ b/icu4c/source/i18n/ucol_tok.cpp
@ -23,6 +23,7 @@
 #include "ucol_tok.h"
 #include "cmemory.h"
 #include "uprops.h"
 U_CDECL_BEGIN
 static int32_t U_EXPORT2 U_CALLCONV
@ -712,274 +713,269 @@ ucol_tok_parseNextToken(UColTokenParser *src,
          }
      }
    }else {
-      /* Sets the strength for this entry */
+      if(!uprv_isRuleWhiteSpace(ch)) {
-      switch (ch) {
+        /* Sets the strength for this entry */
-        case 0x003D/*'='*/ : 
+        switch (ch) {
-          if (newStrength != UCOL_TOK_UNSET) {
+          case 0x003D/*'='*/ : 
-            goto EndOfLoop;
+            if (newStrength != UCOL_TOK_UNSET) {
-          }
+              goto EndOfLoop;
          /* if we start with strength, we'll reset to top */
          if(startOfRules == TRUE) {
            top = TRUE;
            newStrength = UCOL_TOK_RESET;
            goto EndOfLoop;
          }
          newStrength = UCOL_IDENTICAL;
          break;
        case 0x002C/*','*/:  
          if (newStrength != UCOL_TOK_UNSET) {
            goto EndOfLoop;
          }
          /* if we start with strength, we'll reset to top */
          if(startOfRules == TRUE) {
            top = TRUE;
            newStrength = UCOL_TOK_RESET;
            goto EndOfLoop;
          }
          newStrength = UCOL_TERTIARY;
          break;
        case  0x003B/*';'*/:
          if (newStrength != UCOL_TOK_UNSET) {
            goto EndOfLoop;
          }
          /* if we start with strength, we'll reset to top */
          if(startOfRules == TRUE) {
            top = TRUE;
            newStrength = UCOL_TOK_RESET;
            goto EndOfLoop;
          }
          newStrength = UCOL_SECONDARY;
          break;
        case 0x003C/*'<'*/:  
          if (newStrength != UCOL_TOK_UNSET) {
            goto EndOfLoop;
          }
          /* if we start with strength, we'll reset to top */
          if(startOfRules == TRUE) {
            top = TRUE;
            newStrength = UCOL_TOK_RESET;
            goto EndOfLoop;
          }
          /* before this, do a scan to verify whether this is */
          /* another strength */
          if(*(src->current+1) == 0x003C) {
            src->current++;
            if(*(src->current+1) == 0x003C) {
              src->current++; /* three in a row! */
              newStrength = UCOL_TERTIARY;
            } else { /* two in a row */
              newStrength = UCOL_SECONDARY;
            }
          } else { /* just one */
            newStrength = UCOL_PRIMARY;
          }
          break;
-        case 0x0026/*'&'*/:  
+            /* if we start with strength, we'll reset to top */
-          if (newStrength != UCOL_TOK_UNSET) {
+            if(startOfRules == TRUE) {
-            /**/
+              top = TRUE;
-            goto EndOfLoop;
+              newStrength = UCOL_TOK_RESET;
-          }
+              goto EndOfLoop;
            }
            newStrength = UCOL_IDENTICAL;
            break;
-          newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
+          case 0x002C/*','*/:  
-          break;
+            if (newStrength != UCOL_TOK_UNSET) {
              goto EndOfLoop;
            }
-        case 0x005b/*'['*/:
+            /* if we start with strength, we'll reset to top */
-          /* options - read an option, analyze it */
+            if(startOfRules == TRUE) {
-          if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
+              top = TRUE;
-            uint8_t result = ucol_uprv_tok_readAndSetOption(src, optionEnd, status);
+              newStrength = UCOL_TOK_RESET;
-            src->current = optionEnd;
+              goto EndOfLoop;
-            if(U_SUCCESS(*status)) {
+            }
-              if(result & UCOL_TOK_TOP) {
+            newStrength = UCOL_TERTIARY;
-                if(newStrength == UCOL_TOK_RESET) { 
+            break;
-                  top = TRUE;
+
-                  charsOffset = (uint32_t)(src->extraCurrent - src->source);
+          case  0x003B/*';'*/:
-                  *src->extraCurrent++ = 0xFFFE;
+            if (newStrength != UCOL_TOK_UNSET) {
-                  *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
+              goto EndOfLoop;
-                  *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
+            }
-                  if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
+
-                    newCharsLen = 3;
+            /* if we start with strength, we'll reset to top */
            if(startOfRules == TRUE) {
              top = TRUE;
              newStrength = UCOL_TOK_RESET;
              goto EndOfLoop;
            }
            newStrength = UCOL_SECONDARY;
            break;
          case 0x003C/*'<'*/:  
            if (newStrength != UCOL_TOK_UNSET) {
              goto EndOfLoop;
            }
            /* if we start with strength, we'll reset to top */
            if(startOfRules == TRUE) {
              top = TRUE;
              newStrength = UCOL_TOK_RESET;
              goto EndOfLoop;
            }
            /* before this, do a scan to verify whether this is */
            /* another strength */
            if(*(src->current+1) == 0x003C) {
              src->current++;
              if(*(src->current+1) == 0x003C) {
                src->current++; /* three in a row! */
                newStrength = UCOL_TERTIARY;
              } else { /* two in a row */
                newStrength = UCOL_SECONDARY;
              }
            } else { /* just one */
              newStrength = UCOL_PRIMARY;
            }
            break;
          case 0x0026/*'&'*/:  
            if (newStrength != UCOL_TOK_UNSET) {
              /**/
              goto EndOfLoop;
            }
            newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
            break;
          case 0x005b/*'['*/:
            /* options - read an option, analyze it */
            if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
              uint8_t result = ucol_uprv_tok_readAndSetOption(src, optionEnd, status);
              src->current = optionEnd;
              if(U_SUCCESS(*status)) {
                if(result & UCOL_TOK_TOP) {
                  if(newStrength == UCOL_TOK_RESET) { 
                    top = TRUE;
                    charsOffset = (uint32_t)(src->extraCurrent - src->source);
                    *src->extraCurrent++ = 0xFFFE;
                    *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
                    *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
                    if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
                      newCharsLen = 3;
                    } else {
                      *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
                      *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
                      newCharsLen = 5;
                    } 
                    if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
                      *src->extraCurrent++ = 0x002d;
                      *src->extraCurrent++ = before;
                      newCharsLen+=2;
                    }
                    src->current++;
                    goto EndOfLoop;
                  } else {
-                    *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
+                    *status = U_INVALID_FORMAT_ERROR;
-                    *src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
+                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                    newCharsLen = 5;
                  } 
                  if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
                    *src->extraCurrent++ = 0x002d;
                    *src->extraCurrent++ = before;
                    newCharsLen+=2;
                  }
                } else if(result & UCOL_TOK_VARIABLE_TOP) {
                  if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
                    variableTop = TRUE;
                    charsOffset = (uint32_t)(src->extraCurrent - src->source);
                    newCharsLen = 1;
                    *src->extraCurrent++ = 0xFFFF;
                    src->current++;
                    goto EndOfLoop;
                  } else {
                    *status = U_INVALID_FORMAT_ERROR;
                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
                  }
                } else if (result & UCOL_TOK_BEFORE){
                  if(newStrength == UCOL_TOK_RESET) {
                    before = result & UCOL_TOK_BEFORE;
                  } else {
                    *status = U_INVALID_FORMAT_ERROR;
                    syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
-                  src->current++;
+                  }
-                  goto EndOfLoop;
+                } 
-                } else {
+              } else {
-                  *status = U_INVALID_FORMAT_ERROR;
+                syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
-                  syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
+                return NULL;
-                }
+              }
-              } else if(result & UCOL_TOK_VARIABLE_TOP) {
+            }
-                if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
+            break;
-                  variableTop = TRUE;
+		  case 0x0021/*! skip java thai modifier reordering*/:
-                  charsOffset = (uint32_t)(src->extraCurrent - src->source);
+			  break; 
-                  newCharsLen = 1;
+          case 0x002F/*'/'*/:
-                  *src->extraCurrent++ = 0xFFFF;
+            wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
-                  src->current++;
+            inChars = FALSE; /* we're now processing expansion */
-                  goto EndOfLoop;
+            break;
-                } else {
+          case 0x005C /* back slash for escaped chars */:
-                  *status = U_INVALID_FORMAT_ERROR;
+              isEscaped = TRUE;
-                  syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
+              break;
-                }
+          /* found a quote, we're gonna start copying */
-              } else if (result & UCOL_TOK_BEFORE){
+          case 0x0027/*'\''*/:
-                if(newStrength == UCOL_TOK_RESET) {
+            if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
-                  before = result & UCOL_TOK_BEFORE;
+              /*
-                } else {
+			  enabling rules to start with a non-token character a < b
-                  *status = U_INVALID_FORMAT_ERROR;
+			  *status = U_INVALID_FORMAT_ERROR;
-                  syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
+              syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
              return NULL;
 			  */
              newStrength = UCOL_TOK_RESET;
            }
-                }
+            inQuote = TRUE;
-              } 
+
-            } else {
+            if(inChars) { /* we're doing characters */
              if(wasInQuote == FALSE) {
                charsOffset = (uint32_t)(src->extraCurrent - src->source);
              }
              if (newCharsLen != 0) {
                  uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
                  src->extraCurrent += newCharsLen;
              }
              newCharsLen++;
            } else { /* we're doing an expansion */
              if(wasInQuote == FALSE) {
                extensionOffset = (uint32_t)(src->extraCurrent - src->source);
              }
              if (newExtensionLen != 0) {
                uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
                src->extraCurrent += newExtensionLen;
              }
              newExtensionLen++;
            }
            wasInQuote = TRUE;
            ch = *(++(src->current)); 
            if(ch == 0x0027) { /* copy the double quote */
              *src->extraCurrent++ = ch;
              inQuote = FALSE;
            }
            break;
          /* '@' is french only if the strength is not currently set */
          /* if it is, it's just a regular character in collation rules */
          case 0x0040/*'@'*/:
            if (newStrength == UCOL_TOK_UNSET) {
              src->opts->frenchCollation = UCOL_ON;
              break;
            }
          case 0x007C /*|*/: /* this means we have actually been reading prefix part */
            // we want to store read characters to the prefix part and continue reading
            // the characters (proper way would be to restart reading the chars, but in
            // that case we would have to complicate the token hasher, which I do not 
            // intend to play with. Instead, we will do prefixes when prefixes are due
            // (before adding the elements).
            src->parsedToken.prefixOffset = charsOffset;
            src->parsedToken.prefixLen = newCharsLen;
            if(inChars) { /* we're doing characters */
              if(wasInQuote == FALSE) {
                charsOffset = (uint32_t)(src->extraCurrent - src->source);
              }
              if (newCharsLen != 0) {
                  uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
                  src->extraCurrent += newCharsLen;
              }
              newCharsLen++;
            }
            wasInQuote = TRUE;
            ch = *(++(src->current)); 
            break;
            //charsOffset = 0;
            //newCharsLen = 0;
            //break; // We want to store the whole prefix/character sequence. If we break
                     // the '|' is going to get lost.
          default:
            if (newStrength == UCOL_TOK_UNSET) {
              /* enabling rules to start with non-tokens a < b
 			  *status = U_INVALID_FORMAT_ERROR;
              syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
              return NULL;
 			  */
 			  newStrength = UCOL_TOK_RESET;
            }
            if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
              *status = U_INVALID_FORMAT_ERROR;
              syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
              return NULL;
            }
-          }
+
-          break;
+            if(ch == 0x0000 && src->current+1 == src->end) {
-        /* Ignore the white spaces */
+              break;
-        case 0x0009/*'\t'*/:
+            }
-        case 0x000C/*'\f'*/:
+
-        case 0x000D/*'\r'*/:
+            if (inChars) {
-        case 0x000A/*'\n'*/:
+              if(newCharsLen == 0) {
-        case 0x0020/*' '*/:  
+                charsOffset = (uint32_t)(src->current - src->source);
-          break; /* skip whitespace TODO use Unicode */
+              }
-		case 0x0021/*! skip java thai modifier reordering*/:
+              newCharsLen++;
-			break; 
+            } else {
-        case 0x002F/*'/'*/:
+              if(newExtensionLen == 0) {
-          wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
+                extensionOffset = (uint32_t)(src->current - src->source);
-          inChars = FALSE; /* we're now processing expansion */
+              }
-          break;
+              newExtensionLen++;
-        case 0x005C /* back slash for escaped chars */:
+            }
-            isEscaped = TRUE;
+
            break;
-        /* found a quote, we're gonna start copying */
+          }         
-        case 0x0027/*'\''*/:
+       }
          if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
            /*
 			enabling rules to start with a non-token character a < b
 			*status = U_INVALID_FORMAT_ERROR;
            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
            return NULL;
 			*/
            newStrength = UCOL_TOK_RESET;
          }
          inQuote = TRUE;
          if(inChars) { /* we're doing characters */
            if(wasInQuote == FALSE) {
              charsOffset = (uint32_t)(src->extraCurrent - src->source);
            }
            if (newCharsLen != 0) {
                uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
                src->extraCurrent += newCharsLen;
            }
            newCharsLen++;
          } else { /* we're doing an expansion */
            if(wasInQuote == FALSE) {
              extensionOffset = (uint32_t)(src->extraCurrent - src->source);
            }
            if (newExtensionLen != 0) {
              uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
              src->extraCurrent += newExtensionLen;
            }
            newExtensionLen++;
          }
          wasInQuote = TRUE;
          ch = *(++(src->current)); 
          if(ch == 0x0027) { /* copy the double quote */
            *src->extraCurrent++ = ch;
            inQuote = FALSE;
          }
          break;
        /* '@' is french only if the strength is not currently set */
        /* if it is, it's just a regular character in collation rules */
        case 0x0040/*'@'*/:
          if (newStrength == UCOL_TOK_UNSET) {
            src->opts->frenchCollation = UCOL_ON;
            break;
          }
        case 0x007C /*|*/: /* this means we have actually been reading prefix part */
          // we want to store read characters to the prefix part and continue reading
          // the characters (proper way would be to restart reading the chars, but in
          // that case we would have to complicate the token hasher, which I do not 
          // intend to play with. Instead, we will do prefixes when prefixes are due
          // (before adding the elements).
          src->parsedToken.prefixOffset = charsOffset;
          src->parsedToken.prefixLen = newCharsLen;
          if(inChars) { /* we're doing characters */
            if(wasInQuote == FALSE) {
              charsOffset = (uint32_t)(src->extraCurrent - src->source);
            }
            if (newCharsLen != 0) {
                uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
                src->extraCurrent += newCharsLen;
            }
            newCharsLen++;
          }
          wasInQuote = TRUE;
          ch = *(++(src->current)); 
          break;
          //charsOffset = 0;
          //newCharsLen = 0;
          //break; // We want to store the whole prefix/character sequence. If we break
                   // the '|' is going to get lost.
        default:
          if (newStrength == UCOL_TOK_UNSET) {
            /* enabling rules to start with non-tokens a < b
 			*status = U_INVALID_FORMAT_ERROR;
            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
            return NULL;
 			*/
 			newStrength = UCOL_TOK_RESET;
          }
          if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
            *status = U_INVALID_FORMAT_ERROR;
            syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
            return NULL;
          }
          if(ch == 0x0000 && src->current+1 == src->end) {
            break;
          }
          if (inChars) {
            if(newCharsLen == 0) {
              charsOffset = (uint32_t)(src->current - src->source);
            }
            newCharsLen++;
          } else {
            if(newExtensionLen == 0) {
              extensionOffset = (uint32_t)(src->current - src->source);
            }
            newExtensionLen++;
          }
          break;
        }
    }
    if(wasInQuote) {