ICU-96 Hangul tailoring fix, different case bit function, added comments to strcoll

X-SVN-Rev: 4761
2001-05-22 22:26:58 +00:00 · 2001-05-22 22:26:58 +00:00 · 152b11f484
commit 152b11f484
parent 94e1fd78a5
4 changed files with 179 additions and 57 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -316,7 +316,7 @@ ucol_openRules(    const    UChar                  *rules,
        UCollationStrength      strength,
        UErrorCode              *status)
 {
-  uint32_t listLen = 0;
+  uint32_t listLen = 0, nSize = 0;
  UColTokenParser src;
  UColAttributeValue norm;

@ -342,9 +342,11 @@ ucol_openRules(    const    UChar                  *rules,

  /*src.source = rules;*/
  src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
-  uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
+  nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src.source, rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
+  //uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
  src.current = src.source;
-  src.end = src.source+rulesLength;
+  src.end = src.source+nSize;
+  //src.end = src.source+rulesLength;
  src.sourceCurrent = src.source;
  src.extraCurrent = src.end;
  src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
@ -4615,7 +4617,7 @@ ucol_strcoll( const UCollator    *coll,
    }


-
+    // setting up the collator parameters
    UColAttributeValue strength = coll->strength;
    UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);

@ -4628,63 +4630,69 @@ ucol_strcoll( const UCollator    *coll,
    UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
    UBool qShifted = shifted && checkQuad;

+    uint8_t caseSwitch = coll->caseSwitch;
+    uint8_t tertiaryMask = coll->tertiaryMask;
+
+    // This is the lowest primary value that will not be ignored if shifted
+    uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0;
+
    UCollationResult result = UCOL_EQUAL;
    UErrorCode status = U_ZERO_ERROR;

+    // Preparing the context objects for iterating over strings
    collIterate sColl, tColl;

-
    IInit_collIterate(coll, source, sourceLength, &sColl);
    IInit_collIterate(coll, target, targetLength, &tColl);

+    // Preparing the CE buffers. They will be filled during the primary phase
    ucol_CEBuf   sCEs;
    ucol_CEBuf   tCEs;
    UCOL_INIT_CEBUF(&sCEs);
    UCOL_INIT_CEBUF(&tCEs);

-    uint8_t caseSwitch = coll->caseSwitch;
-    uint8_t tertiaryMask = coll->tertiaryMask;
-
-    uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0;
-
    uint32_t secS = 0, secT = 0;
-
    uint32_t sOrder=0, tOrder=0;
+
+    // Non shifted primary processing is quite simple
    if(!shifted) {
      for(;;) {
-        /* Get the next collation element in each of the strings, unless */
-        /* we've been requested to skip it. */
-        while(sOrder == 0) {
-          sOrder = ucol_IGetNextCE(coll, &sColl, &status);
-          UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl);
-          sOrder &= UCOL_PRIMARYMASK;
-        }

-        while(tOrder == 0) {
+        // We fetch CEs until we hit a non ignorable primary or end.
+        do {
+          // We get the next CE
+          sOrder = ucol_IGetNextCE(coll, &sColl, &status);
+          // Stuff it in the buffer
+          UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl);
+          // And keep just the primary part.
+          sOrder &= UCOL_PRIMARYMASK;
+        } while(sOrder == 0);
+
+        // see the comments on the above block
+        do {
          tOrder = ucol_IGetNextCE(coll, &tColl, &status);
          UCOL_CEBUF_PUT(&tCEs, tOrder, &tColl);
          tOrder &= UCOL_PRIMARYMASK;
-        }
+        } while(tOrder == 0);

+        // if both primaries are the same
        if(sOrder == tOrder) {
+            // and there are no more CEs, we advance to the next level
            if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
-
              break;
-            } else {
-              sOrder = 0; tOrder = 0;
-              continue;
-            }
+            } 
        } else {
+            // if two primaries are different, we are done
            result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
            goto commonReturn;
        }
-      } /* no primary difference... do the rest from the buffers */
-    } else { /* shifted - do a slightly more complicated processing */
+      } // no primary difference... do the rest from the buffers
+    } else { // shifted - do a slightly more complicated processing :)
      for(;;) {
        UBool sInShifted = FALSE;
        UBool tInShifted = FALSE;
-
-/* This is where abridged version for shifted should go */
+        // This version of code can be refactored. However, it seems easier to understand this way.
+        // Source loop. Sam as the target loop. 
        for(;;) {
          sOrder = ucol_IGetNextCE(coll, &sColl, &status);
          if(sOrder == UCOL_NO_MORE_CES) {
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@ -635,22 +635,75 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UHash
  }
 }

-uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status) {
+uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
  UChar n[128];
-  UChar nu[128];
+  //UChar nu[128];
+  uint32_t i = 0;

  uint32_t nLen = 0;
  uint32_t nuLen = 0;

-  nLen = unorm_normalize(s, len, UNORM_NFKD, 0, n, 128, status);
+  collIterate s;
+  uint32_t order = 0;

-  nuLen = u_strToUpper(nu, 128, n, nLen, "", status);
-  if(nuLen == nLen) {
-    if(u_strncmp(n, nu, nuLen) == 0) {
-      return UCOL_UPPER_CASE;
+  uint8_t caseBits;
+  UBool isMixed = FALSE;
+  
+  if(U_FAILURE(*status)) {
+    return UCOL_LOWER_CASE;
+  }
+
+  nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
+
+  init_collIterate(UCA, n, nLen, &s);
+
+  order = ucol_getNextCE(UCA, &s, status);
+  if(isContinuation(order)) {
+    *status = U_INTERNAL_PROGRAM_ERROR;
+    return UCOL_LOWER_CASE;
+  }
+
+  caseBits = order & UCOL_CASE_BIT_MASK;
+  for(;;) {
+    order = ucol_getNextCE(UCA, &s, status);
+    if(order == UCOL_NO_MORE_CES) {
+        break;
+    }
+    if(isContinuation(order)) { 
+      continue;
+    }
+    if(caseBits != (order & UCOL_CASE_BIT_MASK)) {
+      isMixed = TRUE;
+      break;
    }
  }

+  if(isMixed == TRUE) {
+    uint32_t noUpper = 0;
+    uint32_t noLower = 0;
+
+    // Let's analyze again, letter by letter
+    for(i = 0; i < nLen; i++) {
+      if(u_isupper(n[i]) == TRUE) {
+        noUpper++;
+      }
+      if(u_islower(n[i]) == TRUE) {
+        noLower++;
+      }
+      if(u_istitle(n[i]) == TRUE) {
+        return UCOL_MIXED_CASE;
+      }
+    }
+
+    if(noUpper > 0 && noLower > 0 && noUpper + noLower <= nLen) {
+      return UCOL_MIXED_CASE;
+    }
+  }
+
+  return caseBits;
+
+
+#if 0
  nuLen = u_strToLower(nu, 128, n, nLen, "", status);
  if(nuLen == nLen) {
    if(u_strncmp(n, nu, nuLen) == 0) {
@ -658,7 +711,14 @@ uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status)
    }
  }

+  nuLen = u_strToUpper(nu, 128, n, nLen, "", status);
+  if(nuLen == nLen) {
+    if(u_strncmp(n, nu, nuLen) == 0) {
+      return UCOL_UPPER_CASE;
+    }
+  }
  return UCOL_MIXED_CASE;
+#endif

 }

@ -699,13 +759,14 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
          /* will have to get one from UCA */
          /* first, get the UChars from the rules */
          /* then pick CEs out until there is no more and stuff them into expansion */
-          UChar source[256],buff[256];
+          //UChar source[256],buff[256];
          collIterate s;
          uint32_t order = 0;
-          uint32_t normSize = 0;
-          uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar));
-          normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status);
-          init_collIterate(src->UCA, source, normSize, &s);
+          //uint32_t normSize = 0;
+          //uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar));
+          //normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status);
+          //init_collIterate(src->UCA, source, normSize, &s);
+          init_collIterate(src->UCA, expOffset + src->source, 1, &s);

          for(;;) {
            order = ucol_getNextCE(src->UCA, &s, status);
@ -735,11 +796,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL

    /* copy UChars */

-    UChar buff[128];
-    uint32_t decompSize;
-    uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));
-    decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status);
-    el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/
+    //UChar buff[128];
+    //uint32_t decompSize;
+    //uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));
+    //decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status);
+    //el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/
+    el.cSize = (tok->source >> 24); 
+    uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    el.cPoints = el.uchars;

    if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
@ -760,7 +823,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
    el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
    if(el.cSize > 1) {
      // Do it manually
-      el.CEs[0] |= ucol_uprv_getCaseBits(el.cPoints, el.cSize, status);
+      el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status);
    } else {
      // Copy it from the UCA
      uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
--- a/icu4c/source/i18n/ucol_tok.cpp
+++ b/icu4c/source/i18n/ucol_tok.cpp
@ -734,6 +734,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
          src->varTop = sourceToken;
        }

+        /*
+          If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 
+          d * ... into &x * c/y * d * ... 
+        */
+        if(expandNext != 0 && sourceToken->expansion == 0) {
+          sourceToken->expansion = expandNext;
+          sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
+          //expandNext = 0;
+        }
+
        /*
        1.	Find the strongest strength in each list, and set strongestP and strongestN 
        accordingly in the headers. 
@ -769,16 +779,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
              lastToken->next = sourceToken;
            }
          }
-
-          /*
-            If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 
-            d * ... into &x * c/y * d * ... 
-          */
-          if(expandNext != 0 && sourceToken->expansion == 0) {
-            sourceToken->expansion = expandNext;
-            sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
-            expandNext = 0;
-          }
        } else {
        /* Otherwise (when LAST is not a reset) 
              if polarity (LAST) == polarity(relation), insert sourceToken after LAST, 
--- a/icu4c/source/test/cintltst/cmsccoll.c
+++ b/icu4c/source/test/cintltst/cmsccoll.c
@ -1954,7 +1954,56 @@ static void TestIncrementalNormalize() {
    uprv_free(strB);
 }

+#if 0
+static void TestGetCaseBit() {
+  static const char *caseBitData[] = {
+    "a", "A", "ch", "Ch", "CH",
+      "\\uFF9E", "\\u0009"
+  };

+  static const uint8_t results[] = {
+    UCOL_LOWER_CASE, UCOL_UPPER_CASE, UCOL_LOWER_CASE, UCOL_MIXED_CASE, UCOL_UPPER_CASE,
+      UCOL_UPPER_CASE, UCOL_LOWER_CASE
+  };
+
+  uint32_t i, blen = 0;
+  UChar b[256] = {0};
+  UErrorCode status = U_ZERO_ERROR;
+  UCollator *UCA = ucol_open("", &status);
+  uint8_t res = 0;
+  
+  for(i = 0; i<sizeof(results)/sizeof(results[0]); i++) {
+    blen = u_unescape(caseBitData[i], b, 256);
+    res = ucol_uprv_getCaseBits(UCA, b, blen, &status);
+    if(results[i] != res) {
+      log_err("Expected case = %02X, got %02X for %04X\n", results[i], res, b[0]);
+    }
+  }
+}
+#endif
+
+static void TestHangulTailoring() {
+  static const char *koreanData[] = {
+    "\\uac00", "\\u4f3d", "\\u4f73", "\\u5047", "\\u50f9", "\\u52a0", "\\u53ef", "\\u5475," 
+        " \\u54e5", "\\u5609", "\\u5ac1", "\\u5bb6", "\\u6687", "\\u67b6", "\\u67b7", "\\u67ef," 
+        " \\u6b4c", "\\u73c2", "\\u75c2", "\\u7a3c", "\\u82db", "\\u8304", "\\u8857", "\\u8888," 
+        " \\u8a36", "\\u8cc8", "\\u8dcf", "\\u8efb", "\\u8fe6", "\\u99d5," 
+	" \\u4EEE", "\\u50A2", "\\u5496", "\\u54FF", "\\u5777", "\\u5B8A", "\\u659D", "\\u698E," 
+	" \\u6A9F", "\\u73C8", "\\u7B33", "\\u801E", "\\u8238", "\\u846D", "\\u8B0C"
+  };
+
+  char rules = 
+        "&\\uac00 <<< \\u4f3d <<< \\u4f73 <<< \\u5047 <<< \\u50f9 <<< \\u52a0 <<< \\u53ef <<< \\u5475 " 
+        "<<< \\u54e5 <<< \\u5609 <<< \\u5ac1 <<< \\u5bb6 <<< \\u6687 <<< \\u67b6 <<< \\u67b7 <<< \\u67ef " 
+        "<<< \\u6b4c <<< \\u73c2 <<< \\u75c2 <<< \\u7a3c <<< \\u82db <<< \\u8304 <<< \\u8857 <<< \\u8888 " 
+        "<<< \\u8a36 <<< \\u8cc8 <<< \\u8dcf <<< \\u8efb <<< \\u8fe6 <<< \\u99d5 " 
+	    "<<< \\u4EEE <<< \\u50A2 <<< \\u5496 <<< \\u54FF <<< \\u5777 <<< \\u5B8A <<< \\u659D <<< \\u698E " //k1
+	    "<<< \\u6A9F <<< \\u73C8 <<< \\u7B33 <<< \\u801E <<< \\u8238 <<< \\u846D <<< \\u8B0C"
+
+
+  genericRulesStarter(rules, koreanData, sizeof(koreanData)/sizeof(koreanData[0]));
+  
+}

 void addMiscCollTest(TestNode** root)
 {
@ -1975,11 +2024,13 @@ void addMiscCollTest(TestNode** root)
    addTest(root, &TestJ831, "tscoll/cmsccoll/TestJ831");
    addTest(root, &TestBefore, "tscoll/cmsccoll/TestBefore");
    addTest(root, &TestRedundantRules, "tscoll/cmsccoll/TestRedundantRules");
+    addTest(root, &TestHangulTailoring, "tscoll/cmsccoll/TestHangulTailoring");
    /*addTest(root, &TestUCAZero, "tscoll/cmsccoll/TestUCAZero");*/
    /*addTest(root, &TestUnmappedSpaces, "tscoll/cmsccoll/TestUnmappedSpaces");*/
    /*addTest(root, &PrintMarkDavis, "tscoll/cmsccoll/PrintMarkDavis");*/
    /*addTest(root, &TestVariableTop, "tscoll/cmsccoll/TestVariableTop");*/
    addTest(root, &TestIncrementalNormalize, "tscoll/cmsccoll/TestIncrementalNormalize");
    addTest(root, &TestComposeDecompose, "tscoll/cmsccoll/TestComposeDecompose");
+    /*addTest(root, &TestGetCaseBit, "tscoll/cmsccoll/TestGetCaseBit");*/
 }