From 152b11f4841c9cebde27395329a57cd5cbceabc4 Mon Sep 17 00:00:00 2001
From: Vladimir Weinstein <icu@weivsara.com>
Date: Tue, 22 May 2001 22:26:58 +0000
Subject: [PATCH] ICU-96 Hangul tailoring fix, different case bit function,
 added comments to strcoll

X-SVN-Rev: 4761
---
 icu4c/source/i18n/ucol.cpp            | 66 ++++++++++--------
 icu4c/source/i18n/ucol_bld.cpp        | 99 ++++++++++++++++++++++-----
 icu4c/source/i18n/ucol_tok.cpp        | 20 +++---
 icu4c/source/test/cintltst/cmsccoll.c | 51 ++++++++++++++
 4 files changed, 179 insertions(+), 57 deletions(-)

diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp
index 5ce5508aef..7aea9d4b59 100644
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@@ -316,7 +316,7 @@ ucol_openRules(    const    UChar                  *rules,
         UCollationStrength      strength,
         UErrorCode              *status)
 {
-  uint32_t listLen = 0;
+  uint32_t listLen = 0, nSize = 0;
   UColTokenParser src;
   UColAttributeValue norm;
 
@@ -342,9 +342,11 @@ ucol_openRules(    const    UChar                  *rules,
 
   /*src.source = rules;*/
   src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
-  uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
+  nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src.source, rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
+  //uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
   src.current = src.source;
-  src.end = src.source+rulesLength;
+  src.end = src.source+nSize;
+  //src.end = src.source+rulesLength;
   src.sourceCurrent = src.source;
   src.extraCurrent = src.end;
   src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
@@ -4615,7 +4617,7 @@ ucol_strcoll( const UCollator    *coll,
     }
 
 
-
+    // setting up the collator parameters
     UColAttributeValue strength = coll->strength;
     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
 
@@ -4628,63 +4630,69 @@ ucol_strcoll( const UCollator    *coll,
     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
     UBool qShifted = shifted && checkQuad;
 
+    uint8_t caseSwitch = coll->caseSwitch;
+    uint8_t tertiaryMask = coll->tertiaryMask;
+
+    // This is the lowest primary value that will not be ignored if shifted
+    uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0;
+
     UCollationResult result = UCOL_EQUAL;
     UErrorCode status = U_ZERO_ERROR;
 
+    // Preparing the context objects for iterating over strings
     collIterate sColl, tColl;
 
-
     IInit_collIterate(coll, source, sourceLength, &sColl);
     IInit_collIterate(coll, target, targetLength, &tColl);
 
+    // Preparing the CE buffers. They will be filled during the primary phase
     ucol_CEBuf   sCEs;
     ucol_CEBuf   tCEs;
     UCOL_INIT_CEBUF(&sCEs);
     UCOL_INIT_CEBUF(&tCEs);
 
-    uint8_t caseSwitch = coll->caseSwitch;
-    uint8_t tertiaryMask = coll->tertiaryMask;
-
-    uint32_t LVT = (shifted)?((coll->variableMax1)<<24 | (coll->variableMax2)<<16):0;
-
     uint32_t secS = 0, secT = 0;
-
     uint32_t sOrder=0, tOrder=0;
+
+    // Non shifted primary processing is quite simple
     if(!shifted) {
       for(;;) {
-        /* Get the next collation element in each of the strings, unless */
-        /* we've been requested to skip it. */
-        while(sOrder == 0) {
-          sOrder = ucol_IGetNextCE(coll, &sColl, &status);
-          UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl);
-          sOrder &= UCOL_PRIMARYMASK;
-        }
 
-        while(tOrder == 0) {
+        // We fetch CEs until we hit a non ignorable primary or end.
+        do {
+          // We get the next CE
+          sOrder = ucol_IGetNextCE(coll, &sColl, &status);
+          // Stuff it in the buffer
+          UCOL_CEBUF_PUT(&sCEs, sOrder, &sColl);
+          // And keep just the primary part.
+          sOrder &= UCOL_PRIMARYMASK;
+        } while(sOrder == 0);
+
+        // see the comments on the above block
+        do {
           tOrder = ucol_IGetNextCE(coll, &tColl, &status);
           UCOL_CEBUF_PUT(&tCEs, tOrder, &tColl);
           tOrder &= UCOL_PRIMARYMASK;
-        }
+        } while(tOrder == 0);
 
+        // if both primaries are the same
         if(sOrder == tOrder) {
+            // and there are no more CEs, we advance to the next level
             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
-
               break;
-            } else {
-              sOrder = 0; tOrder = 0;
-              continue;
-            }
+            } 
         } else {
+            // if two primaries are different, we are done
             result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
             goto commonReturn;
         }
-      } /* no primary difference... do the rest from the buffers */
-    } else { /* shifted - do a slightly more complicated processing */
+      } // no primary difference... do the rest from the buffers
+    } else { // shifted - do a slightly more complicated processing :)
       for(;;) {
         UBool sInShifted = FALSE;
         UBool tInShifted = FALSE;
-
-/* This is where abridged version for shifted should go */
+        // This version of code can be refactored. However, it seems easier to understand this way.
+        // Source loop. Sam as the target loop. 
         for(;;) {
           sOrder = ucol_IGetNextCE(coll, &sColl, &status);
           if(sOrder == UCOL_NO_MORE_CES) {
diff --git a/icu4c/source/i18n/ucol_bld.cpp b/icu4c/source/i18n/ucol_bld.cpp
index 0afc8a494e..6abc7d0596 100644
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@@ -635,22 +635,75 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UHash
   }
 }
 
-uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status) {
+uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
   UChar n[128];
-  UChar nu[128];
+  //UChar nu[128];
+  uint32_t i = 0;
 
   uint32_t nLen = 0;
   uint32_t nuLen = 0;
 
-  nLen = unorm_normalize(s, len, UNORM_NFKD, 0, n, 128, status);
+  collIterate s;
+  uint32_t order = 0;
 
-  nuLen = u_strToUpper(nu, 128, n, nLen, "", status);
-  if(nuLen == nLen) {
-    if(u_strncmp(n, nu, nuLen) == 0) {
-      return UCOL_UPPER_CASE;
+  uint8_t caseBits;
+  UBool isMixed = FALSE;
+  
+  if(U_FAILURE(*status)) {
+    return UCOL_LOWER_CASE;
+  }
+
+  nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
+
+  init_collIterate(UCA, n, nLen, &s);
+
+  order = ucol_getNextCE(UCA, &s, status);
+  if(isContinuation(order)) {
+    *status = U_INTERNAL_PROGRAM_ERROR;
+    return UCOL_LOWER_CASE;
+  }
+
+  caseBits = order & UCOL_CASE_BIT_MASK;
+  for(;;) {
+    order = ucol_getNextCE(UCA, &s, status);
+    if(order == UCOL_NO_MORE_CES) {
+        break;
+    }
+    if(isContinuation(order)) { 
+      continue;
+    }
+    if(caseBits != (order & UCOL_CASE_BIT_MASK)) {
+      isMixed = TRUE;
+      break;
     }
   }
 
+  if(isMixed == TRUE) {
+    uint32_t noUpper = 0;
+    uint32_t noLower = 0;
+
+    // Let's analyze again, letter by letter
+    for(i = 0; i < nLen; i++) {
+      if(u_isupper(n[i]) == TRUE) {
+        noUpper++;
+      }
+      if(u_islower(n[i]) == TRUE) {
+        noLower++;
+      }
+      if(u_istitle(n[i]) == TRUE) {
+        return UCOL_MIXED_CASE;
+      }
+    }
+
+    if(noUpper > 0 && noLower > 0 && noUpper + noLower <= nLen) {
+      return UCOL_MIXED_CASE;
+    }
+  }
+
+  return caseBits;
+
+
+#if 0
   nuLen = u_strToLower(nu, 128, n, nLen, "", status);
   if(nuLen == nLen) {
     if(u_strncmp(n, nu, nuLen) == 0) {
@@ -658,7 +711,14 @@ uint8_t ucol_uprv_getCaseBits(const UChar *s, uint32_t len, UErrorCode *status)
     }
   }
 
+  nuLen = u_strToUpper(nu, 128, n, nLen, "", status);
+  if(nuLen == nLen) {
+    if(u_strncmp(n, nu, nuLen) == 0) {
+      return UCOL_UPPER_CASE;
+    }
+  }
   return UCOL_MIXED_CASE;
+#endif
 
 }
 
@@ -699,13 +759,14 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
           /* will have to get one from UCA */
           /* first, get the UChars from the rules */
           /* then pick CEs out until there is no more and stuff them into expansion */
-          UChar source[256],buff[256];
+          //UChar source[256],buff[256];
           collIterate s;
           uint32_t order = 0;
-          uint32_t normSize = 0;
-          uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar));
-          normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status);
-          init_collIterate(src->UCA, source, normSize, &s);
+          //uint32_t normSize = 0;
+          //uprv_memcpy(buff, expOffset + src->source, 1*sizeof(UChar));
+          //normSize = unorm_normalize(buff, 1, UNORM_NFD, 0, source, 256, status);
+          //init_collIterate(src->UCA, source, normSize, &s);
+          init_collIterate(src->UCA, expOffset + src->source, 1, &s);
 
           for(;;) {
             order = ucol_getNextCE(src->UCA, &s, status);
@@ -735,11 +796,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
 
     /* copy UChars */
 
-    UChar buff[128];
-    uint32_t decompSize;
-    uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));
-    decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status);
-    el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/
+    //UChar buff[128];
+    //uint32_t decompSize;
+    //uprv_memcpy(buff, (tok->source & 0x00FFFFFF) + src->source, (tok->source >> 24)*sizeof(UChar));
+    //decompSize = unorm_normalize(buff, tok->source >> 24, UNORM_NFD, 0, el.uchars, 128, status);
+    //el.cSize = decompSize; /*(tok->source >> 24); *//* + (tok->expansion >> 24);*/
+    el.cSize = (tok->source >> 24); 
+    uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
     el.cPoints = el.uchars;
 
     if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
@@ -760,7 +823,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
     el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
     if(el.cSize > 1) {
       // Do it manually
-      el.CEs[0] |= ucol_uprv_getCaseBits(el.cPoints, el.cSize, status);
+      el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status);
     } else {
       // Copy it from the UCA
       uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
diff --git a/icu4c/source/i18n/ucol_tok.cpp b/icu4c/source/i18n/ucol_tok.cpp
index a652f03ccc..e9389dcfae 100644
--- a/icu4c/source/i18n/ucol_tok.cpp
+++ b/icu4c/source/i18n/ucol_tok.cpp
@@ -734,6 +734,16 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
           src->varTop = sourceToken;
         }
 
+        /*
+          If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 
+          d * ... into &x * c/y * d * ... 
+        */
+        if(expandNext != 0 && sourceToken->expansion == 0) {
+          sourceToken->expansion = expandNext;
+          sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
+          //expandNext = 0;
+        }
+
         /*
         1.	Find the strongest strength in each list, and set strongestP and strongestN 
         accordingly in the headers. 
@@ -769,16 +779,6 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
               lastToken->next = sourceToken;
             }
           }
-
-          /*
-            If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c * 
-            d * ... into &x * c/y * d * ... 
-          */
-          if(expandNext != 0 && sourceToken->expansion == 0) {
-            sourceToken->expansion = expandNext;
-            sourceToken->debugExpansion = *(src->source + (expandNext & 0xFFFFFF));
-            expandNext = 0;
-          }
         } else {
         /* Otherwise (when LAST is not a reset) 
               if polarity (LAST) == polarity(relation), insert sourceToken after LAST, 
diff --git a/icu4c/source/test/cintltst/cmsccoll.c b/icu4c/source/test/cintltst/cmsccoll.c
index b875431ee9..9f25ae94b3 100644
--- a/icu4c/source/test/cintltst/cmsccoll.c
+++ b/icu4c/source/test/cintltst/cmsccoll.c
@@ -1954,7 +1954,56 @@ static void TestIncrementalNormalize() {
     uprv_free(strB);
 }
 
+#if 0
+static void TestGetCaseBit() {
+  static const char *caseBitData[] = {
+    "a", "A", "ch", "Ch", "CH",
+      "\\uFF9E", "\\u0009"
+  };
 
+  static const uint8_t results[] = {
+    UCOL_LOWER_CASE, UCOL_UPPER_CASE, UCOL_LOWER_CASE, UCOL_MIXED_CASE, UCOL_UPPER_CASE,
+      UCOL_UPPER_CASE, UCOL_LOWER_CASE
+  };
+
+  uint32_t i, blen = 0;
+  UChar b[256] = {0};
+  UErrorCode status = U_ZERO_ERROR;
+  UCollator *UCA = ucol_open("", &status);
+  uint8_t res = 0;
+  
+  for(i = 0; i<sizeof(results)/sizeof(results[0]); i++) {
+    blen = u_unescape(caseBitData[i], b, 256);
+    res = ucol_uprv_getCaseBits(UCA, b, blen, &status);
+    if(results[i] != res) {
+      log_err("Expected case = %02X, got %02X for %04X\n", results[i], res, b[0]);
+    }
+  }
+}
+#endif
+
+static void TestHangulTailoring() {
+  static const char *koreanData[] = {
+    "\\uac00", "\\u4f3d", "\\u4f73", "\\u5047", "\\u50f9", "\\u52a0", "\\u53ef", "\\u5475," 
+        " \\u54e5", "\\u5609", "\\u5ac1", "\\u5bb6", "\\u6687", "\\u67b6", "\\u67b7", "\\u67ef," 
+        " \\u6b4c", "\\u73c2", "\\u75c2", "\\u7a3c", "\\u82db", "\\u8304", "\\u8857", "\\u8888," 
+        " \\u8a36", "\\u8cc8", "\\u8dcf", "\\u8efb", "\\u8fe6", "\\u99d5," 
+	" \\u4EEE", "\\u50A2", "\\u5496", "\\u54FF", "\\u5777", "\\u5B8A", "\\u659D", "\\u698E," 
+	" \\u6A9F", "\\u73C8", "\\u7B33", "\\u801E", "\\u8238", "\\u846D", "\\u8B0C"
+  };
+
+  char rules = 
+        "&\\uac00 <<< \\u4f3d <<< \\u4f73 <<< \\u5047 <<< \\u50f9 <<< \\u52a0 <<< \\u53ef <<< \\u5475 " 
+        "<<< \\u54e5 <<< \\u5609 <<< \\u5ac1 <<< \\u5bb6 <<< \\u6687 <<< \\u67b6 <<< \\u67b7 <<< \\u67ef " 
+        "<<< \\u6b4c <<< \\u73c2 <<< \\u75c2 <<< \\u7a3c <<< \\u82db <<< \\u8304 <<< \\u8857 <<< \\u8888 " 
+        "<<< \\u8a36 <<< \\u8cc8 <<< \\u8dcf <<< \\u8efb <<< \\u8fe6 <<< \\u99d5 " 
+	    "<<< \\u4EEE <<< \\u50A2 <<< \\u5496 <<< \\u54FF <<< \\u5777 <<< \\u5B8A <<< \\u659D <<< \\u698E " //k1
+	    "<<< \\u6A9F <<< \\u73C8 <<< \\u7B33 <<< \\u801E <<< \\u8238 <<< \\u846D <<< \\u8B0C"
+
+
+  genericRulesStarter(rules, koreanData, sizeof(koreanData)/sizeof(koreanData[0]));
+  
+}
 
 void addMiscCollTest(TestNode** root)
 {
@@ -1975,11 +2024,13 @@ void addMiscCollTest(TestNode** root)
     addTest(root, &TestJ831, "tscoll/cmsccoll/TestJ831");
     addTest(root, &TestBefore, "tscoll/cmsccoll/TestBefore");
     addTest(root, &TestRedundantRules, "tscoll/cmsccoll/TestRedundantRules");
+    addTest(root, &TestHangulTailoring, "tscoll/cmsccoll/TestHangulTailoring");
     /*addTest(root, &TestUCAZero, "tscoll/cmsccoll/TestUCAZero");*/
     /*addTest(root, &TestUnmappedSpaces, "tscoll/cmsccoll/TestUnmappedSpaces");*/
     /*addTest(root, &PrintMarkDavis, "tscoll/cmsccoll/PrintMarkDavis");*/
     /*addTest(root, &TestVariableTop, "tscoll/cmsccoll/TestVariableTop");*/
     addTest(root, &TestIncrementalNormalize, "tscoll/cmsccoll/TestIncrementalNormalize");
     addTest(root, &TestComposeDecompose, "tscoll/cmsccoll/TestComposeDecompose");
+    /*addTest(root, &TestGetCaseBit, "tscoll/cmsccoll/TestGetCaseBit");*/
 }