ICU-1245 use NFKC for prefix analysis in Japanese

X-SVN-Rev: 5982
2001-10-01 02:58:26 +00:00 · 2001-10-01 02:58:26 +00:00 · 0180438c87
commit 0180438c87
parent 8ac6cb4b40
2 changed files with 16 additions and 10 deletions
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -27,6 +27,7 @@
 #include "unicode/tblcoll.h"
 #include "unicode/coleitr.h"
 #include "unicode/unorm.h"
+#include "unicode/normlzr.h"
 #include "unicode/udata.h"

 #include "unormimp.h"
@ -1826,25 +1827,27 @@ uint32_t getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate
        // Contraction tables are used - so the whole process is not unlike contraction.
        // prefix data is stored backwards in the table.
        const UChar *UCharOffset;
-        UChar tchar, *sourcePointer = source->pos;
+        UChar schar, tchar, *sourcePointer = source->pos;
+        Normalizer n(source->string, source->pos-source->string, UNORM_NFKC);
+        n.last();
        for(;;) {
        // This loop will run once per source string character, for as long as we
        //  are matching a potential contraction sequence                  

        // First we position ourselves at the begining of contraction sequence 
        const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
-        if(sourcePointer != source->string) {
-          --sourcePointer;
-        } else {
-          // Ran off the beggining of the source string.
+        schar = (UChar)n.previous();
+
+        if(schar==Normalizer::DONE) {
          CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
+          break;
        }

-        while(*(sourcePointer) > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
+        while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
          UCharOffset++;
        }

-        if (*(sourcePointer) == tchar) {
+        if (schar == tchar) {
            // Found the source string char in the table.
            //  Pick up the corresponding CE from the table.
            CE = *(coll->contractionCEs +
--- a/icu4c/source/i18n/ucol_bld.cpp
+++ b/icu4c/source/i18n/ucol_bld.cpp
@ -765,10 +765,13 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
    el.prefix = el.prefixChars;
    el.cPoints = el.uchars;
    if(tok->prefix != 0) { // adjust the source if there is a prefix
-      el.prefixSize = (tok->prefix>>24);
-      for(i = 0; i < tok->prefix>>24; i++) { // prefixes are going to be looked up backwards
+      // need to normalize to NFKC first
+      UChar buffNFKC[256];
+      el.prefixSize = unorm_normalize(src->source+(tok->prefix&0x00FFFFFF), tok->prefix>>24, UNORM_NFKC, 0, buffNFKC, 256, status);
+      for(i = 0; i < el.prefixSize; i++) { // prefixes are going to be looked up backwards
        // therefore, we will promptly reverse the prefix buffer...
-        el.prefix[i] = *(src->source+(tok->prefix& 0x00FFFFFF)+(tok->prefix>>24)-i-1);
+        //el.prefix[i] = *(src->source+(tok->prefix& 0x00FFFFFF)+(tok->prefix>>24)-i-1);
+        el.prefix[i] = *(buffNFKC+el.prefixSize-i-1);
      }
      //uprv_memcpy(el.prefix, (tok->prefix & 0x00FFFFFF) + src->source, el.prefixSize*sizeof(UChar));