ICU-7182 remove support for Unicode Normalization behavior before corrigendum 5 (PRI #29), and small simplification of recompose()

X-SVN-Rev: 26954
2009-11-19 05:41:12 +00:00 · 2009-11-19 05:41:12 +00:00 · 16168ea78a
commit 16168ea78a
parent 6b848d4b3f
3 changed files with 63 additions and 131 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/NormalizerImpl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/NormalizerImpl.java
@ -1536,125 +1536,83 @@ public final class NormalizerImpl {
            combineBackIndex=ncArg.combiningIndex;
            args.start = ncArg.start;
                        
-            if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
+            if(
+                // this character combines backward and
+                ((combineFlags&COMBINES_BACK)!=0) &&
+                // we have seen a starter that combines forward and
+                starter>=0 &&
+                // the backward-combining character is not blocked
+                (prevCC<ncArg.cc || prevCC==0)
+            ) {
                if((combineBackIndex&0x8000)!=0) {
-                    /* c is a Jamo V/T, see if we can compose it with the 
-                     * previous character 
-                     */
-                    /* for the PRI #29 fix, check that there is no intervening combining mark */
-                    if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
-                        remove=-1; /* NULL while no Hangul composition */
-                        combineFlags=0;
-                        ncArg.c2=args.source[starter];
-                        if(combineBackIndex==0xfff2) {
-                            /* Jamo V, compose with previous Jamo L and following 
-                             * Jamo T 
-                             */
-                            ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
-                            if(ncArg.c2<JAMO_L_COUNT) {
-                                remove=args.start-1;
-                                ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
-                                               (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
-                                if(args.start!=args.limit && 
-                                            (ncArg.c2=(char)(args.source[args.start]
-                                             -JAMO_T_BASE))<JAMO_T_COUNT) {
-                                    ++args.start;
-                                    ncArg.c+=ncArg.c2;
-                                 } else {
-                                     /* the result is an LV syllable, which is a starter (unlike LVT) */
-                                     combineFlags=COMBINES_FWD;
-                                }
-                                if(!nx_contains(nx, ncArg.c)) {
-                                    args.source[starter]=ncArg.c;
-                                   } else {
-                                    /* excluded */
-                                    if(!isHangulWithoutJamoT(ncArg.c)) {
-                                        --args.start; /* undo the ++args.start from reading the Jamo T */
-                                    }
-                                    /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
-                                    remove=args.start;
+                    /* c is a Jamo V/T, see if we can compose it with the previous character */
+                    remove=-1; /* NULL while no Hangul composition */
+                    combineFlags=0;
+                    ncArg.c2=args.source[starter];
+                    if(combineBackIndex==0xfff2) {
+                        /* Jamo V, compose with previous Jamo L and following Jamo T */
+                        ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
+                        if(ncArg.c2<JAMO_L_COUNT) {
+                            remove=args.start-1;
+                            ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
+                                           (ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
+                            if(args.start!=args.limit && 
+                                        (ncArg.c2=(char)(args.source[args.start]
+                                         -JAMO_T_BASE))<JAMO_T_COUNT) {
+                                ++args.start;
+                                ncArg.c+=ncArg.c2;
+                            }
+                            if(!nx_contains(nx, ncArg.c)) {
+                                args.source[starter]=ncArg.c;
+                            } else {
+                                /* excluded */
+                                if(!isHangulWithoutJamoT(ncArg.c)) {
+                                    --args.start; /* undo the ++args.start from reading the Jamo T */
                                }
+                                /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
+                                remove=-1;
                            }
-
-                        /*
-                         * Normally, the following can not occur:
-                         * Since the input is in NFD, there are no Hangul LV syllables that
-                         * a Jamo T could combine with.
-                         * All Jamo Ts are combined above when handling Jamo Vs.
-                         *
-                         * However, before the PRI #29 fix, this can occur due to
-                         * an intervening combining mark between the Hangul LV and the Jamo T.
-                         */
-                        } else {
-                            /* Jamo T, compose with previous Hangul that does not have a Jamo T */
-                            if(isHangulWithoutJamoT(ncArg.c2)) {
-                                ncArg.c2+=ncArg.c-JAMO_T_BASE;
-                                if(!nx_contains(nx, ncArg.c2)) {
-                                    remove=args.start-1;
-                                    args.source[starter]=ncArg.c2;
-                                }
-                            }
-                        }
-        
-                        if(remove!=-1) {
-                            /* remove the Jamo(s) */
-                            q=remove;
-                            r=args.start;
-                            while(r<args.limit) {
-                                args.source[q++]=args.source[r++];
-                            }
-                            args.start=remove;
-                            args.limit=q;
-                        }
-        
-                        ncArg.c2=0; /* c2 held *starter temporarily */
-
-                        if(combineFlags!=0) {
-                            /*
-                             * not starter=NULL because the composition is a Hangul LV syllable
-                             * and might combine once more (but only before the PRI #29 fix)
-                             */
-
-                            /* done? */
-                            if(args.start==args.limit) {
-                                return (char)prevCC;
-                            }
-
-                            /* the composition is a Hangul LV syllable which is a starter that combines forward */
-                            combineFwdIndex=0xfff0;
-
-                            /* we combined; continue with looking for compositions */
-                            continue;
                        }
                    }
-
                    /*
-                     * now: cc==0 and the combining index does not include 
-                     * "forward" -> the rest of the loop body will reset starter
-                     * to NULL; technically, a composed Hangul syllable is a 
-                     * starter, but it does not combine forward now that we have
-                     * consumed all eligible Jamos; for Jamo V/T, combineFlags 
-                     * does not contain _NORM_COMBINES_FWD
+                     * No "else" for Jamo T:
+                     * Since the input is in NFD, there are no Hangul LV syllables that
+                     * a Jamo T could combine with.
+                     * All Jamo Ts are combined above when handling Jamo Vs.
                     */
+
+                    if(remove>=0) {
+                        /* remove the Jamo(s) */
+                        q=remove;
+                        r=args.start;
+                        while(r<args.limit) {
+                            args.source[q++]=args.source[r++];
+                        }
+                        args.start=remove;
+                        args.limit=q;
+                    }
    
+                    ncArg.c2=0; /* c2 held *starter temporarily */
+
+                    /* done? */
+                    if(args.start==args.limit) {
+                        return (char)prevCC;
+                    }
+
+                    starter=-1;
+                    continue;
                } else if(
                    /* the starter is not a Hangul LV or Jamo V/T and */
                    !((combineFwdIndex&0x8000)!=0) &&
-                    /* the combining mark is not blocked and */
-                    ((options&BEFORE_PRI_29)!=0 ?
-                        (prevCC!=ncArg.cc || prevCC==0) :
-                        (prevCC<ncArg.cc || prevCC==0)) &&
                    /* the starter and the combining mark (c, c2) do combine */
                    0!=(result=combine(combiningTable,combineFwdIndex, 
                                       combineBackIndex, outValues)) &&
                    /* the composition result is not excluded */
-                    !nx_contains(nx, (char)value, (char)value2)
+                    !nx_contains(nx, (char)outValues[0], (char)outValues[1])
                ) {
                    value=outValues[0];
                    value2=outValues[1];
-                    /* replace the starter with the composition, remove the 
-                     * combining mark 
-                     */
+                    /* replace the starter with the composition, remove the combining mark */
                    remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
    
                    /* replace the starter with the composition */
@ -3687,17 +3645,6 @@ public final class NormalizerImpl {
     * Options bit 1, do not decompose CJK compatibility characters.
     */
    private static final int NX_CJK_COMPAT=2;
-    /**
-     * Options bit 8, use buggy recomposition described in
-     * Unicode Public Review Issue #29
-     * at http://www.unicode.org/review/resolved-pri.html#pri29
-     *
-     * Used in IDNA implementation according to strict interpretation
-     * of IDNA definition based on Unicode 3.2 which predates PRI #29.
-     *
-     * See ICU4C unormimp.h
-     */
-    public static final int BEFORE_PRI_29=0x100;

    /*
     * The following options are used only in some composition functions.
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/StringPrep.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/StringPrep.java
@ -473,22 +473,11 @@ public final class StringPrep {


    private StringBuffer normalize(StringBuffer src){
-        /*
-         * Option UNORM_BEFORE_PRI_29:
-         *
-         * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
-         * requires strict adherence to Unicode 3.2 normalization,
-         * including buggy composition from before fixing Public Review Issue #29.
-         * Note that this results in some valid but nonsensical text to be
-         * either corrupted or rejected, depending on the text.
-         * See http://www.unicode.org/review/resolved-pri.html#pri29
-         * See unorm.cpp and cnormtst.c
-         */
        return new StringBuffer(
            Normalizer.normalize(
                src.toString(),
                Normalizer.NFKC,
-                Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
+                Normalizer.UNICODE_3_2));
    }
    /*
    boolean isLabelSeparator(int ch){
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
@ -2936,19 +2936,15 @@ public class BasicTest extends TestFmwk {
        final TestCompositionCase cases[]=new TestCompositionCase[]{
            /*
             * special cases for UAX #15 bug
-             * see Unicode Public Review Issue #29
-             * at http://www.unicode.org/review/resolved-pri.html#pri29
+             * see Unicode Corrigendum #5: Normalization Idempotency
+             * at http://unicode.org/versions/corrigendum5.html
+             * (was Public Review Issue #29)
             */
            new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327",      "\u1100\u0300\u1161\u0327"),
            new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),
            new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8",      "\uac00\u0327\u0300\u11a8"),
            new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e",            "\u0b47\u0300\u0b3e"),

-            new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327",       "\uac00\u0300\u0327"),
-            new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327\u11a8", "\uac01\u0300\u0327"),
-            new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\uac00\u0300\u0327\u11a8",       "\uac01\u0327\u0300"),
-            new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u0b47\u0300\u0b3e",             "\u0b4b\u0300")
-
            /* TODO: add test cases for UNORM_FCC here (j2151) */
        };