ICU-1007 simplify esp. composition, add comments

X-SVN-Rev: 5680
2001-09-01 02:12:03 +00:00 · 2001-09-01 02:12:03 +00:00 · 35e3da494f
commit 35e3da494f
parent 18a73611e4
1 changed files with 234 additions and 444 deletions
--- a/icu4c/source/common/unorm.cpp
+++ b/icu4c/source/common/unorm.cpp
@ -463,139 +463,19 @@ _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
 /* reorder UTF-16 in-place -------------------------------------------------- */
 /*
- * merge two UTF-16 string parts together
+ * simpler, single-character version of _mergeOrdered() -
- * to canonically order (order by combining classes) their concatenation
+ * bubble-insert one single code point into the preceding string
 * which is already canonically ordered
 * (c, c2) may or may not yet have been inserted at [current..p[
 *
- * the two strings may already be adjacent, so that the merging is done in-place
+ * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
 * if the two strings are not adjacent, then the buffer holding the first one
 * must be large enough
 * the second string may or may not be ordered in itself
 *
 * before: [start..current[ is already ordered, and
- *         [next..limit[    may be ordered in itself, but
+ *         [current..p[     may or may not hold (c, c2) but
- *                          is not in relation to [start..current[
+ *                          must be exactly the same length as (c, c2)
- * after: [start..current+(limit-next)[ is ordered
+ * after: [start..p[ is ordered
 *
 * the algorithm is a simple bubble-sort that takes the characters from *next++
 * and inserts them in correct combining class order into the preceding part
 * of the string
 *
 * returns the trailing combining class
 * ### TODO see how often this is used - if very rare then just iterate over [next..limit[ and call optimized fn
 */
 static uint8_t
 _mergeOrdered(UChar *start, UChar *current,
              const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
    const UChar *pBack, *pPreBack;
    UChar *q, *r;
    UChar c, c2;
    uint8_t cc, prevCC, trailCC=0;
    UBool adjacent;
    adjacent= current==next;
    if(start!=current || !isOrdered) {
        while(next<limit) {
            cc=_getNextCC(next, limit, c, c2);
            if(cc==0) {
                /* does not bubble back */
                trailCC=0;
                if(adjacent) {
                    current=(UChar *)next;
                } else {
                    *current++=c;
                    if(c2!=0) {
                        *current++=c2;
                    }
                }
                if(isOrdered) {
                    break;
                } else {
                    start=current;
                }
            } else {
                /* search for the insertion point where cc>=prevCC */
                pPreBack=pBack=current;
                prevCC=_getPrevCC(start, pPreBack);
                if(cc>=prevCC) {
                    /* does not bubble back */
                    trailCC=cc;
                    if(adjacent) {
                        current=(UChar *)next;
                    } else {
                        *current++=c;
                        if(c2!=0) {
                            *current++=c2;
                        }
                    }
                    if(isOrdered) {
                        break;
                    }
                } else {
                    /* this will be the last code point, so keep its cc */
                    trailCC=prevCC;
                    pBack=pPreBack;
                    while(start<pPreBack) {
                        prevCC=_getPrevCC(start, pPreBack);
                        if(cc>=prevCC) {
                            break;
                        }
                        pBack=pPreBack;
                    }
                    /*
                     * this is where we are right now with all these pointers:
                     * [start..pPreBack[ 0..? code points that we can ignore
                     * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
                     * [pBack..current[  0..n code points with >cc, move up to insert (c, c2)
                     * [current..next[      1 code point (c, c2) with cc
                     * [next..limit[     0..? code points yet to be bubbled in
                     *
                     * note that current and next may be unrelated (if not adjacent)!
                     */
                    /* move the code units in between up (q moves left of r) */
                    q=current;
                    r=current= c2==0 ? current+1 : current+2;
                    do {
                        *--r=*--q;
                    } while(pBack!=q);
                    /* insert (c, c2) */
                    *q=c;
                    if(c2!=0) {
                        *(q+1)=c2;
                    }
                    if(isOrdered) {
                        /* we know that the new part is ordered in itself, so we can move start up */
                        start=r; /* set it to after where (c, c2) were inserted */
                    }
                }
            }
        }
    }
    if(next==limit) {
        /* we know the cc of the last code point */
        return trailCC;
    } else {
        if(!adjacent) {
            /* copy the second string part */
            do {
                *current++=*next++;
            } while(next!=limit);
            limit=current;
        }
        return _getPrevCC(start, limit);
    }
 }
 /*
 * simpler, more efficient version of _mergeOrdered() -
 * inserts only one code point into the preceding string
 * assume that (c, c2) has not yet been inserted at [current..p[
 * ### TODO doc that p=current+1 or +2 according to c2=?=0
 */
 static uint8_t
 _insertOrdered(const UChar *start, UChar *current, UChar *p,
@ -646,6 +526,82 @@ _insertOrdered(const UChar *start, UChar *current, UChar *p,
    return trailCC;
 }
 /*
 * merge two UTF-16 string parts together
 * to canonically order (order by combining classes) their concatenation
 *
 * the two strings may already be adjacent, so that the merging is done in-place
 * if the two strings are not adjacent, then the buffer holding the first one
 * must be large enough
 * the second string may or may not be ordered in itself
 *
 * before: [start..current[ is already ordered, and
 *         [next..limit[    may be ordered in itself, but
 *                          is not in relation to [start..current[
 * after: [start..current+(limit-next)[ is ordered
 *
 * the algorithm is a simple bubble-sort that takes the characters from *next++
 * and inserts them in correct combining class order into the preceding part
 * of the string
 *
 * since this function is called much less often than the single-code point
 * _insertOrdered(), it just uses that for easier maintenance
 * (see file version from before 2001aug31 for a more optimized version)
 *
 * returns the trailing combining class
 */
 static uint8_t
 _mergeOrdered(UChar *start, UChar *current,
              const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
    UChar *r;
    UChar c, c2;
    uint8_t cc, trailCC=0;
    UBool adjacent;
    adjacent= current==next;
    if(start!=current || !isOrdered) {
        while(next<limit) {
            cc=_getNextCC(next, limit, c, c2);
            if(cc==0) {
                /* does not bubble back */
                trailCC=0;
                if(adjacent) {
                    current=(UChar *)next;
                } else {
                    *current++=c;
                    if(c2!=0) {
                        *current++=c2;
                    }
                }
                if(isOrdered) {
                    break;
                } else {
                    start=current;
                }
            } else {
                r=current+(c2==0 ? 1 : 2);
                trailCC=_insertOrdered(start, current, r, c, c2, cc);
                current=r;
            }
        }
    }
    if(next==limit) {
        /* we know the cc of the last code point */
        return trailCC;
    } else {
        if(!adjacent) {
            /* copy the second string part */
            do {
                *current++=*next++;
            } while(next!=limit);
            limit=current;
        }
        return _getPrevCC(start, limit);
    }
 }
 /* quick check functions ---------------------------------------------------- */
 static UBool
@ -677,7 +633,13 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
                    if(c==0) {
                        return TRUE;
                    }
-                    /* ### TODO comment this is safe because c<=0x300... */
+                    /*
                     * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
                     * because chances are good that the next one will have
                     * a leading cc of 0;
                     * _getFCD16(-prevCC) is later called when necessary -
                     * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
                     */
                    prevCC=-(int16_t)c;
                } else if((fcd16=_getFCD16(c))==0) {
                    prevCC=0;
@ -702,7 +664,7 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
        /* check one above-minimum, relevant code unit */
        if(UTF_IS_FIRST_SURROGATE(c)) {
            /* c is a lead surrogate, get the real fcd16 */
-            if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+            if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
                ++src;
                fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
            } else {
@ -827,7 +789,7 @@ unorm_quickCheck(const UChar *src,
        /* check one above-minimum, relevant code unit */
        if(isNorm32LeadSurrogate(norm32)) {
            /* c is a lead surrogate, get the real norm32 */
-            if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+            if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
                ++src;
                norm32=_getNorm32FromSurrogatePair(norm32, c2);
            } else {
@ -856,11 +818,12 @@ unorm_quickCheck(const UChar *src,
 /* make NFD & NFKD ---------------------------------------------------------- */
-U_CFUNC int32_t
+static int32_t
-unorm_decompose(UChar *dest, int32_t destCapacity,
+_decompose(UChar *&dest, int32_t &destCapacity,
           const UChar *src, int32_t srcLength,
           UBool compat, UBool ignoreHangul,
           UGrowBuffer *growBuffer, void *context,
           uint8_t &outTrailCC,
           UErrorCode *pErrorCode) {
    UChar buffer[3];
    const UChar *limit, *prevSrc, *p;
@ -871,10 +834,6 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
    uint8_t cc, prevCC, trailCC;
    UBool canGrow;
    if(!_haveData(*pErrorCode)) {
        return 0;
    }
    if(!compat) {
        minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
        qcMask=_NORM_QC_NFD;
@ -951,7 +910,14 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
         * generally, set p and length to the decomposition string
         * in simple cases, p==NULL and (c, c2) will hold the length code units to append
         * in all cases, set cc to the lead and trailCC to the trail combining class
-         * ### TODO say that c, c2 is either (BMP, 0) or (lead surr, trail surr) - for optimized single-char bubble sort
+         *
         * the following merge-sort of the current character into the preceding,
         * canonically ordered result text will use the optimized _insertOrdered()
         * if there is only one single code point to process;
         * this is indicated with p==NULL, and (c, c2) is the character to insert
         * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
         * for a supplementary character)
         * otherwise, p[length] is merged in with _mergeOrdered()
         */
        if(isNorm32HangulOrJamo(norm32)) {
            if(ignoreHangul) {
@ -983,7 +949,7 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
                length=1;
            } else {
                /* c is a lead surrogate, get the real norm32 */
-                if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+                if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
                    ++src;
                    length=2;
                    norm32=_getNorm32FromSurrogatePair(norm32, c2);
@ -1059,6 +1025,30 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
        }
    }
    outTrailCC=prevCC;
    return destIndex;
 }
 U_CFUNC int32_t
 unorm_decompose(UChar *dest, int32_t destCapacity,
                const UChar *src, int32_t srcLength,
                UBool compat, UBool ignoreHangul,
                UGrowBuffer *growBuffer, void *context,
                UErrorCode *pErrorCode) {
    int32_t destIndex;
    uint8_t trailCC;
    if(!_haveData(*pErrorCode)) {
        return 0;
    }
    destIndex=_decompose(dest, destCapacity,
                         src, srcLength,
                         compat, ignoreHangul,
                         growBuffer, context,
                         trailCC,
                         pErrorCode);
 #if 1
    /* ### TODO: this passes the tests but seems weird */
    /* we may NUL-terminate if it fits as a convenience */
@ -1114,21 +1104,14 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
        }
        /* get c=*src - stop at end of string */
        if(limit==NULL) {
            c=*src;
            if(c==0) {
                break;
            }
        } else {
        if(src==limit) {
            break;
        }
        c=*src;
        }
        /* stop if lead cc==0 for this character */
        if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
-            break;
+            break; /* catches terminating NUL, too */
        }
        if(!UTF_IS_FIRST_SURROGATE(c)) {
@ -1136,7 +1119,7 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
                break;
            }
            ++src;
-        } else if((limit==NULL || (src+1)!=limit) && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
+        } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
            /* c is a lead surrogate, get the real fcd16 */
            fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
            if(fcd16<=0xff) {
@ -1391,7 +1374,7 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity,
        /* check one above-minimum, relevant code unit */
        if(UTF_IS_FIRST_SURROGATE(c)) {
            /* c is a lead surrogate, get the real fcd16 */
-            if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+            if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
                ++src;
                fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
            } else {
@ -1770,170 +1753,67 @@ _recompose(UChar *p, UChar *&limit) {
    }
 }
-/*
+/* find the first true starter in [src..limit[ and return the pointer to it */
 * read and decompose the following character
 * return NULL if it is (or its decomposition starts with) a starter (cc==0)
 * that has NF*C "yes"
 * otherwise, return its decomposition (and set length, cc, and trailCC)
 */
 static const UChar *
-_decomposeBeforeNextStarter(const UChar *&src, const UChar *limit,
+_findNextStarter(const UChar *src, const UChar *limit,
-                            uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe,
+                 uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
                            uint8_t &cc, uint8_t &trailCC,
                            int32_t &length) {
    const UChar *p;
-    uint32_t norm32;
+    uint32_t norm32, ccOrQCMask;
-    UChar c, c2;
+    int32_t length;
    /* end of string? get c */
    if(limit==NULL) {
        c=*src;
        if(c==0) {
            return NULL;
        }
    } else {
        if(src==limit) {
            return NULL;
        }
        c=*src;
    }
    /* anything to be done? */
    if(c<minNoMaybe) {
        return NULL;
    }
    norm32=_getNorm32(c);
    if((norm32&(_NORM_CC_MASK|qcMask|decompQCMask))==0) {
        return NULL;
    }
    if(isNorm32HangulOrJamo(norm32)) {
        if(isHangulJamoNorm32HangulOrJamoL(norm32)) {
            /* Hangul decomposes but is all starters, Jamo L are starters */
            return NULL;
        }
        /* Jamo V/T are not starters but cc==0 */
        cc=trailCC=0;
        length=1;
        return src++;
    }
    if(isNorm32Regular(norm32)) {
        c2=0;
        length=1;
    } else {
        /* c is a lead surrogate, get the real norm32 */
        if((limit==NULL || (src+1)!=limit) && UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
            length=2;
            norm32=_getNorm32FromSurrogatePair(norm32, c2);
        } else {
            return NULL;
        }
    }
    /* get the decomposition and the lead and trail cc's */
    if((norm32&decompQCMask)==0) {
        /* c does not decompose */
        cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
        p=src;
    } else {
        /* c decomposes, get everything from the variable-length extra data */
        p=_decompose(norm32, decompQCMask, length, cc, trailCC);
        if(cc==0) {
            /* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */
            norm32=_getNorm32(p, qcMask);
        }
    }
    if(cc==0 && !(norm32&qcMask)) {
        return NULL;
    } else {
        src+= c2==0 ? 1 : 2;
        return p;
    }
 }
 /*
 * decompose the previous code point (needs start<src)
 * set starterIndex>=0 to the last starter in the decomposition
 * that has NF*C "yes"
 * starterIndex==-1 if there is no starter
 */
 static const UChar *
 _decomposeBackFindStarter(const UChar *start, const UChar *&src,
                          uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe,
                          int32_t &starterIndex,
                          int32_t &length) {
    const UChar *p;
    uint32_t norm32;
    UChar c, c2;
    uint8_t cc, trailCC;
-    norm32=_getPrevNorm32(start, src, minNoMaybe, _NORM_CC_MASK|qcMask|decompQCMask, c, c2);
+    ccOrQCMask=_NORM_CC_MASK|qcMask;
    length= c2==0 ? 1 : 2;
    starterIndex=0; /* many characters are themselves starters */
-    if( (norm32&(_NORM_CC_MASK|qcMask|decompQCMask))==0 ||
+    for(;;) {
-        isNorm32HangulOrJamo(norm32)
+        if(src==limit) {
-    ) {
+            break; /* end of string */
-        /* found a true starter */
+        }
-        /*
+        c=*src;
-         * Hangul decomposes but is all starters, Jamo L are starters.
+        if(c<minNoMaybe) {
-         * We never get Jamo V/T here because
+            break; /* catches NUL terminater, too */
-         * we go back through quick check "yes" text
+        }
-         * and Jamo V/T have NFC_MAYBE.
+
-         */
+        norm32=_getNorm32(c);
-        return src;
+        if((norm32&ccOrQCMask)==0) {
            break; /* true starter */
        }
    /* get the decomposition and the lead and trail cc's */
        if((norm32&decompQCMask)==0) {
-        /* c does not decompose */
+            ++src; /* does not decompose, continue */
-        if((norm32&(_NORM_CC_MASK|qcMask))!=0) {
+            continue;
            starterIndex=-1;
        }
-        p=src;
+
        /* no Hangul/Jamo here because they are all true starters or don't decompose */
        if(isNorm32Regular(norm32)) {
            c2=0;
        } else {
-        /* c decomposes, get everything from the variable-length extra data */
+            /* c is a lead surrogate, get the real norm32 */
            if((src+1)==limit || UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
                break; /* unmatched first surrogate */
            }
            norm32=_getNorm32FromSurrogatePair(norm32, c2);
            if((norm32&ccOrQCMask)==0) {
                break; /* true starter */
            } else if((norm32&decompQCMask)==0) {
                src+=2; /* does not decompose, continue */
                continue;
            }
        }
        /* (c, c2) decomposes, get everything from the variable-length extra data */
        p=_decompose(norm32, decompQCMask, length, cc, trailCC);
-        /* find the starterIndex (the decomposition is canonically ordered!) */
+        /* get the first character's norm32 to check if it is a true starter */
-        /* assume that the decomposition contains complete code points */
+        if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
-        if(UTF_IS_SECOND_SURROGATE(p[length-1])) {
+            break; /* true starter */
            starterIndex=length-2;
        } else {
            starterIndex=length-1;
        }
        if(trailCC!=0 || (_getNorm32(p+starterIndex, qcMask)&qcMask)) {
            /* search backwards */
            for(;;) {
                if(starterIndex==0) {
                    starterIndex=-1;
                    break;
                }
                c=p[--starterIndex];
                if(UTF_IS_SECOND_SURROGATE(c)) {
                    c2=p[--starterIndex];
                    norm32=_getNorm32(c2);
                    if((norm32&(_NORM_CC_MASK|qcMask))==0) {
                        /* all surrogate pairs with this lead surrogate have cc==0 */
                        break;
                    } else {
                        /* norm32 must be a surrogate special */
                        norm32=_getNorm32FromSurrogatePair(norm32, c);
                    }
                } else {
                    norm32=_getNorm32(c);
                }
                if((norm32&(_NORM_CC_MASK|qcMask))==0) {
                    break;
                }
            }
        }
        }
-    return p;
+        src+= c2==0 ? 1 : 2; /* not a true starter, continue */
    }
    return src;
 }
 /*
@ -1942,8 +1822,8 @@ _decomposeBackFindStarter(const UChar *start, const UChar *&src,
 * after some text (with quick check "yes") has been copied already
 *
 * decompose this character as well as parts of the source surrounding it,
- * find the previous and the next starter,
+ * bounded by the previous and the next true starter,
- * and then recompose between these two starters
+ * and then recompose this decomposition
 */
 static const UChar *
 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
@ -1952,12 +1832,10 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
             uint32_t qcMask, uint8_t &prevCC,
             int32_t &destIndex,
             UErrorCode *pErrorCode) {
-    const UChar *p, *starter;
+    UChar *recomposeLimit;
    UChar *reorderSplit, *recomposeLimit;
    uint32_t decompQCMask;
    int32_t startIndex, limitIndex, firstStarterIndex, starterIndex;
    UChar minNoMaybe;
-    uint8_t cc, trailCC;
+    uint8_t trailCC;
    decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
@ -1967,140 +1845,41 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
        minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
    }
    /* get the decomposition and the lead and trail cc's */
    if((norm32&decompQCMask)==0) {
        /* c does not decompose */
        cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
        p=prevSrc;
    } else {
        /* c decomposes, get everything from the variable-length extra data */
        p=_decompose(norm32, decompQCMask, length, cc, trailCC);
        if(cc==0) {
            /* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */
            norm32=_getNorm32(p, qcMask);
        }
    }
    /* copy the decomposition into the buffer, assume that it fits */
    startIndex=limitIndex=bufferCapacity/2;
    do {
        buffer[limitIndex++]=*p++;
    } while(--length>0);
    /* find the last starter in [prevStarter..src[ including this new decomposition */
    if((cc==0 && !(norm32&qcMask)) || prevStarter==prevSrc) {
        prevCC=trailCC;
        starter=prevSrc;
        firstStarterIndex=startIndex;
    } else {
    /*
-         * ### TODO
+     * find the last true starter in [prevStarter..src[
-         * - verify that prevStarter is indeed at the _last_ starter before prevSrc
+     * it is either the decomposition of the current character (at prevSrc),
-         * - if that is so, then perform a normal decomposition on [prevStarter..src[
+     * or prevStarter
         *   instead of this special, incremental one
     */
-
+    if(_isTrueStarter(norm32, _NORM_CC_MASK|qcMask, decompQCMask)) {
-        /* decompose backwards and look for a starter */
+        prevStarter=prevSrc;
-        firstStarterIndex=0;
+    } else {
        starter=prevSrc;
        for(;;) {
            p=_decomposeBackFindStarter(prevStarter, starter,
                                        qcMask, decompQCMask, minNoMaybe,
                                        starterIndex, length);
            /* make sure there is enough space in the buffer */
            if(startIndex<length) {
                int32_t bufferLength;
                if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, limitIndex)) {
                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
                    return NULL;
                }
                /* move the current buffer contents up */
                bufferLength=limitIndex-startIndex;
                limitIndex=bufferCapacity-_STACK_BUFFER_CAPACITY/2;
                uprv_memmove(buffer+(limitIndex-bufferLength), buffer+startIndex, bufferLength*U_SIZEOF_UCHAR);
                startIndex=limitIndex-bufferLength;
            }
            /* prepend the decomposition */
            p+=length;
            do {
                buffer[--startIndex]=*--p;
            } while(--length>0);
            /* stop if we found a starter */
            if(starterIndex>=0) {
                firstStarterIndex=startIndex+starterIndex;
                break;
            }
            /* stop if we are at the beginning of the text */
            if(prevStarter>=starter) {
                firstStarterIndex=startIndex;
                break;
            }
        }
        /* reorder the backwards decomposition, set prevCC */
        reorderSplit=buffer+firstStarterIndex;
        prevCC=_mergeOrdered(reorderSplit, reorderSplit, reorderSplit, buffer+limitIndex, FALSE);
        /* adjust destIndex: back out what had been copied with qc "yes" */
-        destIndex-=(int32_t)(prevSrc-starter);
+        destIndex-=(int32_t)(prevSrc-prevStarter);
    }
-    /* find the next starter in [src..limit[ */
+    /* find the next true starter in [src..limit[ */
-    for(;;) {
+    src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
        p=_decomposeBeforeNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe, cc, trailCC, length);
        if(p==NULL) {
            break; /* reached a starter */
        }
-        /* make sure there is enough space in the buffer */
+    /* decompose [prevStarter..src[ */
-        if((limitIndex+length)>bufferCapacity) {
+    length=_decompose(buffer, bufferCapacity,
-            if(startIndex>=length) {
+                      prevStarter, src-prevStarter,
-                /* it fits if we move the buffer contents up */
+                      (decompQCMask&_NORM_QC_NFKD)!=0, FALSE,
-                uprv_memmove(buffer, buffer+startIndex, (limitIndex-startIndex)*U_SIZEOF_UCHAR);
+                      u_growBufferFromStatic, stackBuffer,
-                firstStarterIndex-=startIndex;
+                      trailCC,
-                limitIndex-=startIndex;
+                      pErrorCode);
                startIndex=0;
            } else if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, limitIndex)) {
                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
                return NULL;
            }
        }
-        if(cc!=0 && cc<prevCC) {
+    /* set the next starter */
            /* the decomposition is out of order with respect to the preceding text */
            reorderSplit=buffer+limitIndex;
            limitIndex+=length;
            if(length==1) {
                prevCC=_insertOrdered(buffer+firstStarterIndex, reorderSplit, buffer+limitIndex, *p, 0, cc);
            } else {
                prevCC=_mergeOrdered(buffer+firstStarterIndex, reorderSplit, p, p+length);
            }
        } else {
            /* just append the decomposition */
            do {
                buffer[limitIndex++]=*p++;
            } while(--length>0);
            prevCC=trailCC;
        }
    }
    /* recompose between the two starters */
    recomposeLimit=buffer+limitIndex;
    if((limitIndex-firstStarterIndex)>=2) {
        prevCC=_recompose(buffer+firstStarterIndex, recomposeLimit);
    }
    /* set output parameters and return with a pointer to the recomposition */
    prevStarter=src;
-    p=buffer+startIndex;
+
-    length=recomposeLimit-p;
+    /* recompose the decomposition */
-    return p;
+    recomposeLimit=buffer+length;
    if(length>=2) {
        prevCC=_recompose(buffer, recomposeLimit);
    }
    /* return with a pointer to the recomposition and its length */
    length=recomposeLimit-buffer;
    return buffer;
 }
 U_CFUNC int32_t
@ -2267,7 +2046,7 @@ unorm_compose(UChar *dest, int32_t destCapacity,
                    c2=(UChar)(c2-JAMO_L_BASE);
                    if(c2<JAMO_L_COUNT) {
                        c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
-                        if((limit==NULL || src!=limit) && (c2=(UChar)(*src-JAMO_T_BASE))<JAMO_T_COUNT) {
+                        if(src!=limit && (c2=(UChar)(*src-JAMO_T_BASE))<JAMO_T_COUNT) {
                            ++src;
                            c+=c2;
                        }
@ -2293,7 +2072,7 @@ unorm_compose(UChar *dest, int32_t destCapacity,
                length=1;
            } else {
                /* c is a lead surrogate, get the real norm32 */
-                if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
+                if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
                    ++src;
                    length=2;
                    norm32=_getNorm32FromSurrogatePair(norm32, c2);
@ -2311,7 +2090,19 @@ unorm_compose(UChar *dest, int32_t destCapacity,
            } else {
                const UChar *p;
-                /* ### TODO use sidebuffer because intermediate result might not fit but end result might - also rework some of dest buffer */
+                /*
                 * find appropriate boundaries around this character,
                 * decompose the source text from between the boundaries,
                 * and recompose it
                 *
                 * this puts the intermediate text into the side buffer because
                 * it might be longer than the recomposition end result,
                 * or the destination buffer may be too short or missing
                 *
                 * note that destIndex may be adjusted backwards to account
                 * for source text that passed the quick check but needed to
                 * take part in the recomposition
                 */
                p=_composePart(stackBuffer, buffer, bufferCapacity, length,
                               prevStarter,     /* in/out, will be set to the following true starter */
                               prevSrc, src, limit,
@ -2510,7 +2301,6 @@ unorm_normalize(const UChar *src, int32_t srcLength,
    }
    /* check for overlapping src and destination */
    /* ### TODO: real API may provide a temp buffer */
    if( (src>=dest && src<(dest+destCapacity)) ||
        (srcLength>0 && dest>=src && dest<(src+srcLength))
    ) {