ICU-1007 simplify esp. composition, add comments

X-SVN-Rev: 5680
This commit is contained in:
Markus Scherer 2001-09-01 02:12:03 +00:00
parent 18a73611e4
commit 35e3da494f

View File

@ -463,139 +463,19 @@ _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
/* reorder UTF-16 in-place -------------------------------------------------- */
/*
* merge two UTF-16 string parts together
* to canonically order (order by combining classes) their concatenation
* simpler, single-character version of _mergeOrdered() -
* bubble-insert one single code point into the preceding string
* which is already canonically ordered
* (c, c2) may or may not yet have been inserted at [current..p[
*
* the two strings may already be adjacent, so that the merging is done in-place
* if the two strings are not adjacent, then the buffer holding the first one
* must be large enough
* the second string may or may not be ordered in itself
* it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
*
* before: [start..current[ is already ordered, and
* [next..limit[ may be ordered in itself, but
* is not in relation to [start..current[
* after: [start..current+(limit-next)[ is ordered
*
* the algorithm is a simple bubble-sort that takes the characters from *next++
* and inserts them in correct combining class order into the preceding part
* of the string
* [current..p[ may or may not hold (c, c2) but
* must be exactly the same length as (c, c2)
* after: [start..p[ is ordered
*
* returns the trailing combining class
* ### TODO see how often this is used - if very rare then just iterate over [next..limit[ and call optimized fn
*/
static uint8_t
_mergeOrdered(UChar *start, UChar *current,
const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
const UChar *pBack, *pPreBack;
UChar *q, *r;
UChar c, c2;
uint8_t cc, prevCC, trailCC=0;
UBool adjacent;
adjacent= current==next;
if(start!=current || !isOrdered) {
while(next<limit) {
cc=_getNextCC(next, limit, c, c2);
if(cc==0) {
/* does not bubble back */
trailCC=0;
if(adjacent) {
current=(UChar *)next;
} else {
*current++=c;
if(c2!=0) {
*current++=c2;
}
}
if(isOrdered) {
break;
} else {
start=current;
}
} else {
/* search for the insertion point where cc>=prevCC */
pPreBack=pBack=current;
prevCC=_getPrevCC(start, pPreBack);
if(cc>=prevCC) {
/* does not bubble back */
trailCC=cc;
if(adjacent) {
current=(UChar *)next;
} else {
*current++=c;
if(c2!=0) {
*current++=c2;
}
}
if(isOrdered) {
break;
}
} else {
/* this will be the last code point, so keep its cc */
trailCC=prevCC;
pBack=pPreBack;
while(start<pPreBack) {
prevCC=_getPrevCC(start, pPreBack);
if(cc>=prevCC) {
break;
}
pBack=pPreBack;
}
/*
* this is where we are right now with all these pointers:
* [start..pPreBack[ 0..? code points that we can ignore
* [pPreBack..pBack[ 0..1 code points with prevCC<=cc
* [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
* [current..next[ 1 code point (c, c2) with cc
* [next..limit[ 0..? code points yet to be bubbled in
*
* note that current and next may be unrelated (if not adjacent)!
*/
/* move the code units in between up (q moves left of r) */
q=current;
r=current= c2==0 ? current+1 : current+2;
do {
*--r=*--q;
} while(pBack!=q);
/* insert (c, c2) */
*q=c;
if(c2!=0) {
*(q+1)=c2;
}
if(isOrdered) {
/* we know that the new part is ordered in itself, so we can move start up */
start=r; /* set it to after where (c, c2) were inserted */
}
}
}
}
}
if(next==limit) {
/* we know the cc of the last code point */
return trailCC;
} else {
if(!adjacent) {
/* copy the second string part */
do {
*current++=*next++;
} while(next!=limit);
limit=current;
}
return _getPrevCC(start, limit);
}
}
/*
* simpler, more efficient version of _mergeOrdered() -
* inserts only one code point into the preceding string
* assume that (c, c2) has not yet been inserted at [current..p[
* ### TODO doc that p=current+1 or +2 according to c2=?=0
*/
static uint8_t
_insertOrdered(const UChar *start, UChar *current, UChar *p,
@ -646,6 +526,82 @@ _insertOrdered(const UChar *start, UChar *current, UChar *p,
return trailCC;
}
/*
* merge two UTF-16 string parts together
* to canonically order (order by combining classes) their concatenation
*
* the two strings may already be adjacent, so that the merging is done in-place
* if the two strings are not adjacent, then the buffer holding the first one
* must be large enough
* the second string may or may not be ordered in itself
*
* before: [start..current[ is already ordered, and
* [next..limit[ may be ordered in itself, but
* is not in relation to [start..current[
* after: [start..current+(limit-next)[ is ordered
*
* the algorithm is a simple bubble-sort that takes the characters from *next++
* and inserts them in correct combining class order into the preceding part
* of the string
*
* since this function is called much less often than the single-code point
* _insertOrdered(), it just uses that for easier maintenance
* (see file version from before 2001aug31 for a more optimized version)
*
* returns the trailing combining class
*/
static uint8_t
_mergeOrdered(UChar *start, UChar *current,
const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
UChar *r;
UChar c, c2;
uint8_t cc, trailCC=0;
UBool adjacent;
adjacent= current==next;
if(start!=current || !isOrdered) {
while(next<limit) {
cc=_getNextCC(next, limit, c, c2);
if(cc==0) {
/* does not bubble back */
trailCC=0;
if(adjacent) {
current=(UChar *)next;
} else {
*current++=c;
if(c2!=0) {
*current++=c2;
}
}
if(isOrdered) {
break;
} else {
start=current;
}
} else {
r=current+(c2==0 ? 1 : 2);
trailCC=_insertOrdered(start, current, r, c, c2, cc);
current=r;
}
}
}
if(next==limit) {
/* we know the cc of the last code point */
return trailCC;
} else {
if(!adjacent) {
/* copy the second string part */
do {
*current++=*next++;
} while(next!=limit);
limit=current;
}
return _getPrevCC(start, limit);
}
}
/* quick check functions ---------------------------------------------------- */
static UBool
@ -677,7 +633,13 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
if(c==0) {
return TRUE;
}
/* ### TODO comment this is safe because c<=0x300... */
/*
* delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
* because chances are good that the next one will have
* a leading cc of 0;
* _getFCD16(-prevCC) is later called when necessary -
* -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
*/
prevCC=-(int16_t)c;
} else if((fcd16=_getFCD16(c))==0) {
prevCC=0;
@ -702,7 +664,7 @@ unorm_checkFCD(const UChar *src, int32_t srcLength) {
/* check one above-minimum, relevant code unit */
if(UTF_IS_FIRST_SURROGATE(c)) {
/* c is a lead surrogate, get the real fcd16 */
if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
} else {
@ -827,7 +789,7 @@ unorm_quickCheck(const UChar *src,
/* check one above-minimum, relevant code unit */
if(isNorm32LeadSurrogate(norm32)) {
/* c is a lead surrogate, get the real norm32 */
if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
@ -856,11 +818,12 @@ unorm_quickCheck(const UChar *src,
/* make NFD & NFKD ---------------------------------------------------------- */
U_CFUNC int32_t
unorm_decompose(UChar *dest, int32_t destCapacity,
static int32_t
_decompose(UChar *&dest, int32_t &destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, UBool ignoreHangul,
UGrowBuffer *growBuffer, void *context,
uint8_t &outTrailCC,
UErrorCode *pErrorCode) {
UChar buffer[3];
const UChar *limit, *prevSrc, *p;
@ -871,10 +834,6 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
uint8_t cc, prevCC, trailCC;
UBool canGrow;
if(!_haveData(*pErrorCode)) {
return 0;
}
if(!compat) {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
qcMask=_NORM_QC_NFD;
@ -951,7 +910,14 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
* generally, set p and length to the decomposition string
* in simple cases, p==NULL and (c, c2) will hold the length code units to append
* in all cases, set cc to the lead and trailCC to the trail combining class
* ### TODO say that c, c2 is either (BMP, 0) or (lead surr, trail surr) - for optimized single-char bubble sort
*
* the following merge-sort of the current character into the preceding,
* canonically ordered result text will use the optimized _insertOrdered()
* if there is only one single code point to process;
* this is indicated with p==NULL, and (c, c2) is the character to insert
* ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
* for a supplementary character)
* otherwise, p[length] is merged in with _mergeOrdered()
*/
if(isNorm32HangulOrJamo(norm32)) {
if(ignoreHangul) {
@ -983,7 +949,7 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
length=1;
} else {
/* c is a lead surrogate, get the real norm32 */
if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
length=2;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
@ -1059,6 +1025,30 @@ unorm_decompose(UChar *dest, int32_t destCapacity,
}
}
outTrailCC=prevCC;
return destIndex;
}
U_CFUNC int32_t
unorm_decompose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, UBool ignoreHangul,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
int32_t destIndex;
uint8_t trailCC;
if(!_haveData(*pErrorCode)) {
return 0;
}
destIndex=_decompose(dest, destCapacity,
src, srcLength,
compat, ignoreHangul,
growBuffer, context,
trailCC,
pErrorCode);
#if 1
/* ### TODO: this passes the tests but seems weird */
/* we may NUL-terminate if it fits as a convenience */
@ -1114,21 +1104,14 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
}
/* get c=*src - stop at end of string */
if(limit==NULL) {
c=*src;
if(c==0) {
break;
}
} else {
if(src==limit) {
break;
}
c=*src;
}
/* stop if lead cc==0 for this character */
if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
break;
break; /* catches terminating NUL, too */
}
if(!UTF_IS_FIRST_SURROGATE(c)) {
@ -1136,7 +1119,7 @@ _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
break;
}
++src;
} else if((limit==NULL || (src+1)!=limit) && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
} else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
/* c is a lead surrogate, get the real fcd16 */
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
if(fcd16<=0xff) {
@ -1391,7 +1374,7 @@ unorm_makeFCD(UChar *dest, int32_t destCapacity,
/* check one above-minimum, relevant code unit */
if(UTF_IS_FIRST_SURROGATE(c)) {
/* c is a lead surrogate, get the real fcd16 */
if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
} else {
@ -1770,170 +1753,67 @@ _recompose(UChar *p, UChar *&limit) {
}
}
/*
* read and decompose the following character
* return NULL if it is (or its decomposition starts with) a starter (cc==0)
* that has NF*C "yes"
* otherwise, return its decomposition (and set length, cc, and trailCC)
*/
/* find the first true starter in [src..limit[ and return the pointer to it */
static const UChar *
_decomposeBeforeNextStarter(const UChar *&src, const UChar *limit,
uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe,
uint8_t &cc, uint8_t &trailCC,
int32_t &length) {
_findNextStarter(const UChar *src, const UChar *limit,
uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
const UChar *p;
uint32_t norm32;
UChar c, c2;
/* end of string? get c */
if(limit==NULL) {
c=*src;
if(c==0) {
return NULL;
}
} else {
if(src==limit) {
return NULL;
}
c=*src;
}
/* anything to be done? */
if(c<minNoMaybe) {
return NULL;
}
norm32=_getNorm32(c);
if((norm32&(_NORM_CC_MASK|qcMask|decompQCMask))==0) {
return NULL;
}
if(isNorm32HangulOrJamo(norm32)) {
if(isHangulJamoNorm32HangulOrJamoL(norm32)) {
/* Hangul decomposes but is all starters, Jamo L are starters */
return NULL;
}
/* Jamo V/T are not starters but cc==0 */
cc=trailCC=0;
length=1;
return src++;
}
if(isNorm32Regular(norm32)) {
c2=0;
length=1;
} else {
/* c is a lead surrogate, get the real norm32 */
if((limit==NULL || (src+1)!=limit) && UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
length=2;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
} else {
return NULL;
}
}
/* get the decomposition and the lead and trail cc's */
if((norm32&decompQCMask)==0) {
/* c does not decompose */
cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
p=src;
} else {
/* c decomposes, get everything from the variable-length extra data */
p=_decompose(norm32, decompQCMask, length, cc, trailCC);
if(cc==0) {
/* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */
norm32=_getNorm32(p, qcMask);
}
}
if(cc==0 && !(norm32&qcMask)) {
return NULL;
} else {
src+= c2==0 ? 1 : 2;
return p;
}
}
/*
* decompose the previous code point (needs start<src)
* set starterIndex>=0 to the last starter in the decomposition
* that has NF*C "yes"
* starterIndex==-1 if there is no starter
*/
static const UChar *
_decomposeBackFindStarter(const UChar *start, const UChar *&src,
uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe,
int32_t &starterIndex,
int32_t &length) {
const UChar *p;
uint32_t norm32;
uint32_t norm32, ccOrQCMask;
int32_t length;
UChar c, c2;
uint8_t cc, trailCC;
norm32=_getPrevNorm32(start, src, minNoMaybe, _NORM_CC_MASK|qcMask|decompQCMask, c, c2);
length= c2==0 ? 1 : 2;
starterIndex=0; /* many characters are themselves starters */
ccOrQCMask=_NORM_CC_MASK|qcMask;
if( (norm32&(_NORM_CC_MASK|qcMask|decompQCMask))==0 ||
isNorm32HangulOrJamo(norm32)
) {
/* found a true starter */
/*
* Hangul decomposes but is all starters, Jamo L are starters.
* We never get Jamo V/T here because
* we go back through quick check "yes" text
* and Jamo V/T have NFC_MAYBE.
*/
return src;
for(;;) {
if(src==limit) {
break; /* end of string */
}
c=*src;
if(c<minNoMaybe) {
break; /* catches NUL terminater, too */
}
norm32=_getNorm32(c);
if((norm32&ccOrQCMask)==0) {
break; /* true starter */
}
/* get the decomposition and the lead and trail cc's */
if((norm32&decompQCMask)==0) {
/* c does not decompose */
if((norm32&(_NORM_CC_MASK|qcMask))!=0) {
starterIndex=-1;
++src; /* does not decompose, continue */
continue;
}
p=src;
/* no Hangul/Jamo here because they are all true starters or don't decompose */
if(isNorm32Regular(norm32)) {
c2=0;
} else {
/* c decomposes, get everything from the variable-length extra data */
/* c is a lead surrogate, get the real norm32 */
if((src+1)==limit || UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
break; /* unmatched first surrogate */
}
norm32=_getNorm32FromSurrogatePair(norm32, c2);
if((norm32&ccOrQCMask)==0) {
break; /* true starter */
} else if((norm32&decompQCMask)==0) {
src+=2; /* does not decompose, continue */
continue;
}
}
/* (c, c2) decomposes, get everything from the variable-length extra data */
p=_decompose(norm32, decompQCMask, length, cc, trailCC);
/* find the starterIndex (the decomposition is canonically ordered!) */
/* assume that the decomposition contains complete code points */
if(UTF_IS_SECOND_SURROGATE(p[length-1])) {
starterIndex=length-2;
} else {
starterIndex=length-1;
}
if(trailCC!=0 || (_getNorm32(p+starterIndex, qcMask)&qcMask)) {
/* search backwards */
for(;;) {
if(starterIndex==0) {
starterIndex=-1;
break;
}
c=p[--starterIndex];
if(UTF_IS_SECOND_SURROGATE(c)) {
c2=p[--starterIndex];
norm32=_getNorm32(c2);
if((norm32&(_NORM_CC_MASK|qcMask))==0) {
/* all surrogate pairs with this lead surrogate have cc==0 */
break;
} else {
/* norm32 must be a surrogate special */
norm32=_getNorm32FromSurrogatePair(norm32, c);
}
} else {
norm32=_getNorm32(c);
}
if((norm32&(_NORM_CC_MASK|qcMask))==0) {
break;
}
}
}
/* get the first character's norm32 to check if it is a true starter */
if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
break; /* true starter */
}
return p;
src+= c2==0 ? 1 : 2; /* not a true starter, continue */
}
return src;
}
/*
@ -1942,8 +1822,8 @@ _decomposeBackFindStarter(const UChar *start, const UChar *&src,
* after some text (with quick check "yes") has been copied already
*
* decompose this character as well as parts of the source surrounding it,
* find the previous and the next starter,
* and then recompose between these two starters
* bounded by the previous and the next true starter,
* and then recompose this decomposition
*/
static const UChar *
_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
@ -1952,12 +1832,10 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
uint32_t qcMask, uint8_t &prevCC,
int32_t &destIndex,
UErrorCode *pErrorCode) {
const UChar *p, *starter;
UChar *reorderSplit, *recomposeLimit;
UChar *recomposeLimit;
uint32_t decompQCMask;
int32_t startIndex, limitIndex, firstStarterIndex, starterIndex;
UChar minNoMaybe;
uint8_t cc, trailCC;
uint8_t trailCC;
decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
@ -1967,140 +1845,41 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
}
/* get the decomposition and the lead and trail cc's */
if((norm32&decompQCMask)==0) {
/* c does not decompose */
cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
p=prevSrc;
} else {
/* c decomposes, get everything from the variable-length extra data */
p=_decompose(norm32, decompQCMask, length, cc, trailCC);
if(cc==0) {
/* get the first character's norm32 to check if it is a starter with qc "no" or "maybe" */
norm32=_getNorm32(p, qcMask);
}
}
/* copy the decomposition into the buffer, assume that it fits */
startIndex=limitIndex=bufferCapacity/2;
do {
buffer[limitIndex++]=*p++;
} while(--length>0);
/* find the last starter in [prevStarter..src[ including this new decomposition */
if((cc==0 && !(norm32&qcMask)) || prevStarter==prevSrc) {
prevCC=trailCC;
starter=prevSrc;
firstStarterIndex=startIndex;
} else {
/*
* ### TODO
* - verify that prevStarter is indeed at the _last_ starter before prevSrc
* - if that is so, then perform a normal decomposition on [prevStarter..src[
* instead of this special, incremental one
* find the last true starter in [prevStarter..src[
* it is either the decomposition of the current character (at prevSrc),
* or prevStarter
*/
/* decompose backwards and look for a starter */
firstStarterIndex=0;
starter=prevSrc;
for(;;) {
p=_decomposeBackFindStarter(prevStarter, starter,
qcMask, decompQCMask, minNoMaybe,
starterIndex, length);
/* make sure there is enough space in the buffer */
if(startIndex<length) {
int32_t bufferLength;
if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, limitIndex)) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
/* move the current buffer contents up */
bufferLength=limitIndex-startIndex;
limitIndex=bufferCapacity-_STACK_BUFFER_CAPACITY/2;
uprv_memmove(buffer+(limitIndex-bufferLength), buffer+startIndex, bufferLength*U_SIZEOF_UCHAR);
startIndex=limitIndex-bufferLength;
}
/* prepend the decomposition */
p+=length;
do {
buffer[--startIndex]=*--p;
} while(--length>0);
/* stop if we found a starter */
if(starterIndex>=0) {
firstStarterIndex=startIndex+starterIndex;
break;
}
/* stop if we are at the beginning of the text */
if(prevStarter>=starter) {
firstStarterIndex=startIndex;
break;
}
}
/* reorder the backwards decomposition, set prevCC */
reorderSplit=buffer+firstStarterIndex;
prevCC=_mergeOrdered(reorderSplit, reorderSplit, reorderSplit, buffer+limitIndex, FALSE);
if(_isTrueStarter(norm32, _NORM_CC_MASK|qcMask, decompQCMask)) {
prevStarter=prevSrc;
} else {
/* adjust destIndex: back out what had been copied with qc "yes" */
destIndex-=(int32_t)(prevSrc-starter);
destIndex-=(int32_t)(prevSrc-prevStarter);
}
/* find the next starter in [src..limit[ */
for(;;) {
p=_decomposeBeforeNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe, cc, trailCC, length);
if(p==NULL) {
break; /* reached a starter */
}
/* find the next true starter in [src..limit[ */
src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
/* make sure there is enough space in the buffer */
if((limitIndex+length)>bufferCapacity) {
if(startIndex>=length) {
/* it fits if we move the buffer contents up */
uprv_memmove(buffer, buffer+startIndex, (limitIndex-startIndex)*U_SIZEOF_UCHAR);
firstStarterIndex-=startIndex;
limitIndex-=startIndex;
startIndex=0;
} else if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, limitIndex)) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
/* decompose [prevStarter..src[ */
length=_decompose(buffer, bufferCapacity,
prevStarter, src-prevStarter,
(decompQCMask&_NORM_QC_NFKD)!=0, FALSE,
u_growBufferFromStatic, stackBuffer,
trailCC,
pErrorCode);
if(cc!=0 && cc<prevCC) {
/* the decomposition is out of order with respect to the preceding text */
reorderSplit=buffer+limitIndex;
limitIndex+=length;
if(length==1) {
prevCC=_insertOrdered(buffer+firstStarterIndex, reorderSplit, buffer+limitIndex, *p, 0, cc);
} else {
prevCC=_mergeOrdered(buffer+firstStarterIndex, reorderSplit, p, p+length);
}
} else {
/* just append the decomposition */
do {
buffer[limitIndex++]=*p++;
} while(--length>0);
prevCC=trailCC;
}
}
/* recompose between the two starters */
recomposeLimit=buffer+limitIndex;
if((limitIndex-firstStarterIndex)>=2) {
prevCC=_recompose(buffer+firstStarterIndex, recomposeLimit);
}
/* set output parameters and return with a pointer to the recomposition */
/* set the next starter */
prevStarter=src;
p=buffer+startIndex;
length=recomposeLimit-p;
return p;
/* recompose the decomposition */
recomposeLimit=buffer+length;
if(length>=2) {
prevCC=_recompose(buffer, recomposeLimit);
}
/* return with a pointer to the recomposition and its length */
length=recomposeLimit-buffer;
return buffer;
}
U_CFUNC int32_t
@ -2267,7 +2046,7 @@ unorm_compose(UChar *dest, int32_t destCapacity,
c2=(UChar)(c2-JAMO_L_BASE);
if(c2<JAMO_L_COUNT) {
c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
if((limit==NULL || src!=limit) && (c2=(UChar)(*src-JAMO_T_BASE))<JAMO_T_COUNT) {
if(src!=limit && (c2=(UChar)(*src-JAMO_T_BASE))<JAMO_T_COUNT) {
++src;
c+=c2;
}
@ -2293,7 +2072,7 @@ unorm_compose(UChar *dest, int32_t destCapacity,
length=1;
} else {
/* c is a lead surrogate, get the real norm32 */
if((limit==NULL || src!=limit) && UTF_IS_SECOND_SURROGATE(c2=*src)) {
if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
++src;
length=2;
norm32=_getNorm32FromSurrogatePair(norm32, c2);
@ -2311,7 +2090,19 @@ unorm_compose(UChar *dest, int32_t destCapacity,
} else {
const UChar *p;
/* ### TODO use sidebuffer because intermediate result might not fit but end result might - also rework some of dest buffer */
/*
* find appropriate boundaries around this character,
* decompose the source text from between the boundaries,
* and recompose it
*
* this puts the intermediate text into the side buffer because
* it might be longer than the recomposition end result,
* or the destination buffer may be too short or missing
*
* note that destIndex may be adjusted backwards to account
* for source text that passed the quick check but needed to
* take part in the recomposition
*/
p=_composePart(stackBuffer, buffer, bufferCapacity, length,
prevStarter, /* in/out, will be set to the following true starter */
prevSrc, src, limit,
@ -2510,7 +2301,6 @@ unorm_normalize(const UChar *src, int32_t srcLength,
}
/* check for overlapping src and destination */
/* ### TODO: real API may provide a temp buffer */
if( (src>=dest && src<(dest+destCapacity)) ||
(srcLength>0 && dest>=src && dest<(src+srcLength))
) {