ICU-3497 provide correct NFC/NFKC normalization for both before and after the PRI #29 fix

X-SVN-Rev: 16134
This commit is contained in:
Markus Scherer 2004-08-09 14:47:53 +00:00
parent ded716bf2c
commit 27f1dd1bda
5 changed files with 249 additions and 77 deletions

View File

@ -102,7 +102,19 @@ enum {
_NORM_OPTIONS_UNICODE_MASK=0xe0,
_NORM_OPTIONS_SETS_MASK=0xff,
_NORM_OPTIONS_UNICODE_SHIFT=5
_NORM_OPTIONS_UNICODE_SHIFT=5,
/*
* The following options are used only in some composition functions.
* They use bits 12 and up to preserve lower bits for the available options
* space in unorm_compare() -
* see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
*/
/** Options bit 12, for compatibility vs. canonical decomposition. */
_NORM_OPTIONS_COMPAT=0x1000,
/** Options bit 13, no discontiguous composition (FCC vs. NFC). */
_NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
};
static inline UBool
@ -1934,7 +1946,7 @@ _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UC
* while the combining mark that is removed has at least one code unit
*/
static uint8_t
_recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
_recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
UChar *starter, *pRemove, *q, *r;
uint32_t combineFlags;
UChar c, c2;
@ -1955,57 +1967,87 @@ _recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
if(combineBackIndex&0x8000) {
/* c is a Jamo V/T, see if we can compose it with the previous character */
pRemove=NULL; /* NULL while no Hangul composition */
c2=*starter;
if(combineBackIndex==0xfff2) {
/* Jamo V, compose with previous Jamo L and following Jamo T */
c2=(UChar)(c2-JAMO_L_BASE);
if(c2<JAMO_L_COUNT) {
pRemove=p-1;
c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
++p;
c+=c2;
}
if(!nx_contains(nx, c)) {
*starter=c;
} else {
/* excluded */
if(!isHangulWithoutJamoT(c)) {
--p; /* undo the ++p from reading the Jamo T */
/* for the PRI #29 fix, check that there is no intervening combining mark */
if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
pRemove=NULL; /* NULL while no Hangul composition */
combineFlags=0;
c2=*starter;
if(combineBackIndex==0xfff2) {
/* Jamo V, compose with previous Jamo L and following Jamo T */
c2=(UChar)(c2-JAMO_L_BASE);
if(c2<JAMO_L_COUNT) {
pRemove=p-1;
c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
++p;
c+=c2;
} else {
/* the result is an LV syllable, which is a starter (unlike LVT) */
combineFlags=_NORM_COMBINES_FWD;
}
if(!nx_contains(nx, c)) {
*starter=c;
} else {
/* excluded */
if(!isHangulWithoutJamoT(c)) {
--p; /* undo the ++p from reading the Jamo T */
}
/* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
pRemove=NULL;
}
}
/*
* Normally, the following can not occur:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*
* However, before the PRI #29 fix, this can occur due to
* an intervening combining mark between the Hangul LV and the Jamo T.
*/
} else {
/* Jamo T, compose with previous Hangul that does not have a Jamo T */
if(isHangulWithoutJamoT(c2)) {
c2+=(UChar)(c-JAMO_T_BASE);
if(!nx_contains(nx, c2)) {
pRemove=p-1;
*starter=c2;
}
/* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
pRemove=NULL;
}
}
#if 0
/*
* The following is disabled with #if 0 because it can not occur:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*/
} else {
/* Jamo T, compose with previous Hangul that does not have a Jamo T */
if(isHangulWithoutJamoT(c2)) {
pRemove=p-1;
*starter=(UChar)(c2+(c-JAMO_T_BASE));
}
#endif
}
if(pRemove!=NULL) {
/* remove the Jamo(s) */
q=pRemove;
r=p;
while(r<limit) {
*q++=*r++;
if(pRemove!=NULL) {
/* remove the Jamo(s) */
q=pRemove;
r=p;
while(r<limit) {
*q++=*r++;
}
p=pRemove;
limit=q;
}
p=pRemove;
limit=q;
}
c2=0; /* c2 held *starter temporarily */
c2=0; /* c2 held *starter temporarily */
if(combineFlags!=0) {
/*
* not starter=NULL because the composition is a Hangul LV syllable
* and might combine once more (but only before the PRI #29 fix)
*/
/* done? */
if(p==limit) {
return prevCC;
}
/* the composition is a Hangul LV syllable which is a starter that combines forward */
combineFwdIndex=0xfff0;
/* we combined; continue with looking for compositions */
continue;
}
}
/*
* now: cc==0 and the combining index does not include "forward" ->
@ -2016,10 +2058,12 @@ _recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
*/
} else if(
/* the starter is not a Jamo V/T and */
/* the starter is not a Hangul LV or Jamo V/T and */
!(combineFwdIndex&0x8000) &&
/* the combining mark is not blocked and */
(prevCC<cc || prevCC==0) &&
((options&UNORM_BEFORE_PRI_29) ?
(prevCC!=cc || prevCC==0) :
(prevCC<cc || prevCC==0)) &&
/* the starter and the combining mark (c, c2) do combine and */
0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
/* the composition result is not excluded */
@ -2083,7 +2127,7 @@ _recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
starter=NULL;
}
/* we combined and set prevCC, continue with looking for compositions */
/* we combined; continue with looking for compositions */
continue;
}
}
@ -2111,6 +2155,9 @@ _recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
/* it will not combine with anything */
starter=NULL;
}
} else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
/* FCC: no discontiguous compositions; any intervening character blocks */
starter=NULL;
}
}
}
@ -2119,14 +2166,14 @@ _recompose(UChar *p, UChar *&limit, const UnicodeSet *nx) {
static const UChar *
_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
const UChar *prevStarter, const UChar *src,
uint32_t qcMask, uint8_t &prevCC,
const UnicodeSet *nx,
uint8_t &prevCC,
int32_t options, const UnicodeSet *nx,
UErrorCode *pErrorCode) {
UChar *recomposeLimit;
uint8_t trailCC;
UBool compat;
compat=(UBool)((qcMask&_NORM_QC_NFKC)!=0);
compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
/* decompose [prevStarter..src[ */
length=_decompose(buffer, bufferCapacity,
@ -2147,7 +2194,7 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
/* recompose the decomposition */
recomposeLimit=buffer+length;
if(length>=2) {
prevCC=_recompose(buffer, recomposeLimit, nx);
prevCC=_recompose(buffer, recomposeLimit, options, nx);
}
/* return with a pointer to the recomposition and its length */
@ -2158,7 +2205,7 @@ _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_
static int32_t
_compose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, const UnicodeSet *nx,
int32_t options, const UnicodeSet *nx,
UErrorCode *pErrorCode) {
UChar stackBuffer[_STACK_BUFFER_CAPACITY];
UChar *buffer;
@ -2170,12 +2217,12 @@ _compose(UChar *dest, int32_t destCapacity,
UChar c, c2, minNoMaybe;
uint8_t cc, prevCC;
if(!compat) {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
qcMask=_NORM_QC_NFC;
} else {
if(options&_NORM_OPTIONS_COMPAT) {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
qcMask=_NORM_QC_NFKC;
} else {
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
qcMask=_NORM_QC_NFC;
}
/* initialize */
@ -2306,7 +2353,7 @@ _compose(UChar *dest, int32_t destCapacity,
if(
destIndex>0 &&
_composeHangul(
*(prevSrc-1), c, norm32, src, limit, compat,
*(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
destIndex<=destCapacity ? dest+(destIndex-1) : 0,
nx)
) {
@ -2380,9 +2427,8 @@ _compose(UChar *dest, int32_t destCapacity,
p=_composePart(stackBuffer, buffer, bufferCapacity,
length, /* output */
prevStarter, src,
qcMask,
prevCC, /* output */
nx,
options, nx,
pErrorCode);
if(p==NULL) {
@ -2457,9 +2503,16 @@ unorm_compose(UChar *dest, int32_t destCapacity,
return 0;
}
/* reset options bits that should only be set here or inside _compose() */
options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
if(compat) {
options|=_NORM_OPTIONS_COMPAT;
}
destIndex=_compose(dest, destCapacity,
src, srcLength,
compat, nx,
options, nx,
pErrorCode);
return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
@ -2935,6 +2988,7 @@ _quickCheck(const UChar *src,
const UChar *start, *limit;
uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
int32_t options;
UChar c, c2, minNoMaybe;
uint8_t cc, prevCC;
UNormalizationCheckResult result;
@ -2958,18 +3012,22 @@ _quickCheck(const UChar *src,
case UNORM_NFC:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
qcMask=_NORM_QC_NFC;
options=0;
break;
case UNORM_NFKC:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
qcMask=_NORM_QC_NFKC;
options=_NORM_OPTIONS_COMPAT;
break;
case UNORM_NFD:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
qcMask=_NORM_QC_NFD;
options=0;
break;
case UNORM_NFKD:
minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
qcMask=_NORM_QC_NFKD;
options=_NORM_OPTIONS_COMPAT;
break;
case UNORM_FCD:
return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
@ -3081,8 +3139,8 @@ _quickCheck(const UChar *src,
length,
prevStarter,
src,
qcMask,
prevCC, nx, pErrorCode);
prevCC,
options, nx, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
result=UNORM_MAYBE; /* error (out of memory) */
break;
@ -3147,7 +3205,7 @@ unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
static int32_t
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, const UnicodeSet *nx,
UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
UErrorCode *pErrorCode) {
int32_t destLength;
uint8_t trailCC;
@ -3166,18 +3224,25 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity,
case UNORM_NFC:
destLength=_compose(dest, destCapacity,
src, srcLength,
FALSE, nx, pErrorCode);
options, nx, pErrorCode);
break;
case UNORM_NFKC:
destLength=_compose(dest, destCapacity,
src, srcLength,
TRUE, nx, pErrorCode);
options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
break;
case UNORM_FCD:
return unorm_makeFCD(dest, destCapacity,
src, srcLength,
nx,
pErrorCode);
#if 0
case UNORM_FCC:
destLength=_compose(dest, destCapacity,
src, srcLength,
options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
break;
#endif
case UNORM_NONE:
/* just copy the string */
if(srcLength==-1) {
@ -3217,9 +3282,12 @@ unorm_internalNormalize(UChar *dest, int32_t destCapacity,
return 0;
}
/* reset options bits that should only be set inside unorm_internalNormalize() */
options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
return unorm_internalNormalize(dest, destCapacity,
src, srcLength,
mode, nx,
mode, options, nx,
pErrorCode);
}
@ -4396,6 +4464,7 @@ unorm_compare(const UChar *s1, int32_t length1,
UChar *d1, *d2;
const UnicodeSet *nx;
UNormalizationMode mode;
int32_t normOptions;
int32_t result;
/* argument checking */
@ -4414,7 +4483,8 @@ unorm_compare(const UChar *s1, int32_t length1,
return 0;
}
nx=getNX((int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT), *pErrorCode);
normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
nx=getNX(normOptions, *pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
@ -4474,7 +4544,7 @@ unorm_compare(const UChar *s1, int32_t length1,
if(!isFCD1) {
_len1=unorm_internalNormalize(fcd1, LENGTHOF(fcd1),
s1, length1,
mode, nx,
mode, normOptions, nx,
pErrorCode);
if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
s1=fcd1;
@ -4488,7 +4558,7 @@ unorm_compare(const UChar *s1, int32_t length1,
*pErrorCode=U_ZERO_ERROR;
_len1=unorm_internalNormalize(d1, _len1,
s1, length1,
mode, nx,
mode, normOptions, nx,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
goto cleanup;
@ -4502,7 +4572,7 @@ unorm_compare(const UChar *s1, int32_t length1,
if(!isFCD2) {
_len2=unorm_internalNormalize(fcd2, LENGTHOF(fcd2),
s2, length2,
mode, nx,
mode, normOptions, nx,
pErrorCode);
if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
s2=fcd2;
@ -4516,7 +4586,7 @@ unorm_compare(const UChar *s1, int32_t length1,
*pErrorCode=U_ZERO_ERROR;
_len2=unorm_internalNormalize(d2, _len2,
s2, length2,
mode, nx,
mode, normOptions, nx,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
goto cleanup;

View File

@ -175,7 +175,16 @@ enum {
/** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
UNORM_NX_HANGUL=1,
/** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
UNORM_NX_CJK_COMPAT=2
UNORM_NX_CJK_COMPAT=2,
/**
* Options bit 8, use buggy recomposition described in
* Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
*
* Used in IDNA implementation according to strict interpretation
* of IDNA definition based on Unicode 3.2 which predates PRI #29.
*/
UNORM_BEFORE_PRI_29=0x100
};
/**

View File

@ -587,8 +587,22 @@ static int32_t
usprep_normalize( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode* status ){
return unorm_normalize(src,srcLength,UNORM_NFKC,UNORM_UNICODE_3_2,dest,destCapacity,status);
/*
* Option UNORM_BEFORE_PRI_29:
*
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
* requires strict adherence to Unicode 3.2 normalization,
* including buggy composition from before fixing Public Review Issue #29.
* Note that this results in some valid but nonsensical text to be
* either corrupted or rejected, depending on the text.
* See http://www.unicode.org/review/resolved-pri.html#pri29
* See unorm.cpp and cnormtst.c
*/
return unorm_normalize(
src, srcLength,
UNORM_NFKC, UNORM_UNICODE_3_2|UNORM_BEFORE_PRI_29,
dest, destCapacity,
status);
}

View File

@ -15,6 +15,8 @@
*********************************************************************************/
/*tests for u_normalization*/
#include "unicode/utypes.h"
#include "unicode/unorm.h"
#include "unormimp.h"
#include "cintltst.h"
#if UCONFIG_NO_NORMALIZATION
@ -54,6 +56,9 @@ TestFCNFKCClosure(void);
static void
TestQuickCheckPerCP(void);
static void
TestComposition(void);
const static char* canonTests[][3] = {
/* Input*/ /*Decomposed*/ /*Composed*/
{ "cat", "cat", "cat" },
@ -131,6 +136,7 @@ void addNormTest(TestNode** root)
addTest(root, &TestConcatenate, "tscoll/cnormtst/TestConcatenate");
addTest(root, &TestNextPrevious, "tscoll/cnormtst/TestNextPrevious");
addTest(root, &TestFCNFKCClosure, "tscoll/cnormtst/TestFCNFKCClosure");
addTest(root, &TestComposition, "tscoll/cnormtst/TestComposition");
}
void TestDecomp()
@ -1461,4 +1467,50 @@ TestQuickCheckPerCP() {
}
}
static void
TestComposition(void) {
static const struct {
UNormalizationMode mode;
uint32_t options;
UChar input[12];
UChar expect[12];
} cases[]={
/*
* special cases for UAX #15 bug
* see Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
*/
{ UNORM_NFC, 0, { 0x1100, 0x0300, 0x1161, 0x0327 }, { 0x1100, 0x0300, 0x1161, 0x0327 } },
{ UNORM_NFC, 0, { 0x1100, 0x0300, 0x1161, 0x0327, 0x11a8 }, { 0x1100, 0x0300, 0x1161, 0x0327, 0x11a8 } },
{ UNORM_NFC, 0, { 0xac00, 0x0300, 0x0327, 0x11a8 }, { 0xac00, 0x0327, 0x0300, 0x11a8 } },
{ UNORM_NFC, 0, { 0x0b47, 0x0300, 0x0b3e }, { 0x0b47, 0x0300, 0x0b3e } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0x1100, 0x0300, 0x1161, 0x0327 }, { 0xac00, 0x0300, 0x0327 } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0x1100, 0x0300, 0x1161, 0x0327, 0x11a8 }, { 0xac01, 0x0300, 0x0327 } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0xac00, 0x0300, 0x0327, 0x11a8 }, { 0xac01, 0x0327, 0x0300 } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0x0b47, 0x0300, 0x0b3e }, { 0x0b4b, 0x0300 } }
/* TODO: add test cases for UNORM_FCC here (j2151) */
};
UChar output[16];
UErrorCode errorCode;
int32_t i, length;
for(i=0; i<LENGTHOF(cases); ++i) {
errorCode=U_ZERO_ERROR;
length=unorm_normalize(
cases[i].input, -1,
cases[i].mode, cases[i].options,
output, LENGTHOF(output),
&errorCode);
if( U_FAILURE(errorCode) ||
length!=u_strlen(cases[i].expect) ||
0!=u_memcmp(output, cases[i].expect, length)
) {
log_err("unexpected result for case %d\n", i);
}
}
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View File

@ -33,6 +33,7 @@ static void TestToASCII(void);
static void TestIDNToUnicode(void);
static void TestIDNToASCII(void);
static void TestCompare(void);
static void TestUnicode32Norm(void);
void addIDNATest(TestNode** root);
@ -57,6 +58,7 @@ addIDNATest(TestNode** root)
addTest(root, &TestIDNToUnicode, "idna/TestIDNToUnicode");
addTest(root, &TestIDNToASCII, "idna/TestIDNToASCII");
addTest(root, &TestCompare, "idna/TestCompare");
addTest(root, &TestUnicode32Norm,"idna/TestUnicode32Norm");
}
static void
@ -627,6 +629,31 @@ TestCompare(){
}
}
static void TestUnicode32Norm() {
/*
* test Unicode 3.2 normalization, before Public Review Issue #29
* see cnormtst.c TestComposition()
*/
static const UChar strings[][8]={
{ 0x1100, 0x0300, 0x1161, 0x0327 },
{ 0x0b47, 0x0300, 0x0b3e, 0x0327 }
};
UChar ascii[20], unicode[20];
int32_t i, length;
UErrorCode errorCode;
for(i=0; i<LENGTHOF(strings); ++i) {
errorCode=U_ZERO_ERROR;
length=uidna_toASCII(strings[i], -1, ascii, LENGTHOF(ascii), 0, NULL, &errorCode);
length=uidna_toUnicode(ascii, length, unicode, LENGTHOF(unicode), 0, NULL, &errorCode);
if(errorCode!=U_IDNA_VERIFICATION_ERROR) {
log_err("string %d yields %s instead of U_IDNA_VERIFICATION_ERROR\n",
i, u_errorName(errorCode));
}
}
}
#endif
/*