ICU-7182 remove support for Unicode Normalization behavior before corrigendum 5 (PRI #29), and small simplification of recompose()

X-SVN-Rev: 26954
This commit is contained in:
Markus Scherer 2009-11-19 05:41:12 +00:00
parent 6b848d4b3f
commit 16168ea78a
3 changed files with 63 additions and 131 deletions

View File

@ -1536,125 +1536,83 @@ public final class NormalizerImpl {
combineBackIndex=ncArg.combiningIndex;
args.start = ncArg.start;
if(((combineFlags&COMBINES_BACK)!=0) && starter!=-1) {
if(
// this character combines backward and
((combineFlags&COMBINES_BACK)!=0) &&
// we have seen a starter that combines forward and
starter>=0 &&
// the backward-combining character is not blocked
(prevCC<ncArg.cc || prevCC==0)
) {
if((combineBackIndex&0x8000)!=0) {
/* c is a Jamo V/T, see if we can compose it with the
* previous character
*/
/* for the PRI #29 fix, check that there is no intervening combining mark */
if((options&BEFORE_PRI_29)!=0 || prevCC==0) {
remove=-1; /* NULL while no Hangul composition */
combineFlags=0;
ncArg.c2=args.source[starter];
if(combineBackIndex==0xfff2) {
/* Jamo V, compose with previous Jamo L and following
* Jamo T
*/
ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
if(ncArg.c2<JAMO_L_COUNT) {
remove=args.start-1;
ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
(ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
if(args.start!=args.limit &&
(ncArg.c2=(char)(args.source[args.start]
-JAMO_T_BASE))<JAMO_T_COUNT) {
++args.start;
ncArg.c+=ncArg.c2;
} else {
/* the result is an LV syllable, which is a starter (unlike LVT) */
combineFlags=COMBINES_FWD;
}
if(!nx_contains(nx, ncArg.c)) {
args.source[starter]=ncArg.c;
} else {
/* excluded */
if(!isHangulWithoutJamoT(ncArg.c)) {
--args.start; /* undo the ++args.start from reading the Jamo T */
}
/* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
remove=args.start;
/* c is a Jamo V/T, see if we can compose it with the previous character */
remove=-1; /* NULL while no Hangul composition */
combineFlags=0;
ncArg.c2=args.source[starter];
if(combineBackIndex==0xfff2) {
/* Jamo V, compose with previous Jamo L and following Jamo T */
ncArg.c2=(char)(ncArg.c2-JAMO_L_BASE);
if(ncArg.c2<JAMO_L_COUNT) {
remove=args.start-1;
ncArg.c=(char)(HANGUL_BASE+(ncArg.c2*JAMO_V_COUNT+
(ncArg.c-JAMO_V_BASE))*JAMO_T_COUNT);
if(args.start!=args.limit &&
(ncArg.c2=(char)(args.source[args.start]
-JAMO_T_BASE))<JAMO_T_COUNT) {
++args.start;
ncArg.c+=ncArg.c2;
}
if(!nx_contains(nx, ncArg.c)) {
args.source[starter]=ncArg.c;
} else {
/* excluded */
if(!isHangulWithoutJamoT(ncArg.c)) {
--args.start; /* undo the ++args.start from reading the Jamo T */
}
/* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
remove=-1;
}
/*
* Normally, the following can not occur:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*
* However, before the PRI #29 fix, this can occur due to
* an intervening combining mark between the Hangul LV and the Jamo T.
*/
} else {
/* Jamo T, compose with previous Hangul that does not have a Jamo T */
if(isHangulWithoutJamoT(ncArg.c2)) {
ncArg.c2+=ncArg.c-JAMO_T_BASE;
if(!nx_contains(nx, ncArg.c2)) {
remove=args.start-1;
args.source[starter]=ncArg.c2;
}
}
}
if(remove!=-1) {
/* remove the Jamo(s) */
q=remove;
r=args.start;
while(r<args.limit) {
args.source[q++]=args.source[r++];
}
args.start=remove;
args.limit=q;
}
ncArg.c2=0; /* c2 held *starter temporarily */
if(combineFlags!=0) {
/*
* not starter=NULL because the composition is a Hangul LV syllable
* and might combine once more (but only before the PRI #29 fix)
*/
/* done? */
if(args.start==args.limit) {
return (char)prevCC;
}
/* the composition is a Hangul LV syllable which is a starter that combines forward */
combineFwdIndex=0xfff0;
/* we combined; continue with looking for compositions */
continue;
}
}
/*
* now: cc==0 and the combining index does not include
* "forward" -> the rest of the loop body will reset starter
* to NULL; technically, a composed Hangul syllable is a
* starter, but it does not combine forward now that we have
* consumed all eligible Jamos; for Jamo V/T, combineFlags
* does not contain _NORM_COMBINES_FWD
* No "else" for Jamo T:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*/
if(remove>=0) {
/* remove the Jamo(s) */
q=remove;
r=args.start;
while(r<args.limit) {
args.source[q++]=args.source[r++];
}
args.start=remove;
args.limit=q;
}
ncArg.c2=0; /* c2 held *starter temporarily */
/* done? */
if(args.start==args.limit) {
return (char)prevCC;
}
starter=-1;
continue;
} else if(
/* the starter is not a Hangul LV or Jamo V/T and */
!((combineFwdIndex&0x8000)!=0) &&
/* the combining mark is not blocked and */
((options&BEFORE_PRI_29)!=0 ?
(prevCC!=ncArg.cc || prevCC==0) :
(prevCC<ncArg.cc || prevCC==0)) &&
/* the starter and the combining mark (c, c2) do combine */
0!=(result=combine(combiningTable,combineFwdIndex,
combineBackIndex, outValues)) &&
/* the composition result is not excluded */
!nx_contains(nx, (char)value, (char)value2)
!nx_contains(nx, (char)outValues[0], (char)outValues[1])
) {
value=outValues[0];
value2=outValues[1];
/* replace the starter with the composition, remove the
* combining mark
*/
/* replace the starter with the composition, remove the combining mark */
remove= ncArg.c2==0 ? args.start-1 : args.start-2; /* index to the combining mark */
/* replace the starter with the composition */
@ -3687,17 +3645,6 @@ public final class NormalizerImpl {
* Options bit 1, do not decompose CJK compatibility characters.
*/
private static final int NX_CJK_COMPAT=2;
/**
* Options bit 8, use buggy recomposition described in
* Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
*
* Used in IDNA implementation according to strict interpretation
* of IDNA definition based on Unicode 3.2 which predates PRI #29.
*
* See ICU4C unormimp.h
*/
public static final int BEFORE_PRI_29=0x100;
/*
* The following options are used only in some composition functions.

View File

@ -473,22 +473,11 @@ public final class StringPrep {
private StringBuffer normalize(StringBuffer src){
/*
* Option UNORM_BEFORE_PRI_29:
*
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
* requires strict adherence to Unicode 3.2 normalization,
* including buggy composition from before fixing Public Review Issue #29.
* Note that this results in some valid but nonsensical text to be
* either corrupted or rejected, depending on the text.
* See http://www.unicode.org/review/resolved-pri.html#pri29
* See unorm.cpp and cnormtst.c
*/
return new StringBuffer(
Normalizer.normalize(
src.toString(),
Normalizer.NFKC,
Normalizer.UNICODE_3_2|NormalizerImpl.BEFORE_PRI_29));
Normalizer.UNICODE_3_2));
}
/*
boolean isLabelSeparator(int ch){

View File

@ -2936,19 +2936,15 @@ public class BasicTest extends TestFmwk {
final TestCompositionCase cases[]=new TestCompositionCase[]{
/*
* special cases for UAX #15 bug
* see Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
* see Unicode Corrigendum #5: Normalization Idempotency
* at http://unicode.org/versions/corrigendum5.html
* (was Public Review Issue #29)
*/
new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"),
new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),
new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"),
new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"),
new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327", "\uac00\u0300\u0327"),
new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u1100\u0300\u1161\u0327\u11a8", "\uac01\u0300\u0327"),
new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\uac00\u0300\u0327\u11a8", "\uac01\u0327\u0300"),
new TestCompositionCase(Normalizer.NFC, NormalizerImpl.BEFORE_PRI_29, "\u0b47\u0300\u0b3e", "\u0b4b\u0300")
/* TODO: add test cases for UNORM_FCC here (j2151) */
};