ICU-8804 Normalizer2.composePair(a, b) with separation of minYesNo extraData into combines-forward vs. not

X-SVN-Rev: 30983
This commit is contained in:
Markus Scherer 2011-11-27 20:34:42 +00:00
parent bed105857f
commit 19735dc2a3
8 changed files with 129 additions and 10 deletions

View File

@ -133,6 +133,10 @@ public final class Norm2AllModes {
public String getRawDecomposition(int c) { public String getRawDecomposition(int c) {
return impl.getRawDecomposition(c); return impl.getRawDecomposition(c);
} }
@Override
public int composePair(int a, int b) {
return impl.composePair(a, b);
}
@Override @Override
public int getCombiningClass(int c) { public int getCombiningClass(int c) {

View File

@ -436,6 +436,7 @@ public final class Normalizer2Impl {
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
minYesNo=inIndexes[IX_MIN_YES_NO]; minYesNo=inIndexes[IX_MIN_YES_NO];
minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
minNoNo=inIndexes[IX_MIN_NO_NO]; minNoNo=inIndexes[IX_MIN_NO_NO];
limitNoNo=inIndexes[IX_LIMIT_NO_NO]; limitNoNo=inIndexes[IX_LIMIT_NO_NO];
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
@ -926,11 +927,15 @@ public final class Normalizer2Impl {
public static final int IX_MIN_COMP_NO_MAYBE_CP=9; public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data. // Norm16 value thresholds for quick check combinations and types of extra data.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
public static final int IX_MIN_YES_NO=10; public static final int IX_MIN_YES_NO=10;
public static final int IX_MIN_NO_NO=11; public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12; public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13; public static final int IX_MIN_MAYBE_YES=13;
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
public static final int IX_COUNT=16; public static final int IX_COUNT=16;
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
@ -1658,7 +1663,7 @@ public final class Normalizer2Impl {
private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
private static boolean isInert(int norm16) { return norm16==0; } private static boolean isInert(int norm16) { return norm16==0; }
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; } private static boolean isJamoL(int norm16) { return norm16==1; }
private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
private boolean isHangul(int norm16) { return norm16==minYesNo; } private boolean isHangul(int norm16) { return norm16==minYesNo; }
private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
@ -1804,26 +1809,28 @@ public final class Normalizer2Impl {
} }
} }
/* /**
* Finds the recomposition result for * Finds the recomposition result for
* a forward-combining "lead" character, * a forward-combining "lead" character,
* specified with a pointer to its compositions list, * specified with a pointer to its compositions list,
* and a backward-combining "trail" character. * and a backward-combining "trail" character.
* *
* If the lead and trail characters combine, then this function returns * <p>If the lead and trail characters combine, then this function returns
* the following "compositeAndFwd" value: * the following "compositeAndFwd" value:
* <pre>
* Bits 21..1 composite character * Bits 21..1 composite character
* Bit 0 set if the composite is a forward-combining starter * Bit 0 set if the composite is a forward-combining starter
* </pre>
* otherwise it returns -1. * otherwise it returns -1.
* *
* The compositions list has (trail, compositeAndFwd) pair entries, * <p>The compositions list has (trail, compositeAndFwd) pair entries,
* encoded as either pairs or triples of 16-bit units. * encoded as either pairs or triples of 16-bit units.
* The last entry has the high bit of its first unit set. * The last entry has the high bit of its first unit set.
* *
* The list is sorted by ascending trail characters (there are no duplicates). * <p>The list is sorted by ascending trail characters (there are no duplicates).
* A linear search is used. * A linear search is used.
* *
* See normalizer2impl.h for a more detailed description * <p>See normalizer2impl.h for a more detailed description
* of the compositions list format. * of the compositions list format.
*/ */
private static int combine(String compositions, int list, int trail) { private static int combine(String compositions, int list, int trail) {
@ -2049,6 +2056,51 @@ public final class Normalizer2Impl {
buffer.flush(); buffer.flush();
} }
public int composePair(int a, int b) {
int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
int list;
if(isInert(norm16)) {
return -1;
} else if(norm16<minYesNoMappingsOnly) {
if(isJamoL(norm16)) {
b-=Hangul.JAMO_V_BASE;
if(0<=b && b<Hangul.JAMO_V_COUNT) {
return
(Hangul.HANGUL_BASE+
((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
Hangul.JAMO_T_COUNT);
} else {
return -1;
}
} else if(isHangul(norm16)) {
b-=Hangul.JAMO_T_BASE;
if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0!
return a+b;
} else {
return -1;
}
} else {
// 'a' has a compositions list in extraData
list=norm16;
if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
list+= // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
(extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length
}
// Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
}
} else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
return -1;
} else {
list=norm16-minMaybeYes; // offset into maybeYesCompositions
}
if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
return -1;
}
return combine(maybeYesCompositions, list, b)>>1;
}
/** /**
* Does c have a composition boundary before it? * Does c have a composition boundary before it?
* True if its decomposition begins with a character that has * True if its decomposition begins with a character that has
@ -2157,6 +2209,7 @@ public final class Normalizer2Impl {
// Norm16 value thresholds for quick check combinations and types of extra data. // Norm16 value thresholds for quick check combinations and types of extra data.
private int minYesNo; private int minYesNo;
private int minYesNoMappingsOnly;
private int minNoNo; private int minNoNo;
private int limitNoNo; private int limitNoNo;
private int minMaybeYes; private int minMaybeYes;

View File

@ -94,10 +94,21 @@ public class FilteredNormalizer2 extends Normalizer2 {
* @draft ICU 49 * @draft ICU 49
* @provisional This API might change or be removed in a future release. * @provisional This API might change or be removed in a future release.
*/ */
@Override
public String getRawDecomposition(int c) { public String getRawDecomposition(int c) {
return set.contains(c) ? norm2.getRawDecomposition(c) : null; return set.contains(c) ? norm2.getRawDecomposition(c) : null;
} }
/**
* {@inheritDoc}
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
@Override
public int composePair(int a, int b) {
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
}
/** /**
* {@inheritDoc} * {@inheritDoc}
* @draft ICU 49 * @draft ICU 49

View File

@ -233,6 +233,24 @@ public abstract class Normalizer2 {
*/ */
public String getRawDecomposition(int c) { return null; } public String getRawDecomposition(int c) { return null; }
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
*
* <p>Returns a composite code point c only if c has a two-way mapping to a+b.
* In standard Unicode normalization, this means that
* c has a canonical decomposition to a+b
* and c does not have the Full_Composition_Exclusion property.
*
* <p>This function is independent of the mode of the Normalizer2.
* The default implementation returns a negative value.
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public int composePair(int a, int b) { return -1; }
/** /**
* Gets the combining class of c. * Gets the combining class of c.
* The default implementation returns 0 * The default implementation returns 0

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:5382d3c0ed27a17053f03c7efdb5ecd5df221a83cd325c0aad1cfbc4f55cff21 oid sha256:22211189938e570840f93181da77abc17c8e1b96e7dee1ade36bbfbcb3bd843c
size 7912034 size 7912007

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:440668d950fe840c740a3b40c3e1619863e7d21c8dfe84da9ad582c8032d7ee9 oid sha256:9f33b9cad8d56d14732880345a90bdd55dae0181e293ca44b9767d7fc0e0c247
size 719075 size 719077

View File

@ -815,6 +815,23 @@ public final class UCharacterTest extends TestFmwk
assertEquals( assertEquals(
String.format("error: nfc.getRawDecomposition(U+%04lx) is wrong", ch), String.format("error: nfc.getRawDecomposition(U+%04lx) is wrong", ch),
dm, mapping); dm, mapping);
/* recompose */
if(dt==UCharacter.DecompositionType.CANONICAL
&& !UCharacter.hasBinaryProperty(ch, UProperty.FULL_COMPOSITION_EXCLUSION)) {
int a=dm.codePointAt(0);
int b=dm.codePointBefore(dm.length());
int composite=nfc.composePair(a, b);
assertEquals(
String.format(
"error: nfc U+%04lX decomposes to U+%04lX+U+%04lX "+
"but does not compose back (instead U+%04lX)",
ch, a, b, composite),
ch, composite);
/*
* Note: NFKC has fewer round-trip mappings than NFC,
* so we can't just test nfkc.composePair(a, b) here without further data.
*/
}
// testing iso comment // testing iso comment
try{ try{

View File

@ -2069,6 +2069,15 @@ public class BasicTest extends TestFmwk {
errln("getRawDecomposition() returns TRUE for characters which do not have decompositions"); errln("getRawDecomposition() returns TRUE for characters which do not have decompositions");
} }
// test composePair() for some pairs of characters that do not compose
if( nfcNorm2.composePair(0x20, 0x301)>=0 ||
nfcNorm2.composePair(0x61, 0x305)>=0 ||
nfcNorm2.composePair(0x1100, 0x1160)>=0 ||
nfcNorm2.composePair(0xac00, 0x11a7)>=0
) {
errln("NFC.composePair() incorrectly composes some pairs of characters");
}
// test FilteredNormalizer2.getDecomposition() // test FilteredNormalizer2.getDecomposition()
UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]"); UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]");
FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
@ -2080,6 +2089,13 @@ public class BasicTest extends TestFmwk {
if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) { if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) {
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
} }
// test FilteredNormalizer2::composePair()
if( 0x100!=fn2.composePair(0x41, 0x304) ||
fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
) {
errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
}
} }
// verify that case-folding does not un-FCD strings // verify that case-folding does not un-FCD strings