ICU-8804 Normalizer2.composePair(a, b) with separation of minYesNo extraData into combines-forward vs. not

X-SVN-Rev: 30983
This commit is contained in:
Markus Scherer 2011-11-27 20:34:42 +00:00
parent bed105857f
commit 19735dc2a3
8 changed files with 129 additions and 10 deletions

View File

@ -133,6 +133,10 @@ public final class Norm2AllModes {
public String getRawDecomposition(int c) {
return impl.getRawDecomposition(c);
}
@Override
public int composePair(int a, int b) {
return impl.composePair(a, b);
}
@Override
public int getCombiningClass(int c) {

View File

@ -436,6 +436,7 @@ public final class Normalizer2Impl {
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
minYesNo=inIndexes[IX_MIN_YES_NO];
minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
minNoNo=inIndexes[IX_MIN_NO_NO];
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
@ -926,11 +927,15 @@ public final class Normalizer2Impl {
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
public static final int IX_MIN_YES_NO=10;
public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13;
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
public static final int IX_COUNT=16;
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
@ -1658,7 +1663,7 @@ public final class Normalizer2Impl {
private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
private static boolean isInert(int norm16) { return norm16==0; }
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
private static boolean isJamoL(int norm16) { return norm16==1; }
private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
private boolean isHangul(int norm16) { return norm16==minYesNo; }
private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
@ -1804,26 +1809,28 @@ public final class Normalizer2Impl {
}
}
/*
/**
* Finds the recomposition result for
* a forward-combining "lead" character,
* specified with a pointer to its compositions list,
* and a backward-combining "trail" character.
*
* If the lead and trail characters combine, then this function returns
* <p>If the lead and trail characters combine, then this function returns
* the following "compositeAndFwd" value:
* <pre>
* Bits 21..1 composite character
* Bit 0 set if the composite is a forward-combining starter
* </pre>
* otherwise it returns -1.
*
* The compositions list has (trail, compositeAndFwd) pair entries,
* <p>The compositions list has (trail, compositeAndFwd) pair entries,
* encoded as either pairs or triples of 16-bit units.
* The last entry has the high bit of its first unit set.
*
* The list is sorted by ascending trail characters (there are no duplicates).
* <p>The list is sorted by ascending trail characters (there are no duplicates).
* A linear search is used.
*
* See normalizer2impl.h for a more detailed description
* <p>See normalizer2impl.h for a more detailed description
* of the compositions list format.
*/
private static int combine(String compositions, int list, int trail) {
@ -2049,6 +2056,51 @@ public final class Normalizer2Impl {
buffer.flush();
}
public int composePair(int a, int b) {
int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
int list;
if(isInert(norm16)) {
return -1;
} else if(norm16<minYesNoMappingsOnly) {
if(isJamoL(norm16)) {
b-=Hangul.JAMO_V_BASE;
if(0<=b && b<Hangul.JAMO_V_COUNT) {
return
(Hangul.HANGUL_BASE+
((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
Hangul.JAMO_T_COUNT);
} else {
return -1;
}
} else if(isHangul(norm16)) {
b-=Hangul.JAMO_T_BASE;
if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0!
return a+b;
} else {
return -1;
}
} else {
// 'a' has a compositions list in extraData
list=norm16;
if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
list+= // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
(extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length
}
// Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
}
} else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
return -1;
} else {
list=norm16-minMaybeYes; // offset into maybeYesCompositions
}
if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
return -1;
}
return combine(maybeYesCompositions, list, b)>>1;
}
/**
* Does c have a composition boundary before it?
* True if its decomposition begins with a character that has
@ -2157,6 +2209,7 @@ public final class Normalizer2Impl {
// Norm16 value thresholds for quick check combinations and types of extra data.
private int minYesNo;
private int minYesNoMappingsOnly;
private int minNoNo;
private int limitNoNo;
private int minMaybeYes;

View File

@ -94,10 +94,21 @@ public class FilteredNormalizer2 extends Normalizer2 {
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
@Override
public String getRawDecomposition(int c) {
return set.contains(c) ? norm2.getRawDecomposition(c) : null;
}
/**
* {@inheritDoc}
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
@Override
public int composePair(int a, int b) {
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
}
/**
* {@inheritDoc}
* @draft ICU 49

View File

@ -233,6 +233,24 @@ public abstract class Normalizer2 {
*/
public String getRawDecomposition(int c) { return null; }
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
*
* <p>Returns a composite code point c only if c has a two-way mapping to a+b.
* In standard Unicode normalization, this means that
* c has a canonical decomposition to a+b
* and c does not have the Full_Composition_Exclusion property.
*
* <p>This function is independent of the mode of the Normalizer2.
* The default implementation returns a negative value.
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public int composePair(int a, int b) { return -1; }
/**
* Gets the combining class of c.
* The default implementation returns 0

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5382d3c0ed27a17053f03c7efdb5ecd5df221a83cd325c0aad1cfbc4f55cff21
size 7912034
oid sha256:22211189938e570840f93181da77abc17c8e1b96e7dee1ade36bbfbcb3bd843c
size 7912007

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:440668d950fe840c740a3b40c3e1619863e7d21c8dfe84da9ad582c8032d7ee9
size 719075
oid sha256:9f33b9cad8d56d14732880345a90bdd55dae0181e293ca44b9767d7fc0e0c247
size 719077

View File

@ -815,6 +815,23 @@ public final class UCharacterTest extends TestFmwk
assertEquals(
String.format("error: nfc.getRawDecomposition(U+%04lx) is wrong", ch),
dm, mapping);
/* recompose */
if(dt==UCharacter.DecompositionType.CANONICAL
&& !UCharacter.hasBinaryProperty(ch, UProperty.FULL_COMPOSITION_EXCLUSION)) {
int a=dm.codePointAt(0);
int b=dm.codePointBefore(dm.length());
int composite=nfc.composePair(a, b);
assertEquals(
String.format(
"error: nfc U+%04lX decomposes to U+%04lX+U+%04lX "+
"but does not compose back (instead U+%04lX)",
ch, a, b, composite),
ch, composite);
/*
* Note: NFKC has fewer round-trip mappings than NFC,
* so we can't just test nfkc.composePair(a, b) here without further data.
*/
}
// testing iso comment
try{

View File

@ -2069,6 +2069,15 @@ public class BasicTest extends TestFmwk {
errln("getRawDecomposition() returns TRUE for characters which do not have decompositions");
}
// test composePair() for some pairs of characters that do not compose
if( nfcNorm2.composePair(0x20, 0x301)>=0 ||
nfcNorm2.composePair(0x61, 0x305)>=0 ||
nfcNorm2.composePair(0x1100, 0x1160)>=0 ||
nfcNorm2.composePair(0xac00, 0x11a7)>=0
) {
errln("NFC.composePair() incorrectly composes some pairs of characters");
}
// test FilteredNormalizer2.getDecomposition()
UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]");
FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
@ -2080,6 +2089,13 @@ public class BasicTest extends TestFmwk {
if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) {
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
}
// test FilteredNormalizer2::composePair()
if( 0x100!=fn2.composePair(0x41, 0x304) ||
fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
) {
errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
}
}
// verify that case-folding does not un-FCD strings