ICU-8804 Normalizer2.composePair(a, b) with separation of minYesNo extraData into combines-forward vs. not
X-SVN-Rev: 30983
This commit is contained in:
parent
bed105857f
commit
19735dc2a3
@ -133,6 +133,10 @@ public final class Norm2AllModes {
|
|||||||
public String getRawDecomposition(int c) {
|
public String getRawDecomposition(int c) {
|
||||||
return impl.getRawDecomposition(c);
|
return impl.getRawDecomposition(c);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
public int composePair(int a, int b) {
|
||||||
|
return impl.composePair(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getCombiningClass(int c) {
|
public int getCombiningClass(int c) {
|
||||||
|
@ -436,6 +436,7 @@ public final class Normalizer2Impl {
|
|||||||
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
|
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
|
||||||
|
|
||||||
minYesNo=inIndexes[IX_MIN_YES_NO];
|
minYesNo=inIndexes[IX_MIN_YES_NO];
|
||||||
|
minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
||||||
minNoNo=inIndexes[IX_MIN_NO_NO];
|
minNoNo=inIndexes[IX_MIN_NO_NO];
|
||||||
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
|
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
|
||||||
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
|
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
|
||||||
@ -926,11 +927,15 @@ public final class Normalizer2Impl {
|
|||||||
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
|
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
|
||||||
|
|
||||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||||
|
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
|
||||||
public static final int IX_MIN_YES_NO=10;
|
public static final int IX_MIN_YES_NO=10;
|
||||||
public static final int IX_MIN_NO_NO=11;
|
public static final int IX_MIN_NO_NO=11;
|
||||||
public static final int IX_LIMIT_NO_NO=12;
|
public static final int IX_LIMIT_NO_NO=12;
|
||||||
public static final int IX_MIN_MAYBE_YES=13;
|
public static final int IX_MIN_MAYBE_YES=13;
|
||||||
|
|
||||||
|
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
|
||||||
|
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
|
||||||
|
|
||||||
public static final int IX_COUNT=16;
|
public static final int IX_COUNT=16;
|
||||||
|
|
||||||
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
|
public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
|
||||||
@ -1658,7 +1663,7 @@ public final class Normalizer2Impl {
|
|||||||
private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
||||||
private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
|
private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
|
||||||
private static boolean isInert(int norm16) { return norm16==0; }
|
private static boolean isInert(int norm16) { return norm16==0; }
|
||||||
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
|
private static boolean isJamoL(int norm16) { return norm16==1; }
|
||||||
private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
|
private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
|
||||||
private boolean isHangul(int norm16) { return norm16==minYesNo; }
|
private boolean isHangul(int norm16) { return norm16==minYesNo; }
|
||||||
private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
|
private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
|
||||||
@ -1804,26 +1809,28 @@ public final class Normalizer2Impl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* Finds the recomposition result for
|
* Finds the recomposition result for
|
||||||
* a forward-combining "lead" character,
|
* a forward-combining "lead" character,
|
||||||
* specified with a pointer to its compositions list,
|
* specified with a pointer to its compositions list,
|
||||||
* and a backward-combining "trail" character.
|
* and a backward-combining "trail" character.
|
||||||
*
|
*
|
||||||
* If the lead and trail characters combine, then this function returns
|
* <p>If the lead and trail characters combine, then this function returns
|
||||||
* the following "compositeAndFwd" value:
|
* the following "compositeAndFwd" value:
|
||||||
|
* <pre>
|
||||||
* Bits 21..1 composite character
|
* Bits 21..1 composite character
|
||||||
* Bit 0 set if the composite is a forward-combining starter
|
* Bit 0 set if the composite is a forward-combining starter
|
||||||
|
* </pre>
|
||||||
* otherwise it returns -1.
|
* otherwise it returns -1.
|
||||||
*
|
*
|
||||||
* The compositions list has (trail, compositeAndFwd) pair entries,
|
* <p>The compositions list has (trail, compositeAndFwd) pair entries,
|
||||||
* encoded as either pairs or triples of 16-bit units.
|
* encoded as either pairs or triples of 16-bit units.
|
||||||
* The last entry has the high bit of its first unit set.
|
* The last entry has the high bit of its first unit set.
|
||||||
*
|
*
|
||||||
* The list is sorted by ascending trail characters (there are no duplicates).
|
* <p>The list is sorted by ascending trail characters (there are no duplicates).
|
||||||
* A linear search is used.
|
* A linear search is used.
|
||||||
*
|
*
|
||||||
* See normalizer2impl.h for a more detailed description
|
* <p>See normalizer2impl.h for a more detailed description
|
||||||
* of the compositions list format.
|
* of the compositions list format.
|
||||||
*/
|
*/
|
||||||
private static int combine(String compositions, int list, int trail) {
|
private static int combine(String compositions, int list, int trail) {
|
||||||
@ -2049,6 +2056,51 @@ public final class Normalizer2Impl {
|
|||||||
buffer.flush();
|
buffer.flush();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int composePair(int a, int b) {
|
||||||
|
int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
|
||||||
|
int list;
|
||||||
|
if(isInert(norm16)) {
|
||||||
|
return -1;
|
||||||
|
} else if(norm16<minYesNoMappingsOnly) {
|
||||||
|
if(isJamoL(norm16)) {
|
||||||
|
b-=Hangul.JAMO_V_BASE;
|
||||||
|
if(0<=b && b<Hangul.JAMO_V_COUNT) {
|
||||||
|
return
|
||||||
|
(Hangul.HANGUL_BASE+
|
||||||
|
((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
|
||||||
|
Hangul.JAMO_T_COUNT);
|
||||||
|
} else {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else if(isHangul(norm16)) {
|
||||||
|
b-=Hangul.JAMO_T_BASE;
|
||||||
|
if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0!
|
||||||
|
return a+b;
|
||||||
|
} else {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 'a' has a compositions list in extraData
|
||||||
|
list=norm16;
|
||||||
|
if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
|
||||||
|
list+= // mapping pointer
|
||||||
|
1+ // +1 to skip the first unit with the mapping lenth
|
||||||
|
(extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length
|
||||||
|
}
|
||||||
|
// Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
|
||||||
|
list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
|
||||||
|
}
|
||||||
|
} else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
list=norm16-minMaybeYes; // offset into maybeYesCompositions
|
||||||
|
}
|
||||||
|
if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return combine(maybeYesCompositions, list, b)>>1;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Does c have a composition boundary before it?
|
* Does c have a composition boundary before it?
|
||||||
* True if its decomposition begins with a character that has
|
* True if its decomposition begins with a character that has
|
||||||
@ -2157,6 +2209,7 @@ public final class Normalizer2Impl {
|
|||||||
|
|
||||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||||
private int minYesNo;
|
private int minYesNo;
|
||||||
|
private int minYesNoMappingsOnly;
|
||||||
private int minNoNo;
|
private int minNoNo;
|
||||||
private int limitNoNo;
|
private int limitNoNo;
|
||||||
private int minMaybeYes;
|
private int minMaybeYes;
|
||||||
|
@ -94,10 +94,21 @@ public class FilteredNormalizer2 extends Normalizer2 {
|
|||||||
* @draft ICU 49
|
* @draft ICU 49
|
||||||
* @provisional This API might change or be removed in a future release.
|
* @provisional This API might change or be removed in a future release.
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public String getRawDecomposition(int c) {
|
public String getRawDecomposition(int c) {
|
||||||
return set.contains(c) ? norm2.getRawDecomposition(c) : null;
|
return set.contains(c) ? norm2.getRawDecomposition(c) : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
* @draft ICU 49
|
||||||
|
* @provisional This API might change or be removed in a future release.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int composePair(int a, int b) {
|
||||||
|
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : -1;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@inheritDoc}
|
* {@inheritDoc}
|
||||||
* @draft ICU 49
|
* @draft ICU 49
|
||||||
|
@ -233,6 +233,24 @@ public abstract class Normalizer2 {
|
|||||||
*/
|
*/
|
||||||
public String getRawDecomposition(int c) { return null; }
|
public String getRawDecomposition(int c) { return null; }
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs pairwise composition of a & b and returns the composite if there is one.
|
||||||
|
*
|
||||||
|
* <p>Returns a composite code point c only if c has a two-way mapping to a+b.
|
||||||
|
* In standard Unicode normalization, this means that
|
||||||
|
* c has a canonical decomposition to a+b
|
||||||
|
* and c does not have the Full_Composition_Exclusion property.
|
||||||
|
*
|
||||||
|
* <p>This function is independent of the mode of the Normalizer2.
|
||||||
|
* The default implementation returns a negative value.
|
||||||
|
* @param a A (normalization starter) code point.
|
||||||
|
* @param b Another code point.
|
||||||
|
* @return The non-negative composite code point if there is one; otherwise a negative value.
|
||||||
|
* @draft ICU 49
|
||||||
|
* @provisional This API might change or be removed in a future release.
|
||||||
|
*/
|
||||||
|
public int composePair(int a, int b) { return -1; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the combining class of c.
|
* Gets the combining class of c.
|
||||||
* The default implementation returns 0
|
* The default implementation returns 0
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:5382d3c0ed27a17053f03c7efdb5ecd5df221a83cd325c0aad1cfbc4f55cff21
|
oid sha256:22211189938e570840f93181da77abc17c8e1b96e7dee1ade36bbfbcb3bd843c
|
||||||
size 7912034
|
size 7912007
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
version https://git-lfs.github.com/spec/v1
|
version https://git-lfs.github.com/spec/v1
|
||||||
oid sha256:440668d950fe840c740a3b40c3e1619863e7d21c8dfe84da9ad582c8032d7ee9
|
oid sha256:9f33b9cad8d56d14732880345a90bdd55dae0181e293ca44b9767d7fc0e0c247
|
||||||
size 719075
|
size 719077
|
||||||
|
@ -815,6 +815,23 @@ public final class UCharacterTest extends TestFmwk
|
|||||||
assertEquals(
|
assertEquals(
|
||||||
String.format("error: nfc.getRawDecomposition(U+%04lx) is wrong", ch),
|
String.format("error: nfc.getRawDecomposition(U+%04lx) is wrong", ch),
|
||||||
dm, mapping);
|
dm, mapping);
|
||||||
|
/* recompose */
|
||||||
|
if(dt==UCharacter.DecompositionType.CANONICAL
|
||||||
|
&& !UCharacter.hasBinaryProperty(ch, UProperty.FULL_COMPOSITION_EXCLUSION)) {
|
||||||
|
int a=dm.codePointAt(0);
|
||||||
|
int b=dm.codePointBefore(dm.length());
|
||||||
|
int composite=nfc.composePair(a, b);
|
||||||
|
assertEquals(
|
||||||
|
String.format(
|
||||||
|
"error: nfc U+%04lX decomposes to U+%04lX+U+%04lX "+
|
||||||
|
"but does not compose back (instead U+%04lX)",
|
||||||
|
ch, a, b, composite),
|
||||||
|
ch, composite);
|
||||||
|
/*
|
||||||
|
* Note: NFKC has fewer round-trip mappings than NFC,
|
||||||
|
* so we can't just test nfkc.composePair(a, b) here without further data.
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
// testing iso comment
|
// testing iso comment
|
||||||
try{
|
try{
|
||||||
|
@ -2069,6 +2069,15 @@ public class BasicTest extends TestFmwk {
|
|||||||
errln("getRawDecomposition() returns TRUE for characters which do not have decompositions");
|
errln("getRawDecomposition() returns TRUE for characters which do not have decompositions");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test composePair() for some pairs of characters that do not compose
|
||||||
|
if( nfcNorm2.composePair(0x20, 0x301)>=0 ||
|
||||||
|
nfcNorm2.composePair(0x61, 0x305)>=0 ||
|
||||||
|
nfcNorm2.composePair(0x1100, 0x1160)>=0 ||
|
||||||
|
nfcNorm2.composePair(0xac00, 0x11a7)>=0
|
||||||
|
) {
|
||||||
|
errln("NFC.composePair() incorrectly composes some pairs of characters");
|
||||||
|
}
|
||||||
|
|
||||||
// test FilteredNormalizer2.getDecomposition()
|
// test FilteredNormalizer2.getDecomposition()
|
||||||
UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]");
|
UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]");
|
||||||
FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
|
FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
|
||||||
@ -2080,6 +2089,13 @@ public class BasicTest extends TestFmwk {
|
|||||||
if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) {
|
if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) {
|
||||||
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
|
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test FilteredNormalizer2::composePair()
|
||||||
|
if( 0x100!=fn2.composePair(0x41, 0x304) ||
|
||||||
|
fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
|
||||||
|
) {
|
||||||
|
errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// verify that case-folding does not un-FCD strings
|
// verify that case-folding does not un-FCD strings
|
||||||
|
Loading…
Reference in New Issue
Block a user