Use modified cascade, added more tests of implicit weights

X-SVN-Rev: 8787
This commit is contained in:
Mark Davis 2002-06-04 23:56:29 +00:00
parent 32f40e73bf
commit 4ae38aed3b

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2002/06/04 19:01:51 $
* $Revision: 1.16 $
* $Date: 2002/06/04 23:56:29 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -368,7 +368,7 @@ public class WriteCollationData implements UCD_Types {
if (!shortPrint) {
log.print(Utility.hex(source));
log.print(
";\t#" + ucd.getName(clipped)+ "\t" + UCA.toString(key));
";\t#" + ucd.getName(clipped) + "\t" + UCA.toString(key));
} else {
log.print(source + "\t" + Utility.hex(clipped));
}
@ -2278,7 +2278,7 @@ F900..FAFF; CJK Compatibility Ideographs
summary.println("[variable top = " + Utility.hex(primaryDelta[firstPrimary]) + "] # END OF VARIABLE SECTION!!!");
summary.println();
}
summary.println("First: " + Utility.hex(np & 0xFFFFFFFFL) + " " + ucd.getName(UTF16.charAt(chr,0)));
summary.println("First: " + Utility.hex(np & 0xFFFFFFFFL) + ", " + ucd.getCodeAndName(UTF16.charAt(chr,0)));
}
lastNp = np;
isFirst = false;
@ -2296,7 +2296,7 @@ F900..FAFF; CJK Compatibility Ideographs
log.println();
lastChr = chr;
}
summary.println("Last: " + Utility.hex(lastNp) + " " + ucd.getName(UTF16.charAt(lastChr, 0)));
summary.println("Last: " + Utility.hex(lastNp) + ", " + ucd.getCodeAndName(UTF16.charAt(lastChr, 0)));
/*
String sample = "\u3400\u3401\u4DB4\u4DB5\u4E00\u4E01\u9FA4\u9FA5\uAC00\uAC01\uD7A2\uD7A3";
@ -2312,7 +2312,7 @@ F900..FAFF; CJK Compatibility Ideographs
summary.println("# First CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4E00)));
summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0xFA2F)));
summary.println("# First CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x3400)));
summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF)));
summary.println("# Last CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF)));
boolean lastOne = false;
for (int i = 0; i < 0x10FFFF; ++i) {
@ -2405,37 +2405,59 @@ F900..FAFF; CJK Compatibility Ideographs
}
*/
/**
* Function used to collapse the two different Han blocks from UCA into one.
* It does this by reversing the order of the two groups A and B in the BMP below.
* NOTE: MUST NOT BE CALLED EXCEPT FOR CJK CHARACTERS
* A:
* 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B:
* 3400..4DBF; CJK Unified Ideographs Extension A
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
*/
static int swapCJK(int i) {
if (i >= UCA.CJK_COMPAT_USED_BASE) return i - UCA.CJK_COMPAT_USED_BASE + (UCA.CJK_LIMIT - UCA.CJK_BASE);
if (i >= UCA.CJK_BASE) return i - UCA.CJK_BASE;
// remainder must be CJK_A
return i - UCA.CJK_A_BASE + (UCA.CJK_LIMIT - UCA.CJK_BASE) + (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE);
}
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
*/
static int swapCJK(int i) {
if (i >= UCA.CJK_BASE) {
if (i < UCA.CJK_LIMIT) return i - UCA.CJK_BASE;
if (i < UCA.CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
if (i < UCA.CJK_COMPAT_USED_LIMIT) return i - UCA.CJK_COMPAT_USED_BASE
+ (UCA.CJK_LIMIT - UCA.CJK_BASE);
if (i < UCA.CJK_B_BASE) return i + NON_CJK_OFFSET;
if (i < UCA.CJK_B_LIMIT) return i; // non-BMP-CJK
return i + NON_CJK_OFFSET; // non-CJK
}
if (i < UCA.CJK_A_BASE) return i + NON_CJK_OFFSET;
if (i < UCA.CJK_A_LIMIT) return i - UCA.CJK_A_BASE
+ (UCA.CJK_LIMIT - UCA.CJK_BASE)
+ (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE);
return i + NON_CJK_OFFSET; // non-CJK
}
// CONSTANTS
static final int
static final int
NON_CJK_OFFSET = 0x110000,
BYTES_TO_AVOID = 3,
OTHER_COUNT = 256 - BYTES_TO_AVOID,
LAST_COUNT = OTHER_COUNT / 2,
LAST_COUNT2 = OTHER_COUNT / 20, // room for intervening, without expanding to 5 bytes
LAST_COUNT2 = OTHER_COUNT / 21, // room for intervening, without expanding to 5 bytes
IMPLICIT_3BYTE_COUNT = 1,
IMPLICIT_BASE_BYTE = 0xE0,
@ -2452,6 +2474,20 @@ F900..FAFF; CJK Compatibility Ideographs
// Return value is left justified primary key
static int getImplicitPrimary(int cp) {
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = swapCJK(cp);
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
// we now have a range of numbers from 0 to 21FFFF.
return getImplicitPrimaryFromSwapped(cp);
}
static int getImplicitPrimaryFromSwapped(int cp) {
// we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
@ -2459,15 +2495,6 @@ F900..FAFF; CJK Compatibility Ideographs
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
if (!UCA.isCJK(cp) && !UCA.isCJK_AB(cp)) cp += 0x10FFFF; // space everything else after CJK
else cp = swapCJK(cp);
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
// we now have a range of numbers from 0 to 21FFFF.
int last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
if (last0 < 0) {
int last1 = cp / LAST_COUNT;
@ -2506,22 +2533,45 @@ F900..FAFF; CJK Compatibility Ideographs
static void showImplicit(String title, int cp) {
if (DEBUG) {
System.out.println(title + "-1: " + Utility.hex(cp-1) + " => "
+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp-1)));
System.out.println(title + ": " + Utility.hex(cp) + " => "
+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp)));
System.out.println(title + "+1: " + Utility.hex(cp+1) + " => "
+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp+1)));
}
if (DEBUG) showImplicit2(title + "-1", cp-1);
showImplicit2(title + "00", cp);
if (DEBUG) showImplicit2(title + "+1", cp+1);
}
static void showImplicit2(String title, int cp) {
System.out.println(title + ":\t" + Utility.hex(cp)
+ " => " + Utility.hex(swapCJK(cp))
+ " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp)));
}
static void showImplicit3(String title, int cp) {
System.out.println("*" + title + ":\t" + Utility.hex(cp)
+ " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimaryFromSwapped(cp)));
}
// TEST PROGRAM
static void checkImplicit() {
long oldPrimary = 0;
System.out.println("Starting Implicit Check");
long oldPrimary = 0;
int oldChar = -1;
int oldSwap = -1;
// test monotonically increasing
for (int i = 0; i < 0x21FFFF; ++i) {
long newPrimary = 0xFFFFFFFFL & getImplicitPrimaryFromSwapped(i);
if (newPrimary < oldPrimary) {
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
+ Utility.hex(oldChar) + " (" + Utility.hex(oldPrimary) + ")"
+ " > " + Utility.hex(i) + "(" + Utility.hex(newPrimary) + ")");
}
oldPrimary = newPrimary;
}
showImplicit("# First CJK", UCA.CJK_BASE);
showImplicit("# Last CJK", UCA.CJK_LIMIT-1);
showImplicit("# First CJK-compat", UCA.CJK_COMPAT_USED_BASE);
@ -2532,30 +2582,47 @@ F900..FAFF; CJK Compatibility Ideographs
showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1);
showImplicit("# First Other Implicit", 0);
showImplicit("# Last Other Implicit", 0x10FFFF);
showImplicit("# Boundary", IMPLICIT_4BYTE_BOUNDARY);
showImplicit3("# FIRST", 0);
showImplicit3("# Boundary-1", IMPLICIT_4BYTE_BOUNDARY-1);
showImplicit3("# Boundary00", IMPLICIT_4BYTE_BOUNDARY);
showImplicit3("# Boundary+1", IMPLICIT_4BYTE_BOUNDARY+1);
showImplicit3("# LAST", 0x21FFFF);
oldPrimary = 0;
oldChar = -1;
int oldChar = -1;
for (int batch = 0; batch < 3; ++batch) {
for (int i = 0; i <= 0x10FFFF; ++i) {
// separate the three groups
if (UCA.isCJK(i)) {
if (UCA.isCJK(i) || UCA.CJK_COMPAT_USED_BASE <= i && i < UCA.CJK_COMPAT_USED_LIMIT) {
if (batch != 0) continue;
} else if (UCA.isCJK_AB(i)) {
if (batch != 1) continue;
} else if (batch != 2) continue;
// test swapping
int currSwap = swapCJK(i);
if (currSwap < oldSwap) {
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
+ Utility.hex(oldChar) + " (" + Utility.hex(oldSwap) + ")"
+ " > " + Utility.hex(i) + "(" + Utility.hex(currSwap) + ")");
}
long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i);
// test correct values
if ((newPrimary) < (oldPrimary)) {
if (newPrimary < oldPrimary) {
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
+ Utility.hex(oldChar) + ", " + Utility.hex(oldPrimary)
+ Utility.hex(i) + ", " + " > " + Utility.hex(newPrimary));
+ Utility.hex(oldChar) + " (" + Utility.hex(oldPrimary) + ")"
+ " > " + Utility.hex(i) + "(" + Utility.hex(newPrimary) + ")");
}