From 4ae38aed3bd6cba87f6a8c39d461c3e50dadea51 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 4 Jun 2002 23:56:29 +0000 Subject: [PATCH] Use modified cascade, added more tests of implicit weights X-SVN-Rev: 8787 --- .../com/ibm/text/UCA/WriteCollationData.java | 177 ++++++++++++------ 1 file changed, 122 insertions(+), 55 deletions(-) diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index 1e950225e8..98d909851c 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/06/04 19:01:51 $ -* $Revision: 1.16 $ +* $Date: 2002/06/04 23:56:29 $ +* $Revision: 1.17 $ * ******************************************************************************* */ @@ -368,7 +368,7 @@ public class WriteCollationData implements UCD_Types { if (!shortPrint) { log.print(Utility.hex(source)); log.print( - ";\t#" + ucd.getName(clipped)+ "\t" + UCA.toString(key)); + ";\t#" + ucd.getName(clipped) + "\t" + UCA.toString(key)); } else { log.print(source + "\t" + Utility.hex(clipped)); } @@ -2278,7 +2278,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.println("[variable top = " + Utility.hex(primaryDelta[firstPrimary]) + "] # END OF VARIABLE SECTION!!!"); summary.println(); } - summary.println("First: " + Utility.hex(np & 0xFFFFFFFFL) + " " + ucd.getName(UTF16.charAt(chr,0))); + summary.println("First: " + Utility.hex(np & 0xFFFFFFFFL) + ", " + ucd.getCodeAndName(UTF16.charAt(chr,0))); } lastNp = np; isFirst = false; @@ -2296,7 +2296,7 @@ F900..FAFF; CJK Compatibility Ideographs log.println(); lastChr = chr; } - summary.println("Last: " + Utility.hex(lastNp) + " " + ucd.getName(UTF16.charAt(lastChr, 0))); + summary.println("Last: " + Utility.hex(lastNp) + ", " + ucd.getCodeAndName(UTF16.charAt(lastChr, 0))); /* String sample = "\u3400\u3401\u4DB4\u4DB5\u4E00\u4E01\u9FA4\u9FA5\uAC00\uAC01\uD7A2\uD7A3"; @@ -2312,7 +2312,7 @@ F900..FAFF; CJK Compatibility Ideographs summary.println("# First CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4E00))); summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0xFA2F))); summary.println("# First CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x3400))); - summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF))); + summary.println("# Last CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF))); boolean lastOne = false; for (int i = 0; i < 0x10FFFF; ++i) { @@ -2405,37 +2405,59 @@ F900..FAFF; CJK Compatibility Ideographs } */ - /** - * Function used to collapse the two different Han blocks from UCA into one. - * It does this by reversing the order of the two groups A and B in the BMP below. - * NOTE: MUST NOT BE CALLED EXCEPT FOR CJK CHARACTERS - * A: - * 4E00..9FFF; CJK Unified Ideographs - * F900..FAFF; CJK Compatibility Ideographs - * B: - * 3400..4DBF; CJK Unified Ideographs Extension A - * As long as - * no new B characters are allocated between 4E00 and FAFF, and - * no new A characters are outside of this range, - * (very high probability) this simple code will work. - * Block1 is CJK - * Block2 is CJK_COMPAT_USED - * Block3 is CJK_A - */ - static int swapCJK(int i) { - if (i >= UCA.CJK_COMPAT_USED_BASE) return i - UCA.CJK_COMPAT_USED_BASE + (UCA.CJK_LIMIT - UCA.CJK_BASE); - if (i >= UCA.CJK_BASE) return i - UCA.CJK_BASE; - // remainder must be CJK_A - return i - UCA.CJK_A_BASE + (UCA.CJK_LIMIT - UCA.CJK_BASE) + (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE); - } +/** + * Function used to: + * a) collapse the 2 different Han ranges from UCA into one (in the right order), and + * b) bump any non-CJK characters by 10FFFF. + * The relevant blocks are: + * A: 4E00..9FFF; CJK Unified Ideographs + * F900..FAFF; CJK Compatibility Ideographs + * B: 3400..4DBF; CJK Unified Ideographs Extension A + * 20000..XX; CJK Unified Ideographs Extension B (and others later on) + * As long as + * no new B characters are allocated between 4E00 and FAFF, and + * no new A characters are outside of this range, + * (very high probability) this simple code will work. + * The reordered blocks are: + * Block1 is CJK + * Block2 is CJK_COMPAT_USED + * Block3 is CJK_A + * Any other CJK gets its normal code point + * Any non-CJK gets +10FFFF + * When we reorder Block1, we make sure that it is at the very start, + * so that it will use a 3-byte form. + */ +static int swapCJK(int i) { + + if (i >= UCA.CJK_BASE) { + if (i < UCA.CJK_LIMIT) return i - UCA.CJK_BASE; + + if (i < UCA.CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; + + if (i < UCA.CJK_COMPAT_USED_LIMIT) return i - UCA.CJK_COMPAT_USED_BASE + + (UCA.CJK_LIMIT - UCA.CJK_BASE); + if (i < UCA.CJK_B_BASE) return i + NON_CJK_OFFSET; + + if (i < UCA.CJK_B_LIMIT) return i; // non-BMP-CJK + + return i + NON_CJK_OFFSET; // non-CJK + } + if (i < UCA.CJK_A_BASE) return i + NON_CJK_OFFSET; + + if (i < UCA.CJK_A_LIMIT) return i - UCA.CJK_A_BASE + + (UCA.CJK_LIMIT - UCA.CJK_BASE) + + (UCA.CJK_COMPAT_USED_LIMIT - UCA.CJK_COMPAT_USED_BASE); + return i + NON_CJK_OFFSET; // non-CJK +} // CONSTANTS - static final int + static final int + NON_CJK_OFFSET = 0x110000, BYTES_TO_AVOID = 3, OTHER_COUNT = 256 - BYTES_TO_AVOID, LAST_COUNT = OTHER_COUNT / 2, - LAST_COUNT2 = OTHER_COUNT / 20, // room for intervening, without expanding to 5 bytes + LAST_COUNT2 = OTHER_COUNT / 21, // room for intervening, without expanding to 5 bytes IMPLICIT_3BYTE_COUNT = 1, IMPLICIT_BASE_BYTE = 0xE0, @@ -2452,6 +2474,20 @@ F900..FAFF; CJK Compatibility Ideographs // Return value is left justified primary key static int getImplicitPrimary(int cp) { + + if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); + + cp = swapCJK(cp); + + if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); + + // we now have a range of numbers from 0 to 21FFFF. + + return getImplicitPrimaryFromSwapped(cp); + } + + static int getImplicitPrimaryFromSwapped(int cp) { + // we must skip all 00, 01, 02 bytes, so most bytes have 253 values // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case) // we shift so that HAN all has the same first primary, for compression. @@ -2459,15 +2495,6 @@ F900..FAFF; CJK Compatibility Ideographs // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1) // Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14) - if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); - - if (!UCA.isCJK(cp) && !UCA.isCJK_AB(cp)) cp += 0x10FFFF; // space everything else after CJK - else cp = swapCJK(cp); - - if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); - - // we now have a range of numbers from 0 to 21FFFF. - int last0 = cp - IMPLICIT_4BYTE_BOUNDARY; if (last0 < 0) { int last1 = cp / LAST_COUNT; @@ -2506,22 +2533,45 @@ F900..FAFF; CJK Compatibility Ideographs static void showImplicit(String title, int cp) { - if (DEBUG) { - System.out.println(title + "-1: " + Utility.hex(cp-1) + " => " - + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp-1))); - System.out.println(title + ": " + Utility.hex(cp) + " => " - + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp))); - System.out.println(title + "+1: " + Utility.hex(cp+1) + " => " - + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp+1))); - } + if (DEBUG) showImplicit2(title + "-1", cp-1); + + showImplicit2(title + "00", cp); + + if (DEBUG) showImplicit2(title + "+1", cp+1); + } + + static void showImplicit2(String title, int cp) { + System.out.println(title + ":\t" + Utility.hex(cp) + + " => " + Utility.hex(swapCJK(cp)) + + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp))); + } + + static void showImplicit3(String title, int cp) { + System.out.println("*" + title + ":\t" + Utility.hex(cp) + + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimaryFromSwapped(cp))); } // TEST PROGRAM static void checkImplicit() { - long oldPrimary = 0; System.out.println("Starting Implicit Check"); + long oldPrimary = 0; + int oldChar = -1; + int oldSwap = -1; + + // test monotonically increasing + + for (int i = 0; i < 0x21FFFF; ++i) { + long newPrimary = 0xFFFFFFFFL & getImplicitPrimaryFromSwapped(i); + if (newPrimary < oldPrimary) { + throw new IllegalArgumentException(Utility.hex(i) + ": overlap: " + + Utility.hex(oldChar) + " (" + Utility.hex(oldPrimary) + ")" + + " > " + Utility.hex(i) + "(" + Utility.hex(newPrimary) + ")"); + } + oldPrimary = newPrimary; + } + showImplicit("# First CJK", UCA.CJK_BASE); showImplicit("# Last CJK", UCA.CJK_LIMIT-1); showImplicit("# First CJK-compat", UCA.CJK_COMPAT_USED_BASE); @@ -2532,30 +2582,47 @@ F900..FAFF; CJK Compatibility Ideographs showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1); showImplicit("# First Other Implicit", 0); showImplicit("# Last Other Implicit", 0x10FFFF); - showImplicit("# Boundary", IMPLICIT_4BYTE_BOUNDARY); + showImplicit3("# FIRST", 0); + showImplicit3("# Boundary-1", IMPLICIT_4BYTE_BOUNDARY-1); + showImplicit3("# Boundary00", IMPLICIT_4BYTE_BOUNDARY); + showImplicit3("# Boundary+1", IMPLICIT_4BYTE_BOUNDARY+1); + showImplicit3("# LAST", 0x21FFFF); + + oldPrimary = 0; + oldChar = -1; - int oldChar = -1; for (int batch = 0; batch < 3; ++batch) { for (int i = 0; i <= 0x10FFFF; ++i) { // separate the three groups - if (UCA.isCJK(i)) { + if (UCA.isCJK(i) || UCA.CJK_COMPAT_USED_BASE <= i && i < UCA.CJK_COMPAT_USED_LIMIT) { if (batch != 0) continue; } else if (UCA.isCJK_AB(i)) { if (batch != 1) continue; } else if (batch != 2) continue; + + // test swapping + + int currSwap = swapCJK(i); + if (currSwap < oldSwap) { + throw new IllegalArgumentException(Utility.hex(i) + ": overlap: " + + Utility.hex(oldChar) + " (" + Utility.hex(oldSwap) + ")" + + " > " + Utility.hex(i) + "(" + Utility.hex(currSwap) + ")"); + } + + long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i); // test correct values - if ((newPrimary) < (oldPrimary)) { + if (newPrimary < oldPrimary) { throw new IllegalArgumentException(Utility.hex(i) + ": overlap: " - + Utility.hex(oldChar) + ", " + Utility.hex(oldPrimary) - + Utility.hex(i) + ", " + " > " + Utility.hex(newPrimary)); + + Utility.hex(oldChar) + " (" + Utility.hex(oldPrimary) + ")" + + " > " + Utility.hex(i) + "(" + Utility.hex(newPrimary) + ")"); }