package com.ibm.text.UCA; import com.ibm.text.UCD.UCD_Types; import com.ibm.text.utility.Utility; /** * For generation of Implicit CEs * @author Davis * * Cleaned up so that changes can be made more easily. * Old values: # First Implicit: E26A792D # Last Implicit: E3DC70C0 # First CJK: E0030300 # Last CJK: E0A9DD00 # First CJK_A: E0A9DF00 # Last CJK_A: E0DE3100 */ public class Implicit implements UCD_Types { /** * constants */ static final boolean DEBUG = false; static final long topByte = 0xFF000000L; static final long bottomByte = 0xFFL; static final long fourBytes = 0xFFFFFFFFL; static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2 /** * Testing function * @param args ignored */ public static void main(String[] args) { System.out.println("Start"); try { Implicit foo = new Implicit(0xE0, 0xE4); //int x = foo.getRawImplicit(0xF810); foo.getRawFromImplicit(0xE20303E7); int gap4 = foo.getGap4(); System.out.println("Gap4: " + gap4); int gap3 = foo.getGap3(); int minTrail = foo.getMinTrail(); int maxTrail = foo.getMaxTrail(); long last = 0; long current; for (int i = 0; i <= MAX_INPUT; ++i) { current = foo.getImplicitFromRaw(i) & fourBytes; // check that it round-trips AND that all intervening ones are illegal int roundtrip = foo.getRawFromImplicit((int)current); if (roundtrip != i) { foo.throwError("No roundtrip", i); } if (last != 0) { for (long j = last + 1; j < current; ++j) { roundtrip = foo.getRawFromImplicit((int)j); // raise an error if it *doesn't* find an error if (roundtrip != -1) { foo.throwError("Fails to recognize illegal", j); } } } // now do other consistency checks long lastBottom = last & bottomByte; long currentBottom = current & bottomByte; long lastTop = last & topByte; long currentTop = current & topByte; // do some consistency checks /* long gap = current - last; if (currentBottom != 0) { // if we are a 4-byte // gap has to be at least gap4 // and gap from minTrail, maxTrail has to be at least gap4 if (gap <= gap4) foo.throwError("Failed gap4 between", i); if (currentBottom < minTrail + gap4) foo.throwError("Failed gap4 before", i); if (currentBottom > maxTrail - gap4) foo.throwError("Failed gap4 after", i); } else { // we are a three-byte gap = gap >> 8; // move gap down for comparison. long current3Bottom = (current >> 8) & bottomByte; if (gap <= gap3) foo.throwError("Failed gap3 between ", i); if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i); if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i); } */ // print out some values for spot-checking if (lastTop != currentTop || i == 0x10000 || i == 0x110000) { foo.show(i-3); foo.show(i-2); foo.show(i-1); if (i == 0) { // do nothing } else if (lastBottom == 0 && currentBottom != 0) { System.out.println("+ primary boundary, 4-byte CE's below"); } else if (lastTop != currentTop) { System.out.println("+ primary boundary"); } foo.show(i); foo.show(i+1); foo.show(i+2); System.out.println("..."); } last = current; } foo.show(MAX_INPUT-2); foo.show(MAX_INPUT-1); foo.show(MAX_INPUT); } catch (Exception e) { e.printStackTrace(); } finally { System.out.println("End"); } } private void throwError(String title, int cp) { throw new IllegalArgumentException(title + "\t" + Utility.hex(cp) + "\t" + Utility.hex(getImplicitFromRaw(cp) & fourBytes)); } private void throwError(String title, long ce) { throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes)); } private void show(int i) { if (i >= 0 && i <= MAX_INPUT) { System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes)); } } /** * Precomputed by constructor */ int final3Multiplier; int final4Multiplier; int final3Count; int final4Count; int medialCount; int min3Primary; int min4Primary; int max4Primary; int minTrail; int maxTrail; int max3Trail; int max4Trail; int min4Boundary; public int getGap4() { return final4Multiplier - 1; } public int getGap3() { return final3Multiplier - 1; } // old comment // we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case) // we shift so that HAN all has the same first primary, for compression. // for the 4 byte case, we make the gap as large as we can fit. /** * Supply parameters for generating implicit CEs */ public Implicit(int minPrimary, int maxPrimary) { // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. this(minPrimary, maxPrimary, 0x03, 0xFE, 1, 1); } /** * Set up to generate implicits. * @param minPrimary * @param maxPrimary * @param minTrail final byte * @param maxTrail final byte * @param gap3 the gap we leave for tailoring for 3-byte forms * @param primaries3count number of 3-byte primarys we can use (normally 1) */ public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) { // some simple parameter checks if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes"); if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes"); if (primaries3count < 1) throw new IllegalArgumentException("bad three-byte primaries"); this.minTrail = minTrail; this.maxTrail = maxTrail; min3Primary = minPrimary; max4Primary = maxPrimary; // compute constants for use later. // number of values we can use in trailing bytes // leave room for empty values between AND above, e.g. if gap = 2 // range 3..7 => +3 -4 -5 -6 -7: so 1 value // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values final3Multiplier = gap3 + 1; final3Count = (maxTrail - minTrail + 1) / final3Multiplier; max3Trail = minTrail + (final3Count - 1) * final3Multiplier; // medials can use full range medialCount = (maxTrail - minTrail + 1); // find out how many values fit in each form int threeByteCount = medialCount * final3Count; // now determine where the 3/4 boundary is. // we use 3 bytes below the boundary, and 4 above int primariesAvailable = maxPrimary - minPrimary + 1; int primaries4count = primariesAvailable - primaries3count; int min3ByteCoverage = primaries3count * threeByteCount; min4Primary = minPrimary + primaries3count; min4Boundary = min3ByteCoverage; // Now expand out the multiplier for the 4 bytes, and redo. int totalNeeded = MAX_INPUT - min4Boundary; int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte); int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte); int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; if (DEBUG) System.out.println("expandedGap: " + gap4); if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s"); final4Multiplier = gap4 + 1; final4Count = neededPerFinalByte; max4Trail = minTrail + (final4Count - 1) * final4Multiplier; if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) { throw new IllegalArgumentException("internal error"); } if (DEBUG) { System.out.println("final4Count: " + final4Count); for (int counter = 0; counter < final4Count; ++counter) { int value = minTrail + (1 + counter)*final4Multiplier; System.out.println(counter + "\t" + value + "\t" + Utility.hex(value)); } } } static public int divideAndRoundUp(int a, int b) { return 1 + (a-1)/b; } /** * Converts implicit CE into raw integer * @param implicit * @return -1 if illegal format */ public int getRawFromImplicit(int implicit) { int result; int b3 = implicit & 0xFF; implicit >>= 8; int b2 = implicit & 0xFF; implicit >>= 8; int b1 = implicit & 0xFF; implicit >>= 8; int b0 = implicit & 0xFF; // simple parameter checks if (b0 < min3Primary || b0 > max4Primary || b1 < minTrail || b1 > maxTrail) return -1; // normal offsets b1 -= minTrail; // take care of the final values, and compose if (b0 < min4Primary) { if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1; b2 -= minTrail; int remainder = b2 % final3Multiplier; if (remainder != 0) return -1; b0 -= min3Primary; b2 /= final3Multiplier; result = ((b0 * medialCount) + b1) * final3Count + b2; } else { if (b2 < minTrail || b2 > maxTrail || b3 < minTrail || b3 > max4Trail) return -1; b2 -= minTrail; b3 -= minTrail; int remainder = b3 % final4Multiplier; if (remainder != 0) return -1; b3 /= final4Multiplier; b0 -= min4Primary; result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; } // final check if (result < 0 || result > MAX_INPUT) return -1; return result; } /** * Generate the implicit CE, from raw integer. * Left shifted to put the first byte at the top of an int. * @param cp code point * @return */ public int getImplicitFromRaw(int cp) { if (cp < 0 || cp > MAX_INPUT) { throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); } int last0 = cp - min4Boundary; if (last0 < 0) { int last1 = cp / final3Count; last0 = cp % final3Count; int last2 = last1 / medialCount; last1 %= medialCount; last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start last1 = minTrail + last1; // offset last2 = min3Primary + last2; // offset if (last2 >= min4Primary) { throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); } return (last2 << 24) + (last1 << 16) + (last0 << 8); } else { int last1 = last0 / final4Count; last0 %= final4Count; int last2 = last1 / medialCount; last1 %= medialCount; int last3 = last2 / medialCount; last2 %= medialCount; last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start last1 = minTrail + last1; // offset last2 = minTrail + last2; // offset last3 = min4Primary + last3; // offset if (last3 > max4Primary) { throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); } return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; } } /** * Gets an Implicit from a code point. Internally, * swaps (which produces a raw value 0..220000, * then converts raw to implicit. * @param cp * @return */ public int getSwappedImplicit(int cp) { if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); // Produce Raw value // note, we add 1 so that the first value is always empty!! cp = Implicit.swapCJK(cp) + 1; // we now have a range of numbers from 0 to 220000. if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); return getImplicitFromRaw(cp); } /** * Function used to: * a) collapse the 2 different Han ranges from UCA into one (in the right order), and * b) bump any non-CJK characters by 10FFFF. * The relevant blocks are: * A: 4E00..9FFF; CJK Unified Ideographs * F900..FAFF; CJK Compatibility Ideographs * B: 3400..4DBF; CJK Unified Ideographs Extension A * 20000..XX; CJK Unified Ideographs Extension B (and others later on) * As long as * no new B characters are allocated between 4E00 and FAFF, and * no new A characters are outside of this range, * (very high probability) this simple code will work. * The reordered blocks are: * Block1 is CJK * Block2 is CJK_COMPAT_USED * Block3 is CJK_A * (all contiguous) * Any other CJK gets its normal code point * Any non-CJK gets +10FFFF * When we reorder Block1, we make sure that it is at the very start, * so that it will use a 3-byte form. * Warning: the we only pick up the compatibility characters that are * NOT decomposed, so that block is smaller! */ static int NON_CJK_OFFSET = 0x110000; static int swapCJK(int i) { if (i >= CJK_BASE) { if (i < CJK_LIMIT) return i - CJK_BASE; if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE + (CJK_LIMIT - CJK_BASE); if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; if (i < CJK_B_LIMIT) return i; // non-BMP-CJK return i + NON_CJK_OFFSET; // non-CJK } if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; if (i < CJK_A_LIMIT) return i - CJK_A_BASE + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); return i + NON_CJK_OFFSET; // non-CJK } /** * @return */ public int getMinTrail() { return minTrail; } /** * @return */ public int getMaxTrail() { return maxTrail; } }