5a36195666
X-SVN-Rev: 14341
430 lines
16 KiB
Java
430 lines
16 KiB
Java
package com.ibm.text.UCA;
|
|
|
|
import com.ibm.text.UCD.UCD_Types;
|
|
import com.ibm.text.utility.Utility;
|
|
|
|
/**
|
|
* For generation of Implicit CEs
|
|
* @author Davis
|
|
*
|
|
* Cleaned up so that changes can be made more easily.
|
|
* Old values:
|
|
# First Implicit: E26A792D
|
|
# Last Implicit: E3DC70C0
|
|
# First CJK: E0030300
|
|
# Last CJK: E0A9DD00
|
|
# First CJK_A: E0A9DF00
|
|
# Last CJK_A: E0DE3100
|
|
|
|
*/
|
|
public class Implicit implements UCD_Types {
|
|
|
|
/**
|
|
* constants
|
|
*/
|
|
static final boolean DEBUG = false;
|
|
|
|
static final long topByte = 0xFF000000L;
|
|
static final long bottomByte = 0xFFL;
|
|
static final long fourBytes = 0xFFFFFFFFL;
|
|
|
|
static final int MAX_INPUT = 0x220001; // 2 * Unicode range + 2
|
|
|
|
/**
|
|
* Testing function
|
|
* @param args ignored
|
|
*/
|
|
public static void main(String[] args) {
|
|
System.out.println("Start");
|
|
try {
|
|
Implicit foo = new Implicit(0xE0, 0xE4);
|
|
|
|
//int x = foo.getRawImplicit(0xF810);
|
|
foo.getRawFromImplicit(0xE20303E7);
|
|
|
|
int gap4 = foo.getGap4();
|
|
System.out.println("Gap4: " + gap4);
|
|
int gap3 = foo.getGap3();
|
|
int minTrail = foo.getMinTrail();
|
|
int maxTrail = foo.getMaxTrail();
|
|
long last = 0;
|
|
long current;
|
|
for (int i = 0; i <= MAX_INPUT; ++i) {
|
|
current = foo.getImplicitFromRaw(i) & fourBytes;
|
|
|
|
// check that it round-trips AND that all intervening ones are illegal
|
|
int roundtrip = foo.getRawFromImplicit((int)current);
|
|
if (roundtrip != i) {
|
|
foo.throwError("No roundtrip", i);
|
|
}
|
|
if (last != 0) {
|
|
for (long j = last + 1; j < current; ++j) {
|
|
roundtrip = foo.getRawFromImplicit((int)j);
|
|
// raise an error if it *doesn't* find an error
|
|
if (roundtrip != -1) {
|
|
foo.throwError("Fails to recognize illegal", j);
|
|
}
|
|
}
|
|
}
|
|
// now do other consistency checks
|
|
long lastBottom = last & bottomByte;
|
|
long currentBottom = current & bottomByte;
|
|
long lastTop = last & topByte;
|
|
long currentTop = current & topByte;
|
|
|
|
// do some consistency checks
|
|
/*
|
|
long gap = current - last;
|
|
if (currentBottom != 0) { // if we are a 4-byte
|
|
// gap has to be at least gap4
|
|
// and gap from minTrail, maxTrail has to be at least gap4
|
|
if (gap <= gap4) foo.throwError("Failed gap4 between", i);
|
|
if (currentBottom < minTrail + gap4) foo.throwError("Failed gap4 before", i);
|
|
if (currentBottom > maxTrail - gap4) foo.throwError("Failed gap4 after", i);
|
|
} else { // we are a three-byte
|
|
gap = gap >> 8; // move gap down for comparison.
|
|
long current3Bottom = (current >> 8) & bottomByte;
|
|
if (gap <= gap3) foo.throwError("Failed gap3 between ", i);
|
|
if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i);
|
|
if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i);
|
|
}
|
|
*/
|
|
// print out some values for spot-checking
|
|
if (lastTop != currentTop || i == 0x10000 || i == 0x110000) {
|
|
foo.show(i-3);
|
|
foo.show(i-2);
|
|
foo.show(i-1);
|
|
if (i == 0) {
|
|
// do nothing
|
|
} else if (lastBottom == 0 && currentBottom != 0) {
|
|
System.out.println("+ primary boundary, 4-byte CE's below");
|
|
} else if (lastTop != currentTop) {
|
|
System.out.println("+ primary boundary");
|
|
}
|
|
foo.show(i);
|
|
foo.show(i+1);
|
|
foo.show(i+2);
|
|
System.out.println("...");
|
|
}
|
|
last = current;
|
|
}
|
|
foo.show(MAX_INPUT-2);
|
|
foo.show(MAX_INPUT-1);
|
|
foo.show(MAX_INPUT);
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
} finally {
|
|
System.out.println("End");
|
|
}
|
|
}
|
|
|
|
private void throwError(String title, int cp) {
|
|
throw new IllegalArgumentException(title + "\t" + Utility.hex(cp) + "\t" + Utility.hex(getImplicitFromRaw(cp) & fourBytes));
|
|
}
|
|
|
|
private void throwError(String title, long ce) {
|
|
throw new IllegalArgumentException(title + "\t" + Utility.hex(ce & fourBytes));
|
|
}
|
|
|
|
private void show(int i) {
|
|
if (i >= 0 && i <= MAX_INPUT) {
|
|
System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicitFromRaw(i) & fourBytes));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Precomputed by constructor
|
|
*/
|
|
int final3Multiplier;
|
|
int final4Multiplier;
|
|
int final3Count;
|
|
int final4Count;
|
|
int medialCount;
|
|
int min3Primary;
|
|
int min4Primary;
|
|
int max4Primary;
|
|
int minTrail;
|
|
int maxTrail;
|
|
int max3Trail;
|
|
int max4Trail;
|
|
int min4Boundary;
|
|
|
|
public int getGap4() {
|
|
return final4Multiplier - 1;
|
|
}
|
|
|
|
public int getGap3() {
|
|
return final3Multiplier - 1;
|
|
}
|
|
|
|
// old comment
|
|
// we must skip all 00, 01, 02, FF bytes, so most bytes have 252 values
|
|
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
|
|
// we shift so that HAN all has the same first primary, for compression.
|
|
// for the 4 byte case, we make the gap as large as we can fit.
|
|
|
|
/**
|
|
* Supply parameters for generating implicit CEs
|
|
*/
|
|
public Implicit(int minPrimary, int maxPrimary) {
|
|
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
|
|
this(minPrimary, maxPrimary, 0x03, 0xFE, 1, 1);
|
|
}
|
|
|
|
/**
|
|
* Set up to generate implicits.
|
|
* @param minPrimary
|
|
* @param maxPrimary
|
|
* @param minTrail final byte
|
|
* @param maxTrail final byte
|
|
* @param gap3 the gap we leave for tailoring for 3-byte forms
|
|
* @param primaries3count number of 3-byte primarys we can use (normally 1)
|
|
*/
|
|
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
|
|
// some simple parameter checks
|
|
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
|
|
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
|
|
if (primaries3count < 1) throw new IllegalArgumentException("bad three-byte primaries");
|
|
|
|
this.minTrail = minTrail;
|
|
this.maxTrail = maxTrail;
|
|
|
|
min3Primary = minPrimary;
|
|
max4Primary = maxPrimary;
|
|
// compute constants for use later.
|
|
// number of values we can use in trailing bytes
|
|
// leave room for empty values between AND above, e.g. if gap = 2
|
|
// range 3..7 => +3 -4 -5 -6 -7: so 1 value
|
|
// range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
|
|
// range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
|
|
final3Multiplier = gap3 + 1;
|
|
final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
|
|
max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
|
|
|
|
// medials can use full range
|
|
medialCount = (maxTrail - minTrail + 1);
|
|
// find out how many values fit in each form
|
|
int threeByteCount = medialCount * final3Count;
|
|
// now determine where the 3/4 boundary is.
|
|
// we use 3 bytes below the boundary, and 4 above
|
|
int primariesAvailable = maxPrimary - minPrimary + 1;
|
|
int primaries4count = primariesAvailable - primaries3count;
|
|
|
|
int min3ByteCoverage = primaries3count * threeByteCount;
|
|
min4Primary = minPrimary + primaries3count;
|
|
min4Boundary = min3ByteCoverage;
|
|
// Now expand out the multiplier for the 4 bytes, and redo.
|
|
|
|
int totalNeeded = MAX_INPUT - min4Boundary;
|
|
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
|
|
if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
|
|
|
|
int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
|
|
if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
|
|
|
|
int gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
|
|
if (DEBUG) System.out.println("expandedGap: " + gap4);
|
|
if (gap4 < 1) throw new IllegalArgumentException("must have larger gap4s");
|
|
|
|
final4Multiplier = gap4 + 1;
|
|
final4Count = neededPerFinalByte;
|
|
max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
|
|
|
|
if (primaries4count * medialCount * medialCount * final4Count < MAX_INPUT) {
|
|
throw new IllegalArgumentException("internal error");
|
|
}
|
|
if (DEBUG) {
|
|
System.out.println("final4Count: " + final4Count);
|
|
for (int counter = 0; counter < final4Count; ++counter) {
|
|
int value = minTrail + (1 + counter)*final4Multiplier;
|
|
System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
|
|
}
|
|
}
|
|
}
|
|
|
|
static public int divideAndRoundUp(int a, int b) {
|
|
return 1 + (a-1)/b;
|
|
}
|
|
/**
|
|
* Converts implicit CE into raw integer
|
|
* @param implicit
|
|
* @return -1 if illegal format
|
|
*/
|
|
public int getRawFromImplicit(int implicit) {
|
|
int result;
|
|
int b3 = implicit & 0xFF;
|
|
implicit >>= 8;
|
|
int b2 = implicit & 0xFF;
|
|
implicit >>= 8;
|
|
int b1 = implicit & 0xFF;
|
|
implicit >>= 8;
|
|
int b0 = implicit & 0xFF;
|
|
|
|
// simple parameter checks
|
|
if (b0 < min3Primary || b0 > max4Primary
|
|
|| b1 < minTrail || b1 > maxTrail) return -1;
|
|
// normal offsets
|
|
b1 -= minTrail;
|
|
|
|
// take care of the final values, and compose
|
|
if (b0 < min4Primary) {
|
|
if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
|
|
b2 -= minTrail;
|
|
int remainder = b2 % final3Multiplier;
|
|
if (remainder != 0) return -1;
|
|
b0 -= min3Primary;
|
|
b2 /= final3Multiplier;
|
|
result = ((b0 * medialCount) + b1) * final3Count + b2;
|
|
} else {
|
|
if (b2 < minTrail || b2 > maxTrail
|
|
|| b3 < minTrail || b3 > max4Trail) return -1;
|
|
b2 -= minTrail;
|
|
b3 -= minTrail;
|
|
int remainder = b3 % final4Multiplier;
|
|
if (remainder != 0) return -1;
|
|
b3 /= final4Multiplier;
|
|
b0 -= min4Primary;
|
|
result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
|
|
}
|
|
// final check
|
|
if (result < 0 || result > MAX_INPUT) return -1;
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Generate the implicit CE, from raw integer.
|
|
* Left shifted to put the first byte at the top of an int.
|
|
* @param cp code point
|
|
* @return
|
|
*/
|
|
public int getImplicitFromRaw(int cp) {
|
|
if (cp < 0 || cp > MAX_INPUT) {
|
|
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
|
|
}
|
|
int last0 = cp - min4Boundary;
|
|
if (last0 < 0) {
|
|
int last1 = cp / final3Count;
|
|
last0 = cp % final3Count;
|
|
|
|
int last2 = last1 / medialCount;
|
|
last1 %= medialCount;
|
|
|
|
last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
|
|
last1 = minTrail + last1; // offset
|
|
last2 = min3Primary + last2; // offset
|
|
|
|
if (last2 >= min4Primary) {
|
|
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
|
|
}
|
|
|
|
return (last2 << 24) + (last1 << 16) + (last0 << 8);
|
|
} else {
|
|
int last1 = last0 / final4Count;
|
|
last0 %= final4Count;
|
|
|
|
int last2 = last1 / medialCount;
|
|
last1 %= medialCount;
|
|
|
|
int last3 = last2 / medialCount;
|
|
last2 %= medialCount;
|
|
|
|
last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
|
|
last1 = minTrail + last1; // offset
|
|
last2 = minTrail + last2; // offset
|
|
last3 = min4Primary + last3; // offset
|
|
|
|
if (last3 > max4Primary) {
|
|
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
|
|
}
|
|
|
|
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
|
|
}
|
|
}
|
|
/**
|
|
* Gets an Implicit from a code point. Internally,
|
|
* swaps (which produces a raw value 0..220000,
|
|
* then converts raw to implicit.
|
|
* @param cp
|
|
* @return
|
|
*/
|
|
public int getSwappedImplicit(int cp) {
|
|
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
|
|
|
|
// Produce Raw value
|
|
// note, we add 1 so that the first value is always empty!!
|
|
cp = Implicit.swapCJK(cp) + 1;
|
|
// we now have a range of numbers from 0 to 220000.
|
|
|
|
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
|
|
|
|
return getImplicitFromRaw(cp);
|
|
}
|
|
|
|
|
|
/**
|
|
* Function used to:
|
|
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
|
|
* b) bump any non-CJK characters by 10FFFF.
|
|
* The relevant blocks are:
|
|
* A: 4E00..9FFF; CJK Unified Ideographs
|
|
* F900..FAFF; CJK Compatibility Ideographs
|
|
* B: 3400..4DBF; CJK Unified Ideographs Extension A
|
|
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
|
|
* As long as
|
|
* no new B characters are allocated between 4E00 and FAFF, and
|
|
* no new A characters are outside of this range,
|
|
* (very high probability) this simple code will work.
|
|
* The reordered blocks are:
|
|
* Block1 is CJK
|
|
* Block2 is CJK_COMPAT_USED
|
|
* Block3 is CJK_A
|
|
* (all contiguous)
|
|
* Any other CJK gets its normal code point
|
|
* Any non-CJK gets +10FFFF
|
|
* When we reorder Block1, we make sure that it is at the very start,
|
|
* so that it will use a 3-byte form.
|
|
* Warning: the we only pick up the compatibility characters that are
|
|
* NOT decomposed, so that block is smaller!
|
|
*/
|
|
|
|
static int NON_CJK_OFFSET = 0x110000;
|
|
|
|
static int swapCJK(int i) {
|
|
|
|
if (i >= CJK_BASE) {
|
|
if (i < CJK_LIMIT) return i - CJK_BASE;
|
|
|
|
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
|
|
|
|
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
|
|
+ (CJK_LIMIT - CJK_BASE);
|
|
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
|
|
|
|
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
|
|
|
|
return i + NON_CJK_OFFSET; // non-CJK
|
|
}
|
|
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
|
|
|
|
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
|
|
+ (CJK_LIMIT - CJK_BASE)
|
|
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
|
|
return i + NON_CJK_OFFSET; // non-CJK
|
|
}
|
|
|
|
|
|
/**
|
|
* @return
|
|
*/
|
|
public int getMinTrail() {
|
|
return minTrail;
|
|
}
|
|
|
|
/**
|
|
* @return
|
|
*/
|
|
public int getMaxTrail() {
|
|
return maxTrail;
|
|
}
|
|
|
|
} |