fix problem with implicits

X-SVN-Rev: 14310
This commit is contained in:
Mark Davis 2004-01-13 18:32:12 +00:00
parent fb11ca2159
commit 9e3dba41a9
4 changed files with 319 additions and 68 deletions

View File

@ -0,0 +1,278 @@
package com.ibm.text.UCA;
import com.ibm.text.utility.Utility;
/**
* For generation of Implicit CEs
* @author Davis
*
* Cleaned up so that changes can be made more easily.
* Old values:
# First Implicit: E26A792D
# Last Implicit: E3DC70C0
# First CJK: E0030300
# Last CJK: E0A9DD00
# First CJK_A: E0A9DF00
# Last CJK_A: E0DE3100
*/
public class Implicit {
/**
* constants
*/
static final boolean DEBUG = false;
static final long topByte = 0xFF000000L;
static final long bottomByte = 0xFFL;
static final long fourBytes = 0xFFFFFFFFL;
static final int MAX_INPUT = 0x21FFFF;
/**
* Testing function
* @param args ignored
*/
public static void main(String[] args) {
System.out.println("Start");
try {
Implicit foo = new Implicit(0xE0, 0xE4);
int gap4 = foo.getGap4();
int gap3 = foo.getGap3();
int minTrail = foo.getMinTrail();
int maxTrail = foo.getMaxTrail();
long last = 0;
long current;
for (int i = 0; i <= MAX_INPUT; ++i) {
current = foo.getImplicit(i) & fourBytes;
long lastBottom = last & bottomByte;
long currentBottom = current & bottomByte;
long lastTop = last & topByte;
long currentTop = current & topByte;
// do some consistency checks
long gap = current - last;
if (currentBottom != 0) { // if we are a 4-byte
// gap has to be at least gap4
// and gap from minTrail, maxTrail has to be at least gap4
if (gap <= gap4) foo.throwError("Failed gap4 between", i);
if (currentBottom < minTrail + gap4) foo.throwError("Failed gap4 before", i);
if (currentBottom > maxTrail - gap4) foo.throwError("Failed gap4 after", i);
} else { // we are a three-byte
gap = gap >> 8; // move gap down for comparison.
long current3Bottom = (current >> 8) & bottomByte;
if (gap <= gap3) foo.throwError("Failed gap3 between ", i);
if (current3Bottom < minTrail + gap3) foo.throwError("Failed gap3 before", i);
if (current3Bottom > maxTrail - gap3) foo.throwError("Failed gap3 after", i);
}
// print out some values for spot-checking
if (lastTop != currentTop || i == 0x10000 || i == 0x110000) {
foo.show(i-3);
foo.show(i-2);
foo.show(i-1);
if (i == 0) {
// do nothing
} else if (lastBottom == 0 && currentBottom != 0) {
System.out.println("+ primary boundary, 4-byte CE's below");
} else if (lastTop != currentTop) {
System.out.println("+ primary boundary");
}
foo.show(i);
foo.show(i+1);
foo.show(i+2);
System.out.println("...");
}
last = current;
}
foo.show(MAX_INPUT-2);
foo.show(MAX_INPUT-1);
foo.show(MAX_INPUT);
} catch (Exception e) {
e.printStackTrace();
} finally {
System.out.println("End");
}
}
private void throwError(String title, int i) {
throw new IllegalArgumentException(title + "\t" + Utility.hex(i) + "\t" + Utility.hex(getImplicit(i) & fourBytes));
}
private void show(int i) {
if (i >= 0 && i <= MAX_INPUT) {
System.out.println(Utility.hex(i) + "\t" + Utility.hex(getImplicit(i) & fourBytes));
}
}
/**
* Precomputed by constructor
*/
int final3Multiplier;
int final4Multiplier;
int final3Count;
int final4Count;
int medialCount;
int min3Primary;
int min4Primary;
int max4Primary;
int minTrail;
int maxTrail;
int min4Boundary;
public int getGap4() {
return final4Multiplier - 1;
}
public int getGap3() {
return final3Multiplier - 1;
}
// old comment
// we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
// for the 4 byte case, we make the gap as large as we can fit.
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
/**
* Supply parameters for generating implicit CEs
*/
public Implicit(int minPrimary, int maxPrimary) {
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 15);
}
/**
* Set up to generate implicits.
* @param minPrimary
* @param maxPrimary
* @param minTrail final byte
* @param maxTrail final byte
* @param gap3 the gap we leave for tailoring for 3-byte forms
* @param gap4 the gap we leave for tailoring for 4-byte forms
*/
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int gap4) {
// some simple parameter checks
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
if (gap3 < 1 || gap4 < 1) throw new IllegalArgumentException("must have larger gaps");
this.minTrail = minTrail;
this.maxTrail = maxTrail;
final3Multiplier = gap3 + 1;
final4Multiplier = gap4 + 1;
min3Primary = minPrimary;
max4Primary = maxPrimary;
// compute constants for use later.
// number of values we can use in trailing bytes
// leave room for empty values below, between, AND above, so
// gap = 2:
// range 3..7 => (3,4) 5 (6,7): so 1 value
// range 3..8 => (3,4) 5 (6,7,8): so 1 value
// range 3..9 => (3,4) 5 (6,7,8,9): so 1 value
// range 3..10 => (3,4) 5 (6,7) 8 (9, 10): so 2 values
final3Count = 1 + (maxTrail - minTrail - 1) / final3Multiplier;
final4Count = 1 + (maxTrail - minTrail - 1) / final4Multiplier;
// medials can use full range
medialCount = (maxTrail - minTrail + 1);
// find out how many values fit in each form
int fourByteCount = medialCount * medialCount * final4Count;
int threeByteCount = medialCount * final3Count;
// now determine where the 3/4 boundary is.
// we use 3 bytes below the boundary, and 4 above
int primariesAvailable = maxPrimary - minPrimary + 1;
int min4BytesNeeded = divideAndRoundUp(MAX_INPUT, fourByteCount);
int min3BytesNeeded = primariesAvailable - min4BytesNeeded;
if (min3BytesNeeded < 1) throw new IllegalArgumentException("Too few 3-byte implicits available.");
int min3ByteCoverage = min3BytesNeeded * threeByteCount;
min4Primary = minPrimary + min3BytesNeeded;
min4Boundary = min3ByteCoverage;
// Now expand out the multiplier for the 4 bytes, and redo.
int totalNeeded = MAX_INPUT - min4Boundary;
int neededPerPrimaryByte = divideAndRoundUp(totalNeeded, min4BytesNeeded);
if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
int neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
int expandedGap = (maxTrail - minTrail - 1) / (neededPerFinalByte + 1) - 1;
if (DEBUG) System.out.println("expandedGap: " + expandedGap);
if (expandedGap < gap4) throw new IllegalArgumentException("must have larger gaps");
final4Multiplier = expandedGap + 1;
final4Count = neededPerFinalByte;
if (DEBUG) {
System.out.println("final4Count: " + final4Count);
for (int counter = 0; counter <= final4Count; ++counter) {
int value = minTrail + (1 + counter)*final4Multiplier;
System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
}
}
}
static public int divideAndRoundUp(int a, int b) {
return 1 + (a-1)/b;
}
/**
* Generate the implicit CE, left shifted to put the first byte at the top of an int.
* @param cp code point
* @return
*/
public int getImplicit(int cp) {
if (cp < 0 || cp > MAX_INPUT) {
throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
}
int last0 = cp - min4Boundary;
if (last0 < 0) {
int last1 = cp / final3Count;
last0 = cp % final3Count;
int last2 = last1 / medialCount;
last1 %= medialCount;
last0 = minTrail + (last0 + 1)*final3Multiplier - 1; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = min3Primary + last2; // offset
if (last2 >= min4Primary) {
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
}
return (last2 << 24) + (last1 << 16) + (last0 << 8);
} else {
int last1 = last0 / final4Count;
last0 %= final4Count;
int last2 = last1 / medialCount;
last1 %= medialCount;
int last3 = last2 / medialCount;
last2 %= medialCount;
last0 = minTrail + (last0 + 1)*final4Multiplier - 1; // spread out, leaving gap at start
last1 = minTrail + last1; // offset
last2 = minTrail + last2; // offset
last3 = min4Primary + last3; // offset
if (last3 > max4Primary) {
throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
}
return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
}
}
/**
* @return
*/
public int getMinTrail() {
return minTrail;
}
/**
* @return
*/
public int getMaxTrail() {
return maxTrail;
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2003/08/22 16:51:21 $
* $Revision: 1.16 $
* $Date: 2004/01/13 18:32:12 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -17,7 +17,7 @@ import com.ibm.text.utility.*;
public class Main {
static final String UCDVersion = "";
static final String UCDVersion = "4.0.0";
static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA",
"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted",
"short",
@ -89,7 +89,7 @@ public class Main {
else if (arg.equalsIgnoreCase("noCE")) noCE = !noCE;
else if (arg.equalsIgnoreCase("writeAllocation")) WriteCharts.writeAllocation();
else if (arg.equalsIgnoreCase("probe")) Probe.test();
// else if (arg.equalsIgnoreCase("probe")) Probe.test();
else {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Types.java,v $
* $Date: 2003/08/22 16:51:21 $
* $Revision: 1.5 $
* $Date: 2004/01/13 18:32:11 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -20,7 +20,7 @@ public interface UCA_Types {
* Version of the UCA tables to use
*/
//private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7";
public static final String UCA_BASE = "4.0.0d5"; // "3.1.1"; // ; // ""; // "-2.1.9d7";
public static final String UCA_BASE = "4.0.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7";
public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7";
public static final String ALLFILES = "allkeys"; // null if not there

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2003/08/22 16:51:21 $
* $Revision: 1.35 $
* $Date: 2004/01/13 18:32:11 $
* $Revision: 1.36 $
*
*******************************************************************************
*/
@ -319,10 +319,10 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
}
}
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, filename + (shortPrint ? "_SHORT" : "") + ".txt", Utility.UTF8_WINDOWS);
String fullFileName = filename + (shortPrint ? "_SHORT" : "") + ".txt";
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
//if (!shortPrint) log.write('\uFEFF');
log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
log.println("# Generated: " + getNormalDate());
writeVersionAndDate(log, fullFileName);
System.out.println("Sorting");
int counter = 0;
@ -448,6 +448,14 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
System.out.println("Done");
}
private static void writeVersionAndDate(PrintWriter log, String filename) {
log.println("# File: " + filename);
log.println("# UCA Version: " + collator.getDataVersion());
log.println("# UCD Version: " + collator.getDataVersion());
log.println("# Generated: " + getNormalDate());
log.println();
}
static void addStringX(int x, byte option) {
addStringX(UTF32.valueOf32(x), option);
}
@ -703,7 +711,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
}*/
static void testCompatibilityCharacters() throws IOException {
log = Utility.openPrintWriter(UCA_GEN_DIR, "UCA_CompatComparison.txt", Utility.UTF8_WINDOWS);
String fullFileName = "UCA_CompatComparison.txt";
log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
int[] kenCes = new int[50];
int[] markCes = new int[50];
@ -750,8 +759,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
Iterator it = forLater.keySet().iterator();
byte oldType = (byte)0xFF; // anything unique
int caseCount = 0;
log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
log.println("Generated: " + getNormalDate());
writeVersionAndDate(log, fullFileName);
//log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
//log.println("Generated: " + getNormalDate());
while (it.hasNext()) {
String key = (String) it.next();
byte type = (byte)key.charAt(0);
@ -1197,7 +1207,8 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
"UTF8"),
32*1024));
*/
PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "UCA_Contractions.txt", Utility.UTF8_WINDOWS);
String fullFileName = "UCA_Contractions.txt";
PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
diLog.write('\uFEFF');
@ -1209,8 +1220,9 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
int[] lenArray = new int[1];
diLog.println("# Contractions");
diLog.println("# Generated " + getNormalDate());
diLog.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
writeVersionAndDate(diLog, fullFileName);
//diLog.println("# Generated " + getNormalDate());
//diLog.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion());
while (true) {
String s = cc.next(ces, lenArray);
if (s == null) break;
@ -3195,8 +3207,8 @@ F900..FAFF; CJK Compatibility Ideographs
if (firstTrailing.isUnset()) {
System.out.println("No first/last trailing: resetting");
firstTrailing.setValue(IMPLICIT_LIMIT_BYTE+1, COMMON, COMMON, "");
lastTrailing.setValue(IMPLICIT_LIMIT_BYTE+1, COMMON, COMMON, "");
firstTrailing.setValue(IMPLICIT_MAX_BYTE+1, COMMON, COMMON, "");
lastTrailing.setValue(IMPLICIT_MAX_BYTE+1, COMMON, COMMON, "");
System.out.println(firstTrailing.formatFCE());
}
@ -3208,8 +3220,8 @@ F900..FAFF; CJK Compatibility Ideographs
log.println("# superceded! [top " + lastNonIgnorable.formatFCE() + "]");
log.println("[fixed first implicit byte " + Utility.hex(IMPLICIT_BASE_BYTE,2) + "]");
log.println("[fixed last implicit byte " + Utility.hex(IMPLICIT_LIMIT_BYTE,2) + "]");
log.println("[fixed first trail byte " + Utility.hex(IMPLICIT_LIMIT_BYTE+1,2) + "]");
log.println("[fixed last implicit byte " + Utility.hex(IMPLICIT_MAX_BYTE,2) + "]");
log.println("[fixed first trail byte " + Utility.hex(IMPLICIT_MAX_BYTE+1,2) + "]");
log.println("[fixed last trail byte " + Utility.hex(SPECIAL_BASE-1,2) + "]");
log.println("[fixed first special byte " + Utility.hex(SPECIAL_BASE,2) + "]");
log.println("[fixed last special byte " + Utility.hex(0xFF,2) + "]");
@ -3509,7 +3521,7 @@ static int swapCJK(int i) {
IMPLICIT_3BYTE_COUNT = 1,
IMPLICIT_BASE_BYTE = 0xE0,
IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4, // leave room for 1 3-byte and 2 4-byte forms
IMPLICIT_MAX_BYTE = IMPLICIT_BASE_BYTE + 4, // leave room for 1 3-byte and 2 4-byte forms
IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT,
LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT,
@ -3534,49 +3546,10 @@ static int swapCJK(int i) {
return getImplicitPrimaryFromSwapped(cp);
}
static Implicit implicit = new Implicit(IMPLICIT_BASE_BYTE, IMPLICIT_MAX_BYTE);
static int getImplicitPrimaryFromSwapped(int cp) {
// we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
// for the 4 byte case, we make the gap as large as we can fit.
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
int last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
if (last0 < 0) {
int last1 = cp / LAST_COUNT;
last0 = cp % LAST_COUNT;
int last2 = last1 / OTHER_COUNT;
last1 %= OTHER_COUNT;
if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
+ Utility.hex(last2) + ", "
+ Utility.hex(last1) + ", "
+ Utility.hex(last0) + ", "
);
return IMPLICIT_BASE_3BYTE + (last2 << 24) + (last1 << 16) + ((last0*LAST_MULTIPLIER) << 8);
} else {
int last1 = last0 / LAST_COUNT2;
last0 %= LAST_COUNT2;
int last2 = last1 / OTHER_COUNT;
last1 %= OTHER_COUNT;
int last3 = last2 / OTHER_COUNT;
last2 %= OTHER_COUNT;
if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
+ Utility.hex(last3) + ", "
+ Utility.hex(last2) + ", "
+ Utility.hex(last1) + ", "
+ Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
);
return IMPLICIT_BASE_4BYTE + (last3 << 24) + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
}
return implicit.getImplicit(cp);
}
@ -3679,7 +3652,7 @@ static int swapCJK(int i) {
long b2 = (newPrimary >> 8) & 0xFF;
long b3 = newPrimary & 0xFF;
if (b0 < IMPLICIT_BASE_BYTE || b0 >= IMPLICIT_LIMIT_BYTE || b1 < 3 || b2 < 3 || b3 == 1 || b3 == 2) {
if (b0 < IMPLICIT_BASE_BYTE || b0 > IMPLICIT_MAX_BYTE || b1 < 3 || b2 < 3 || b3 == 1 || b3 == 2) {
throw new IllegalArgumentException(Utility.hex(i) + ": illegal byte value: " + Utility.hex(newPrimary)
+ ", " + Utility.hex(b1) + ", " + Utility.hex(b2) + ", " + Utility.hex(b3));
}