shifted the implicit range.
X-SVN-Rev: 8767
This commit is contained in:
parent
0078f2f53e
commit
012100e9dd
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/06/02 05:07:08 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -18,7 +18,8 @@ import com.ibm.text.utility.*;
|
||||
|
||||
public class Main {
|
||||
static final String UCDVersion = "";
|
||||
static final String[] ICU_FILES = {"FractionalUCA", "writeconformance", "writeconformanceshifted", "WriteRules"};
|
||||
static final String[] ICU_FILES = {"FractionalUCA", "writeconformance", "writeconformanceshifted",
|
||||
"WriteRules", "WriteRulesWithNames", "WriteRulesXML"};
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.11 $
|
||||
* $Date: 2002/06/02 05:07:08 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -639,9 +639,72 @@ final public class UCA implements Comparator {
|
||||
}
|
||||
|
||||
|
||||
static boolean isImplicitCE(int ce) {
|
||||
int primary = getPrimary(ce);
|
||||
return primary >= UNSUPPORTED_BASE && primary <= UNSUPPORTED_TOP;
|
||||
static boolean isImplicitLeadCE(int ce) {
|
||||
return isImplicitLeadPrimary(getPrimary(ce));
|
||||
}
|
||||
|
||||
static boolean isImplicitLeadPrimary(int primary) {
|
||||
return primary >= UNSUPPORTED_BASE && primary < UNSUPPORTED_LIMIT;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
The formula from the UCA:
|
||||
|
||||
BASE:
|
||||
|
||||
FB40 CJK Ideograph
|
||||
FB80 CJK Ideograph Extension A/B
|
||||
FBC0 Any other code point
|
||||
|
||||
AAAA = BASE + (CP >> 15);
|
||||
BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
|
||||
|
||||
CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
*/
|
||||
|
||||
/**
|
||||
* Returns implicit value as pair, first part in high word; second part in low word
|
||||
* So to get first part use (x >>> 16) -- remember the >>>!
|
||||
* and to get the second part use (x & 0xFFFF)
|
||||
*/
|
||||
|
||||
static void CodepointToImplicit(int cp, int[] output) {
|
||||
int base = UNSUPPORTED_OTHER_BASE;
|
||||
if (isCJK(cp)) base = UNSUPPORTED_CJK_BASE;
|
||||
else if (isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
|
||||
output[0] = base + (cp >>> 15);
|
||||
output[1] = (cp & 0x7FFF) | 0x8000;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes implicit value as pair, first part in high word; second part in low word
|
||||
* So to get first part use (x >>> 16) -- remember the >>>!
|
||||
* and to get the second part use (x & 0xFFFF)
|
||||
*/
|
||||
|
||||
static int ImplicitToCodePoint(int leadImplicit, int trailImplicit) {
|
||||
// could probably optimize all this, but it is not worth it.
|
||||
if (leadImplicit < UNSUPPORTED_BASE || leadImplicit >= UNSUPPORTED_LIMIT) {
|
||||
throw new IllegalArgumentException("Lead implicit out of bounds: " + Utility.hex(leadImplicit));
|
||||
}
|
||||
if ((trailImplicit & 0x8000) == 0) {
|
||||
throw new IllegalArgumentException("Trail implicit out of bounds: " + Utility.hex(trailImplicit));
|
||||
}
|
||||
int base;
|
||||
if (leadImplicit >= UNSUPPORTED_OTHER_BASE) base = UNSUPPORTED_OTHER_BASE;
|
||||
else if (leadImplicit >= UNSUPPORTED_CJK_AB_BASE) base = UNSUPPORTED_CJK_AB_BASE;
|
||||
else base = UNSUPPORTED_CJK_BASE;
|
||||
|
||||
int result = ((leadImplicit - base) << 15) | (trailImplicit & 0x7FFF);
|
||||
|
||||
if (result > 0x10FFFF) {
|
||||
throw new IllegalArgumentException("Resulting character out of bounds: "
|
||||
+ Utility.hex(leadImplicit) + ", " + Utility.hex(trailImplicit)
|
||||
+ " => " + result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -784,10 +847,17 @@ final public class UCA implements Comparator {
|
||||
// Collation Element Memory Data Table Formats
|
||||
// =============================================================
|
||||
|
||||
/**
|
||||
* Used to composed Hangul and Han characters
|
||||
*/
|
||||
|
||||
static final int NEUTRAL_SECONDARY = 0x20;
|
||||
static final int NEUTRAL_TERTIARY = 0x02;
|
||||
|
||||
/**
|
||||
* Temporary buffer used in getSortKey for the decomposed string
|
||||
*/
|
||||
StringBuffer decompositionBuffer = new StringBuffer();
|
||||
private StringBuffer decompositionBuffer = new StringBuffer();
|
||||
|
||||
/**
|
||||
* The collation element data is stored a couple of different structures.
|
||||
@ -798,20 +868,14 @@ final public class UCA implements Comparator {
|
||||
* table of simple collation elements, indexed by char.<br>
|
||||
* Exceptional cases: expanding, contracting, unsupported are handled as described below.
|
||||
*/
|
||||
int[] collationElements = new int[65536];
|
||||
private int[] collationElements = new int[65536];
|
||||
|
||||
/**
|
||||
* A special bit combination in a CE is used to reserve exception cases. This has the effect
|
||||
* of removing a small number of the primary key values out of the 65536 possible.
|
||||
*/
|
||||
static final int EXCEPTION_CE_MASK = 0xF8000000;
|
||||
private static final int EXCEPTION_CE_MASK = 0xF8000000;
|
||||
|
||||
/**
|
||||
* Used to composed Hangul and Han characters
|
||||
*/
|
||||
|
||||
static final int NEUTRAL_SECONDARY = 0x20;
|
||||
static final int NEUTRAL_TERTIARY = 0x02;
|
||||
|
||||
/**
|
||||
* Any unsupported characters (those not in the UCA data tables)
|
||||
@ -820,14 +884,14 @@ final public class UCA implements Comparator {
|
||||
* There are at least 34 values, so that we can use a range for surrogates
|
||||
* However, we do add to the first weight if we have surrogate pairs!
|
||||
*/
|
||||
public static final int UNSUPPORTED_CJK_BASE = 0xFF40;
|
||||
public static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
|
||||
public static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
|
||||
private static final int UNSUPPORTED_CJK_BASE = 0xFF40;
|
||||
private static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
|
||||
private static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
|
||||
|
||||
public static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
|
||||
public static final int UNSUPPORTED_TOP = UNSUPPORTED_OTHER_BASE + 0x40;
|
||||
private static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
|
||||
private static final int UNSUPPORTED_LIMIT = UNSUPPORTED_OTHER_BASE + 0x40;
|
||||
|
||||
static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
private static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
|
||||
// was 0xFFC20101;
|
||||
|
||||
@ -838,7 +902,7 @@ final public class UCA implements Comparator {
|
||||
* to be looked up (with following characters) in the contractingTable.<br>
|
||||
* This isn't a MASK since there is exactly one value.
|
||||
*/
|
||||
static final int CONTRACTING = 0xFA310000;
|
||||
private static final int CONTRACTING = 0xFA310000;
|
||||
|
||||
/**
|
||||
* Expanding characters are marked with a exception bit combination
|
||||
@ -846,7 +910,7 @@ final public class UCA implements Comparator {
|
||||
* This means that they map to more than one CE, which is looked up in
|
||||
* the expansionTable by index. See EXCEPTION_INDEX_MASK
|
||||
*/
|
||||
static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
|
||||
private static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
|
||||
|
||||
/**
|
||||
* This mask is used to get the index from an EXPANDING exception.
|
||||
@ -860,12 +924,12 @@ final public class UCA implements Comparator {
|
||||
* as the table is built from the UCA data, they are narrowed in.
|
||||
* The first three values are used in building; the last two in testing.
|
||||
*/
|
||||
int variableLow = '\uFFFF';
|
||||
int nonVariableLow = '\uFFFF'; // HACK '\u089A';
|
||||
int variableHigh = '\u0000';
|
||||
private int variableLow = '\uFFFF';
|
||||
private int nonVariableLow = '\uFFFF'; // HACK '\u089A';
|
||||
private int variableHigh = '\u0000';
|
||||
|
||||
int variableLowCE; // used for testing against
|
||||
int variableHighCE; // used for testing against
|
||||
private int variableLowCE; // used for testing against
|
||||
private int variableHighCE; // used for testing against
|
||||
|
||||
/**
|
||||
* Although a single character can expand into multiple CEs, we don't want to burden
|
||||
@ -875,19 +939,19 @@ final public class UCA implements Comparator {
|
||||
* will be used for the expansion. The implementation is as a stack; this just makes it
|
||||
* easy to generate.
|
||||
*/
|
||||
IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
|
||||
private IntStack expandingTable = new IntStack(3600); // initial number is from compKeys
|
||||
|
||||
/**
|
||||
* For now, this is just a simple mapping of strings to collation elements.
|
||||
* The implementation depends on the contracting characters being "completed",
|
||||
* so that it can be efficiently determined when to stop looking.
|
||||
*/
|
||||
Hashtable contractingTable = new Hashtable();
|
||||
private Hashtable contractingTable = new Hashtable();
|
||||
|
||||
/**
|
||||
* Special char value that means failed or terminated
|
||||
*/
|
||||
static final char NOT_A_CHAR = '\uFFFF';
|
||||
private static final char NOT_A_CHAR = '\uFFFF';
|
||||
|
||||
/**
|
||||
* Marks whether we are using the full data set, or an abbreviated version for
|
||||
@ -1006,37 +1070,19 @@ final public class UCA implements Comparator {
|
||||
index++; // skip next char
|
||||
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
The formula from the UCA:
|
||||
|
||||
BASE:
|
||||
|
||||
FB40 CJK Ideograph
|
||||
FB80 CJK Ideograph Extension A/B
|
||||
FBC0 Any other code point
|
||||
|
||||
AAAA = BASE + (CP >> 15);
|
||||
BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
|
||||
|
||||
CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
*/
|
||||
// divide the three cases
|
||||
// find the implicit values; returned in 0 and 1
|
||||
int[] implicit = new int[2];
|
||||
CodepointToImplicit(bigChar, implicit);
|
||||
|
||||
int base = UNSUPPORTED_OTHER_BASE;
|
||||
if (isCJK(bigChar)) base = UNSUPPORTED_CJK_BASE;
|
||||
else if (isCJK_AB(bigChar)) base = UNSUPPORTED_CJK_AB_BASE;
|
||||
|
||||
// Now compose the two keys
|
||||
// first push BBBB
|
||||
// first push BBBB, which is #1
|
||||
|
||||
// HACK: expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0));
|
||||
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
|
||||
expandingStack.push(makeKey(implicit[1], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
|
||||
|
||||
// now return AAAA
|
||||
// now return AAAA, which is #0
|
||||
|
||||
return makeKey(base + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
|
||||
}
|
||||
if (ce == CONTRACTING) {
|
||||
@ -1113,17 +1159,64 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
return expandingStack.pop(); // pop last (guaranteed to exist!)
|
||||
}
|
||||
|
||||
public final boolean isCJK(int bigChar) {
|
||||
return (0x4E00 <= bigChar && bigChar <= 0x9FFF);
|
||||
// Neither Mapped nor Composite CJK: [\u3400-\u4DB5\u4E00-\u9FA5\U00020000-\U0002A6D6]
|
||||
|
||||
public static boolean isCJK(int cp) {
|
||||
return (CJK_BASE <= cp && cp < CJK_LIMIT
|
||||
|| cp == 0xFA0E // compat characters that don't decompose.
|
||||
|| cp == 0xFA0F
|
||||
|| cp == 0xFA11
|
||||
|| cp == 0xFA13
|
||||
|| cp == 0xFA14
|
||||
|| cp == 0xFA1F
|
||||
|| cp == 0xFA21
|
||||
|| cp == 0xFA23
|
||||
|| cp == 0xFA24
|
||||
|| cp == 0xFA27
|
||||
|| cp == 0xFA28
|
||||
|| cp == 0xFA29
|
||||
|| cp == 0xFA2E
|
||||
|| cp == 0xFA2F
|
||||
);
|
||||
}
|
||||
public final boolean isCJK_AB(int bigChar) {
|
||||
return (0x3400 <= bigChar && bigChar <= 0x4DBF
|
||||
|| 0x20000 <= bigChar && bigChar <= 0x2A6DF);
|
||||
|
||||
public static final int
|
||||
CJK_BASE = 0x4E00,
|
||||
CJK_LIMIT = 0x9FFF+1,
|
||||
CJK_BASE_COMPAT_USED = 0xFA0E,
|
||||
CJK_LIMIT_COMPAT_USED = 0xFA2F+1,
|
||||
CJK_A_BASE = 0x3400,
|
||||
CJK_A_LIMIT = 0x4DBF+1,
|
||||
CJK_B_BASE = 0x20000,
|
||||
CJK_B_LIMIT = 0x2A6DF+1;
|
||||
|
||||
public static final boolean isCJK_AB(int bigChar) {
|
||||
return (CJK_A_BASE <= bigChar && bigChar < CJK_A_LIMIT
|
||||
|| CJK_B_BASE <= bigChar && bigChar < CJK_B_LIMIT);
|
||||
}
|
||||
/*
|
||||
2E80..2EFF; CJK Radicals Supplement
|
||||
2F00..2FDF; Kangxi Radicals
|
||||
|
||||
3400..4DBF; CJK Unified Ideographs Extension A
|
||||
4E00..9FFF; CJK Unified Ideographs
|
||||
F900..FAFF; CJK Compatibility Ideographs
|
||||
|
||||
20000..2A6DF; CJK Unified Ideographs Extension B
|
||||
2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||
|
||||
Compat:
|
||||
# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
|
||||
# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
|
||||
# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
|
||||
# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
|
||||
# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
|
||||
# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
|
||||
# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
|
||||
# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
|
||||
# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
|
||||
# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
|
||||
|
||||
*/
|
||||
|
||||
private final boolean isHangul(int bigChar) {
|
||||
@ -1348,6 +1441,8 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
IntStack tempStack = new IntStack(100); // used for reversal
|
||||
StringBuffer multiChars = new StringBuffer(); // used for contracting chars
|
||||
String inputLine = "";
|
||||
boolean[] wasImplicitLeadPrimary = new boolean[1];
|
||||
|
||||
while (true) try {
|
||||
inputLine = in.readLine();
|
||||
if (inputLine == null) break; // means file is done
|
||||
@ -1413,8 +1508,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
System.out.println("debug");
|
||||
}
|
||||
|
||||
int ce = getCEFromLine(value, line, position, record);
|
||||
int ce2 = getCEFromLine(value, line, position, record);
|
||||
wasImplicitLeadPrimary[0] = false;
|
||||
|
||||
int ce = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
|
||||
int ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
|
||||
if (CHECK_UNIQUE && (ce2 == TERMINATOR || CHECK_UNIQUE_EXPANSIONS)) {
|
||||
if (!CHECK_UNIQUE_VARIABLES) {
|
||||
checkUnique(value, ce, 0, inputLine); // only need to check first value
|
||||
@ -1433,7 +1530,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
// set collationElement to exception value, plus index
|
||||
ce = EXPANDING_MASK | expandingTable.getTop();
|
||||
while (true) {
|
||||
ce2 = getCEFromLine(value, line, position, record);
|
||||
ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
|
||||
if (ce2 == TERMINATOR) break;
|
||||
tempStack.push(ce2);
|
||||
}
|
||||
@ -1700,7 +1797,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
boolean haveVariableWarning = false;
|
||||
boolean haveZeroVariableWarning = false;
|
||||
|
||||
private int getCEFromLine(char value, String line, int[] position, boolean record) {
|
||||
private int getCEFromLine(char value, String line, int[] position, boolean record, boolean[] lastWasImplicitLead) {
|
||||
int start = line.indexOf('[', position[0]);
|
||||
if (start == -1) return TERMINATOR;
|
||||
boolean variable = line.charAt(start+1) == '*';
|
||||
@ -1711,7 +1808,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
int key2 = Integer.parseInt(line.substring(start+7,start+11),16);
|
||||
int key3 = Integer.parseInt(line.substring(start+12,start+16),16);
|
||||
if (record) {
|
||||
primarySet.set(key1);
|
||||
if (lastWasImplicitLead[0]) {
|
||||
lastWasImplicitLead[0] = false;
|
||||
} else if (isImplicitLeadPrimary(key1)) {
|
||||
lastWasImplicitLead[0] = true;
|
||||
} else {
|
||||
primarySet.set(key1);
|
||||
}
|
||||
secondarySet.set(key2);
|
||||
secondaryCount[key2]++;
|
||||
tertiarySet.set(key3);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/06/02 05:07:08 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -113,7 +113,7 @@ public class WriteCharts implements UCD_Types {
|
||||
else if (primary == 0) script = IGNORABLE_ORDER;
|
||||
else if (primary < variable) script = VARIABLE_ORDER;
|
||||
else if (primary < high) script = COMMON_SCRIPT;
|
||||
else if (primary >= UCA.UNSUPPORTED_BASE && primary <= UCA.UNSUPPORTED_TOP) script = UNSUPPORTED;
|
||||
else if (UCA.isImplicitLeadPrimary(primary)) script = UNSUPPORTED;
|
||||
|
||||
if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
|
||||
else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript;
|
||||
@ -147,7 +147,7 @@ public class WriteCharts implements UCD_Types {
|
||||
for (int i = 0; i < sortKey.length(); ++i) {
|
||||
char w = sortKey.charAt(i);
|
||||
if (w == 0) break;
|
||||
if (w >= UCA.UNSUPPORTED_BASE && w <= UCA.UNSUPPORTED_TOP) {
|
||||
if (UCA.isImplicitLeadPrimary(w)) {
|
||||
++i; // skip next
|
||||
}
|
||||
++ primaryCount;
|
||||
@ -571,7 +571,7 @@ public class WriteCharts implements UCD_Types {
|
||||
|
||||
static int getFirstPrimary(String sortKey) {
|
||||
int result = sortKey.charAt(0);
|
||||
if (result >= UCA.UNSUPPORTED_BASE && result <= UCA.UNSUPPORTED_TOP) {
|
||||
if (UCA.isImplicitLeadPrimary(result)) {
|
||||
return (result << 16) | sortKey.charAt(1);
|
||||
}
|
||||
return (result << 16);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/06/02 05:07:08 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -15,6 +15,7 @@ package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
import java.io.*;
|
||||
//import java.text.*;
|
||||
@ -30,6 +31,9 @@ import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UCD.Normalizer;
|
||||
|
||||
public class WriteCollationData implements UCD_Types {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
public static final String copyright =
|
||||
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
|
||||
|
||||
@ -283,7 +287,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException {
|
||||
UCD ucd30 = UCD.make("3.0.0");
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt");
|
||||
PrintWriter log = Utility.openPrintWriter(filename + (shortPrint ? "_SHORT" : "") + ".txt", true, false);
|
||||
if (!shortPrint) log.write('\uFEFF');
|
||||
|
||||
System.out.println("Sorting");
|
||||
@ -1149,6 +1153,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
|
||||
static Normalizer nfdNew = new Normalizer(Normalizer.NFD, "");
|
||||
static Normalizer NFC = new Normalizer(Normalizer.NFC, "");
|
||||
static Normalizer nfkdNew = new Normalizer(Normalizer.NFKD, "");
|
||||
|
||||
static void writeRules (byte option) throws IOException {
|
||||
@ -1207,6 +1212,56 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Checking CJK");
|
||||
|
||||
// Check for characters that are ARE explicitly mapped in the CJK ranges
|
||||
UnicodeSet CJK = new UnicodeSet(0x2E80, 0x2EFF);
|
||||
CJK.add(0x2F00, 0x2EFF);
|
||||
CJK.add(0x2F00, 0x2FDF);
|
||||
CJK.add(0x3400, 0x9FFF);
|
||||
CJK.add(0xF900, 0xFAFF);
|
||||
CJK.add(0x20000, 0x2A6DF);
|
||||
CJK.add(0x2F800, 0x2FA1F);
|
||||
CJK.removeAll(new UnicodeSet("[:Cn:]")); // remove unassigned
|
||||
|
||||
// make set with canonical decomposibles
|
||||
UnicodeSet composites = new UnicodeSet();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAllocated(i)) continue;
|
||||
if (nfd.isNormalized(i)) continue;
|
||||
composites.add(i);
|
||||
}
|
||||
UnicodeSet CJKcomposites = new UnicodeSet(CJK).retainAll(composites);
|
||||
System.out.println("CJK composites " + CJKcomposites.toPattern(true));
|
||||
System.out.println("CJK NONcomposites " + new UnicodeSet(CJK).removeAll(composites).toPattern(true));
|
||||
|
||||
UnicodeSet mapped = new UnicodeSet();
|
||||
Iterator it = alreadyDone.iterator();
|
||||
while (it.hasNext()) {
|
||||
String member = (String) it.next();
|
||||
mapped.add(member);
|
||||
}
|
||||
UnicodeSet CJKmapped = new UnicodeSet(CJK).retainAll(mapped);
|
||||
System.out.println("Mapped CJK: " + CJKmapped.toPattern(true));
|
||||
System.out.println("UNMapped CJK: " + new UnicodeSet(CJK).removeAll(mapped).toPattern(true));
|
||||
System.out.println("Neither Mapped nor Composite CJK: "
|
||||
+ new UnicodeSet(CJK).removeAll(CJKcomposites).removeAll(CJKmapped).toPattern(true));
|
||||
|
||||
|
||||
|
||||
/*
|
||||
2E80..2EFF; CJK Radicals Supplement
|
||||
2F00..2FDF; Kangxi Radicals
|
||||
|
||||
3400..4DBF; CJK Unified Ideographs Extension A
|
||||
4E00..9FFF; CJK Unified Ideographs
|
||||
F900..FAFF; CJK Compatibility Ideographs
|
||||
|
||||
20000..2A6DF; CJK Unified Ideographs Extension B
|
||||
2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||
*/
|
||||
|
||||
|
||||
System.out.println("Adding Kanji");
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAllocated(i)) continue;
|
||||
@ -1236,10 +1291,29 @@ public class WriteCollationData implements UCD_Types {
|
||||
else if (option == IN_XML) filename = "UCA_Rules.xml";
|
||||
log = Utility.openPrintWriter(filename, false, false);
|
||||
|
||||
if (option == IN_XML) log.println("<uca>");
|
||||
else log.write('\uFEFF'); // BOM
|
||||
String[] commentText = {
|
||||
"NOTE: Since UCA handles canonical equivalents, no composites are necessary",
|
||||
"(except in extensions).",
|
||||
"For syntax description, see: http://oss.software.ibm.com/icu/userguide/Collate_Intro.html"
|
||||
};
|
||||
|
||||
Iterator it = ordered.keySet().iterator();
|
||||
if (option == IN_XML) {
|
||||
log.println("<uca>");
|
||||
log.println("<!--");
|
||||
for (int i = 0; i < commentText.length; ++i) {
|
||||
log.println(commentText[i]);
|
||||
}
|
||||
log.println("-->");
|
||||
log.println("<version UCA='" + collator.getDataVersion() + "' UCD='" + collator.getUCDVersion() + "'/>");
|
||||
} else {
|
||||
log.write('\uFEFF'); // BOM
|
||||
for (int i = 0; i < commentText.length; ++i) {
|
||||
log.println("#\t" + commentText[i]);
|
||||
}
|
||||
log.println("# VERSION: UCA=" + collator.getDataVersion() + ", UCD=" + collator.getUCDVersion());
|
||||
}
|
||||
|
||||
it = ordered.keySet().iterator();
|
||||
int oldFirstPrimary = UCA.getPrimary(UCA.TERMINATOR);
|
||||
boolean wasVariable = false;
|
||||
|
||||
@ -1347,46 +1421,6 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
if (len == -1) continue;
|
||||
|
||||
// RESETs: do special case for relations to fixed items
|
||||
|
||||
String reset = "";
|
||||
int xmlReset = 0;
|
||||
|
||||
if (firstTime
|
||||
|| collator.getPrimary(lastCE) == 0 && collator.getPrimary(ce) != 0
|
||||
|| collator.getSecondary(lastCE) == 0 && collator.getSecondary(ce) != 0
|
||||
|| collator.getTertiary(lastCE) == 0 && collator.getTertiary(ce) != 0) {
|
||||
firstTime = false;
|
||||
if (collator.getPrimary(ce) != 0) {
|
||||
reset = "& [top]";
|
||||
} else {
|
||||
reset = "& " + quoteOperand(chr);
|
||||
}
|
||||
} else if (variableTop != 0 && (ce & 0xFFFF0000L) > variableTop) {
|
||||
reset = "= [variable\\u0020top]";
|
||||
xmlReset = 1;
|
||||
variableTop = 0;
|
||||
} else {
|
||||
char primary = collator.getPrimary(ce);
|
||||
if (isFixedIdeograph(remapUCA_CompatibilityIdeographToCp(primary))) {
|
||||
if (primary != lastCJKPrimary) {
|
||||
reset = "& " + quoteOperand(String.valueOf(primary));
|
||||
lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
|
||||
xmlReset = 2;
|
||||
}
|
||||
}
|
||||
lastCJKPrimary = primary;
|
||||
}
|
||||
|
||||
/*
|
||||
if (primary >= 0x3400) {
|
||||
if (primary == 0x9FA6) {
|
||||
primary = '\u9FA5';
|
||||
}
|
||||
if (primary < 0x9FA6) {
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// get relation
|
||||
|
||||
@ -1398,6 +1432,50 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
int relation = getStrengthDifference(ces, len, lastCes, lastLen);
|
||||
|
||||
// RESETs: do special case for relations to fixed items
|
||||
|
||||
String reset = "";
|
||||
String resetComment = "";
|
||||
int xmlReset = 0;
|
||||
|
||||
if (firstTime
|
||||
|| collator.getPrimary(lastCE) == 0 && collator.getPrimary(ce) != 0
|
||||
|| collator.getSecondary(lastCE) == 0 && collator.getSecondary(ce) != 0
|
||||
|| collator.getTertiary(lastCE) == 0 && collator.getTertiary(ce) != 0) {
|
||||
firstTime = false;
|
||||
if (collator.getPrimary(ce) != 0) {
|
||||
reset = "[top]";
|
||||
} else {
|
||||
reset = quoteOperand(chr);
|
||||
}
|
||||
} else if (variableTop != 0 && (ce & 0xFFFF0000L) > variableTop) {
|
||||
reset = "[variable\\u0020top]";
|
||||
xmlReset = 1;
|
||||
variableTop = 0;
|
||||
} else {
|
||||
int primary = collator.getPrimary(ce);
|
||||
if (UCA.isImplicitLeadPrimary(primary)) {
|
||||
if (relation == PRIMARY_DIFF) {
|
||||
int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1]));
|
||||
reset = quoteOperand(UTF16.valueOf(resetCp));
|
||||
resetComment = ucd.getCodeAndName(resetCp);
|
||||
// lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
|
||||
xmlReset = 2;
|
||||
}
|
||||
// lastCJKPrimary = primary;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if (primary >= 0x3400) {
|
||||
if (primary == 0x9FA6) {
|
||||
primary = '\u9FA5';
|
||||
}
|
||||
if (primary < 0x9FA6) {
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
if (chr.equals("\u2F00")) {
|
||||
System.out.println(UCA.ceToString(ces, len));
|
||||
}
|
||||
@ -1405,7 +1483,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
// There are double-CEs, so we have to know what the length of the first bit is.
|
||||
|
||||
int expansionStart = 1;
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
if (UCA.isImplicitLeadCE(ces[0])) {
|
||||
expansionStart = 2; // move up if first is double-ce
|
||||
}
|
||||
|
||||
@ -1432,16 +1510,17 @@ public class WriteCollationData implements UCD_Types {
|
||||
*/
|
||||
|
||||
if (xmlReset == 2) {
|
||||
log.print("<reset anchor=\"" + Utility.quoteXML(String.valueOf(collator.getPrimary(ce))) + "\"/>");
|
||||
log.print("<reset>" + Utility.quoteXML(reset) + "</reset>");
|
||||
}
|
||||
log.print(" <" + XML_RELATION_NAMES[relation]);
|
||||
log.print(" s=\"" + Utility.quoteXML(chr) + "\"");
|
||||
if (len > 1) {
|
||||
log.print(" expansion=\"" + Utility.quoteXML(expansion) + "\"");
|
||||
log.print(" <" + XML_RELATION_NAMES[relation] + ">");
|
||||
if (expansion.length() > 0) {
|
||||
log.print("<x>" + Utility.quoteXML(expansion) + "</x>");
|
||||
}
|
||||
log.println("/>");
|
||||
log.print(Utility.quoteXML(chr));
|
||||
log.print("</" + XML_RELATION_NAMES[relation] + ">");
|
||||
} else {
|
||||
if (reset.length() != 0) log.println(reset);
|
||||
if (reset.length() != 0) log.println("& " + reset
|
||||
+ (resetComment.length() != 0 ? "\t\t# " + resetComment : ""));
|
||||
log.print(RELATION_NAMES[relation] + " " + quoteOperand(chr));
|
||||
if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
|
||||
if (option == WITH_NAMES) {
|
||||
@ -1461,7 +1540,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
|
||||
static long getPrimary(int[] ces) {
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
if (UCA.isImplicitLeadCE(ces[0])) {
|
||||
return (UCA.getPrimary(ces[0]) << 16) + UCA.getPrimary(ces[1]);
|
||||
} else {
|
||||
return UCA.getPrimary(ces[0]);
|
||||
@ -1469,7 +1548,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
|
||||
static long getSecondary(int[] ces) {
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
if (UCA.isImplicitLeadCE(ces[0])) {
|
||||
return (UCA.getSecondary(ces[0]) << 16) + UCA.getSecondary(ces[1]);
|
||||
} else {
|
||||
return UCA.getSecondary(ces[0]);
|
||||
@ -1477,36 +1556,42 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
|
||||
static long getTertiary(int[] ces) {
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
if (UCA.isImplicitLeadCE(ces[0])) {
|
||||
return (UCA.getTertiary(ces[0]) << 16) + UCA.getTertiary(ces[1]);
|
||||
} else {
|
||||
return UCA.getTertiary(ces[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static final int
|
||||
PRIMARY_DIFF = 0,
|
||||
SECONDARY_DIFF = 1,
|
||||
TERTIARY_DIFF = 2,
|
||||
QUARTERNARY_DIFF = 3;
|
||||
|
||||
static int getStrengthDifference(int[] ces, int len, int[] lastCes, int lastLen) {
|
||||
|
||||
int relation = 3;
|
||||
int relation = QUARTERNARY_DIFF;
|
||||
if (getPrimary(ces) != getPrimary(lastCes)) {
|
||||
relation = 0;
|
||||
relation = PRIMARY_DIFF;
|
||||
} else if (getSecondary(ces) != getSecondary(lastCes)) {
|
||||
relation = 1;
|
||||
relation = SECONDARY_DIFF;
|
||||
} else if (getTertiary(ces) != getTertiary(lastCes)) {
|
||||
relation = 2;
|
||||
relation = TERTIARY_DIFF;
|
||||
} else if (len > lastLen) {
|
||||
relation = 2; // HACK
|
||||
relation = TERTIARY_DIFF; // HACK
|
||||
} else {
|
||||
int minLen = len < lastLen ? len : lastLen;
|
||||
int start = UCA.isImplicitCE(ces[0]) ? 2 : 1;
|
||||
int start = UCA.isImplicitLeadCE(ces[0]) ? 2 : 1;
|
||||
for (int kk = start; kk < minLen; ++kk) {
|
||||
int lc = lastCes[kk];
|
||||
int c = ces[kk];
|
||||
if (collator.getPrimary(c) != collator.getPrimary(lc)
|
||||
|| collator.getSecondary(c) != collator.getSecondary(lc)) {
|
||||
relation = 3; // reset relation on FIRST char, since differ anyway
|
||||
relation = QUARTERNARY_DIFF; // reset relation on FIRST char, since differ anyway
|
||||
break;
|
||||
} else if (collator.getTertiary(c) > collator.getTertiary(lc)) {
|
||||
relation = 2; // reset to tertiary (but later ce's might override!)
|
||||
relation = TERTIARY_DIFF; // reset to tertiary (but later ce's might override!)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1760,6 +1845,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
|
||||
|
||||
static final String quoteOperand(String s) {
|
||||
s = NFC.normalize(s);
|
||||
quoteOperandBuffer.setLength(0);
|
||||
boolean noQuotes = true;
|
||||
boolean inQuote = false;
|
||||
@ -1910,14 +1996,14 @@ public class WriteCollationData implements UCD_Types {
|
||||
// special handling for Jamo 3-byte forms
|
||||
|
||||
if (isOldJamo(primary)) {
|
||||
System.out.print("JAMO: " + Utility.hex(lastValue));
|
||||
if (DEBUG) System.out.print("JAMO: " + Utility.hex(lastValue));
|
||||
if ((lastValue & 0xFF0000) == 0) { // lastValue was 2-byte form
|
||||
subtotal += primaryDelta[primary]; // we convert from relative to absolute
|
||||
lastValue = primaryDelta[primary] = (subtotal << 8) + 0x10; // make 3 byte, leave gap
|
||||
} else { // lastValue was 3-byte form
|
||||
lastValue = primaryDelta[primary] = lastValue + 3;
|
||||
}
|
||||
System.out.println(" => " + Utility.hex(lastValue));
|
||||
if (DEBUG) System.out.println(" => " + Utility.hex(lastValue));
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1945,12 +2031,17 @@ public class WriteCollationData implements UCD_Types {
|
||||
lastValue = primaryDelta[primary] = subtotal;
|
||||
}
|
||||
// fixup for Kanji
|
||||
/*
|
||||
|
||||
// WE DROP THIS: we are skipping all CJK values above, and will fix them separately
|
||||
|
||||
int fixedCompat = remapUCA_CompatibilityIdeographToCp(primary);
|
||||
if (isFixedIdeograph(fixedCompat)) {
|
||||
int CE = getImplicitPrimary(fixedCompat);
|
||||
|
||||
lastValue = primaryDelta[primary] = CE >>> 8;
|
||||
}
|
||||
*/
|
||||
//if ((primary & 0xFF) == 0) System.out.println(Utility.hex(primary) + " => " + hexBytes(primaryDelta[primary]));
|
||||
}
|
||||
|
||||
@ -1959,7 +2050,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
System.out.println("Sorting");
|
||||
Map ordered = new TreeMap();
|
||||
UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, null);
|
||||
UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, NFD);
|
||||
int ccounter = 0;
|
||||
while (true) {
|
||||
Utility.dot(ccounter++);
|
||||
@ -2043,6 +2134,11 @@ public class WriteCollationData implements UCD_Types {
|
||||
log.println("# - Differs from previous version in that MAX value was introduced at 1F.");
|
||||
log.println("# All tertiary values are shifted down by 1, filling the gap at 7!");
|
||||
|
||||
int firstImplicit = getImplicitPrimary(UCA.CJK_BASE) >>> 24;
|
||||
int lastImplicit = getImplicitPrimary(0x10FFFF) >>> 24;
|
||||
log.println("[FIRST_IMPLICIT= " + Utility.hex(firstImplicit) + "]");
|
||||
log.println("[LAST_IMPLICIT= " + Utility.hex(lastImplicit) + "]");
|
||||
|
||||
String lastChr = "";
|
||||
int lastNp = 0;
|
||||
boolean doVariable = false;
|
||||
@ -2091,27 +2187,37 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
|
||||
// special hack for unsupported!
|
||||
// special treatment for unsupported!
|
||||
|
||||
if (pri >= UCA.UNSUPPORTED_BASE) {
|
||||
if (UCA.isImplicitLeadPrimary(pri)) {
|
||||
++q;
|
||||
oldStr.append(UCA.ceToString(ces[q]));// + "," + Integer.toString(ces[q],16);
|
||||
|
||||
int pri2 = UCA.getPrimary(ces[q]);
|
||||
// get old code point
|
||||
// pri = UNSUPPORTED_BASE + (bigChar >>> 15)
|
||||
// pri2 = (bigChar & 0x7FFF) | 0x8000
|
||||
pri -= UCA.UNSUPPORTED_BASE;
|
||||
pri <<= 15;
|
||||
pri2 &= 0x7FFF;
|
||||
pri += pri2;
|
||||
System.out.println("Unsupported: "
|
||||
+ Utility.hex(UCA.getPrimary(ces[q-1]))
|
||||
+ ", " + Utility.hex(UCA.getPrimary(ces[q]))
|
||||
+ ", " + Utility.hex(pri)
|
||||
+ ", " + Utility.hex(fixPrimary(pri) & 0xFFFFFFFFL)
|
||||
|
||||
int cp = UCA.ImplicitToCodePoint(pri, pri2);
|
||||
|
||||
// double check results!
|
||||
|
||||
int[] testImplicit = new int[2];
|
||||
UCA.CodepointToImplicit(cp, testImplicit);
|
||||
boolean gotError = pri != testImplicit[0] || pri2 != testImplicit[1];
|
||||
if (gotError) {
|
||||
System.out.println("ERROR");
|
||||
}
|
||||
if (DEBUG || gotError) {
|
||||
System.out.println("Computing Unsupported CP as: "
|
||||
+ Utility.hex(pri)
|
||||
+ ", " + Utility.hex(pri2)
|
||||
+ " => " + Utility.hex(cp)
|
||||
+ " => " + Utility.hex(testImplicit[0])
|
||||
+ ", " + Utility.hex(testImplicit[1])
|
||||
// + ", " + Utility.hex(fixPrimary(pri) & 0xFFFFFFFFL)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
pri = cp | MARK_CODE_POINT;
|
||||
}
|
||||
|
||||
if (sec != 0x20) {
|
||||
@ -2173,10 +2279,14 @@ public class WriteCollationData implements UCD_Types {
|
||||
summary.println();
|
||||
summary.println("# First Implicit: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0)));
|
||||
summary.println("# Last Implicit: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x10FFFF)));
|
||||
summary.println("# First CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4E00)));
|
||||
summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0xFA2F)));
|
||||
summary.println("# First CJK_A: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x3400)));
|
||||
summary.println("# Last CJK: " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(0x4DBF)));
|
||||
|
||||
boolean lastOne = false;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
boolean thisOne = isFixedIdeograph(i);
|
||||
boolean thisOne = UCA.isCJK(i) || UCA.isCJK_AB(i);
|
||||
if (thisOne != lastOne) {
|
||||
summary.println("# Implicit Cusp: CJK=" + lastOne + ": " + Utility.hex(i-1) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i-1)));
|
||||
summary.println("# Implicit Cusp: CJK=" + thisOne + ": " + Utility.hex(i) + " => " + Utility.hex(0xFFFFFFFFL & getImplicitPrimary(i)));
|
||||
@ -2223,21 +2333,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
summary.close();
|
||||
}
|
||||
|
||||
// CONSTANTS
|
||||
|
||||
static final int
|
||||
HAN_START = 0x3400,
|
||||
HAN_LIMIT = 0xA000,
|
||||
SUPPLEMENTARY_COUNT = 0x100000,
|
||||
BYTES_TO_AVOID = 3,
|
||||
OTHER_COUNT = 256 - BYTES_TO_AVOID,
|
||||
LAST_COUNT = OTHER_COUNT / 2,
|
||||
LAST_COUNT2 = (SUPPLEMENTARY_COUNT - 1) / (OTHER_COUNT * OTHER_COUNT) + 1, // last byte
|
||||
HAN_SHIFT = LAST_COUNT * OTHER_COUNT - HAN_START,
|
||||
IMPLICIT_BOUNDARY = 2 * OTHER_COUNT * LAST_COUNT + HAN_START,
|
||||
LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2;
|
||||
|
||||
|
||||
/*
|
||||
static boolean isFixedIdeograph(int cp) {
|
||||
return (0x3400 <= cp && cp <= 0x4DB5
|
||||
|| 0x4E00 <= cp && cp <= 0x9FA5
|
||||
@ -2246,6 +2342,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
|| 0x2F800 <= cp && cp <= 0x2FA1D // compat: most of these decompose anyway
|
||||
);
|
||||
}
|
||||
*/
|
||||
/*
|
||||
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
@ -2258,6 +2355,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;
|
||||
*/
|
||||
|
||||
/*
|
||||
static int remapUCA_CompatibilityIdeographToCp(int cp) {
|
||||
switch (cp) {
|
||||
case 0x9FA6: return 0xFA0E; // FA0E ; [.9FA6.0020.0002.FA0E] # CJK COMPATIBILITY IDEOGRAPH-FA0E
|
||||
@ -2275,6 +2373,45 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Function used to collapse the two different Han blocks from UCA into one.
|
||||
* It does this by reversing the order of the two groups A and B below.
|
||||
* A:
|
||||
* 4E00..9FFF; CJK Unified Ideographs
|
||||
* F900..FAFF; CJK Compatibility Ideographs
|
||||
* B:
|
||||
* 3400..4DBF; CJK Unified Ideographs Extension A
|
||||
* As long as
|
||||
* no new B characters are allocated between 4E00 and FAFF, and
|
||||
* no new A characters are outside of this range,
|
||||
* (very high probability) this simple code will work.
|
||||
*/
|
||||
static int swapCJK(int i) {
|
||||
if (i >= UCA.CJK_LIMIT_COMPAT_USED) return i;
|
||||
if (i >= UCA.CJK_BASE) return i - UCA.CJK_BASE;
|
||||
return i + (UCA.CJK_LIMIT_COMPAT_USED - UCA.CJK_BASE);
|
||||
}
|
||||
|
||||
// CONSTANTS
|
||||
|
||||
static final int
|
||||
BYTES_TO_AVOID = 3,
|
||||
OTHER_COUNT = 256 - BYTES_TO_AVOID,
|
||||
LAST_COUNT = OTHER_COUNT / 2,
|
||||
LAST_COUNT2 = OTHER_COUNT / 16, // room for intervening, without expanding to 5 bytes
|
||||
IMPLICIT_3BYTE_COUNT = 1,
|
||||
IMPLICIT_BASE_BYTE = 0xE0,
|
||||
|
||||
IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4, // leave room for 1 3-byte and 2 4-byte forms
|
||||
|
||||
IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT,
|
||||
LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT,
|
||||
LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2,
|
||||
IMPLICIT_BASE_3BYTE = (IMPLICIT_BASE_BYTE << 24) + 0x030300,
|
||||
IMPLICIT_BASE_4BYTE = ((IMPLICIT_BASE_BYTE + IMPLICIT_3BYTE_COUNT) << 24) + 0x030303
|
||||
;
|
||||
|
||||
// GET IMPLICIT PRIMARY WEIGHTS
|
||||
// Return value is left justified primary key
|
||||
@ -2287,22 +2424,62 @@ public class WriteCollationData implements UCD_Types {
|
||||
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
|
||||
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
|
||||
|
||||
int last0 = cp - IMPLICIT_BOUNDARY;
|
||||
int hanFixup = 0;
|
||||
if (isFixedIdeograph(cp)) hanFixup = 0x04000000;
|
||||
if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
|
||||
|
||||
if (!UCA.isCJK(cp) && !UCA.isCJK_AB(cp)) cp += 0x10FFFF; // space everything else after CJK
|
||||
|
||||
if (DEBUG) System.out.println("Remapped: " + Utility.hex(cp));
|
||||
|
||||
cp = swapCJK(cp);
|
||||
|
||||
if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
|
||||
// we now have a range of numbers from 0 to 21FFFF.
|
||||
|
||||
int last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
|
||||
if (last0 < 0) {
|
||||
cp += HAN_SHIFT; // shift so HAN shares single block
|
||||
int last1 = cp / LAST_COUNT;
|
||||
last0 = cp % LAST_COUNT;
|
||||
|
||||
int last2 = last1 / OTHER_COUNT;
|
||||
last1 %= OTHER_COUNT;
|
||||
return 0xEC030300 - hanFixup + (last2 << 24) + (last1 << 16) + (last0 << 9);
|
||||
|
||||
if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
|
||||
+ Utility.hex(last2) + ", "
|
||||
+ Utility.hex(last1) + ", "
|
||||
+ Utility.hex(last0) + ", "
|
||||
);
|
||||
|
||||
return IMPLICIT_BASE_3BYTE + (last2 << 24) + (last1 << 16) + ((last0*LAST_MULTIPLIER) << 8);
|
||||
} else {
|
||||
int last1 = last0 / LAST_COUNT2;
|
||||
last0 %= LAST_COUNT2;
|
||||
|
||||
int last2 = last1 / OTHER_COUNT;
|
||||
last1 %= OTHER_COUNT;
|
||||
return 0xEF030303 - hanFixup + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
|
||||
|
||||
int last3 = last2 / OTHER_COUNT;
|
||||
last2 %= OTHER_COUNT;
|
||||
|
||||
if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
|
||||
+ Utility.hex(last3) + ", "
|
||||
+ Utility.hex(last2) + ", "
|
||||
+ Utility.hex(last1) + ", "
|
||||
+ Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
|
||||
);
|
||||
|
||||
return IMPLICIT_BASE_4BYTE + (last3 << 24) + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void showImplicit(String title, int cp) {
|
||||
if (DEBUG) {
|
||||
System.out.println(title + "-1: " + Utility.hex(cp-1) + " => "
|
||||
+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp-1)));
|
||||
System.out.println(title + ": " + Utility.hex(cp) + " => "
|
||||
+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp)));
|
||||
System.out.println(title + "+1: " + Utility.hex(cp+1) + " => "
|
||||
+ Utility.hex(0xFFFFFFFFL & getImplicitPrimary(cp+1)));
|
||||
}
|
||||
}
|
||||
|
||||
@ -2311,35 +2488,65 @@ public class WriteCollationData implements UCD_Types {
|
||||
static void checkImplicit() {
|
||||
long oldPrimary = 0;
|
||||
System.out.println("Starting Implicit Check");
|
||||
int mask = ~0x04000000;
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i);
|
||||
|
||||
// test correct values
|
||||
|
||||
if ((newPrimary & mask) < (oldPrimary & mask)) {
|
||||
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: " + Utility.hex(oldPrimary) + " > " + Utility.hex(newPrimary));
|
||||
}
|
||||
|
||||
long b0 = (newPrimary >> 24) & 0xFF;
|
||||
long b1 = (newPrimary >> 16) & 0xFF;
|
||||
long b2 = (newPrimary >> 8) & 0xFF;
|
||||
long b3 = newPrimary & 0xFF;
|
||||
|
||||
if (b0 < 0xE8 || b0 > 0xEF || b1 < 3 || b2 < 3 || b3 == 1 || b3 == 2) {
|
||||
throw new IllegalArgumentException(Utility.hex(i) + ": illegal byte value: " + Utility.hex(newPrimary)
|
||||
+ ", " + Utility.hex(b1) + ", " + Utility.hex(b2) + ", " + Utility.hex(b3));
|
||||
}
|
||||
|
||||
// print range to look at
|
||||
|
||||
if (false) {
|
||||
int b = i & 0xFF;
|
||||
if (b == 255 || b == 0 || b == 1) {
|
||||
System.out.println(Utility.hex(i) + " => " + Utility.hex(newPrimary));
|
||||
}
|
||||
}
|
||||
oldPrimary = newPrimary;
|
||||
|
||||
showImplicit("# First CJK", UCA.CJK_BASE);
|
||||
showImplicit("# Last CJK", UCA.CJK_LIMIT-1);
|
||||
showImplicit("# First CJK-compat", UCA.CJK_BASE_COMPAT_USED);
|
||||
showImplicit("# Last CJK-compat", UCA.CJK_LIMIT_COMPAT_USED-1);
|
||||
showImplicit("# First CJK_A", UCA.CJK_A_BASE);
|
||||
showImplicit("# Last CJK_A", UCA.CJK_A_LIMIT-1);
|
||||
showImplicit("# First CJK_B", UCA.CJK_B_BASE);
|
||||
showImplicit("# Last CJK_B", UCA.CJK_B_LIMIT-1);
|
||||
showImplicit("# First Other Implicit", 0);
|
||||
showImplicit("# Last Other Implicit", 0x10FFFF);
|
||||
showImplicit("# Boundary", IMPLICIT_4BYTE_BOUNDARY);
|
||||
|
||||
|
||||
int oldChar = -1;
|
||||
for (int batch = 0; batch < 3; ++batch) {
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
|
||||
// separate the three groups
|
||||
|
||||
if (UCA.isCJK(i)) {
|
||||
if (batch != 0) continue;
|
||||
} else if (UCA.isCJK_AB(i)) {
|
||||
if (batch != 1) continue;
|
||||
} else if (batch != 2) continue;
|
||||
|
||||
long newPrimary = 0xFFFFFFFFL & getImplicitPrimary(i);
|
||||
|
||||
// test correct values
|
||||
|
||||
|
||||
if ((newPrimary) < (oldPrimary)) {
|
||||
throw new IllegalArgumentException(Utility.hex(i) + ": overlap: "
|
||||
+ Utility.hex(oldChar) + ", " + Utility.hex(oldPrimary)
|
||||
+ Utility.hex(i) + ", " + " > " + Utility.hex(newPrimary));
|
||||
}
|
||||
|
||||
|
||||
long b0 = (newPrimary >> 24) & 0xFF;
|
||||
long b1 = (newPrimary >> 16) & 0xFF;
|
||||
long b2 = (newPrimary >> 8) & 0xFF;
|
||||
long b3 = newPrimary & 0xFF;
|
||||
|
||||
if (b0 < IMPLICIT_BASE_BYTE || b0 >= IMPLICIT_LIMIT_BYTE || b1 < 3 || b2 < 3 || b3 == 1 || b3 == 2) {
|
||||
throw new IllegalArgumentException(Utility.hex(i) + ": illegal byte value: " + Utility.hex(newPrimary)
|
||||
+ ", " + Utility.hex(b1) + ", " + Utility.hex(b2) + ", " + Utility.hex(b3));
|
||||
}
|
||||
|
||||
// print range to look at
|
||||
|
||||
if (false) {
|
||||
int b = i & 0xFF;
|
||||
if (b == 255 || b == 0 || b == 1) {
|
||||
System.out.println(Utility.hex(i) + " => " + Utility.hex(newPrimary));
|
||||
}
|
||||
}
|
||||
oldPrimary = newPrimary;
|
||||
oldChar = i;
|
||||
}
|
||||
}
|
||||
System.out.println("Successful Implicit Check!!");
|
||||
}
|
||||
@ -2448,16 +2655,12 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
|
||||
static final int secondaryDoubleStart = 0xD0;
|
||||
static final int MARK_CODE_POINT = 0x40000000;
|
||||
|
||||
static int fixPrimary(int x) {
|
||||
int result = 0;
|
||||
if (x <= 0xFFFF) result = primaryDelta[x];
|
||||
else result = getImplicitPrimary(x);
|
||||
|
||||
/*if (x > 0x3400) {
|
||||
System.out.println(Utility.hex(x) + " => " + Utility.hex(result));
|
||||
}
|
||||
*/
|
||||
if ((x & MARK_CODE_POINT) != 0) result = getImplicitPrimary(x & ~MARK_CODE_POINT);
|
||||
else result = primaryDelta[x];
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -2898,7 +3101,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
|
||||
0xE0000, 0xEFFFD, 0xEFFFE, 0xEFFFF,
|
||||
0xF0000, 0xFFFFD, 0xFFFFE, 0xFFFFF,
|
||||
0x100000, 0x10FFFD, 0x10FFFE, 0x10FFFF,
|
||||
IMPLICIT_BOUNDARY, IMPLICIT_BOUNDARY-1, IMPLICIT_BOUNDARY+1,
|
||||
IMPLICIT_4BYTE_BOUNDARY, IMPLICIT_4BYTE_BOUNDARY-1, IMPLICIT_4BYTE_BOUNDARY+1,
|
||||
};
|
||||
|
||||
static final int MARK = 1;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2002/06/02 05:07:08 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -535,6 +535,9 @@ public final class Utility { // COMMON UTILITIES
|
||||
return openPrintWriter(filename, true, true);
|
||||
}
|
||||
|
||||
// Normally use false, false.
|
||||
// But for UCD files use true, true
|
||||
// Or if they are UTF8, use true, false
|
||||
public static PrintWriter openPrintWriter(String filename, boolean removeCR, boolean latin1) throws IOException {
|
||||
File file = new File(getOutputName(filename));
|
||||
System.out.println("Creating File: " + file);
|
||||
|
Loading…
Reference in New Issue
Block a user