/** ******************************************************************************* * Copyright (C) 1996-2001, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ * $Date: 2006/06/08 18:16:40 $ * $Revision: 1.26 $ * ******************************************************************************* */ package com.ibm.text.UCA; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.BufferedReader; import java.io.Reader; import java.io.PrintWriter; import java.io.FileReader; import java.text.MessageFormat; import java.io.IOException; import com.ibm.text.UCD.Normalizer; import com.ibm.text.UCD.UCD; import com.ibm.text.utility.*; import com.ibm.text.UCD.UnifiedBinaryProperty; import com.ibm.text.UCD.UCDProperty; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; //import com.ibm.text.CollationData.*; /** * Collator is a working version of UTR#10 Unicode Collation Algorithm, * as described on http://www.unicode.org/unicode/reports/tr10/ * @author Mark Davis It is not optimized, although it does use some techniques that are required for a real optimization, such as squeezing all the weights into 32 bits.
Invariants relied upon by the algorithm:
UCA Data:
1. While it contains secondaries greater than 0xFF,
these can be folded down by subtracting 0xC0--without collision--to be less than 0xFF
2. Tertiary values are less than 0x80
3. Contracting characters must be "completed": if "abcd" is a contracting character,
then "abc" is also.
4. Variables (marked with *), have a distinct, closed range of primaries.
That is, there are no variable CEs X, Z and non-ignorable CE Y such that X[1] <= Y[1] <= Z[1]
5. It needs to be fixed when reading: only non-zero weights (levels 1-3) are really variable!
#4 saves a bit in each CE.
Limits
1. There is a limit on the number of expanding characters. If N is the number of expanding
characters, then their total lengths must be less than 65536-N. This should never pose a
problem in practice.
2. If any of the weight limits are reached (FFFF for primary, FF for secondary, tertiary),
expanding characters can be used to achieve the right results, as discussed in UTR#10.
Remarks:
Neither the old 14651 nor the old UCA algorithms for backwards really worked.
This is because of shared
characters between scripts with different directions, like French with Arabic or Greek.
*/
final public class UCA implements Comparator, UCA_Types {
public static final String copyright =
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
public int compare(Object a, Object b) {
return getSortKey((String) a).compareTo(getSortKey((String) b));
}
/**
* Records the codeversion
*/
private static final String codeVersion = "7";
// base directory will change depending on the installation
public static final String BASE_DIR = "c:\\DATA\\";
// =============================================================
// Test Settings
// =============================================================
static final boolean DEBUG = false;
static final boolean DEBUG_SHOW_LINE = false;
static final boolean SHOW_STATS = true;
static final boolean SHOW_CE = false;
static final boolean CHECK_UNIQUE = false;
static final boolean CHECK_UNIQUE_EXPANSIONS = false; // only effective if CHECK_UNIQUE
static final boolean CHECK_UNIQUE_VARIABLES = false; // only effective if CHECK_UNIQUE
static final boolean TEST_BACKWARDS = false;
static final boolean RECORDING_DATA = false;
static final boolean RECORDING_CHARS = true;
private UCD ucd;
private UCA_Data ucaData;
// =============================================================
// Main Methods
// =============================================================
private String fileVersion = "??";
/**
* Initializes the collation from a stream of rules in the normal formal.
* If the source is null, uses the normal Unicode data files, which
* need to be in BASE_DIR.
*/
public UCA(String sourceFile, String unicodeVersion) throws java.io.IOException {
fullData = sourceFile == null;
fileVersion = sourceFile;
// load the normalizer
if (toD == null) {
toD = new Normalizer(Normalizer.NFD, unicodeVersion);
}
ucd = UCD.make(unicodeVersion);
ucdVersion = ucd.getVersion();
ucaData = new UCA_Data(toD, ucd);
// either get the full sources, or just a demo set
/* if (fullData) {
for (int i = 0; i < KEYS.length; ++i) {
BufferedReader in = new BufferedReader(
new FileReader(KEYS[i]), BUFFER_SIZE);
addCollationElements(in);
in.close();
}
} else */
{
BufferedReader in = new BufferedReader(
new FileReader(sourceFile), BUFFER_SIZE);
addCollationElements(in);
in.close();
}
cleanup();
}
/**
* Constructs a sort key for a string of input Unicode characters. Uses
* default values for alternate and decomposition.
* @param sourceString string to make a sort key for.
* @return Result is a String not of really of Unicodes, but of weights.
* String is just a handy way of returning them in Java, since there are no
* unsigned shorts.
*/
public String getSortKey(String sourceString) {
return getSortKey(sourceString, defaultAlternate, defaultDecomposition);
}
/**
* Constructs a sort key for a string of input Unicode characters. Uses
* default value decomposition.
* @param sourceString string to make a sort key for.
* @param alternate choice of different 4th level weight construction
* @return Result is a String not of really of Unicodes, but of weights.
* String is just a handy way of returning them in Java, since there are no
* unsigned shorts.
*/
public String getSortKey(String sourceString, byte alternate) {
return getSortKey(sourceString, alternate, defaultDecomposition);
}
/**
* Constructs a sort key for a string of input Unicode characters.
* @param sourceString string to make a sort key for.
* @param alternate choice of different 4th level weight construction
* @param decomposition true for UCA, false where the text is guaranteed to be
* normalization form C with no combining marks of class 0.
* @return Result is a String not of really of Unicodes, but of weights.
* String is just a handy way of returning them in Java, since there are no
* unsigned shorts.
*/
public String getSortKey(String sourceString, byte alternate, boolean decomposition) {
decompositionBuffer.setLength(0);
if (decomposition) {
toD.normalize(sourceString, decompositionBuffer);
} else {
decompositionBuffer.append(sourceString);
}
storedDecomposition = decomposition; // record the setting for other methods
index = 0; // position in source string
// Weight strings - not chars, weights.
primaries.setLength(0); // clear out
secondaries.setLength(0); // clear out
tertiaries.setLength(0); // clear out
quaternaries.setLength(0); // clear out
if (SHOW_CE) debugList.setLength(0); // clear out
rearrangeBuffer = EMPTY; // clear the rearrange buffer (thai)
hangulBufferPosition = 0; // clear hangul buffer
hangulBuffer.setLength(0); // clear hangul buffer
char weight4 = '\u0000'; // DEFAULT FOR NON_IGNORABLE
boolean lastWasVariable = false;
// process CEs, building weight strings
while (true) {
//fixQuaternatiesPosition = quaternaries.length();
int ce = getCE();
if (ce == TERMINATOR) break;
if (ce == 0) continue;
switch (alternate) {
case ZEROED:
if (isVariable(ce)) {
ce = 0;
}
break;
case SHIFTED_TRIMMED:
case SHIFTED:
if (ce == 0) {
weight4 = 0;
} else if (isVariable(ce)) { // variables
weight4 = getPrimary(ce);
lastWasVariable = true;
ce = 0;
} else if (lastWasVariable && getPrimary(ce) == 0) { // zap trailing ignorables
ce = 0;
weight4 = 0;
} else { // above variables
lastWasVariable = false;
weight4 = '\uFFFF';
}
break;
// case NON_IGNORABLE: // doesn't ever change!
}
if (SHOW_CE) {
if (debugList.length() != 0) debugList.append("/");
debugList.append(CEList.toString(ce));
}
// add weights
char w = getPrimary(ce);
if (DEBUG) System.out.println("\tCE: " + Utility.hex(ce));
if (w != 0) {
primaries.append(w);
}
w = getSecondary(ce);
if (w != 0) {
if (!useBackwards) {
secondaries.append(w);
} else {
secondaries.insert(0, w);
}
}
w = getTertiary(ce);
if (w != 0) {
tertiaries.append(w);
}
if (weight4 != 0) {
quaternaries.append(weight4);
}
}
// Produce weight strings
// For simplicity, we use the strength setting here.
// To optimize, we wouldn't actually generate the weights in the first place.
StringBuffer result = primaries;
if (strength >= 2) {
result.append(LEVEL_SEPARATOR); // separator
result.append(secondaries);
if (strength >= 3) {
result.append(LEVEL_SEPARATOR); // separator
result.append(tertiaries);
if (strength >= 4) {
result.append(LEVEL_SEPARATOR); // separator
if (alternate == SHIFTED_TRIMMED) {
int q;
for (q = quaternaries.length()-1; q >= 0; --q) {
if (quaternaries.charAt(q) != '\uFFFF') {
break;
}
}
quaternaries.setLength(q+1);
}
result.append(quaternaries);
//appendInCodePointOrder(decompositionBuffer, result);
}
}
}
return result.toString();
}
// 0 ==
// 2, -2 quarternary
// 3, -3 tertiary
// 4, -4 secondary
// 5, -5 primary
public static int strengthDifference(String sortKey1, String sortKey2) {
int len1 = sortKey1.length();
int len2 = sortKey2.length();
int minLen = len1 < len2 ? len1 : len2;
int strength = 5;
for (int i = 0; i < minLen; ++i) {
char c1 = sortKey1.charAt(i);
char c2 = sortKey2.charAt(i);
if (c1 < c2) return -strength;
if (c1 > c2) return strength;
if (c1 == LEVEL_SEPARATOR) --strength; // Separator!
}
if (len1 < len2) return -strength;
if (len1 > len2) return strength;
return 0;
}
/**
* Turns backwards (e.g. for French) on globally for all secondaries
*/
public void setBackwards(boolean backwards) {
useBackwards = backwards;
}
/**
* Retrieves value applied by set.
*/
public boolean isBackwards() {
return useBackwards;
}
/**
* Causes variables (those with *) to be set to all zero weights (level 1-3).
*/
public void setDecompositionState(boolean state) {
defaultDecomposition = state;
}
/**
* Retrieves value applied by set.
*/
public boolean isDecomposed() {
return defaultDecomposition;
}
/**
* Causes variables (those with *) to be set to all zero weights (level 1-3).
*/
public void setAlternate(byte status) {
defaultAlternate = status;
}
/**
* Retrieves value applied by set.
*/
public byte getAlternate() {
return defaultAlternate;
}
/**
* Sets the maximum strength level to be included in the string.
* E.g. with 3, only weights of 1, 2, and 3 are included: level 4 weights are discarded.
*/
public void setStrength(int inStrength) {
strength = inStrength;
}
/**
* Retrieves value applied by set.
*/
public int getStrength() {
return strength;
}
/**
* Retrieves version
*/
public String getCodeVersion() {
return codeVersion;
}
/**
* Retrieves versions
*/
public String getDataVersion() {
return dataVersion;
}
/**
* Retrieves versions
*/
public String getUCDVersion() {
return ucdVersion;
}
public static String codePointOrder(String s) {
return appendInCodePointOrder(s, new StringBuffer()).toString();
}
/**
* Appends UTF-16 string
* with the values swapped around so that they compare in
* code-point order. Replace 0000 and 0001 by 0001 0001/2
* @param source Normal UTF-16 (Java) string
* @return sort key (as string)
* @author Markus Scherer (cast into Java by MD)
* NOTE: changed to be longer, but handle isolated surrogates
*/
public static StringBuffer appendInCodePointOrder(String source, StringBuffer target) {
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
target.append((char)((cp >> 15) | 0x8000));
target.append((char)(cp | 0x8000));
/*
if (ch <= 1) { // hack to avoid nulls
target.append('\u0001');
target.append((char)(ch+1));
}
target.append((char)(ch + utf16CodePointOrder[ch>>11]));
*/
}
return target;
}
/**
* Returns a list of CEs for a unicode character at a position.
* @param sourceString string to make a sort key for.
* @param offset position in string
* @param decomposition true for UCA, false where the text is guaranteed to be
* normalization form C with no combining marks of class 0.
* @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
*/
public void getCEs(String sourceString, boolean decomposition, IntStack output) {
decompositionBuffer.setLength(0);
if (decomposition) {
toD.normalize(sourceString, decompositionBuffer);
} else {
decompositionBuffer.append(sourceString);
}
rearrangeBuffer = EMPTY; // clear the rearrange buffer (thai)
index = 0;
// process CEs, building weight strings
while (true) {
//fixQuaternatiesPosition = quaternaries.length();
int ce = getCE();
if (ce == 0) continue;
if (ce == TERMINATOR) break;
output.push(ce);
}
}
/**
* Returns a list of CEs for a unicode character at a position.
* @param sourceString string to make a sort key for.
* @param offset position in string
* @param decomposition true for UCA, false where the text is guaranteed to be
* normalization form C with no combining marks of class 0.
* @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
* @return count of CEs
*/
public int getCEs(String sourceString, boolean decomposition, int[] output) {
decompositionBuffer.setLength(0);
if (decomposition) {
toD.normalize(sourceString, decompositionBuffer);
} else {
decompositionBuffer.append(sourceString);
}
rearrangeBuffer = EMPTY; // clear the rearrange buffer (thai)
index = 0;
int outpos = 0;
output[0] = 0; // just in case!!
// process CEs, building weight strings
while (true) {
//fixQuaternatiesPosition = quaternaries.length();
int ce = getCE();
if (ce == 0) continue;
if (ce == TERMINATOR) break;
output[outpos++] = ce;
}
return outpos;
}
/**
* Returns a CEList for a unicode character at a position.
* @param sourceString string to make a sort key for.
* @param offset position in string
* @param decomposition true for UCA, false where the text is guaranteed to be
* normalization form C with no combining marks of class 0.
* @param output array for output. Must be large enough on entry. When done, is terminated with TERMINATOR.
* @return count of CEs
*/
public CEList getCEList(String sourceString, boolean decomposition) {
int len;
while (true) {
try {
len = getCEs(sourceString, decomposition, ceListBuffer);
break;
} catch (ArrayIndexOutOfBoundsException e) {
ceListBuffer = new int[ceListBuffer.length * 2];
}
}
return new CEList(ceListBuffer, 0, len);
}
int[] ceListBuffer = new int[30]; // temporary storage, to avoid multiple creation
/**
* Get Usage
*/
public BitSet getWeightUsage(int strength) {
return strength == 1 ? primarySet : strength == 2 ? secondarySet : tertiarySet;
}
/**
* Returns the char associated with a FIXED value
*/
/*public char charFromFixed(int ce) {
return getPrimary(ce);
}
*/
/**
* Return the type of the CE
*/
public byte getCEType(int ch) {
return ucaData.getCEType(ch);
}
/**
* Utility, used to get the primary weight from a 32-bit CE
* The primary is 16 bits, stored in b31..b16
*/
public static char getPrimary(int ce) {
return (char)(ce >>> 16);
}
/**
* Utility, used to get the secondary weight from a 32-bit CE
* The secondary is 8 bits, stored in b15..b8
*/
public static char getSecondary(int ce) {
return (char)((ce >>> 7) & 0x1FF);
}
/**
* Utility, used to get the tertiary weight from a 32-bit CE
* The tertiary is 6 bits, stored in b6..b0
*/
public static char getTertiary(int ce) {
return (char)(ce & 0x7F);
}
/**
* Utility, used to determine whether a CE is variable or not.
*/
public boolean isVariable(int ce) {
return (variableLowCE <= ce && ce <= variableHighCE);
}
/**
* Utility, used to determine whether a CE is variable or not.
*/
public int getVariableLow() {
return variableLowCE;
}
/**
* Utility, used to determine whether a CE is variable or not.
*/
public int getVariableHigh() {
return variableHighCE;
}
/**
* Utility, used to make a CE from the pieces. They must already
* be in the right range of values.
*/
public static int makeKey(int primary, int secondary, int tertiary) {
return (primary << 16) | (secondary << 7) | tertiary;
}
// =============================================================
// Utility methods
// =============================================================
/**
* Produces a human-readable string for a sort key.
* The 0000 separator is replaced by a '|'
*/
static public String toString(String sortKey) {
StringBuffer result = new StringBuffer();
boolean needSep = false;
result.append("[");
for (int i = 0; i < sortKey.length(); ++i) {
char ch = sortKey.charAt(i);
if (needSep) result.append(" ");
if (ch == 0) {
result.append("|");
needSep = true;
} else {
result.append(Utility.hex(ch));
needSep = true;
}
}
result.append("]");
return result.toString();
}
/**
* Produces a human-readable string for a collation element.
* value is terminated by -1!
*/
/*
static public String ceToString(int[] ces, int len) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < len; ++i) {
result.append(ceToString(ces[i]));
}
return result.toString();
}
&/
/**
* Produces a human-readable string for a collation element.
* value is terminated by -1!
*/
/*
static public String ceToString(int[] ces) {
StringBuffer result = new StringBuffer();
for (int i = 0; ; ++i) {
if (ces[i] == TERMINATOR) break;
result.append(ceToString(ces[i]));
}
return result.toString();
}
*/
static boolean isImplicitLeadCE(int ce) {
return isImplicitLeadPrimary(getPrimary(ce));
}
static boolean isImplicitLeadPrimary(int primary) {
return primary >= UNSUPPORTED_BASE && primary < UNSUPPORTED_LIMIT;
}
/*
The formula from the UCA:
BASE:
FB40 CJK Ideograph
FB80 CJK Ideograph Extension A/B
FBC0 Any other code point
AAAA = BASE + (CP >> 15);
BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
*/
/**
* Returns implicit value
*/
void CodepointToImplicit(int cp, int[] output) {
int base = UNSUPPORTED_OTHER_BASE;
if (ucd.isCJK_BASE(cp)) base = UNSUPPORTED_CJK_BASE;
else if (ucd.isCJK_AB(cp)) base = UNSUPPORTED_CJK_AB_BASE;
output[0] = base + (cp >>> 15);
output[1] = (cp & 0x7FFF) | 0x8000;
}
/**
* Takes implicit value
*/
static int ImplicitToCodePoint(int leadImplicit, int trailImplicit) {
// could probably optimize all this, but it is not worth it.
if (leadImplicit < UNSUPPORTED_BASE || leadImplicit >= UNSUPPORTED_LIMIT) {
throw new IllegalArgumentException("Lead implicit out of bounds: " + Utility.hex(leadImplicit));
}
if ((trailImplicit & 0x8000) == 0) {
throw new IllegalArgumentException("Trail implicit out of bounds: " + Utility.hex(trailImplicit));
}
int base;
if (leadImplicit >= UNSUPPORTED_OTHER_BASE) base = UNSUPPORTED_OTHER_BASE;
else if (leadImplicit >= UNSUPPORTED_CJK_AB_BASE) base = UNSUPPORTED_CJK_AB_BASE;
else base = UNSUPPORTED_CJK_BASE;
int result = ((leadImplicit - base) << 15) | (trailImplicit & 0x7FFF);
if (result > 0x10FFFF) {
throw new IllegalArgumentException("Resulting character out of bounds: "
+ Utility.hex(leadImplicit) + ", " + Utility.hex(trailImplicit)
+ " => " + result);
}
return result;
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
/*
static public String hex(int i) {
String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
return "00000000".substring(result.length(),8) + result;
}
*/
/**
* Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
/*
static public String hex(char i) {
String result = Integer.toString(i, 16).toUpperCase();
return "0000".substring(result.length(),4) + result;
}
*/
/**
* Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u)
*/
/*
static public String hex(byte b) {
int i = b & 0xFF;
String result = Integer.toString(i, 16).toUpperCase();
return "00".substring(result.length(),2) + result;
}
*/
/**
* Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
*@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
*/
/*
static public String hex(String s, String sep) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
if (i != 0) result.append(sep);
result.append(hex(s.charAt(i)));
}
return result.toString();
}
*/
/**
* Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u)
*@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062"
*/
/*
static public String hex(StringBuffer s, String sep) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < s.length(); ++i) {
if (i != 0) result.append(sep);
result.append(hex(s.charAt(i)));
}
return result.toString();
}
*/
// =============================================================
// Privates
// =============================================================
IntStack expandingStack = new IntStack(10);
/**
* Array used to reorder surrogates to top of 16-bit range, and others down.
* Adds 2000 to D800..DFFF, making them F800..FFFF
* Subtracts 800 from E000..FFFF, making them D800..F7FF
*/
private static final int[] utf16CodePointOrder = {
0, 0, 0, 0, // 00, 08, 10, 18
0, 0, 0, 0, // 20, 28, 30, 38
0, 0, 0, 0, // 40, 48, 50, 58
0, 0, 0, 0, // 60, 68, 70, 78
0, 0, 0, 0, // 80, 88, 90, 98
0, 0, 0, 0, // A0, A8, B0, B8
0, 0, 0, 0x2000, // C0, C8, D0, D8
-0x800, -0x800, -0x800, -0x800 // E0, E8, F0, F8
};
/**
* NFD required
*/
private static Normalizer toD;
/**
* Records the dataversion
*/
public static final String BADVERSION = "Missing @version in data!!";
private String dataVersion = BADVERSION;
/**
* Records the dataversion
*/
private String ucdVersion = "?";
/**
* Turns backwards (e.g. for French) on globally for all secondaries
*/
private boolean useBackwards = false;
/**
* Choice of how to handle variables (those with *)
*/
private byte defaultAlternate = SHIFTED;
/**
* For testing
*/
private boolean defaultDecomposition = true;
/**
* Sets the maximum strength level to be included in the string.
* E.g. with 3, only weights of 1, 2, and 3 are included: level 4 weights are discarded.
*/
private int strength = 4;
/**
* Position in decompositionBuffer used when constructing sort key
*/
private int index;
/**
* List of files to use for constructing the CE data, used by build()
*/
/* private static final String[] KEYS = {
//"D:\\UnicodeData\\testkeys.txt",
BASE_DIR + "UCA\\allkeys" + VERSION + ".txt",
BASE_DIR + "UnicodeData\\Collation\\basekeys" + VERSION + ".txt",
BASE_DIR + "UnicodeData\\Collation\\compkeys" + VERSION + ".txt",
BASE_DIR + "UnicodeData\\Collation\\ctrckeys" + VERSION + ".txt",
};
*/
/**
* File buffer size, used to make reads faster.
*/
private static final int BUFFER_SIZE = 64*1024;
// =============================================================
// Collation Element Memory Data Table Formats
// =============================================================
/**
* Temporary buffer used in getSortKey for the decomposed string
*/
private StringBuffer decompositionBuffer = new StringBuffer();
// was 0xFFC20101;
/**
* We take advantage of the variables being in a closed range to save a bit per CE.
* The low and high values are initially set to be at the opposite ends of the range,
* as the table is built from the UCA data, they are narrowed in.
* The first three values are used in building; the last two in testing.
*/
private int variableLow = '\uFFFF';
private int nonVariableLow = '\uFFFF'; // HACK '\u089A';
private int variableHigh = '\u0000';
private int variableLowCE; // used for testing against
private int variableHighCE; // used for testing against
/*
private void fixSurrogateContraction(char ch) {
//if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));
if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
String chs = String.valueOf(ch);
Object probe = contractingTable.get(chs);
if (probe != null) return;
contractingTable.put(chs, new Integer(UNSUPPORTED));
}
*/
/**
* Marks whether we are using the full data set, or an abbreviated version for
* an applet.
*/
private boolean fullData;
// =============================================================
// Temporaries used in getCE.
// Made part of the object to avoid reallocating each time.
// =============================================================
/**
* Temporary buffers used in getSortKey to store weights
* these are NOT strings of Unicode characters--they are
* lists of weights. But this is a convenient way to store them,
* since Java doesn't have unsigned shorts.
*/
private StringBuffer primaries = new StringBuffer(100);
private StringBuffer secondaries = new StringBuffer(100);
private StringBuffer tertiaries = new StringBuffer(100);
private StringBuffer quaternaries = new StringBuffer(100);
/**
* Temporary buffer used to collect progress data for debugging
*/
StringBuffer debugList = new StringBuffer(100);
/**
* Temporary with requested decomposition
*/
boolean storedDecomposition;
/**
* Used for supporting Thai rearrangement
*/
static final char EMPTY = '\uFFFF';
char rearrangeBuffer = EMPTY;
UnicodeSet rearrangeList = new UnicodeSet();
int hangulBufferPosition = 0;
StringBuffer hangulBuffer = new StringBuffer();
// =============================================================
// getCE: Get the next Collation Element
// Main Routine
// =============================================================
/**
* Gets the next Collation Element from the decomposition buffer.
* May take one or more characters.
* Resets index to point at the next position to get characters from.
*@param quaternary the collection of 4th level weights, synthesized from the
* (normalized) character code.
*/
private int getCE() {
if (!expandingStack.isEmpty()) return expandingStack.popFront();
char ch;
// Fetch next character. Handle rearrangement for Thai, etc.
if (rearrangeBuffer != EMPTY) {
ch = rearrangeBuffer;
rearrangeBuffer = EMPTY;
} else if (hangulBufferPosition < hangulBuffer.length()) {
ch = hangulBuffer.charAt(hangulBufferPosition++);
if (hangulBufferPosition == hangulBuffer.length()) {
hangulBuffer.setLength(0);
hangulBufferPosition = 0;
}
} else {
if (index >= decompositionBuffer.length()) return TERMINATOR;
ch = decompositionBuffer.charAt(index++); // get next
if (rearrangeList.contains(ch) && index < decompositionBuffer.length()) {// if in list
rearrangeBuffer = ch; // store for later
ch = decompositionBuffer.charAt(index++); // never rearrange twice!!
}
}
index = ucaData.get(ch, decompositionBuffer, index, expandingStack);
int ce = expandingStack.popFront(); // pop first (guaranteed to exist!)
if (ce == UNSUPPORTED_FLAG) {
return handleUnsupported(ch);
}
return ce;
}
private int handleUnsupported(char ch) {
int bigChar = ch;
// Special check for Hangul
if (ucd.isHangulSyllable(bigChar)) {
// MUST DECOMPOSE!!
hangulBuffer = new StringBuffer();
decomposeHangul(bigChar, hangulBuffer);
return getCE();
// RECURSIVE!!!
}
// special check and fix for unsupported surrogate pair, 20 1/8 bits
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
// ignore unmatched surrogates (e.g. return zero)
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
int ch2 = decompositionBuffer.charAt(index);
if (ch2 < 0xDC00 || 0xDFFF < ch2) return 0; // unmatched
index++; // skip next char
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
}
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
return 0;
}
// find the implicit values; returned in 0 and 1
int[] implicit = new int[2];
CodepointToImplicit(bigChar, implicit);
// Now compose the two keys
// push BBBB
expandingStack.push(makeKey(implicit[1], 0, 0));
// return AAAA
return makeKey(implicit[0], NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
}
/**
* Constants for Hangul
*/
static final int // constants
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
NCount = VCount * TCount, // 588
SCount = LCount * NCount, // 11172
LastInitial = LBase + LCount-1, // last initial jamo
LastPrimary = SBase + (LCount-1) * VCount * TCount; // last corresponding primary
public static StringBuffer decomposeHangul(int s, StringBuffer result) {
int SIndex = s - SBase;
if (0 > SIndex || SIndex >= SCount) {
throw new IllegalArgumentException("Non-Hangul Syllable");
}
int L = LBase + SIndex / NCount;
int V = VBase + (SIndex % NCount) / TCount;
int T = TBase + SIndex % TCount;
result.append((char)L);
result.append((char)V);
if (T != TBase) result.append((char)T);
return result;
}
/**
* Fix for Hangul, since the tables are not set up right.
* The fix for Hangul is to give different values to the combining initial
* Jamo to put them up into the AC00 range, as follows. Each one is put
* after the first syllable it begins.
*
private int fixJamo(char ch, int jamoCe) {
int result = jamoCe - hangulHackBottom + 0xAC000000; // put into right range
if (DEBUG) System.out.println("\tChanging " + hex(ch) + " " + hex(jamoCe) + " => " + hex(result));
return result;
/*
int newPrimary;
int LIndex = jamo - LBase;
if (LIndex < LCount) {
newPrimary = SBase + (LIndex + 1) * VCount * TCount; // multiply to match syllables
} else {
newPrimary = LastPrimary + (jamo - LastInitial); // just shift up
}
return makeKey(newPrimary, 0x21, 0x2); // make secondary difference!
* /
}
*/
// =============================================================
// Building Collation Element Tables
// =============================================================
/**
* Value for returning int as well as function return,
* since Java doesn't have output parameters
*/
private int[] position = new int[1];
/**
* For recording statistics
*/
private int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0;
private int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1;
UnicodeSet found = new UnicodeSet();
/*public Hashtable getContracting() {
return new Hashtable(multiTable);
}
*/
public UCAContents getContents(byte ceLimit, Normalizer skipDecomps) {
return new UCAContents(ceLimit, skipDecomps, ucdVersion);
}
static boolean haveUnspecified = false;
static UnicodeSet unspecified = new UnicodeSet();
UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154);
UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F);
UnicodeSet homelessSecondaries;
static final UnicodeSet moreSamples = new UnicodeSet();
static {
moreSamples.add("\u09C7\u09BE");
moreSamples.add("\u09C7\u09D7");
moreSamples.add("\u1025\u102E");
moreSamples.add("\u0DD9\u0DCF");
moreSamples.add("\u0DD9\u0DDF");
moreSamples.add("\u1100\u1161");
moreSamples.add("\u1100\u1175");
moreSamples.add("\u1112\u1161");
moreSamples.add("\u1112\u1175");
moreSamples.add("\uAC00\u1161");
moreSamples.add("\uAC00\u1175");
moreSamples.add("\uD788\u1161");
moreSamples.add("\uD788\u1175");
}
// static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198);
// 0x0153..0x017F
public class UCAContents {
int current = -1;
Normalizer skipDecomps;
Normalizer nfd;
Normalizer nfkd;
Iterator enum1 = null;
byte ceLimit;
int currentRange = SAMPLE_RANGES.length; // set to ZERO to enable
int startOfRange = SAMPLE_RANGES[0][0];
int endOfRange = startOfRange;
int itemInRange = startOfRange;
int skip = 1;
boolean doSamples = false;
AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
UnicodeSetIterator moreSampleIterator = new UnicodeSetIterator(moreSamples);
/**
* use FIXED_CE as the limit
*/
UCAContents(byte ceLimit, Normalizer skipDecomps, String unicodeVersion) {
this.ceLimit = ceLimit;
this.nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
this.nfkd = new Normalizer(Normalizer.NFKD, unicodeVersion);
this.skipDecomps = skipDecomps;
currentRange = 0;
usi.reset(unspecified, true);
//usi.setAbbreviated(true);
// FIX SAMPLES
if (SAMPLE_RANGES[0][0] == 0) {
for (int i = 0; ; ++i) { // add first unallocated character
if (!ucd.isAssigned(i)) {
SAMPLE_RANGES[0][0] = i;
break;
}
}
}
}
/**
* use FIXED_CE as the limit
* @param newValue TODO
*/
public void setDoEnableSamples(boolean newValue) {
doSamples = newValue;
}
/**
* returns a string
*/
public String next() {
String result = null; // null if done
// normal case
while (current++ < 0x10FFFF) {
if (DEBUG && current == 0xdbff) {
System.out.println("DEBUG");
}
//char ch = (char)current;
byte type = getCEType(current);
if (type >= ceLimit || type == CONTRACTING_CE) continue;
//if (nfd.isNormalized(current) || type == HANGUL_CE) {
//}
if (skipDecomps != null && !skipDecomps.isNormalized(current)) continue; // CHECK THIS
result = UTF16.valueOf(current);
if (!haveUnspecified) unspecified.add(current);
return result;
}
// contractions
if (enum1 == null) enum1 = ucaData.getContractions();
while (enum1.hasNext()) {
result = (String)enum1.next();
if (result.length() == 1 && UTF16.isLeadSurrogate(result.charAt(0))) {
//System.out.println("Skipping " + ucd.getCodeAndName(result));
continue; // try again
}
if (!haveUnspecified) {
if (UTF16.countCodePoint(result) == 1) {
unspecified.add(result);
}
}
return result;
}
if (!haveUnspecified) {
if (DEBUG) System.out.println("Specified = " + unspecified.toPattern(true));
UnicodeSet temp = new UnicodeSet();
for (int i = 0; i <= 0x10ffff; ++i) {
if (!ucd.isAllocated(i)) continue;
if (!unspecified.contains(i)) {
temp.add(i);
}
// add the following so that if a CJK is in a decomposition, we add it
if (!nfkd.isNormalized(i)) {
String decomp = nfkd.normalize(i);
int cp2;
for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp2)) {
cp2 = UTF16.charAt(decomp, j);
if (!unspecified.contains(cp2)) {
temp.add(cp2);
}
}
}
}
unspecified = temp;
usi.reset(unspecified, true);
//usi.setAbbreviated(true);
if (DEBUG) System.out.println("Unspecified = " + unspecified.toPattern(true));
haveUnspecified = true;
}
if (!doSamples) return null;
if (usi.next()) {
if (usi.codepoint == usi.IS_STRING) result = usi.string;
else result = UTF16.valueOf(usi.codepoint);
if (DEBUG) System.out.println("Unspecified: " + ucd.getCodeAndName(result));
return result;
}
if (moreSampleIterator.next()) {
result = moreSampleIterator.getString();
if (DEBUG) System.out.println("More Samples: " + ucd.getCodeAndName(result));
return result;
}
// extra samples
if (currentRange < SAMPLE_RANGES.length) {
try {
result = UTF16.valueOf(itemInRange);
} catch (RuntimeException e) {
System.out.println(Utility.hex(itemInRange));
throw e;
}
++itemInRange;
if (itemInRange > endOfRange) {
++currentRange;
if (currentRange < SAMPLE_RANGES.length) {
startOfRange = itemInRange = SAMPLE_RANGES[currentRange][0];
endOfRange = SAMPLE_RANGES[currentRange].length > 1
? SAMPLE_RANGES[currentRange][1]
: startOfRange;
//skip = ((endOfRange - startOfRange) / 3);
}
} else if (itemInRange > startOfRange + 5 && itemInRange < endOfRange - 5 /* - skip*/) {
//itemInRange += skip;
itemInRange = endOfRange - 5;
}
}
return result;
}
/**
* returns a string and its ces
*/
public String next(int[] ces, int[] len) {
String result = next(); // null if done
if (result != null) {
len[0] = getCEs(result, true, ces);
}
return result;
}
int[] lengthBuffer = new int[1];
/**
* returns a string and its ces
*/
public boolean next(Pair result) {
String s = next(ceListBuffer, lengthBuffer);
if (s == null) return false;
result.first = new CEList(ceListBuffer, 0, lengthBuffer[0]);
result.second = s;
return true;
}
/**
* @return Returns the doSamples.
*/
public boolean isDoSamples() {
return doSamples;
}
}
static final int[][] SAMPLE_RANGES = {
{0}, // LEAVE EMPTY--Turns into first unassigned character
{0xFFF0},
{0xD800},
{0xDFFF},
{0xFFFE},
{0xFFFF},
{0x10000},
{0xC0000},
{0xD0000},
{0x10FFFF},
{0x10FFFE},
{0x10FFFF},
{0x3400, 0x4DB5},
{0x4E00, 0x9FA5},
{0xAC00, 0xD7A3},
{0xA000, 0xA48C},
{0xE000, 0xF8FF},
{0x20000, 0x2A6D6},
{0xE0000, 0xE007E},
{0xF0000, 0xF00FD},
{0xFFF00, 0xFFFFD},
{0x100000, 0x1000FD},
{0x10FF00, 0x10FFFD},
};
/**
* Adds the collation elements from a file (or other stream) in the UCA format.
* Values will override any previous mappings.
*/
private void addCollationElements(BufferedReader in) throws java.io.IOException {
IntStack tempStack = new IntStack(100);
StringBuffer multiChars = new StringBuffer(); // used for contracting chars
String inputLine = "";
boolean[] wasImplicitLeadPrimary = new boolean[1];
// In UAX 3.1, the rearrange list is moved to UCD.
if (ucaData.lessThan410) {
rearrangeList = UnifiedBinaryProperty.make(UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd)
.getSet();
}
while (true) try {
inputLine = in.readLine();
if (inputLine == null) break; // means file is done
// HACK
if (inputLine.startsWith("# Variant secondaries:")) {
variantSecondaries = extractSet(inputLine);
} else if (inputLine.startsWith("# Digit secondaries:")) {
digitSecondaries = extractSet(inputLine);
}
String line = cleanLine(inputLine); // remove comments, extra whitespace
if (line.length() == 0) continue; // skip empty lines
if (DEBUG_SHOW_LINE) {
System.out.println("Processing: " + inputLine);
}
position[0] = 0; // start at front of line
if (line.startsWith("@")) {
if (line.startsWith("@version")) {
dataVersion = line.substring("@version".length()+1).trim();
continue;
}
if (line.startsWith("@rearrange")) {
line = line.substring("@rearrange".length()+1).trim();
String[] list = Utility.split(line, ',');
for (int i = 0; i < list.length; ++i) {
rearrangeList.add(Integer.parseInt(list[i].trim(), 16));
}
continue;
}
throw new IllegalArgumentException("Illegal @ command: " + line);
}
// collect characters
multiChars.setLength(0); // clear buffer
char value = getChar(line, position);
multiChars.append(value);
//fixSurrogateContraction(value);
char value2 = getChar(line, position);
// append until we get terminator
while (value2 != NOT_A_CHAR) {
multiChars.append(value2);
value2 = getChar(line, position);
}
if (RECORDING_CHARS) {
found.addAll(multiChars.toString());
}
if (!fullData && RECORDING_DATA) {
if (value == 0 || value == '\t' || value == '\n' || value == '\r'
|| (0x20 <= value && value <= 0x7F)
|| (0x80 <= value && value <= 0xFF)
|| (0x300 <= value && value <= 0x3FF)
) {
System.out.println(" + \"" + inputLine + "\\n\"");
}
}
// for recording information
boolean record = true;
/* if (multiChars.length() > 0) record = false;
else */
if (!toD.isNormalized(value)) record = false;
// collect CEs
if (false && value == 0x2F00) {
System.out.println("debug");
}
wasImplicitLeadPrimary[0] = false;
int ce = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
int ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
if (CHECK_UNIQUE && (ce2 == TERMINATOR || CHECK_UNIQUE_EXPANSIONS)) {
if (!CHECK_UNIQUE_VARIABLES) {
checkUnique(value, ce, 0, inputLine); // only need to check first value
} else {
int key1 = ce >>> 16;
if (isVariable(ce)) {
checkUnique(value, 0, key1, inputLine); // only need to check first value
}
}
}
tempStack.clear();
tempStack.push(ce);
while (ce2 != TERMINATOR) {
tempStack.push(ce2);
ce2 = getCEFromLine(value, line, position, record, wasImplicitLeadPrimary);
if (ce2 == TERMINATOR) break;
}
ucaData.add(multiChars, tempStack);
} catch (RuntimeException e) {
System.out.println("Error on line: " + inputLine);
throw e;
}
}
/**
*
*/
private UnicodeSet extractSet(String inputLine) {
//# Variant secondaries: 0177..017B (5)
//# Digit secondaries: 017C..0198 (29)
Matcher m = Pattern.compile(".*:\\s*([0-9A-Fa-f]+)\\.\\.([0-9A-Fa-f]+).*").matcher("");
if (!m.reset(inputLine).matches()) throw new IllegalArgumentException("Failed to recognized special Ken lines: " + inputLine);
return new UnicodeSet(Integer.parseInt(m.group(1),16), Integer.parseInt(m.group(2),16));
}
/*
private void concat(int[] ces1, int[] ces2) {
}
*/
/**
* Checks the internal tables corresponding to the UCA data.
*/
private void cleanup() {
UCDProperty ubp = UnifiedBinaryProperty.make(
UCD.BINARY_PROPERTIES + UCD.Logical_Order_Exception, ucd);
UnicodeSet desiredSet = ubp.getSet();
if (ucaData.lessThan410 && !rearrangeList.equals(desiredSet)) {
throw new IllegalArgumentException("Rearrangement should be " + desiredSet.toPattern(true)
+ ", but is " + rearrangeList.toPattern(true));
}
ucaData.checkConsistency();
Map missingStrings = new HashMap();
Map tempMap = new HashMap();
Iterator enum1 = ucaData.getContractions();
while (enum1.hasNext()) {
String sequence = (String)enum1.next();
//System.out.println("Contraction: " + Utility.hex(sequence));
for (int i = sequence.length()-1; i > 0; --i) {
String shorter = sequence.substring(0,i);
if (!ucaData.contractionTableContains(shorter)) {
IntStack tempStack = new IntStack(1);
getCEs(shorter, true, tempStack);
if (false) System.out.println("WARNING: CLOSING: " + ucd.getCodeAndName(shorter)
+ " => " + CEList.toString(tempStack));
tempMap.put(shorter, tempStack);
// missingStrings.put(shorter,"");
// collationElements[sequence.charAt(0)] = UNSUPPORTED; // nuke all bad values
}
}
}
// now add them. We couldn't before because we were iterating over it.
enum1 = tempMap.keySet().iterator();
while (enum1.hasNext()) {
String shorter = (String) enum1.next();
IntStack tempStack = (IntStack) tempMap.get(shorter);
ucaData.add(shorter, tempStack);
}
enum1 = missingStrings.keySet().iterator();
if (missingStrings.size() != 0) {
/**
while (enum1.hasMoreElements()) {
String sequence = (String)enum1.nextElement();
getCE(sequence);
FIX LATER;
}
*/
String errorMessage = "";
while (enum1.hasNext()) {
String missing = (String)enum1.next();
if (errorMessage.length() != 0) errorMessage += ", ";
errorMessage += "\"" + missing + "\"";
}
throw new IllegalArgumentException("Contracting table not closed! Missing " + errorMessage);
}
//fixlater;
variableLowCE = variableLow << 16;
variableHighCE = (variableHigh << 16) | 0xFFFF; // turn on bottom bits
//int hangulHackBottom;
//int hangulHackTop;
//hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries
//hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries
//if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop));
// show some statistics
if (SHOW_STATS) System.out.println("\tcount1: " + count1);
if (SHOW_STATS) System.out.println("\tcount2: " + max2);
if (SHOW_STATS) System.out.println("\tcount3: " + max3);
if (SHOW_STATS) System.out.println("\tcontractions: " + ucaData.getContractionCount());
if (SHOW_STATS) System.out.println("\tMIN1/MAX1: " + Utility.hex(MIN1) + "/" + Utility.hex(MAX1));
if (SHOW_STATS) System.out.println("\tMIN2/MAX2: " + Utility.hex(MIN2) + "/" + Utility.hex(MAX2));
if (SHOW_STATS) System.out.println("\tMIN3/MAX3: " + Utility.hex(MIN3) + "/" + Utility.hex(MAX3));
if (SHOW_STATS) System.out.println("\tVar Min/Max: " + Utility.hex(variableLow) + "/" + Utility.hex(variableHigh));
if (SHOW_STATS) System.out.println("\tNon-Var Min: " + Utility.hex(nonVariableLow));
if (SHOW_STATS) System.out.println("\trenumberedVariable: " + renumberedVariable);
}
/**
* Remove comments, extra whitespace
*/
private String cleanLine(String line) {
int commentPosition = line.indexOf('#');
if (commentPosition >= 0) line = line.substring(0,commentPosition);
commentPosition = line.indexOf('%');
if (commentPosition >= 0) line = line.substring(0,commentPosition);
return line.trim();
}
/**
* Get a char from a line, of form: (