Completed most of the upgrade to UCA 3.1.1;
involved double CEs for Han; fixing back-mappings, etc. did a bit of code cleanup too. Remaining to do: backmap from UCA double CEs to original character codes, for constructing Fractional UCA. X-SVN-Rev: 8754
This commit is contained in:
parent
ce883f6d81
commit
693b0c9b91
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/CEList.java,v $
|
||||
* $Date: 2001/09/19 23:32:21 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -156,6 +156,15 @@ public final class CEList implements java.lang.Comparable, UCD_Types {
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(int[] ces, int len) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i != 0) result.append(' ');
|
||||
result.append(toString(ces[i]));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public static String toString(int ce) {
|
||||
return "[" + Utility.hex(UCA.getPrimary(ce)) + "."
|
||||
+ Utility.hex(UCA.getSecondary(ce)) + "."
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
|
||||
* $Date: 2002/04/23 01:59:14 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -375,7 +375,7 @@ public class GenOverlap implements UCD_Types {
|
||||
System.out.println("debug");
|
||||
}
|
||||
boolean mashLast = false;
|
||||
if (nfkd.normalizationDiffers(cp)) {
|
||||
if (!nfkd.isNormalized(cp)) {
|
||||
String decomp = nfkd.normalize(cp);
|
||||
String canon = nfd.normalize(cp);
|
||||
len = collator.getCEs(decomp, true, ces);
|
||||
@ -578,7 +578,7 @@ public class GenOverlap implements UCD_Types {
|
||||
|
||||
if (UTF16.countCodePoint(s) != 1) continue; // skip ligatures
|
||||
int cp = UTF16.charAt(s, 0);
|
||||
if (nfkd.normalizationDiffers(cp)) continue;
|
||||
if (!nfkd.isNormalized(cp)) continue;
|
||||
|
||||
int script = ucd.getScript(cp);
|
||||
int len = lenArray[0];
|
||||
@ -607,7 +607,7 @@ public class GenOverlap implements UCD_Types {
|
||||
|
||||
Utility.dot(counter++);
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
if (nfkd.normalizationDiffers(cp)) continue;
|
||||
if (!nfkd.isNormalized(cp)) continue;
|
||||
if (ucd.getCategory(cp) == Lu) continue; // don't count case
|
||||
|
||||
String scp = UTF16.valueOf(cp);
|
||||
|
@ -5,18 +5,20 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
|
||||
* $Date: 2002/05/29 23:18:15 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public class Main {
|
||||
static final String UCDVersion = "";
|
||||
static final String[] ICU_FILES = {"FractionalUCA", "writeconformance", "writeconformanceshifted", "WriteRules"};
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
@ -36,7 +38,10 @@ public class Main {
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
if (arg.equalsIgnoreCase("WriteRulesWithNames")) WriteCollationData.writeRules(WriteCollationData.WITH_NAMES);
|
||||
System.out.println("OPTION: " + arg);
|
||||
|
||||
if (arg.equalsIgnoreCase("ICU")) args = Utility.append(args, ICU_FILES);
|
||||
else if (arg.equalsIgnoreCase("WriteRulesWithNames")) WriteCollationData.writeRules(WriteCollationData.WITH_NAMES);
|
||||
else if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference();
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2002/04/23 01:59:14 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -108,6 +108,8 @@ final public class UCA implements Comparator {
|
||||
static final boolean RECORDING_DATA = false;
|
||||
static final boolean RECORDING_CHARS = true;
|
||||
|
||||
private UCD ucd;
|
||||
|
||||
// =============================================================
|
||||
// Main Methods
|
||||
// =============================================================
|
||||
@ -129,7 +131,8 @@ final public class UCA implements Comparator {
|
||||
toD = new Normalizer(Normalizer.NFD, unicodeVersion);
|
||||
}
|
||||
|
||||
ucdVersion = UCD.make(unicodeVersion).getVersion();
|
||||
ucd = UCD.make(unicodeVersion);
|
||||
ucdVersion = ucd.getVersion();
|
||||
|
||||
// either get the full sources, or just a demo set
|
||||
if (fullData) {
|
||||
@ -478,7 +481,9 @@ final public class UCA implements Comparator {
|
||||
* CE Type
|
||||
*/
|
||||
static final byte NORMAL_CE = 0, CONTRACTING_CE = 1, EXPANDING_CE = 2,
|
||||
FIXED_CE = 3, HANGUL_CE = 5, SURROGATE_CE = 6, UNSUPPORTED_CE = 7;
|
||||
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
|
||||
FIXED_CE = 3;
|
||||
// SURROGATE_CE = 6,
|
||||
|
||||
/**
|
||||
* Returns the char associated with a FIXED value
|
||||
@ -502,12 +507,13 @@ final public class UCA implements Comparator {
|
||||
// Special check for Han, Hangul
|
||||
if (isHangul(ch)) return HANGUL_CE;
|
||||
|
||||
if (isFixed(ch)) return FIXED_CE;
|
||||
if (isCJK(ch)) return CJK_CE;
|
||||
if (isCJK_AB(ch)) return CJK_AB_CE;
|
||||
|
||||
// special check for unsupported surrogate pair, 20 1/8 bits
|
||||
if (0xD800 <= ch && ch <= 0xDFFF) {
|
||||
return SURROGATE_CE;
|
||||
}
|
||||
//if (0xD800 <= ch && ch <= 0xDFFF) {
|
||||
// return SURROGATE_CE;
|
||||
//}
|
||||
return UNSUPPORTED_CE;
|
||||
}
|
||||
|
||||
@ -632,6 +638,12 @@ final public class UCA implements Comparator {
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
static boolean isImplicitCE(int ce) {
|
||||
int primary = getPrimary(ce);
|
||||
return primary >= UNSUPPORTED_BASE && primary <= UNSUPPORTED_TOP;
|
||||
}
|
||||
|
||||
/**
|
||||
* Supplies a zero-padded hex representation of an integer (without 0x)
|
||||
*/
|
||||
@ -790,9 +802,9 @@ final public class UCA implements Comparator {
|
||||
|
||||
/**
|
||||
* A special bit combination in a CE is used to reserve exception cases. This has the effect
|
||||
* of removing 32 primary key values out of the 65536 possible.
|
||||
* of removing a small number of the primary key values out of the 65536 possible.
|
||||
*/
|
||||
static final int EXCEPTION_CE_MASK = 0xFF000000;
|
||||
static final int EXCEPTION_CE_MASK = 0xF8000000;
|
||||
|
||||
/**
|
||||
* Used to composed Hangul and Han characters
|
||||
@ -808,8 +820,13 @@ final public class UCA implements Comparator {
|
||||
* There are at least 34 values, so that we can use a range for surrogates
|
||||
* However, we do add to the first weight if we have surrogate pairs!
|
||||
*/
|
||||
public static final int UNSUPPORTED_BASE = 0xFF40;
|
||||
public static final int UNSUPPORTED_TOP = 0xFFFF;
|
||||
public static final int UNSUPPORTED_CJK_BASE = 0xFF40;
|
||||
public static final int UNSUPPORTED_CJK_AB_BASE = 0xFF80;
|
||||
public static final int UNSUPPORTED_OTHER_BASE = 0xFFC0;
|
||||
|
||||
public static final int UNSUPPORTED_BASE = UNSUPPORTED_CJK_BASE;
|
||||
public static final int UNSUPPORTED_TOP = UNSUPPORTED_OTHER_BASE + 0x40;
|
||||
|
||||
static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
|
||||
// was 0xFFC20101;
|
||||
@ -821,7 +838,7 @@ final public class UCA implements Comparator {
|
||||
* to be looked up (with following characters) in the contractingTable.<br>
|
||||
* This isn't a MASK since there is exactly one value.
|
||||
*/
|
||||
static final int CONTRACTING = 0xFF310000;
|
||||
static final int CONTRACTING = 0xFA310000;
|
||||
|
||||
/**
|
||||
* Expanding characters are marked with a exception bit combination
|
||||
@ -829,7 +846,7 @@ final public class UCA implements Comparator {
|
||||
* This means that they map to more than one CE, which is looked up in
|
||||
* the expansionTable by index. See EXCEPTION_INDEX_MASK
|
||||
*/
|
||||
static final int EXPANDING_MASK = 0xFF300000; // marks expanding range start
|
||||
static final int EXPANDING_MASK = 0xFA300000; // marks expanding range start
|
||||
|
||||
/**
|
||||
* This mask is used to get the index from an EXPANDING exception.
|
||||
@ -976,12 +993,11 @@ final public class UCA implements Comparator {
|
||||
// RECURSIVE!!!
|
||||
}
|
||||
|
||||
// Special check for Han, YI
|
||||
if (isFixed(bigChar)) {
|
||||
return makeKey(bigChar, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
if (ucd.isNoncharacter(bigChar)) { // illegal code value, ignore!!
|
||||
return 0;
|
||||
}
|
||||
|
||||
// special check for unsupported surrogate pair, 20 1/8 bits
|
||||
|
||||
// special check and fix for unsupported surrogate pair, 20 1/8 bits
|
||||
if (0xD800 <= bigChar && bigChar <= 0xDFFF) {
|
||||
// ignore unmatched surrogates (e.g. return zero)
|
||||
if (bigChar >= 0xDC00 || index >= decompositionBuffer.length()) return 0; // unmatched
|
||||
@ -990,25 +1006,38 @@ final public class UCA implements Comparator {
|
||||
index++; // skip next char
|
||||
bigChar = 0x10000 + ((ch - 0xD800) << 10) + (ch2 - 0xDC00); // extract value
|
||||
}
|
||||
|
||||
if ((bigChar & 0xFFFE) == 0xFFFE) { // illegal code value, ignore!!
|
||||
return 0;
|
||||
}
|
||||
|
||||
// The result is 2 CEs. One is UNSUPPORTED + top bits, and the other
|
||||
// is a primary that is the next fifteen bits
|
||||
// This has the effect of putting all unsupported characters at the end,
|
||||
// in code order.
|
||||
// add bottom 5 bits to UNSUPPORTED, and push rest
|
||||
//return UNSUPPORTED + (bigChar & 0xFFFF0000); // top bits added
|
||||
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on.
|
||||
// secondary and tertiary are both zero
|
||||
return makeKey(UNSUPPORTED_BASE + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
|
||||
/*
|
||||
expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on.
|
||||
// secondary and tertiary are both zero
|
||||
return UNSUPPORTED + ((bigChar << 1) & 0xFFFF0000); // top 34 values plus UNSUPPORTED
|
||||
*/
|
||||
|
||||
/*
|
||||
The formula from the UCA:
|
||||
|
||||
BASE:
|
||||
|
||||
FB40 CJK Ideograph
|
||||
FB80 CJK Ideograph Extension A/B
|
||||
FBC0 Any other code point
|
||||
|
||||
AAAA = BASE + (CP >> 15);
|
||||
BBBB = (CP & 0x7FFF) | 0x8000;The mapping given to CP is then given by:
|
||||
|
||||
CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
||||
*/
|
||||
// divide the three cases
|
||||
|
||||
int base = UNSUPPORTED_OTHER_BASE;
|
||||
if (isCJK(bigChar)) base = UNSUPPORTED_CJK_BASE;
|
||||
else if (isCJK_AB(bigChar)) base = UNSUPPORTED_CJK_AB_BASE;
|
||||
|
||||
// Now compose the two keys
|
||||
// first push BBBB
|
||||
|
||||
// HACK: expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0));
|
||||
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY));
|
||||
|
||||
// now return AAAA
|
||||
|
||||
return makeKey(base + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
|
||||
|
||||
}
|
||||
if (ce == CONTRACTING) {
|
||||
// Contracting is probably the most interesting (read "tricky") part
|
||||
@ -1084,12 +1113,18 @@ final public class UCA implements Comparator {
|
||||
return expandingStack.pop(); // pop last (guaranteed to exist!)
|
||||
}
|
||||
|
||||
public final boolean isFixed(int bigChar) {
|
||||
return (0x3400 <= bigChar && bigChar <= 0x4DB5
|
||||
|| 0x4E00 <= bigChar && bigChar <= 0x9FA5
|
||||
// || 0xA000 <= bigChar && bigChar <= 0xA48F
|
||||
);
|
||||
public final boolean isCJK(int bigChar) {
|
||||
return (0x4E00 <= bigChar && bigChar <= 0x9FFF);
|
||||
}
|
||||
public final boolean isCJK_AB(int bigChar) {
|
||||
return (0x3400 <= bigChar && bigChar <= 0x4DBF
|
||||
|| 0x20000 <= bigChar && bigChar <= 0x2A6DF);
|
||||
}
|
||||
/*
|
||||
3400..4DBF; CJK Unified Ideographs Extension A
|
||||
4E00..9FFF; CJK Unified Ideographs
|
||||
20000..2A6DF; CJK Unified Ideographs Extension B
|
||||
*/
|
||||
|
||||
private final boolean isHangul(int bigChar) {
|
||||
return (0xAC00 <= bigChar && bigChar <= 0xD7A3);
|
||||
@ -1176,7 +1211,7 @@ final public class UCA implements Comparator {
|
||||
Normalizer nfd = skipDecomps;
|
||||
Iterator enum = null;
|
||||
byte ceLimit;
|
||||
int currentRange = Integer.MAX_VALUE; // set to ZERO to enable
|
||||
int currentRange = SAMPLE_RANGES.length; // set to ZERO to enable
|
||||
int startOfRange = SAMPLE_RANGES[0][0];
|
||||
int endOfRange = startOfRange;
|
||||
int itemInRange = startOfRange;
|
||||
@ -1206,13 +1241,16 @@ final public class UCA implements Comparator {
|
||||
|
||||
// normal case
|
||||
while (current++ < 0x10FFFF) {
|
||||
|
||||
//char ch = (char)current;
|
||||
byte type = getCEType(current);
|
||||
if (type >= ceLimit || type == CONTRACTING_CE) continue;
|
||||
|
||||
//if (nfd.isNormalized(current) || type == HANGUL_CE) {
|
||||
//}
|
||||
|
||||
if (skipDecomps != null && !skipDecomps.isNormalized(current)) continue; // CHECK THIS
|
||||
|
||||
if (!nfd.normalizationDiffers(current) || type == HANGUL_CE) {
|
||||
if (type >= ceLimit) continue;
|
||||
if (skipDecomps != null && skipDecomps.normalizationDiffers(current)) continue;
|
||||
}
|
||||
result = UTF16.valueOf(current);
|
||||
return result;
|
||||
}
|
||||
@ -1226,6 +1264,7 @@ final public class UCA implements Comparator {
|
||||
|
||||
// extra samples
|
||||
if (currentRange < SAMPLE_RANGES.length) {
|
||||
System.out.println("*");
|
||||
try {
|
||||
result = UTF16.valueOf(itemInRange);
|
||||
} catch (RuntimeException e) {
|
||||
@ -1274,6 +1313,7 @@ final public class UCA implements Comparator {
|
||||
result.second = s;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static final int[][] SAMPLE_RANGES = {
|
||||
@ -1299,7 +1339,7 @@ final public class UCA implements Comparator {
|
||||
{0x100000, 0x1000FD},
|
||||
{0x10FF00, 0x10FFFD},
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Adds the collation elements from a file (or other stream) in the UCA format.
|
||||
* Values will override any previous mappings.
|
||||
@ -1366,7 +1406,7 @@ final public class UCA implements Comparator {
|
||||
boolean record = true;
|
||||
/* if (multiChars.length() > 0) record = false;
|
||||
else */
|
||||
if (toD.normalizationDiffers(value)) record = false;
|
||||
if (!toD.isNormalized(value)) record = false;
|
||||
|
||||
// collect CEs
|
||||
if (value == 0x2F00) {
|
||||
@ -1402,6 +1442,8 @@ final public class UCA implements Comparator {
|
||||
expandingTable.push(TERMINATOR);
|
||||
}
|
||||
|
||||
//if (value == 0xd801) System.out.print("DEBUG: " + line);
|
||||
|
||||
// assign CE(s) to char(s)
|
||||
if (multiChars.length() > 0) {
|
||||
contractingTable.put(multiChars.toString(), new Integer(ce));
|
||||
@ -1455,8 +1497,9 @@ final public class UCA implements Comparator {
|
||||
}
|
||||
|
||||
// assign CE(s) to char(s)
|
||||
|
||||
int value = source.charAt(0);
|
||||
//if (value == 0x10000) System.out.print("DEBUG2: " + source);
|
||||
|
||||
if (source.length() > 0) {
|
||||
contractingTable.put(source.toString(), new Integer(ce));
|
||||
if (collationElements[value] == UNSUPPORTED) {
|
||||
@ -1772,7 +1815,7 @@ final public class UCA implements Comparator {
|
||||
* Used for checking data file integrity
|
||||
*/
|
||||
private void checkUnique(char value, int result, int fourth, String line) {
|
||||
if (toD.normalizationDiffers(value)) return; // don't check decomposables.
|
||||
if (!toD.isNormalized(value)) return; // don't check decomposables.
|
||||
Object ceObj = new Long(((long)result << 16) | fourth);
|
||||
Object probe = uniqueTable.get(ceObj);
|
||||
if (probe != null) {
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -29,7 +29,7 @@ public class WriteCharts implements UCD_Types {
|
||||
Default.setUCD();
|
||||
for (int i = 0xE000; i < 0x10000; ++i) {
|
||||
if (!Default.ucd.isRepresented(i)) continue;
|
||||
if (Default.nfkc.normalizationDiffers(i)) continue;
|
||||
if (!Default.nfkc.isNormalized(i)) continue;
|
||||
System.out.println(Default.ucd.getCodeAndName(i));
|
||||
}
|
||||
}
|
||||
@ -205,7 +205,7 @@ public class WriteCharts implements UCD_Types {
|
||||
byte cat = Default.ucd.getCategory(i);
|
||||
if (cat == Cs || cat == Co) continue;
|
||||
|
||||
if (!Default.nfkd.normalizationDiffers(i)) continue;
|
||||
if (Default.nfkd.isNormalized(i)) continue;
|
||||
String decomp = Default.nfkd.normalize(i);
|
||||
|
||||
byte script = getBestScript(decomp);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2002/05/29 23:18:15 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -190,7 +190,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
if (0xAC00 <= c && c <= 0xD7A3) continue;
|
||||
if (normKD.normalizationDiffers(c)) {
|
||||
if (!normKD.isNormalized(c)) {
|
||||
++count;
|
||||
String decomp = normKD.normalize(c);
|
||||
datasize += decomp.length();
|
||||
@ -218,7 +218,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
for (char c = 0; c < 0xFFFF; ++c) {
|
||||
if ((c & 0xFFF) == 0) System.err.println(Utility.hex(c));
|
||||
if (0xAC00 <= c && c <= 0xD7A3) continue;
|
||||
if (normD.normalizationDiffers(c)) {
|
||||
if (!normD.isNormalized(c)) {
|
||||
++count;
|
||||
String decomp = normD.normalize(c);
|
||||
datasize += decomp.length();
|
||||
@ -408,7 +408,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
|
||||
for (char ch = 0; ch < 0xFFFF; ++ch) {
|
||||
if (!nfkd.normalizationDiffers(ch)) continue;
|
||||
if (nfkd.isNormalized(ch)) continue;
|
||||
if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul
|
||||
String sortKey = collator.getSortKey(String.valueOf(ch), UCA.NON_IGNORABLE, decomposition);
|
||||
String decompSortKey = collator.getSortKey(nfkd.normalize(ch), UCA.NON_IGNORABLE, decomposition);
|
||||
@ -1148,6 +1148,9 @@ public class WriteCollationData implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
static Normalizer nfdNew = new Normalizer(Normalizer.NFD, "");
|
||||
static Normalizer nfkdNew = new Normalizer(Normalizer.NFKD, "");
|
||||
|
||||
static void writeRules (byte option) throws IOException {
|
||||
|
||||
//testTransitivity();
|
||||
@ -1155,6 +1158,7 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
int[] ces = new int[50];
|
||||
Normalizer nfd = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
|
||||
Normalizer nfkd = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
|
||||
|
||||
if (false) {
|
||||
int len2 = collator.getCEs("\u2474", true, ces);
|
||||
@ -1173,29 +1177,64 @@ public class WriteCollationData implements UCD_Types {
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE,
|
||||
SKIP_CANONICAL_DECOMPOSIBLES ? nfd : null);
|
||||
int[] lenArray = new int[1];
|
||||
|
||||
Set alreadyDone = new HashSet();
|
||||
PrintWriter log2 = Utility.openPrintWriter("UCARules-log.txt", false, false);
|
||||
|
||||
while (true) {
|
||||
String s = cc.next(ces, lenArray);
|
||||
if (s == null) break;
|
||||
int len = lenArray[0];
|
||||
|
||||
if (s.equals("\uD800")) {
|
||||
System.out.println("Check: " + CEList.toString(ces, len));
|
||||
}
|
||||
|
||||
log2.println(s + "\t" + CEList.toString(ces, len) + "\t" + ucd.getCodeAndName(s));
|
||||
|
||||
addToBackMap(backMap, ces, len, s, false);
|
||||
|
||||
if (len == 1) backMap.put(new Integer(ces[0]), s);
|
||||
String key = String.valueOf((char)(ces[0]>>>16))
|
||||
+ String.valueOf((char)(ces[0] & 0xFFFF))
|
||||
+ collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + UCA.codePointOrder(s);
|
||||
|
||||
ordered.put(key, s);
|
||||
alreadyDone.add(s);
|
||||
|
||||
Object result = ordered.get(key);
|
||||
if (result == null) {
|
||||
System.out.println("BAD SORT: " + Utility.hex(key) + ", " + Utility.hex(s));
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Adding Kanji");
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAllocated(i)) continue;
|
||||
if (nfkd.isNormalized(i)) continue;
|
||||
Utility.dot(i);
|
||||
String decomp = nfkd.normalize(i);
|
||||
int cp;
|
||||
for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(decomp, j);
|
||||
String s = UTF16.valueOf(cp);
|
||||
if (alreadyDone.contains(s)) continue;
|
||||
|
||||
alreadyDone.add(s);
|
||||
int len = collator.getCEs(s, true, ces);
|
||||
|
||||
log2.println(s+ "\t" + CEList.toString(ces, len)
|
||||
+ "\t" + ucd.getCodeAndName(s) + " from " + ucd.getCodeAndName(i));
|
||||
|
||||
addToBackMap(backMap, ces, len, s, false);
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println("Writing");
|
||||
|
||||
String filename = "UCA_Rules.txt";
|
||||
if (option == WITH_NAMES) filename = "UCA_Rules_With_Names.txt";
|
||||
else if (option == IN_XML) filename = "UCA_Rules.xml";
|
||||
log = Utility.openPrintWriter(filename);
|
||||
log = Utility.openPrintWriter(filename, false, false);
|
||||
|
||||
if (option == IN_XML) log.println("<uca>");
|
||||
else log.write('\uFEFF'); // BOM
|
||||
@ -1351,60 +1390,35 @@ public class WriteCollationData implements UCD_Types {
|
||||
|
||||
// get relation
|
||||
|
||||
int relation = 3;
|
||||
|
||||
/*if (chr.charAt(0) == 0xFFFB) {
|
||||
System.out.println("DEBUG");
|
||||
}*/
|
||||
|
||||
if (collator.getPrimary(ce) != collator.getPrimary(lastCE)) {
|
||||
relation = 0;
|
||||
} else if (collator.getSecondary(ce) != collator.getSecondary(lastCE)) {
|
||||
relation = 1;
|
||||
} else if (collator.getTertiary(ce) != collator.getTertiary(lastCE)) {
|
||||
relation = 2;
|
||||
} else if (len > lastLen) {
|
||||
relation = 2; // HACK
|
||||
} else {
|
||||
int minLen = len < lastLen ? len : lastLen;
|
||||
for (int kk = 1; kk < minLen; ++kk) {
|
||||
int lc = lastCes[kk];
|
||||
int c = ces[kk];
|
||||
if (collator.getPrimary(c) != collator.getPrimary(lc)
|
||||
|| collator.getSecondary(c) != collator.getSecondary(lc)) {
|
||||
relation = 3; // reset relation on FIRST char, since differ anyway
|
||||
break;
|
||||
} else if (collator.getTertiary(c) > collator.getTertiary(lc)) {
|
||||
relation = 2; // reset to tertiary (but later ce's might override!)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int relation = getStrengthDifference(ces, len, lastCes, lastLen);
|
||||
|
||||
/*if (chr.equals("\u2474")) {
|
||||
if (chr.equals("\u2F00")) {
|
||||
System.out.println(UCA.ceToString(ces, len));
|
||||
}*/
|
||||
}
|
||||
|
||||
// There are double-CEs, so we have to know what the length of the first bit is.
|
||||
|
||||
int expansionStart = 1;
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
expansionStart = 2; // move up if first is double-ce
|
||||
}
|
||||
|
||||
// check expansions
|
||||
|
||||
String expansion = "";
|
||||
if (len > 1) {
|
||||
int tert0 = ces[0] & 0xFF;
|
||||
boolean isCompat = tert0 != 2 && tert0 != 8;
|
||||
for (int i = 1; i < len; ++i) {
|
||||
int probe = ces[i];
|
||||
String s = getFromBackMap(backMap, probe);
|
||||
if (s == null) {
|
||||
int meHack = UCA.makeKey(0x1795,0x0020,0x0004);
|
||||
if (probe == meHack) {
|
||||
s = "\u3081";
|
||||
} else {
|
||||
System.out.println("No back map for " + collator.ceToString(ces[i])
|
||||
+ ": " + ucd.getCodeAndName(chr));
|
||||
s = "[" + Utility.hex(ces[i]) + "]";
|
||||
}
|
||||
}
|
||||
expansion += s;
|
||||
}
|
||||
if (len > expansionStart) {
|
||||
//int tert0 = ces[0] & 0xFF;
|
||||
//boolean isCompat = tert0 != 2 && tert0 != 8;
|
||||
log2.println("Exp: " + ucd.getCodeAndName(chr) + ", " + CEList.toString(ces, len) + ", start: " + expansionStart);
|
||||
int[] rel = {relation};
|
||||
expansion = getFromBackMap(backMap, ces, expansionStart, len, chr, rel);
|
||||
relation = rel[0];
|
||||
}
|
||||
|
||||
// print results
|
||||
@ -1429,28 +1443,268 @@ public class WriteCollationData implements UCD_Types {
|
||||
} else {
|
||||
if (reset.length() != 0) log.println(reset);
|
||||
log.print(RELATION_NAMES[relation] + " " + quoteOperand(chr));
|
||||
if (len > 1) log.print(" / " + quoteOperand(expansion));
|
||||
if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion));
|
||||
if (option == WITH_NAMES) {
|
||||
log.print("\t# "
|
||||
+ collator.ceToString(ces, len) + " "
|
||||
+ ucd.getCodeAndName(chr));
|
||||
if (len > 1) log.print(" / " + Utility.hex(expansion));
|
||||
if (expansion.length() > 0) log.print(" / " + Utility.hex(expansion));
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
}
|
||||
// log.println("& [top]"); // RESET
|
||||
if (option == IN_XML) log.println("</uca>");
|
||||
log2.close();
|
||||
log.close();
|
||||
Utility.fixDot();
|
||||
}
|
||||
|
||||
static long getPrimary(int[] ces) {
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
return (UCA.getPrimary(ces[0]) << 16) + UCA.getPrimary(ces[1]);
|
||||
} else {
|
||||
return UCA.getPrimary(ces[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static long getSecondary(int[] ces) {
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
return (UCA.getSecondary(ces[0]) << 16) + UCA.getSecondary(ces[1]);
|
||||
} else {
|
||||
return UCA.getSecondary(ces[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static long getTertiary(int[] ces) {
|
||||
if (UCA.isImplicitCE(ces[0])) {
|
||||
return (UCA.getTertiary(ces[0]) << 16) + UCA.getTertiary(ces[1]);
|
||||
} else {
|
||||
return UCA.getTertiary(ces[0]);
|
||||
}
|
||||
}
|
||||
|
||||
static int getStrengthDifference(int[] ces, int len, int[] lastCes, int lastLen) {
|
||||
|
||||
int relation = 3;
|
||||
if (getPrimary(ces) != getPrimary(lastCes)) {
|
||||
relation = 0;
|
||||
} else if (getSecondary(ces) != getSecondary(lastCes)) {
|
||||
relation = 1;
|
||||
} else if (getTertiary(ces) != getTertiary(lastCes)) {
|
||||
relation = 2;
|
||||
} else if (len > lastLen) {
|
||||
relation = 2; // HACK
|
||||
} else {
|
||||
int minLen = len < lastLen ? len : lastLen;
|
||||
int start = UCA.isImplicitCE(ces[0]) ? 2 : 1;
|
||||
for (int kk = start; kk < minLen; ++kk) {
|
||||
int lc = lastCes[kk];
|
||||
int c = ces[kk];
|
||||
if (collator.getPrimary(c) != collator.getPrimary(lc)
|
||||
|| collator.getSecondary(c) != collator.getSecondary(lc)) {
|
||||
relation = 3; // reset relation on FIRST char, since differ anyway
|
||||
break;
|
||||
} else if (collator.getTertiary(c) > collator.getTertiary(lc)) {
|
||||
relation = 2; // reset to tertiary (but later ce's might override!)
|
||||
}
|
||||
}
|
||||
}
|
||||
return relation;
|
||||
}
|
||||
|
||||
|
||||
// static final String[] RELATION_NAMES = {" <", " <<", " <<<", " ="};
|
||||
static final String[] RELATION_NAMES = {" <\t", " <<\t", " <<<\t", " =\t"};
|
||||
static final String[] XML_RELATION_NAMES = {"o1", "o2", "o3", "o4"};
|
||||
|
||||
static final String getFromBackMap(Map backMap, int probe) {
|
||||
String s = (String)backMap.get(new Integer(probe));
|
||||
static class ArrayWrapper {
|
||||
int[] array;
|
||||
int start;
|
||||
int limit;
|
||||
|
||||
/*public ArrayWrapper(int[] contents) {
|
||||
set(contents, 0, contents.length);
|
||||
}
|
||||
*/
|
||||
|
||||
public ArrayWrapper(int[] contents, int start, int limit) {
|
||||
set(contents, start, limit);
|
||||
}
|
||||
|
||||
private void set(int[] contents, int start, int limit) {
|
||||
array = contents;
|
||||
this.start = start;
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
ArrayWrapper that = (ArrayWrapper) other;
|
||||
if (that.limit - that.start != limit - start) return false;
|
||||
for (int i = start; i < limit; ++i) {
|
||||
if (array[i] != that.array[i - start + that.start]) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int result = limit - start;
|
||||
for (int i = start; i < limit; ++i) {
|
||||
result = result * 37 + array[i];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
static int testCase[] = {
|
||||
//collator.makeKey(0xFF40, 0x0020, 0x0002),
|
||||
collator.makeKey(0x0255, 0x0020, 0x000E),
|
||||
};
|
||||
|
||||
static String testString = "\u33C2\u002E";
|
||||
|
||||
static boolean contains(int[] array, int start, int limit, int key) {
|
||||
for (int i = start; i < limit; ++i) {
|
||||
if (array[i] == key) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static final void addToBackMap(Map backMap, int[] ces, int len, String s, boolean show) {
|
||||
if (show || contains(testCase, 0, testCase.length, ces[0]) || testString.indexOf(s) > 0) {
|
||||
System.out.println("Test case: " + Utility.hex(s) + ", " + CEList.toString(ces, len));
|
||||
}
|
||||
backMap.put(new ArrayWrapper((int[])(ces.clone()), 0, len), s);
|
||||
}
|
||||
|
||||
static int[] ignorableList = {
|
||||
UCA.makeKey(0x0000, 0x0153, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0154, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0155, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0156, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0157, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0158, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0159, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x015A, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x015B, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x015C, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x015D, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x015E, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x015F, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0160, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0161, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0162, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0163, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0164, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0165, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0166, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0167, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0168, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0169, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x016A, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x016B, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x016C, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x016D, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x016E, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x016F, 0x0002),
|
||||
UCA.makeKey(0x0000, 0x0170, 0x0002),
|
||||
};
|
||||
|
||||
static final String getFromBackMap(Map backMap, int[] originalces, int expansionStart, int len, String chr, int[] rel) {
|
||||
int[] ces = (int[])(originalces.clone());
|
||||
|
||||
String expansion = "";
|
||||
|
||||
// process ces to neutralize tertiary
|
||||
|
||||
for (int i = expansionStart; i < len; ++i) {
|
||||
int probe = ces[i];
|
||||
char primary = collator.getPrimary(probe);
|
||||
char secondary = collator.getSecondary(probe);
|
||||
char tertiary = collator.getTertiary(probe);
|
||||
|
||||
int tert = tertiary;
|
||||
switch (tert) {
|
||||
case 8: case 9: case 0xA: case 0xB: case 0xC: case 0x1D:
|
||||
tert = 8;
|
||||
break;
|
||||
case 0xD: case 0x10: case 0x11: case 0x12: case 0x13: case 0x1C:
|
||||
tert = 0xE;
|
||||
break;
|
||||
default:
|
||||
tert = 2;
|
||||
break;
|
||||
}
|
||||
ces[i] = collator.makeKey(primary, secondary, tert);
|
||||
}
|
||||
|
||||
for (int i = expansionStart; i < len;) {
|
||||
int limit;
|
||||
String s = null;
|
||||
for (limit = len; limit > i; --limit) {
|
||||
ArrayWrapper wrapper = new ArrayWrapper(ces, i, limit);
|
||||
s = (String)backMap.get(wrapper);
|
||||
if (s != null) break;
|
||||
}
|
||||
if (s == null) {
|
||||
do {
|
||||
if (contains(ignorableList, 0, ignorableList.length, ces[i])) {
|
||||
s = "";
|
||||
if (rel[0] > 1) rel[0] = 1; // HACK
|
||||
break;
|
||||
}
|
||||
|
||||
// Try stomping the value to different tertiaries
|
||||
|
||||
int probe = ces[i];
|
||||
char primary = collator.getPrimary(probe);
|
||||
char secondary = collator.getSecondary(probe);
|
||||
|
||||
ces[i] = collator.makeKey(primary, secondary, 2);
|
||||
ArrayWrapper wrapper = new ArrayWrapper(ces, i, i+1);
|
||||
s = (String)backMap.get(wrapper);
|
||||
if (s != null) break;
|
||||
|
||||
ces[i] = collator.makeKey(primary, secondary,0xE);
|
||||
wrapper = new ArrayWrapper(ces, i, i+1);
|
||||
s = (String)backMap.get(wrapper);
|
||||
if (s != null) break;
|
||||
|
||||
/*
|
||||
int meHack = UCA.makeKey(0x1795,0x0020,0x0004);
|
||||
if (ces[i] == meHack) {
|
||||
s = "\u3081";
|
||||
break;
|
||||
}
|
||||
*/
|
||||
|
||||
// we failed completely. Print error message, and bail
|
||||
|
||||
System.out.println("No back map for " + collator.ceToString(ces[i])
|
||||
+ " from " + CEList.toString(ces, len));
|
||||
System.out.println("\t" + ucd.getCodeAndName(chr)
|
||||
+ " => " + ucd.getCodeAndName(nfkdNew.normalize(chr))
|
||||
);
|
||||
s = "[" + Utility.hex(ces[i]) + "]";
|
||||
} while (false); // exactly one time, just for breaking
|
||||
limit = i + 1;
|
||||
}
|
||||
expansion += s;
|
||||
i = limit;
|
||||
}
|
||||
return expansion;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
static final String getFromBackMap(Map backMap, int[] ces, int index, int limit) {
|
||||
ArrayWrapper wrapper = new ArrayWrapper(ces, index, limit);
|
||||
|
||||
int probe = ces[index];
|
||||
wrapperContents[0] = probe;
|
||||
String s = (String)backMap.get(wrapper);
|
||||
|
||||
outputLen[0] = 1;
|
||||
if (s != null) return s;
|
||||
|
||||
char primary = collator.getPrimary(probe);
|
||||
@ -1473,25 +1727,31 @@ public class WriteCollationData implements UCD_Types {
|
||||
break;
|
||||
}
|
||||
probe = collator.makeKey(primary, secondary, tert);
|
||||
s = (String)backMap.get(new Integer(probe));
|
||||
wrapperContents[0] = probe;
|
||||
s = (String)backMap.get(wrapper);
|
||||
if (s != null) return s;
|
||||
|
||||
probe = collator.makeKey(primary, secondary, collator.NEUTRAL_TERTIARY);
|
||||
s = (String)backMap.get(new Integer(probe));
|
||||
wrapperContents[0] = probe;
|
||||
s = (String)backMap.get(wrapper);
|
||||
}
|
||||
if (s != null) return s;
|
||||
|
||||
if (primary != 0 && secondary != collator.NEUTRAL_SECONDARY) {
|
||||
String first = getFromBackMap(backMap,
|
||||
collator.makeKey(primary, collator.NEUTRAL_SECONDARY, tertiary));
|
||||
String second = getFromBackMap(backMap,
|
||||
collator.makeKey(0, secondary, collator.NEUTRAL_TERTIARY));
|
||||
int[] dummyArray = new int[1];
|
||||
dummyArray[0] = collator.makeKey(primary, collator.NEUTRAL_SECONDARY, tertiary);
|
||||
String first = getFromBackMap(backMap, dummyArray, 0, outputLen);
|
||||
|
||||
dummyArray[0] = collator.makeKey(0, secondary, collator.NEUTRAL_TERTIARY);
|
||||
String second = getFromBackMap(backMap, dummyArray, 0, outputLen);
|
||||
|
||||
if (first != null && second != null) {
|
||||
s = first + second;
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
*/
|
||||
|
||||
static final String[] RELATION = {
|
||||
"<", " << ", " <<< ", " = ", " = ", " = ", " >>> ", " >> ", ">"
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $
|
||||
* $Date: 2002/04/23 01:59:16 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -283,7 +283,7 @@ public class WriteHTMLCollation implements UCD_Types {
|
||||
}
|
||||
log.println("<tr><th>Code</td><th>Sort Key</th><th>Decomposed Sort Key</th><th>Name</th></tr>");
|
||||
for (char ch = 0; ch < 0xFFFF; ++ch) {
|
||||
if (!nfkd.normalizationDiffers(ch)) continue;
|
||||
if (nfkd.isNormalized(ch)) continue;
|
||||
if (ch > 0xAC00 && ch < 0xD7A3) continue; // skip most of Hangul
|
||||
String sortKey = collator.getSortKey(String.valueOf(ch), UCA.NON_IGNORABLE, decomposition);
|
||||
String decompSortKey = collator.getSortKey(nfkd.normalize(ch), UCA.NON_IGNORABLE, decomposition);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2002/04/23 01:59:13 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -281,7 +281,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
}
|
||||
|
||||
public String getValue(int cp, byte style) {
|
||||
if (nfx.normalizationDiffers(cp)) return NO;
|
||||
if (!nfx.isNormalized(cp)) return NO;
|
||||
else if (nfx.isTrailing(cp)) return MAYBE;
|
||||
else return "";
|
||||
}
|
||||
@ -598,7 +598,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (hasSoftDot(cp)) return true;
|
||||
if (!Default.nfkd.normalizationDiffers(cp)) return false;
|
||||
if (Default.nfkd.isNormalized(cp)) return false;
|
||||
String decomp = Default.nfd.normalize(cp);
|
||||
boolean ok = false;
|
||||
for (int i = decomp.length()-1; i >= 0; --i) {
|
||||
@ -700,7 +700,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
|
||||
// if (true) throw new IllegalArgumentException("FIX Default.nf[2]");
|
||||
|
||||
if (!Default.nf[NFKD].normalizationDiffers(cp)) return Lo;
|
||||
if (Default.nf[NFKD].isNormalized(cp)) return Lo;
|
||||
|
||||
String norm = Default.nf[NFKD].normalize(cp);
|
||||
int cp2;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2002/04/23 01:59:14 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -416,7 +416,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
|
||||
static boolean specialNormalizationDiffers(int ch) {
|
||||
if (ch == 0x00DF) return true; // es-zed
|
||||
return Default.nfkd.normalizationDiffers(ch);
|
||||
return !Default.nfkd.isNormalized(ch);
|
||||
}
|
||||
|
||||
static String specialNormalization(String s) {
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -1232,7 +1232,7 @@ public class GenerateData implements UCD_Types {
|
||||
Utility.dot(i);
|
||||
if (!Default.ucd.isRepresented(i)) continue;
|
||||
|
||||
if (!Default.nfd.normalizationDiffers(i)) {
|
||||
if (Default.nfd.isNormalized(i)) {
|
||||
if (Default.ucd.getScript(i) == LATIN_SCRIPT) {
|
||||
int cp = i;
|
||||
String hex = "u" + Utility.hex(cp, 4);
|
||||
@ -1358,7 +1358,7 @@ public class GenerateData implements UCD_Types {
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if ((i & 0xFFF) == 0) System.out.println("# " + i);
|
||||
if (!Default.ucd.isAssigned(i)) continue;
|
||||
if (!Default.nfd.normalizationDiffers(i)) continue;
|
||||
if (Default.nfd.isNormalized(i)) continue;
|
||||
String decomp = Default.nfd.normalize(i);
|
||||
int cp;
|
||||
for (int j = 0; j < decomp.length(); j += UTF16.getCharCount(cp)) {
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -119,10 +119,12 @@ public final class Main implements UCD_Types {
|
||||
|
||||
if (arg.equalsIgnoreCase("All")) {
|
||||
// Append all args at end
|
||||
/*
|
||||
String[] temp = new String[args.length + ALL_FILES.length];
|
||||
System.arraycopy(args, 0, temp, 0, args.length);
|
||||
System.arraycopy(ALL_FILES, 0, temp, args.length, ALL_FILES.length);
|
||||
args = temp;
|
||||
*/
|
||||
args = Utility.append(args, ALL_FILES);
|
||||
expanding = true;
|
||||
|
||||
// EXTRACTED PROPERTIES
|
||||
|
@ -67,7 +67,7 @@ public final class NFSkippable extends UnicodeProperty {
|
||||
if (!ucd.isAssigned(cp)) return true;
|
||||
|
||||
if (DEBUG) cause = "\t\tnf differs";
|
||||
if (nf.normalizationDiffers(cp)) return false;
|
||||
if (!nf.isNormalized(cp)) return false;
|
||||
|
||||
if (DEBUG) cause = "\t\tnon-zero cc";
|
||||
if (ucd.getCombiningClass(cp) != 0) return false;
|
||||
@ -87,7 +87,7 @@ public final class NFSkippable extends UnicodeProperty {
|
||||
// "displaced", so we don't have to test further
|
||||
|
||||
if (DEBUG) cause = "\t\tno decomp";
|
||||
if (!nfd.normalizationDiffers(cp)) return true;
|
||||
if (nfd.isNormalized(cp)) return true;
|
||||
|
||||
// OPTIMIZATION -- careful
|
||||
// Hangul syllables are skippable IFF they are isLeadingJamoComposition
|
||||
@ -265,7 +265,7 @@ public final class NFSkippable extends UnicodeProperty {
|
||||
byte cat = skipper.ucd.getCategory(cp);
|
||||
if (cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (skipper.ucd.getCombiningClass(cp) != 0) continue;
|
||||
if (skipper.nf.normalizationDiffers(cp)) continue;
|
||||
if (!skipper.nf.isNormalized(cp)) continue;
|
||||
if ((cp < 0xAC00 || cp > 0xAE00)
|
||||
&& cp != skipper.ucd.mapToRepresentative(cp, false)) continue;
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -205,8 +205,8 @@ public final class Normalizer implements UCD_Types {
|
||||
* normalizer.
|
||||
* @param ch the source character
|
||||
*/
|
||||
public boolean normalizationDiffers(int ch) {
|
||||
return data.normalizationDiffers(ch, composition, compatibility);
|
||||
public boolean isNormalized(int ch) {
|
||||
return !data.normalizationDiffers(ch, composition, compatibility);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -273,7 +273,7 @@ public class VerifyUCD implements UCD_Types {
|
||||
}
|
||||
|
||||
public static boolean checkNormalizer(Normalizer x, int cp) {
|
||||
boolean result = x.normalizationDiffers(cp);
|
||||
boolean result = !x.isNormalized(cp);
|
||||
if (false) {
|
||||
String s = x.normalize(cp);
|
||||
boolean sResult = !s.equals(UTF16.valueOf(cp));
|
||||
@ -291,7 +291,7 @@ public class VerifyUCD implements UCD_Types {
|
||||
Utility.dot(cp);
|
||||
if (!Default.ucd.isAllocated(cp)) continue;
|
||||
|
||||
if (!Default.nfd.normalizationDiffers(cp)) continue;
|
||||
if (Default.nfd.isNormalized(cp)) continue;
|
||||
|
||||
String decomp = Default.nfd.normalize(cp);
|
||||
String comp = Default.nfc.normalize(cp);
|
||||
@ -979,12 +979,12 @@ can help you narrow these down.
|
||||
if (cp == 0x3131) {
|
||||
System.out.println("Debug: " + idnProhibited
|
||||
+ ", " + idnUnassigned
|
||||
+ ", " + Default.nfkd.normalizationDiffers(cp)
|
||||
+ ", " + !Default.nfkd.isNormalized(cp)
|
||||
+ ", " + Default.ucd.getCodeAndName(Default.nfkc.normalize(cp))
|
||||
+ ", " + Default.ucd.getCodeAndName(Default.nfc.normalize(cp)));
|
||||
}
|
||||
|
||||
if (!idnProhibited && ! idnUnassigned && Default.nfkd.normalizationDiffers(cp)) {
|
||||
if (!idnProhibited && ! idnUnassigned && !Default.nfkd.isNormalized(cp)) {
|
||||
String kc = Default.nfkc.normalize(cp);
|
||||
String c = Default.nfc.normalize(cp);
|
||||
if (kc.equals(c)) continue;
|
||||
@ -1415,7 +1415,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
Utility.dot(cp);
|
||||
if (!Default.ucd.isAssigned(cp)) continue;
|
||||
if (Default.ucd.isPUA(cp)) continue;
|
||||
if (!normalizationDiffers(cp, j)) continue;
|
||||
if (isNormalized(cp, j)) continue;
|
||||
|
||||
if (cp == 0xFDFB || cp == 0x0140) {
|
||||
System.out.println("debug point");
|
||||
@ -1478,9 +1478,9 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
return Default.ucd.getCase(s, FULL, FOLD);
|
||||
}
|
||||
|
||||
static boolean normalizationDiffers(int cp, int j) {
|
||||
if (j < 4) return Default.nf[j].normalizationDiffers(cp);
|
||||
return true;
|
||||
static boolean isNormalized(int cp, int j) {
|
||||
if (j < 4) return !Default.nf[j].isNormalized(cp);
|
||||
return false;
|
||||
}
|
||||
|
||||
private static final String[] NAMES = {"Default.nfd", "NFC", "NFKD", "NFKC", "Fold"};
|
||||
@ -1489,7 +1489,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
Normalizer nfx = Default.nf[j];
|
||||
System.out.println();
|
||||
System.out.println("Testing normalizationDiffers for " + NAMES[j]);
|
||||
System.out.println("Testing isNormalized for " + NAMES[j]);
|
||||
System.out.println();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
@ -1497,7 +1497,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
if (Default.ucd.isPUA(i)) continue;
|
||||
String s = nfx.normalize(i);
|
||||
boolean differs = !s.equals(UTF32.valueOf32(i));
|
||||
boolean call = nfx.normalizationDiffers(i);
|
||||
boolean call = !nfx.isNormalized(i);
|
||||
if (differs != call) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Problem: differs: " + differs
|
||||
@ -1597,7 +1597,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
|
||||
static public void verifyNormalizationStability2(String version) {
|
||||
|
||||
Default.nfd.normalizationDiffers(0x10300);
|
||||
// Default.nfd.normalizationDiffers(0x10300);
|
||||
|
||||
UCD older = UCD.make(version); // Default.ucd.getPreviousVersion();
|
||||
|
||||
@ -1640,7 +1640,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
} else {
|
||||
// not in older version.
|
||||
// (1) If there is a decomp, and it is composed of all OLD characters, then it must NOT compose
|
||||
if (Default.nfd.normalizationDiffers(i)) {
|
||||
if (!Default.nfd.isNormalized(i)) {
|
||||
String decomp = Default.nfd.normalize(i);
|
||||
if (noneHaveCategory(decomp, Cn, older)) {
|
||||
String recomp = Default.nfc.normalize(decomp);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/04/24 02:38:52 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -22,6 +22,13 @@ import com.ibm.text.UCD.*;
|
||||
public final class Utility { // COMMON UTILITIES
|
||||
|
||||
static final boolean UTF8 = true; // TODO -- make argument
|
||||
|
||||
public static String[] append(String[] array1, String[] array2) {
|
||||
String[] temp = new String[array1.length + array2.length];
|
||||
System.arraycopy(array1, 0, temp, 0, array1.length);
|
||||
System.arraycopy(array2, 0, temp, array1.length, array2.length);
|
||||
return temp;
|
||||
}
|
||||
|
||||
public static String getName(int i, String[] names) {
|
||||
try {
|
||||
|
Loading…
Reference in New Issue
Block a user