fixes for 3.2
X-SVN-Rev: 8130
This commit is contained in:
parent
79d29d4e37
commit
1660406201
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2002/03/15 00:34:46 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/03/20 00:21:43 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -25,7 +25,7 @@ import java.io.*;
|
||||
*/
|
||||
|
||||
public final class ConvertUCD implements UCD_Types {
|
||||
public static final boolean SHOW = true;
|
||||
public static final boolean SHOW = false;
|
||||
public static final boolean DEBUG = false;
|
||||
|
||||
public static int major;
|
||||
@ -201,7 +201,7 @@ public final class ConvertUCD implements UCD_Types {
|
||||
// MAIN!!
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
System.out.println("ConvertUCD");
|
||||
System.out.println("Building binary version of UCD");
|
||||
|
||||
log = new PrintWriter(new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
@ -260,8 +260,17 @@ public final class ConvertUCD implements UCD_Types {
|
||||
UData value = (UData) charData.get(key);
|
||||
value.compact();
|
||||
}
|
||||
UData ud = getEntry(0x2A6D6);
|
||||
|
||||
UData ud;
|
||||
ud = getEntry(0x5e);
|
||||
System.out.println("SPOT-CHECK: 5e: " + ud);
|
||||
|
||||
ud = getEntry(0x130);
|
||||
System.out.println("SPOT-CHECK: 130: " + ud);
|
||||
|
||||
ud = getEntry(0x2A6D6);
|
||||
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
|
||||
|
||||
ud = getEntry(0xFFFF);
|
||||
System.out.println("SPOT-CHECK: FFFF: " + ud);
|
||||
|
||||
@ -493,7 +502,16 @@ public final class ConvertUCD implements UCD_Types {
|
||||
if (type.equals("I")) {
|
||||
data.simpleCaseFolding = val;
|
||||
setBinaryProperty(cps, CaseFoldTurkishI);
|
||||
System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
|
||||
+ Utility.hex(cps) + ": " + Utility.hex(val));
|
||||
}
|
||||
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
|
||||
&& labels[4].equals("sc")
|
||||
&& parts[4].trim().length() > 0) {
|
||||
if (i < 4) {
|
||||
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
|
||||
+ Utility.hex(key) + ":" + Utility.hex(val));
|
||||
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
|
||||
}
|
||||
} else {
|
||||
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
|
||||
@ -782,12 +800,16 @@ public final class ConvertUCD implements UCD_Types {
|
||||
} else if (fieldName.equals("su")) {
|
||||
uData.fullUppercase = fieldValue;
|
||||
} else if (fieldName.equals("sl")) {
|
||||
if (DEBUG) System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData);
|
||||
uData.fullLowercase = fieldValue;
|
||||
} else if (fieldName.equals("st")) {
|
||||
uData.fullTitlecase = fieldValue;
|
||||
|
||||
} else if (fieldName.equals("sc")) {
|
||||
uData.specialCasing = fieldValue;
|
||||
if (uData.specialCasing.length() > 0) {
|
||||
uData.specialCasing += ";";
|
||||
}
|
||||
uData.specialCasing += fieldValue;
|
||||
|
||||
} else if (fieldName.equals("xp")) {
|
||||
uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP, true);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2002/03/15 01:57:01 $
|
||||
* $Revision: 1.11 $
|
||||
* $Date: 2002/03/20 00:21:43 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -285,6 +285,11 @@ public final class DerivedProperty implements UCD_Types {
|
||||
else if (nfx.isTrailing(cp)) return MAYBE;
|
||||
else return "";
|
||||
}
|
||||
|
||||
public String getListingValue(int cp) {
|
||||
return getValue(cp, LONG);
|
||||
}
|
||||
|
||||
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
@ -460,6 +465,12 @@ of characters, the first of which has a non-zero combining class.
|
||||
if (isCompEx(cp)) return true;
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
public String getListingValue(int cp) {
|
||||
if (getValueType() != BINARY) return getValue(cp, SHORT);
|
||||
return getProperty(SHORT);
|
||||
}
|
||||
*/
|
||||
};
|
||||
|
||||
dprops[FullCompInclusion] = new UnicodeProperty() {
|
||||
@ -537,37 +548,15 @@ of characters, the first of which has a non-zero combining class.
|
||||
hasUnassigned = true;
|
||||
shortName = "DI";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space";
|
||||
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
|
||||
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
|
||||
if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true;
|
||||
if (ucdData.getBinaryProperty(cp, White_space)) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Cf || cat == Cs || cat == Cc
|
||||
|| ucdData.getBinaryProperty(cp,Reserved_Cf_Code_Point)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
GraphemeExtend = 27,
|
||||
GraphemeBase = 28,
|
||||
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
|
||||
# GraphemeBase :=
|
||||
|
||||
*/
|
||||
dprops[GraphemeExtend] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Grapheme_Extend";
|
||||
shortName = "GrExt";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Me || cat == Mn || cat == Mc
|
||||
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
|
||||
if (cat == Cf || cat == Cs || cat == Cc) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
@ -576,6 +565,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
{
|
||||
name = "Other_Case_Ignorable";
|
||||
shortName = "OCI";
|
||||
isStandard = false;
|
||||
|
||||
header = header = "# Binary Property";
|
||||
}
|
||||
@ -608,7 +598,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (hasSoftDot(cp)) return true;
|
||||
if (!Main.nfkd.hasDecomposition(cp)) return false;
|
||||
if (!Main.nfkd.normalizationDiffers(cp)) return false;
|
||||
String decomp = Main.nfd.normalize(cp);
|
||||
boolean ok = false;
|
||||
for (int i = decomp.length()-1; i >= 0; --i) {
|
||||
@ -630,6 +620,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
dprops[Case_Ignorable] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Case_Ignorable";
|
||||
isStandard = false;
|
||||
shortName = "CI";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
|
||||
@ -642,6 +633,33 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
GraphemeExtend = 27,
|
||||
GraphemeBase = 28,
|
||||
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
|
||||
# GraphemeBase :=
|
||||
|
||||
*/
|
||||
dprops[GraphemeExtend] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Grapheme_Extend";
|
||||
shortName = "GrExt";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link - CGJ"
|
||||
+ "\r\n# (CGJ = U+034F)";
|
||||
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (cp == 0x034F) return false;
|
||||
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Me || cat == Mn || cat == Mc
|
||||
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[GraphemeBase] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
@ -649,9 +667,11 @@ of characters, the first of which has a non-zero combining class.
|
||||
shortName = "GrBase";
|
||||
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Link - Grapheme_Extend";
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
|
||||
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (cp == 0x034F) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
|
||||
|| ucdData.getBinaryProperty(cp,GraphemeLink)) return false;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
|
||||
* $Date: 2002/03/15 00:34:46 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/03/20 00:21:43 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -56,8 +56,7 @@ final class DerivedPropertyLister extends PropertyLister {
|
||||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
if (uprop.getValueType() != BINARY) return uprop.getValue(cp, LONG);
|
||||
return uprop.getProperty(LONG);
|
||||
return uprop.getListingValue(cp);
|
||||
}
|
||||
|
||||
//public String optionalComment(int cp) {
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2002/03/15 01:57:01 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2002/03/20 00:21:43 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -24,6 +24,8 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
|
||||
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
|
||||
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
|
||||
static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1
|
||||
|
||||
// PICK_SHORT & NF_CLOSURE = false for old style
|
||||
|
||||
|
||||
@ -83,8 +85,14 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
if (rFull != null && rFull.equals(rSimple)
|
||||
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
|
||||
String type = "C";
|
||||
if (ch == 0x130 || ch == 0x131) type = "I";
|
||||
drawLine(out, ch, type, rFull);
|
||||
if (ch == 0x130) {
|
||||
drawLine(out, ch, "F", "i\u0307");
|
||||
drawLine(out, ch, "I", "\u0130");
|
||||
} else if (ch == 0x131) {
|
||||
drawLine(out, ch, "I", "i");
|
||||
} else {
|
||||
drawLine(out, ch, type, rFull);
|
||||
}
|
||||
} else {
|
||||
if (rFull != null) {
|
||||
drawLine(out, ch, "F", rFull);
|
||||
@ -404,7 +412,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
}
|
||||
|
||||
static boolean isExcluded(int ch) {
|
||||
if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
// if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
|
||||
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
|
||||
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
|
||||
@ -456,7 +464,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
btitle = Main.nfc.normalize(btitle);
|
||||
}
|
||||
|
||||
if (ch == -1) {// for debugging, change to actual character
|
||||
if (ch == CHECK_CHAR) {
|
||||
System.out.println("Code: " + Main.ucd.getCodeAndName(ch));
|
||||
System.out.println("Decomp: " + Main.ucd.getCodeAndName(decomp));
|
||||
System.out.println("Base: " + Main.ucd.getCodeAndName(base));
|
||||
@ -474,11 +482,17 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
// presumably if there is a single code point, it would already be in the simple mappings
|
||||
|
||||
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
|
||||
&& UTF16.countCodePoint(title) == 1) continue;
|
||||
&& UTF16.countCodePoint(title) == 1) {
|
||||
if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Main.ucd.getCodeAndName(ch));
|
||||
continue;
|
||||
}
|
||||
|
||||
// if there is no change from the base, skip
|
||||
|
||||
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) continue;
|
||||
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
|
||||
if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Main.ucd.getCodeAndName(ch));
|
||||
continue;
|
||||
}
|
||||
|
||||
// fix special cases
|
||||
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
|
||||
@ -488,20 +502,26 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
|
||||
// if there are no changes from the original, or the expanded original, skip
|
||||
|
||||
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) continue;
|
||||
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
|
||||
if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Main.ucd.getCodeAndName(ch));
|
||||
continue;
|
||||
}
|
||||
|
||||
String name = Main.ucd.getName(ch);
|
||||
|
||||
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
|
||||
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 3
|
||||
: name.indexOf("LIGATURE") >= 0 ? 2
|
||||
: name.indexOf("GEGRAMMENI") < 0 ? 4
|
||||
: UTF16.countCodePoint(ftitle) == 1 ? 5
|
||||
: UTF16.countCodePoint(fupper) == 2 ? 6
|
||||
: 7;
|
||||
: ch == 0x130 ? 2
|
||||
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
|
||||
: name.indexOf("LIGATURE") >= 0 ? 3
|
||||
: name.indexOf("GEGRAMMENI") < 0 ? 5
|
||||
: UTF16.countCodePoint(ftitle) == 1 ? 6
|
||||
: UTF16.countCodePoint(fupper) == 2 ? 7
|
||||
: 8;
|
||||
|
||||
if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Main.ucd.getCodeAndName(ch));
|
||||
|
||||
// HACK
|
||||
boolean denormalize = !normalize && order != 5 && order != 6;
|
||||
boolean denormalize = !normalize && order != 6 && order != 7;
|
||||
|
||||
String mapping = Utility.hex(ch)
|
||||
+ "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Main.nfd.normalize(flower) : flower)
|
||||
@ -544,12 +564,15 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
out.println("# The German es-zed is special--the normal mapping is to SS.");
|
||||
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
|
||||
break;
|
||||
case 2: out.println("# Ligatures"); break;
|
||||
case 3: skipLine = true; break;
|
||||
case 4: out.println("# No corresponding uppercase precomposed character"); break;
|
||||
case 5: Utility.appendFile("SpecialCasingIota.txt", true, out); break;
|
||||
case 6: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break;
|
||||
case 7: skipLine = true; break;
|
||||
case 2:
|
||||
out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
|
||||
break;
|
||||
case 3: out.println("# Ligatures"); break;
|
||||
case 4: skipLine = true; break;
|
||||
case 5: out.println("# No corresponding uppercase precomposed character"); break;
|
||||
case 6: Utility.appendFile("SpecialCasingIota.txt", true, out); break;
|
||||
case 7: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break;
|
||||
case 8: skipLine = true; break;
|
||||
}
|
||||
if (!skipLine) out.println();
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2002/03/15 01:57:01 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -1183,7 +1183,7 @@ public class GenerateData implements UCD_Types {
|
||||
Utility.dot(i);
|
||||
if (!Main.ucd.isRepresented(i)) continue;
|
||||
|
||||
if (!Main.nfd.hasDecomposition(i)) {
|
||||
if (!Main.nfd.normalizationDiffers(i)) {
|
||||
if (Main.ucd.getScript(i) == LATIN_SCRIPT) {
|
||||
int cp = i;
|
||||
String hex = "u" + Utility.hex(cp, 4);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/03/15 00:34:46 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -77,8 +77,11 @@ public final class Main implements UCD_Types {
|
||||
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{ucdVersion});
|
||||
else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
|
||||
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
|
||||
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
|
||||
|
||||
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
|
||||
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -85,7 +85,7 @@ final class MyPropertyLister extends PropertyLister {
|
||||
|
||||
if (cat == Cn
|
||||
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
|
||||
&& propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point)
|
||||
&& propMask != (BINARY_PROPERTIES | Other_Default_Ignorable_Code_Point)
|
||||
&& propMask != (CATEGORY | Cn)) {
|
||||
if (BRIDGE) return CONTINUE;
|
||||
else return EXCLUDE;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2002/03/15 01:57:01 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -67,6 +67,13 @@ public final class Normalizer implements UCD_Types {
|
||||
return getName(form);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return string name
|
||||
*/
|
||||
public String getUCDVersion() {
|
||||
return data.getUCDVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does compose?
|
||||
*/
|
||||
@ -120,7 +127,6 @@ public final class Normalizer implements UCD_Types {
|
||||
}
|
||||
|
||||
/**
|
||||
*/
|
||||
private StringBuffer hasDecompositionBuffer = new StringBuffer();
|
||||
|
||||
public boolean hasDecomposition(int cp) {
|
||||
@ -129,6 +135,7 @@ public final class Normalizer implements UCD_Types {
|
||||
if (hasDecompositionBuffer.length() != 1) return true;
|
||||
return cp != hasDecompositionBuffer.charAt(0);
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Does a quick check to see if the string is in the current form. Checks canonical order and
|
||||
@ -427,6 +434,11 @@ public final class Normalizer implements UCD_Types {
|
||||
if (ucd.
|
||||
*/
|
||||
}
|
||||
|
||||
String getUCDVersion() {
|
||||
return ucd.getVersion();
|
||||
}
|
||||
|
||||
/*
|
||||
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
|
||||
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
|
||||
|
@ -48,10 +48,14 @@
|
||||
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
# The following rules handle those cases.
|
||||
|
||||
0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
|
||||
0307; ; 0307; 0307; After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
@ -63,7 +67,6 @@
|
||||
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
|
||||
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
|
||||
|
||||
# Note: the following cases are already in the UnicodeData file.
|
||||
# Note: the following case is already in the UnicodeData file.
|
||||
|
||||
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
|
||||
# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -1027,6 +1027,19 @@ to guarantee identifier closure.
|
||||
}
|
||||
|
||||
private void fillFromFile(String version) {
|
||||
try {
|
||||
fillFromFile2(version);
|
||||
} catch (ChainException e) {
|
||||
try {
|
||||
ConvertUCD.main(new String[]{version});
|
||||
} catch (Exception e2) {
|
||||
throw new ChainException("Can't build data file for {0}", new Object[]{version}, e2);
|
||||
}
|
||||
fillFromFile2(version);
|
||||
}
|
||||
}
|
||||
|
||||
private void fillFromFile2(String version) {
|
||||
DataInputStream dataIn = null;
|
||||
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
|
||||
int uDataFileCount = 0;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2002/03/15 00:34:46 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -636,6 +636,7 @@ final class UCD_Names implements UCD_Types {
|
||||
"TEH_MARBUTA",
|
||||
"TETH",
|
||||
"WAW",
|
||||
"SYRIAC WAW",
|
||||
"YEH",
|
||||
"YEH_BARREE",
|
||||
"YEH_WITH_TAIL",
|
||||
@ -652,21 +653,21 @@ final class UCD_Names implements UCD_Types {
|
||||
"BEH",
|
||||
"BETH",
|
||||
"DAL",
|
||||
"DALATH RISH",
|
||||
"DALATH_RISH",
|
||||
"E",
|
||||
"FEH",
|
||||
"FINAL SEMKATH",
|
||||
"FINAL_SEMKATH",
|
||||
"GAF",
|
||||
"GAMAL",
|
||||
"HAH",
|
||||
"HAMZA ON HEH GOAL",
|
||||
"HAMZA_ON_HEH_GOAL",
|
||||
"HE",
|
||||
"HEH",
|
||||
"HEH GOAL",
|
||||
"HEH_GOAL",
|
||||
"HETH",
|
||||
"KAF",
|
||||
"KAPH",
|
||||
"KNOTTED HEH",
|
||||
"KNOTTED_HEH",
|
||||
"LAM",
|
||||
"LAMADH",
|
||||
"MEEM",
|
||||
@ -677,23 +678,24 @@ final class UCD_Names implements UCD_Types {
|
||||
"QAF",
|
||||
"QAPH",
|
||||
"REH",
|
||||
"REVERSED PE",
|
||||
"REVERSED_PE",
|
||||
"SAD",
|
||||
"SADHE",
|
||||
"SEEN",
|
||||
"SEMKATH",
|
||||
"SHIN",
|
||||
"SWASH KAF",
|
||||
"SWASH_KAF",
|
||||
"TAH",
|
||||
"TAW",
|
||||
"TEH MARBUTA",
|
||||
"TEH_MARBUTA",
|
||||
"TETH",
|
||||
"WAW",
|
||||
"SYRIAC WAW",
|
||||
"YEH",
|
||||
"YEH BARREE",
|
||||
"YEH WITH TAIL",
|
||||
"YEH_BARREE",
|
||||
"YEH_WITH_TAIL",
|
||||
"YUDH",
|
||||
"YUDH HE",
|
||||
"YUDH_HE",
|
||||
"ZAIN",
|
||||
};
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2002/03/15 00:34:46 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -15,7 +15,7 @@ package com.ibm.text.UCD;
|
||||
|
||||
public interface UCD_Types {
|
||||
|
||||
public static final int dVersion = 7; // change to fix the generated file D version. If less than zero, no "d"
|
||||
public static final int dVersion = 8; // change to fix the generated file D version. If less than zero, no "d"
|
||||
|
||||
public static final String BASE_DIR = "C:\\DATA\\";
|
||||
public static final String UCD_DIR = BASE_DIR + "UCD\\";
|
||||
@ -23,7 +23,7 @@ public interface UCD_Types {
|
||||
public static final String GEN_DIR = BASE_DIR + "GEN\\";
|
||||
|
||||
|
||||
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
|
||||
static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes
|
||||
|
||||
// Unicode Property Types
|
||||
static final byte
|
||||
@ -188,7 +188,7 @@ public interface UCD_Types {
|
||||
IDS_TrinaryOperator = 24,
|
||||
Radical = 25,
|
||||
UnifiedIdeograph = 26,
|
||||
Reserved_Cf_Code_Point = 27,
|
||||
Other_Default_Ignorable_Code_Point = 27,
|
||||
Deprecated = 28,
|
||||
Soft_Dotted = 29,
|
||||
Logical_Order_Exception = 30,
|
||||
@ -407,13 +407,14 @@ public static byte
|
||||
TEH_MARBUTA = 41,
|
||||
TETH = 42,
|
||||
WAW = 43,
|
||||
YEH = 44,
|
||||
YEH_BARREE = 45,
|
||||
YEH_WITH_TAIL = 46,
|
||||
YUDH = 47,
|
||||
YUDH_HE = 48,
|
||||
ZAIN = 49,
|
||||
LIMIT_JOINING_GROUP = 50;
|
||||
SYRIAC_WAW = 44,
|
||||
YEH = 45,
|
||||
YEH_BARREE = 46,
|
||||
YEH_WITH_TAIL = 47,
|
||||
YUDH = 48,
|
||||
YUDH_HE = 49,
|
||||
ZAIN = 50,
|
||||
LIMIT_JOINING_GROUP = 51;
|
||||
|
||||
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
|
||||
public static final int
|
||||
|
@ -137,6 +137,14 @@ public abstract class UnicodeProperty implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* special hack for NFD/NFKD
|
||||
*/
|
||||
public String getListingValue(int cp) {
|
||||
if (getValueType() != BINARY) return getValue(cp, LONG);
|
||||
return getProperty(LONG);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does it have the propertyValue?
|
||||
*/
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/03/15 01:57:01 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -674,12 +674,12 @@ can help you narrow these down.
|
||||
if (cp == 0x3131) {
|
||||
System.out.println("Debug: " + idnProhibited
|
||||
+ ", " + idnUnassigned
|
||||
+ ", " + Main.nfkc.hasDecomposition(cp)
|
||||
+ ", " + Main.nfkd.normalizationDiffers(cp)
|
||||
+ ", " + Main.ucd.getCodeAndName(Main.nfkc.normalize(cp))
|
||||
+ ", " + Main.ucd.getCodeAndName(Main.nfc.normalize(cp)));
|
||||
}
|
||||
|
||||
if (!idnProhibited && ! idnUnassigned && Main.nfkc.hasDecomposition(cp)) {
|
||||
if (!idnProhibited && ! idnUnassigned && Main.nfkd.normalizationDiffers(cp)) {
|
||||
String kc = Main.nfkc.normalize(cp);
|
||||
String c = Main.nfc.normalize(cp);
|
||||
if (kc.equals(c)) continue;
|
||||
@ -1045,6 +1045,47 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
|
||||
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
|
||||
*/
|
||||
|
||||
public static void diffIgnorable () {
|
||||
Main.setUCD();
|
||||
|
||||
UnicodeSet control = UnifiedBinaryProperty.make(CATEGORY + Cf, Main.ucd).getSet();
|
||||
|
||||
System.out.println("Cf");
|
||||
Utility.showSetNames("", control, false, Main.ucd);
|
||||
|
||||
control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cc, Main.ucd).getSet());
|
||||
|
||||
System.out.println("Cf + Cc");
|
||||
Utility.showSetNames("", control, false, Main.ucd);
|
||||
|
||||
control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cs, Main.ucd).getSet());
|
||||
|
||||
System.out.println("Cf + Cc + Cs");
|
||||
Utility.showSetNames("", control, false, Main.ucd);
|
||||
|
||||
control.removeAll(UnifiedBinaryProperty.make(BINARY_PROPERTIES + White_space, Main.ucd).getSet());
|
||||
|
||||
System.out.println("Cf + Cc + Cs - WhiteSpace");
|
||||
Utility.showSetNames("", control, false, Main.ucd);
|
||||
|
||||
control.add(0x2060,0x206f).add(0xFFF0,0xFFFB).add(0xE0000,0xE0FFF);
|
||||
|
||||
System.out.println("(Cf + Cc + Cs - WhiteSpace) + ranges");
|
||||
Utility.showSetNames("", control, false, Main.ucd);
|
||||
|
||||
UnicodeSet odicp = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Other_Default_Ignorable_Code_Point, Main.ucd).getSet();
|
||||
|
||||
odicp.removeAll(control);
|
||||
|
||||
System.out.println("Minimal Default Ignorable Code Points");
|
||||
Utility.showSetNames("", odicp, true, Main.ucd);
|
||||
}
|
||||
|
||||
|
||||
public static void IdentifierTest() {
|
||||
@ -1241,6 +1282,95 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
if (cat == Lu || cat == Lt || cat == Ll) return "LC";
|
||||
return Main.ucd.getCategoryID(cp);
|
||||
}
|
||||
|
||||
static public void verifyNormalizationStability() {
|
||||
Main.setUCD();
|
||||
verifyNormalizationStability2("3.1.0");
|
||||
verifyNormalizationStability2("3.0.0");
|
||||
}
|
||||
|
||||
static public void verifyNormalizationStability2(String version) {
|
||||
|
||||
Main.nfd.normalizationDiffers(0x10300);
|
||||
|
||||
UCD older = UCD.make(version); // Main.ucd.getPreviousVersion();
|
||||
|
||||
Normalizer oldNFC = new Normalizer(Normalizer.NFC, older.getVersion());
|
||||
Normalizer oldNFD = new Normalizer(Normalizer.NFD, older.getVersion());
|
||||
Normalizer oldNFKC = new Normalizer(Normalizer.NFKC, older.getVersion());
|
||||
Normalizer oldNFKD = new Normalizer(Normalizer.NFKD, older.getVersion());
|
||||
|
||||
System.out.println("Testing " + Main.nfd.getUCDVersion() + " against " + oldNFD.getUCDVersion());
|
||||
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
if (!Main.ucd.isAssigned(i)) continue;
|
||||
byte cat = Main.ucd.getCategory(i);
|
||||
if (cat == Cs || cat == PRIVATE_USE) continue;
|
||||
|
||||
if (i == 0x5e) {
|
||||
System.out.println("debug");
|
||||
String test1 = Main.nfkd.normalize(i);
|
||||
String test2 = oldNFKD.normalize(i);
|
||||
System.out.println("Testing (new/old)" + Main.ucd.getCodeAndName(i));
|
||||
System.out.println("\t" + Main.ucd.getCodeAndName(test1));
|
||||
System.out.println("\t" + Main.ucd.getCodeAndName(test2));
|
||||
}
|
||||
|
||||
if (older.isAssigned(i)) {
|
||||
|
||||
int newCan = Main.ucd.getCombiningClass(i);
|
||||
int oldCan = older.getCombiningClass(i);
|
||||
if (newCan != oldCan) {
|
||||
System.out.println("FAILS CCC STABILITY: " + newCan + " != " + oldCan
|
||||
+ "; " + Main.ucd.getCodeAndName(i));
|
||||
}
|
||||
|
||||
verifyEquals(i, "NFD STABILITY (new/old)", Main.nfd.normalize(i), oldNFD.normalize(i));
|
||||
verifyEquals(i, "NFC STABILITY (new/old)", Main.nfc.normalize(i), oldNFC.normalize(i));
|
||||
verifyEquals(i, "NFKD STABILITY (new/old)", Main.nfkd.normalize(i), oldNFKD.normalize(i));
|
||||
verifyEquals(i, "NFKC STABILITY (new/old)", Main.nfkc.normalize(i), oldNFKC.normalize(i));
|
||||
|
||||
} else {
|
||||
// not in older version.
|
||||
// (1) If there is a decomp, and it is composed of all OLD characters, then it must NOT compose
|
||||
if (Main.nfd.normalizationDiffers(i)) {
|
||||
String decomp = Main.nfd.normalize(i);
|
||||
if (noneHaveCategory(decomp, Cn, older)) {
|
||||
String recomp = Main.nfc.normalize(decomp);
|
||||
if (recomp.equals(UTF16.valueOf(i))) {
|
||||
Utility.fixDot();
|
||||
System.out.println("FAILS COMP STABILITY: " + Main.ucd.getCodeAndName(i));
|
||||
System.out.println("\t" + Main.ucd.getCodeAndName(decomp));
|
||||
System.out.println("\t" + Main.ucd.getCodeAndName(recomp));
|
||||
System.out.println();
|
||||
throw new IllegalArgumentException("Comp stability");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean noneHaveCategory(String s, byte cat, UCD ucd) {
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
byte cat2 = ucd.getCategory(i);
|
||||
if (cat == cat2) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static void verifyEquals(int cp, String message, String a, String b) {
|
||||
if (!a.equals(b)) {
|
||||
Utility.fixDot();
|
||||
System.out.println("FAILS " + message + ": " + Main.ucd.getCodeAndName(cp));
|
||||
System.out.println("\t" + Main.ucd.getCodeAndName(a));
|
||||
System.out.println("\t" + Main.ucd.getCodeAndName(b));
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
public static void checkAgainstUInfo() {
|
||||
/*
|
||||
|
Loading…
Reference in New Issue
Block a user