fixes for 3.2

X-SVN-Rev: 8130
This commit is contained in:
Mark Davis 2002-03-20 00:21:43 +00:00
parent 79d29d4e37
commit 1660406201
14 changed files with 345 additions and 109 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
* $Date: 2002/03/15 00:34:46 $
* $Revision: 1.5 $
* $Date: 2002/03/20 00:21:43 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -25,7 +25,7 @@ import java.io.*;
*/
public final class ConvertUCD implements UCD_Types {
public static final boolean SHOW = true;
public static final boolean SHOW = false;
public static final boolean DEBUG = false;
public static int major;
@ -201,7 +201,7 @@ public final class ConvertUCD implements UCD_Types {
// MAIN!!
public static void main (String[] args) throws Exception {
System.out.println("ConvertUCD");
System.out.println("Building binary version of UCD");
log = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
@ -260,8 +260,17 @@ public final class ConvertUCD implements UCD_Types {
UData value = (UData) charData.get(key);
value.compact();
}
UData ud = getEntry(0x2A6D6);
UData ud;
ud = getEntry(0x5e);
System.out.println("SPOT-CHECK: 5e: " + ud);
ud = getEntry(0x130);
System.out.println("SPOT-CHECK: 130: " + ud);
ud = getEntry(0x2A6D6);
System.out.println("SPOT-CHECK: 2A6D6: " + ud);
ud = getEntry(0xFFFF);
System.out.println("SPOT-CHECK: FFFF: " + ud);
@ -493,7 +502,16 @@ public final class ConvertUCD implements UCD_Types {
if (type.equals("I")) {
data.simpleCaseFolding = val;
setBinaryProperty(cps, CaseFoldTurkishI);
System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val));
System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting "
+ Utility.hex(cps) + ": " + Utility.hex(val));
}
} else if (labels[0].equals("SpecialCasing") // special handling for special casing
&& labels[4].equals("sc")
&& parts[4].trim().length() > 0) {
if (i < 4) {
if (DEBUG) System.out.println("Got special: " + Utility.hex(cps) + ", "
+ Utility.hex(key) + ":" + Utility.hex(val));
addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val);
}
} else {
/*if (key.equals("sn")) { // SKIP UNDEFINED!!
@ -782,12 +800,16 @@ public final class ConvertUCD implements UCD_Types {
} else if (fieldName.equals("su")) {
uData.fullUppercase = fieldValue;
} else if (fieldName.equals("sl")) {
if (DEBUG) System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData);
uData.fullLowercase = fieldValue;
} else if (fieldName.equals("st")) {
uData.fullTitlecase = fieldValue;
} else if (fieldName.equals("sc")) {
uData.specialCasing = fieldValue;
if (uData.specialCasing.length() > 0) {
uData.specialCasing += ";";
}
uData.specialCasing += fieldValue;
} else if (fieldName.equals("xp")) {
uData.binaryProperties |= 1 << Utility.lookup(fieldValue, UCD_Names.BP, true);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2002/03/15 01:57:01 $
* $Revision: 1.11 $
* $Date: 2002/03/20 00:21:43 $
* $Revision: 1.12 $
*
*******************************************************************************
*/
@ -285,6 +285,11 @@ public final class DerivedProperty implements UCD_Types {
else if (nfx.isTrailing(cp)) return MAYBE;
else return "";
}
public String getListingValue(int cp) {
return getValue(cp, LONG);
}
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
@ -460,6 +465,12 @@ of characters, the first of which has a non-zero combining class.
if (isCompEx(cp)) return true;
return false;
}
/*
public String getListingValue(int cp) {
if (getValueType() != BINARY) return getValue(cp, SHORT);
return getProperty(SHORT);
}
*/
};
dprops[FullCompInclusion] = new UnicodeProperty() {
@ -537,37 +548,15 @@ of characters, the first of which has a non-zero combining class.
hasUnassigned = true;
shortName = "DI";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space";
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
}
boolean hasValue(int cp) {
if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true;
if (ucdData.getBinaryProperty(cp, White_space)) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cf || cat == Cs || cat == Cc
|| ucdData.getBinaryProperty(cp,Reserved_Cf_Code_Point)) return true;
return false;
}
};
/*
GraphemeExtend = 27,
GraphemeBase = 28,
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
# GraphemeBase :=
*/
dprops[GraphemeExtend] = new UnicodeProperty() {
{
type = DERIVED_CORE;
name = "Grapheme_Extend";
shortName = "GrExt";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link";
}
boolean hasValue(int cp) {
if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Me || cat == Mn || cat == Mc
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
if (cat == Cf || cat == Cs || cat == Cc) return true;
return false;
}
};
@ -576,6 +565,7 @@ of characters, the first of which has a non-zero combining class.
{
name = "Other_Case_Ignorable";
shortName = "OCI";
isStandard = false;
header = header = "# Binary Property";
}
@ -608,7 +598,7 @@ of characters, the first of which has a non-zero combining class.
}
boolean hasValue(int cp) {
if (hasSoftDot(cp)) return true;
if (!Main.nfkd.hasDecomposition(cp)) return false;
if (!Main.nfkd.normalizationDiffers(cp)) return false;
String decomp = Main.nfd.normalize(cp);
boolean ok = false;
for (int i = decomp.length()-1; i >= 0; --i) {
@ -630,6 +620,7 @@ of characters, the first of which has a non-zero combining class.
dprops[Case_Ignorable] = new UnicodeProperty() {
{
name = "Case_Ignorable";
isStandard = false;
shortName = "CI";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
@ -642,6 +633,33 @@ of characters, the first of which has a non-zero combining class.
}
};
/*
GraphemeExtend = 27,
GraphemeBase = 28,
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
# GraphemeBase :=
*/
dprops[GraphemeExtend] = new UnicodeProperty() {
{
type = DERIVED_CORE;
name = "Grapheme_Extend";
shortName = "GrExt";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link - CGJ"
+ "\r\n# (CGJ = U+034F)";
}
boolean hasValue(int cp) {
if (cp == 0x034F) return false;
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Me || cat == Mn || cat == Mc
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
return false;
}
};
dprops[GraphemeBase] = new UnicodeProperty() {
{
type = DERIVED_CORE;
@ -649,9 +667,11 @@ of characters, the first of which has a non-zero combining class.
shortName = "GrBase";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Link - Grapheme_Extend";
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
}
boolean hasValue(int cp) {
if (cp == 0x034F) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
|| ucdData.getBinaryProperty(cp,GraphemeLink)) return false;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
* $Date: 2002/03/15 00:34:46 $
* $Revision: 1.9 $
* $Date: 2002/03/20 00:21:43 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -56,8 +56,7 @@ final class DerivedPropertyLister extends PropertyLister {
}
public String valueName(int cp) {
if (uprop.getValueType() != BINARY) return uprop.getValue(cp, LONG);
return uprop.getProperty(LONG);
return uprop.getListingValue(cp);
}
//public String optionalComment(int cp) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2002/03/15 01:57:01 $
* $Revision: 1.6 $
* $Date: 2002/03/20 00:21:43 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -24,6 +24,8 @@ public class GenerateCaseFolding implements UCD_Types {
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1
// PICK_SHORT & NF_CLOSURE = false for old style
@ -83,8 +85,14 @@ public class GenerateCaseFolding implements UCD_Types {
if (rFull != null && rFull.equals(rSimple)
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
String type = "C";
if (ch == 0x130 || ch == 0x131) type = "I";
drawLine(out, ch, type, rFull);
if (ch == 0x130) {
drawLine(out, ch, "F", "i\u0307");
drawLine(out, ch, "I", "\u0130");
} else if (ch == 0x131) {
drawLine(out, ch, "I", "i");
} else {
drawLine(out, ch, type, rFull);
}
} else {
if (rFull != null) {
drawLine(out, ch, "F", rFull);
@ -404,7 +412,7 @@ public class GenerateCaseFolding implements UCD_Types {
}
static boolean isExcluded(int ch) {
if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
// if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
@ -456,7 +464,7 @@ public class GenerateCaseFolding implements UCD_Types {
btitle = Main.nfc.normalize(btitle);
}
if (ch == -1) {// for debugging, change to actual character
if (ch == CHECK_CHAR) {
System.out.println("Code: " + Main.ucd.getCodeAndName(ch));
System.out.println("Decomp: " + Main.ucd.getCodeAndName(decomp));
System.out.println("Base: " + Main.ucd.getCodeAndName(base));
@ -474,11 +482,17 @@ public class GenerateCaseFolding implements UCD_Types {
// presumably if there is a single code point, it would already be in the simple mappings
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
&& UTF16.countCodePoint(title) == 1) continue;
&& UTF16.countCodePoint(title) == 1) {
if (ch == CHECK_CHAR) System.out.println("Skipping single code point: " + Main.ucd.getCodeAndName(ch));
continue;
}
// if there is no change from the base, skip
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) continue;
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) {
if (ch == CHECK_CHAR) System.out.println("Skipping equals base: " + Main.ucd.getCodeAndName(ch));
continue;
}
// fix special cases
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
@ -488,20 +502,26 @@ public class GenerateCaseFolding implements UCD_Types {
// if there are no changes from the original, or the expanded original, skip
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) continue;
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) {
if (ch == CHECK_CHAR) System.out.println("Skipping unchanged: " + Main.ucd.getCodeAndName(ch));
continue;
}
String name = Main.ucd.getName(ch);
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 3
: name.indexOf("LIGATURE") >= 0 ? 2
: name.indexOf("GEGRAMMENI") < 0 ? 4
: UTF16.countCodePoint(ftitle) == 1 ? 5
: UTF16.countCodePoint(fupper) == 2 ? 6
: 7;
: ch == 0x130 ? 2
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4
: name.indexOf("LIGATURE") >= 0 ? 3
: name.indexOf("GEGRAMMENI") < 0 ? 5
: UTF16.countCodePoint(ftitle) == 1 ? 6
: UTF16.countCodePoint(fupper) == 2 ? 7
: 8;
if (ch == CHECK_CHAR) System.out.println("Order: " + order + " for " + Main.ucd.getCodeAndName(ch));
// HACK
boolean denormalize = !normalize && order != 5 && order != 6;
boolean denormalize = !normalize && order != 6 && order != 7;
String mapping = Utility.hex(ch)
+ "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Main.nfd.normalize(flower) : flower)
@ -544,12 +564,15 @@ public class GenerateCaseFolding implements UCD_Types {
out.println("# The German es-zed is special--the normal mapping is to SS.");
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
break;
case 2: out.println("# Ligatures"); break;
case 3: skipLine = true; break;
case 4: out.println("# No corresponding uppercase precomposed character"); break;
case 5: Utility.appendFile("SpecialCasingIota.txt", true, out); break;
case 6: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break;
case 7: skipLine = true; break;
case 2:
out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below.");
break;
case 3: out.println("# Ligatures"); break;
case 4: skipLine = true; break;
case 5: out.println("# No corresponding uppercase precomposed character"); break;
case 6: Utility.appendFile("SpecialCasingIota.txt", true, out); break;
case 7: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break;
case 8: skipLine = true; break;
}
if (!skipLine) out.println();
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2002/03/15 01:57:01 $
* $Revision: 1.15 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -1183,7 +1183,7 @@ public class GenerateData implements UCD_Types {
Utility.dot(i);
if (!Main.ucd.isRepresented(i)) continue;
if (!Main.nfd.hasDecomposition(i)) {
if (!Main.nfd.normalizationDiffers(i)) {
if (Main.ucd.getScript(i) == LATIN_SCRIPT) {
int cp = i;
String hex = "u" + Utility.hex(cp, 4);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/03/15 00:34:46 $
* $Revision: 1.9 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -77,8 +77,11 @@ public final class Main implements UCD_Types {
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{ucdVersion});
else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.7 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -85,7 +85,7 @@ final class MyPropertyLister extends PropertyLister {
if (cat == Cn
&& propMask != (BINARY_PROPERTIES | Noncharacter_Code_Point)
&& propMask != (BINARY_PROPERTIES | Reserved_Cf_Code_Point)
&& propMask != (BINARY_PROPERTIES | Other_Default_Ignorable_Code_Point)
&& propMask != (CATEGORY | Cn)) {
if (BRIDGE) return CONTINUE;
else return EXCLUDE;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2002/03/15 01:57:01 $
* $Revision: 1.7 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -67,6 +67,13 @@ public final class Normalizer implements UCD_Types {
return getName(form);
}
/**
* Return string name
*/
public String getUCDVersion() {
return data.getUCDVersion();
}
/**
* Does compose?
*/
@ -120,7 +127,6 @@ public final class Normalizer implements UCD_Types {
}
/**
*/
private StringBuffer hasDecompositionBuffer = new StringBuffer();
public boolean hasDecomposition(int cp) {
@ -129,6 +135,7 @@ public final class Normalizer implements UCD_Types {
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
*/
/**
* Does a quick check to see if the string is in the current form. Checks canonical order and
@ -427,6 +434,11 @@ public final class Normalizer implements UCD_Types {
if (ucd.
*/
}
String getUCDVersion() {
return ucd.getVersion();
}
/*
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL

View File

@ -48,10 +48,14 @@
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
0307; ; 0307; 0307; After_Soft_Dotted; # COMBINING DOT ABOVE
0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
@ -63,7 +67,6 @@
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
# Note: the following cases are already in the UnicodeData file.
# Note: the following case is already in the UnicodeData file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.9 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -1027,6 +1027,19 @@ to guarantee identifier closure.
}
private void fillFromFile(String version) {
try {
fillFromFile2(version);
} catch (ChainException e) {
try {
ConvertUCD.main(new String[]{version});
} catch (Exception e2) {
throw new ChainException("Can't build data file for {0}", new Object[]{version}, e2);
}
fillFromFile2(version);
}
}
private void fillFromFile2(String version) {
DataInputStream dataIn = null;
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
int uDataFileCount = 0;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2002/03/15 00:34:46 $
* $Revision: 1.12 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -636,6 +636,7 @@ final class UCD_Names implements UCD_Types {
"TEH_MARBUTA",
"TETH",
"WAW",
"SYRIAC WAW",
"YEH",
"YEH_BARREE",
"YEH_WITH_TAIL",
@ -652,21 +653,21 @@ final class UCD_Names implements UCD_Types {
"BEH",
"BETH",
"DAL",
"DALATH RISH",
"DALATH_RISH",
"E",
"FEH",
"FINAL SEMKATH",
"FINAL_SEMKATH",
"GAF",
"GAMAL",
"HAH",
"HAMZA ON HEH GOAL",
"HAMZA_ON_HEH_GOAL",
"HE",
"HEH",
"HEH GOAL",
"HEH_GOAL",
"HETH",
"KAF",
"KAPH",
"KNOTTED HEH",
"KNOTTED_HEH",
"LAM",
"LAMADH",
"MEEM",
@ -677,23 +678,24 @@ final class UCD_Names implements UCD_Types {
"QAF",
"QAPH",
"REH",
"REVERSED PE",
"REVERSED_PE",
"SAD",
"SADHE",
"SEEN",
"SEMKATH",
"SHIN",
"SWASH KAF",
"SWASH_KAF",
"TAH",
"TAW",
"TEH MARBUTA",
"TEH_MARBUTA",
"TETH",
"WAW",
"SYRIAC WAW",
"YEH",
"YEH BARREE",
"YEH WITH TAIL",
"YEH_BARREE",
"YEH_WITH_TAIL",
"YUDH",
"YUDH HE",
"YUDH_HE",
"ZAIN",
};

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2002/03/15 00:34:46 $
* $Revision: 1.9 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -15,7 +15,7 @@ package com.ibm.text.UCD;
public interface UCD_Types {
public static final int dVersion = 7; // change to fix the generated file D version. If less than zero, no "d"
public static final int dVersion = 8; // change to fix the generated file D version. If less than zero, no "d"
public static final String BASE_DIR = "C:\\DATA\\";
public static final String UCD_DIR = BASE_DIR + "UCD\\";
@ -23,7 +23,7 @@ public interface UCD_Types {
public static final String GEN_DIR = BASE_DIR + "GEN\\";
static final byte BINARY_FORMAT = 5; // bumped if binary format of UCD changes
static final byte BINARY_FORMAT = 6; // bumped if binary format of UCD changes
// Unicode Property Types
static final byte
@ -188,7 +188,7 @@ public interface UCD_Types {
IDS_TrinaryOperator = 24,
Radical = 25,
UnifiedIdeograph = 26,
Reserved_Cf_Code_Point = 27,
Other_Default_Ignorable_Code_Point = 27,
Deprecated = 28,
Soft_Dotted = 29,
Logical_Order_Exception = 30,
@ -407,13 +407,14 @@ public static byte
TEH_MARBUTA = 41,
TETH = 42,
WAW = 43,
YEH = 44,
YEH_BARREE = 45,
YEH_WITH_TAIL = 46,
YUDH = 47,
YUDH_HE = 48,
ZAIN = 49,
LIMIT_JOINING_GROUP = 50;
SYRIAC_WAW = 44,
YEH = 45,
YEH_BARREE = 46,
YEH_WITH_TAIL = 47,
YUDH = 48,
YUDH_HE = 49,
ZAIN = 50,
LIMIT_JOINING_GROUP = 51;
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
public static final int

View File

@ -137,6 +137,14 @@ public abstract class UnicodeProperty implements UCD_Types {
}
}
/**
* special hack for NFD/NFKD
*/
public String getListingValue(int cp) {
if (getValueType() != BINARY) return getValue(cp, LONG);
return getProperty(LONG);
}
/**
* Does it have the propertyValue?
*/

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/03/15 01:57:01 $
* $Revision: 1.10 $
* $Date: 2002/03/20 00:21:42 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
@ -674,12 +674,12 @@ can help you narrow these down.
if (cp == 0x3131) {
System.out.println("Debug: " + idnProhibited
+ ", " + idnUnassigned
+ ", " + Main.nfkc.hasDecomposition(cp)
+ ", " + Main.nfkd.normalizationDiffers(cp)
+ ", " + Main.ucd.getCodeAndName(Main.nfkc.normalize(cp))
+ ", " + Main.ucd.getCodeAndName(Main.nfc.normalize(cp)));
}
if (!idnProhibited && ! idnUnassigned && Main.nfkc.hasDecomposition(cp)) {
if (!idnProhibited && ! idnUnassigned && Main.nfkd.normalizationDiffers(cp)) {
String kc = Main.nfkc.normalize(cp);
String c = Main.nfc.normalize(cp);
if (kc.equals(c)) continue;
@ -1045,6 +1045,47 @@ E0020-E007F; [TAGGING CHARACTERS]
}
return result;
}
/*
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
*/
public static void diffIgnorable () {
Main.setUCD();
UnicodeSet control = UnifiedBinaryProperty.make(CATEGORY + Cf, Main.ucd).getSet();
System.out.println("Cf");
Utility.showSetNames("", control, false, Main.ucd);
control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cc, Main.ucd).getSet());
System.out.println("Cf + Cc");
Utility.showSetNames("", control, false, Main.ucd);
control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cs, Main.ucd).getSet());
System.out.println("Cf + Cc + Cs");
Utility.showSetNames("", control, false, Main.ucd);
control.removeAll(UnifiedBinaryProperty.make(BINARY_PROPERTIES + White_space, Main.ucd).getSet());
System.out.println("Cf + Cc + Cs - WhiteSpace");
Utility.showSetNames("", control, false, Main.ucd);
control.add(0x2060,0x206f).add(0xFFF0,0xFFFB).add(0xE0000,0xE0FFF);
System.out.println("(Cf + Cc + Cs - WhiteSpace) + ranges");
Utility.showSetNames("", control, false, Main.ucd);
UnicodeSet odicp = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Other_Default_Ignorable_Code_Point, Main.ucd).getSet();
odicp.removeAll(control);
System.out.println("Minimal Default Ignorable Code Points");
Utility.showSetNames("", odicp, true, Main.ucd);
}
public static void IdentifierTest() {
@ -1241,6 +1282,95 @@ E0020-E007F; [TAGGING CHARACTERS]
if (cat == Lu || cat == Lt || cat == Ll) return "LC";
return Main.ucd.getCategoryID(cp);
}
static public void verifyNormalizationStability() {
Main.setUCD();
verifyNormalizationStability2("3.1.0");
verifyNormalizationStability2("3.0.0");
}
static public void verifyNormalizationStability2(String version) {
Main.nfd.normalizationDiffers(0x10300);
UCD older = UCD.make(version); // Main.ucd.getPreviousVersion();
Normalizer oldNFC = new Normalizer(Normalizer.NFC, older.getVersion());
Normalizer oldNFD = new Normalizer(Normalizer.NFD, older.getVersion());
Normalizer oldNFKC = new Normalizer(Normalizer.NFKC, older.getVersion());
Normalizer oldNFKD = new Normalizer(Normalizer.NFKD, older.getVersion());
System.out.println("Testing " + Main.nfd.getUCDVersion() + " against " + oldNFD.getUCDVersion());
for (int i = 0; i <= 0x10FFFF; ++i) {
Utility.dot(i);
if (!Main.ucd.isAssigned(i)) continue;
byte cat = Main.ucd.getCategory(i);
if (cat == Cs || cat == PRIVATE_USE) continue;
if (i == 0x5e) {
System.out.println("debug");
String test1 = Main.nfkd.normalize(i);
String test2 = oldNFKD.normalize(i);
System.out.println("Testing (new/old)" + Main.ucd.getCodeAndName(i));
System.out.println("\t" + Main.ucd.getCodeAndName(test1));
System.out.println("\t" + Main.ucd.getCodeAndName(test2));
}
if (older.isAssigned(i)) {
int newCan = Main.ucd.getCombiningClass(i);
int oldCan = older.getCombiningClass(i);
if (newCan != oldCan) {
System.out.println("FAILS CCC STABILITY: " + newCan + " != " + oldCan
+ "; " + Main.ucd.getCodeAndName(i));
}
verifyEquals(i, "NFD STABILITY (new/old)", Main.nfd.normalize(i), oldNFD.normalize(i));
verifyEquals(i, "NFC STABILITY (new/old)", Main.nfc.normalize(i), oldNFC.normalize(i));
verifyEquals(i, "NFKD STABILITY (new/old)", Main.nfkd.normalize(i), oldNFKD.normalize(i));
verifyEquals(i, "NFKC STABILITY (new/old)", Main.nfkc.normalize(i), oldNFKC.normalize(i));
} else {
// not in older version.
// (1) If there is a decomp, and it is composed of all OLD characters, then it must NOT compose
if (Main.nfd.normalizationDiffers(i)) {
String decomp = Main.nfd.normalize(i);
if (noneHaveCategory(decomp, Cn, older)) {
String recomp = Main.nfc.normalize(decomp);
if (recomp.equals(UTF16.valueOf(i))) {
Utility.fixDot();
System.out.println("FAILS COMP STABILITY: " + Main.ucd.getCodeAndName(i));
System.out.println("\t" + Main.ucd.getCodeAndName(decomp));
System.out.println("\t" + Main.ucd.getCodeAndName(recomp));
System.out.println();
throw new IllegalArgumentException("Comp stability");
}
}
}
}
}
}
public static boolean noneHaveCategory(String s, byte cat, UCD ucd) {
int cp;
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
byte cat2 = ucd.getCategory(i);
if (cat == cat2) return false;
}
return true;
}
public static void verifyEquals(int cp, String message, String a, String b) {
if (!a.equals(b)) {
Utility.fixDot();
System.out.println("FAILS " + message + ": " + Main.ucd.getCodeAndName(cp));
System.out.println("\t" + Main.ucd.getCodeAndName(a));
System.out.println("\t" + Main.ucd.getCodeAndName(b));
System.out.println();
}
}
public static void checkAgainstUInfo() {
/*