updated for 4.0

X-SVN-Rev: 11161
This commit is contained in:
Mark Davis 2003-02-25 23:38:23 +00:00
parent c31688a777
commit 07a8be151c
24 changed files with 1610 additions and 367 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.9 $
* $Date: 2003/02/25 23:38:23 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -718,6 +718,8 @@ public final class ConvertUCD implements UCD_Types {
static Set jtSet = new TreeSet();
static Set jgSet = new TreeSet();
static final boolean SHOW_SAMPLE = false;
/** Adds the character data. Signals duplicates with an exception
*/
@ -725,6 +727,11 @@ public final class ConvertUCD implements UCD_Types {
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
UData charEntry = getEntry(cp);
//if (cp < 10) System.out.println(" " + charEntry);
if (SHOW_SAMPLE && cp == 0x221) {
System.out.println("Sample: " + cp + ", " + key + ", " + value);
System.out.println(charEntry);
}
if (key.equals("bm")) {
if (value.equals("Y")) charEntry.binaryProperties |= 1;
@ -780,6 +787,11 @@ public final class ConvertUCD implements UCD_Types {
} else {
setField(charEntry, key, value);
}
if (SHOW_SAMPLE && cp == 0x221) {
System.out.println("Sample Result:");
System.out.println(charEntry);
}
}
static public void setField(UData uData, String fieldName, String fieldValue) {

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.17 $
* $Date: 2003/02/25 23:38:23 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
@ -14,11 +14,20 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import java.io.PrintWriter;
public final class DerivedProperty implements UCD_Types {
UCD ucdData;
Normalizer nfc;
Normalizer nfd;
Normalizer nfkc;
Normalizer nfkd;
Normalizer[] nf = new Normalizer[4];
UnicodeSet XID_Start_Set = new UnicodeSet();
UnicodeSet XID_Continue_Set = new UnicodeSet();
// ADD CONSTANT to UCD_TYPES
@ -33,9 +42,6 @@ public final class DerivedProperty implements UCD_Types {
}
///////////////////////////////////////////////////////////
private DerivedProperty(UCD ucd) {
ucdData = ucd;
}
static Map cache = new HashMap();
static UCD lastUCD = null;
@ -101,7 +107,7 @@ public final class DerivedProperty implements UCD_Types {
Normalizer nfx;
ExDProp(int i) {
type = DERIVED_NORMALIZATION;
nfx = Default.nf[i];
nfx = nf[i];
name = "Expands_On_" + nfx.getName();
shortName = "XO_" + nfx.getName();
header = "# Derived Property: " + name
@ -125,7 +131,7 @@ public final class DerivedProperty implements UCD_Types {
NF_UnsafeStartProp(int i) {
isStandard = false;
type = DERIVED_NORMALIZATION;
nfx = Default.nf[i];
nfx = nf[i];
name = nfx.getName() + "_UnsafeStart";
shortName = nfx.getName() + "_SS";
header = "# Derived Property: " + name
@ -144,6 +150,35 @@ public final class DerivedProperty implements UCD_Types {
}
};
/*
class HangulSyllableType extends UnicodeProperty {
Normalizer nfx;
//int prop;
HangulSyllableType(int i) {
isStandard = false;
type = DERIVED_NORMALIZATION;
nfx = nf[i];
name = nfx.getName() + "_UnsafeStart";
shortName = nfx.getName() + "_SS";
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
;
}
public boolean hasValue(int cp) {
if (ucdData.getCombiningClass(cp) != 0) return false;
String norm = nfx.normalize(cp);
int first = UTF16.charAt(norm, 0);
if (ucdData.getCombiningClass(first) != 0) return true;
if (nfx.isComposition()
&& dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
return false;
}
};
*/
class NFC_Prop extends UnicodeProperty {
BitSet bitset;
@ -161,7 +196,7 @@ public final class DerivedProperty implements UCD_Types {
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
}
filter = bitsets[1] != null;
Default.nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
name = Names[i-NFC_Leading];
shortName = SNames[i-NFC_Leading];
@ -197,17 +232,17 @@ public final class DerivedProperty implements UCD_Types {
isStandard = false;
setValueType(NON_ENUMERATED);
type = DERIVED_NORMALIZATION;
nfx = Default.nf[i];
nfx = nf[i];
name = nfx.getName();
String compName = "the character itself";
if (i == NFKC || i == NFD) {
name += "-NFC";
nfComp = Default.nfc;
nfComp = nfc;
compName = "NFC for the character";
} else if (i == NFKD) {
name += "-NFD";
nfComp = Default.nfd;
nfComp = nfd;
compName = "NFD for the character";
}
header = "# Derived Property: " + name
@ -273,7 +308,7 @@ public final class DerivedProperty implements UCD_Types {
QuickDProp (int i) {
setValueType((i == NFC || i == NFKC) ? ENUMERATED : BINARY);
type = DERIVED_NORMALIZATION;
nfx = Default.nf[i];
nfx = nf[i];
NO = nfx.getName() + "_NO";
MAYBE = nfx.getName() + "_MAYBE";
name = nfx.getName() + "_QuickCheck";
@ -297,7 +332,14 @@ public final class DerivedProperty implements UCD_Types {
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
{
private DerivedProperty(UCD ucd) {
ucdData = ucd;
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdData.getVersion());
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdData.getVersion());
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdData.getVersion());
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdData.getVersion());
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
dprops[i] = new ExDProp(i-ExpandsOnNFD);
}
@ -321,10 +363,10 @@ public final class DerivedProperty implements UCD_Types {
shortName = "IDS";
header = "# Derived Property: " + name
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl+ID_Start_Exceptions";
}
public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp, false);
return ucdData.isIdentifierStart(cp);
}
};
@ -339,10 +381,65 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# NOTE: Cf characters should be filtered out.";
}
public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, false);
return ucdData.isIdentifierContinue_NO_Cf(cp);
}
};
StringBuffer tempBuf = new StringBuffer();
//System.out.println("Deriving data for XID");
// special hack for middle dot
XID_Continue_Set.add(0x00B7);
//System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7));
for (int cp = 0; cp < 0x10FFFF; ++cp) {
// skip cases that can't matter
if (!ucdData.isAssigned(cp)) continue;
// find out normal status
int status = 0;
if (ucdData.isIdentifierStart(cp)) status = 1;
else if (ucdData.isIdentifierContinue_NO_Cf(cp)) status = 2;
if (status != 0 && !nfkd.isNormalized(cp)) {
// now find out NFKD status
// if it is <start><extend>*, then it is start
// else if it is <extend>*, then it is extend
// else it is nothing
int status2 = 0;
tempBuf.setLength(0);
nfkd.normalize(UTF32.valueOf32(cp), tempBuf);
for (int i = 0; i < tempBuf.length(); i += UTF32.count16(cp)) {
int cp2 = UTF32.char32At(tempBuf, i);
if (i == 0) {
if (ucdData.isIdentifierStart(cp2)) status2 = 1;
else if (ucdData.isIdentifierContinue_NO_Cf(cp2)) status2 = 2;
else {
status2 = 0;
break;
}
} else if (!ucdData.isIdentifierContinue_NO_Cf(cp2) && cp2 != 0xB7) {
status2 = 0;
break;
}
}
// Now see if the statuses are compatible.
if (status != status2) {
//System.out.println("Need to do something with:");
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
//System.out.println(" " + status2 + ": " + ucdData.getCodeAndName(tempBuf.toString()));
if (status2 == 0) status = 0;
else if (status2 > status) status = status2;
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
}
}
if (status == 1) XID_Start_Set.add(cp);
if (status != 0) XID_Continue_Set.add(cp);
}
dprops[Mod_ID_Start] = new UnicodeProperty() {
{
type = DERIVED_CORE;
@ -355,7 +452,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp, true);
return XID_Start_Set.contains(cp);
}
};
@ -372,7 +469,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, true);
return XID_Continue_Set.contains(cp);
}
};
@ -458,7 +555,6 @@ of characters, the first of which has a non-zero combining class.
shortName = "Comp_Ex";
defaultValueStyle = defaultPropertyStyle = SHORT;
header = "# Derived Property: " + name
+ ": Full Composition Exclusion"
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
}
public boolean hasValue(int cp) {
@ -469,6 +565,9 @@ of characters, the first of which has a non-zero combining class.
if (isCompEx(cp)) return true;
return false;
}
/*public String getListingValue(int cp) {
return "Comp_Ex";
}*/
/*
public String getListingValue(int cp) {
if (getValueType() != BINARY) return getValue(cp, SHORT);
@ -511,8 +610,8 @@ of characters, the first of which has a non-zero combining class.
}
public String getValue(int cp, byte style) {
if (!ucdData.isRepresented(cp)) return "";
String b = Default.nfkc.normalize(fold(cp));
String c = Default.nfkc.normalize(fold(b));
String b = nfkc.normalize(fold(cp));
String c = nfkc.normalize(fold(b));
if (c.equals(b)) return "";
return "FNC; " + Utility.hex(c);
} // default
@ -533,8 +632,8 @@ of characters, the first of which has a non-zero combining class.
}
public String getValue(int cp, byte style) {
if (!ucdData.isRepresented(cp)) return "";
String b = Default.nfc.normalize(fold(cp));
String c = Default.nfc.normalize(fold(b));
String b = nfc.normalize(fold(cp));
String c = nfc.normalize(fold(b));
if (c.equals(b)) return "";
return "FN; " + Utility.hex(c);
} // default
@ -565,6 +664,94 @@ of characters, the first of which has a non-zero combining class.
}
};
dprops[Case_Sensitive] = new UnicodeProperty() {
{
type = DERIVED_CORE;
isStandard = false;
name = "Case_Sensitive";
hasUnassigned = false;
shortName = "CS";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from all characters that are either on the right or left side of a case mapping";
}
UnicodeSet case_sensitive = null;
UnicodeSet tempSet = new UnicodeSet();
UnicodeSet cased = null;
PrintWriter log;
private void addCase(String cps, byte c1, byte c2) {
String temp = ucdData.getCase(cps, c1, c2);
if (temp.equals(cps)) return;
//temp = nfc.normalize(temp);
//if (temp.equals(cps)) return;
tempSet.clear();
tempSet.addAll(cps);
tempSet.addAll(temp);
if (!case_sensitive.containsAll(tempSet)) {
tempSet.removeAll(case_sensitive);
if (!cased.containsAll(tempSet)) {
log.println();
log.println("Adding " + tempSet + " because of: ");
log.println("\t" + ucdData.getCodeAndName(cps));
log.println("=>\t" + ucdData.getCodeAndName(temp));
}
case_sensitive.addAll(tempSet);
}
}
public boolean hasValue(int cp) {
if (case_sensitive == null) {
try {
log = Utility.openPrintWriter("Case_Sensitive_Log.txt", Utility.UTF8_UNIX);
System.out.println("Building Case-Sensitive cache");
case_sensitive = new UnicodeSet();
cased = DerivedProperty.make(PropLowercase, ucdData).getSet()
.addAll(DerivedProperty.make(PropUppercase, ucdData).getSet())
.addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet());
for (int c = 0; c < 0x10FFFF; ++c) {
Utility.dot(c);
// skip cases that can't matter
if (!ucdData.isAssigned(c)) continue;
String cps = UTF16.valueOf(c);
addCase(cps, FULL, LOWER);
addCase(cps, FULL, UPPER);
addCase(cps, FULL, TITLE);
addCase(cps, FULL, FOLD);
addCase(cps, SIMPLE, LOWER);
addCase(cps, SIMPLE, UPPER);
addCase(cps, SIMPLE, TITLE);
addCase(cps, SIMPLE, FOLD);
}
Utility.fixDot();
UnicodeSet temp;
log.println("Cased, but not Case_Sensitive");
temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive);
Utility.showSetNames(log, "", temp, false, false, ucdData);
log.println("Case_Sensitive, but not Cased");
temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased);
Utility.showSetNames(log, "", temp, false, false, ucdData);
log.println("Both Case_Sensitive, and Cased");
temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased);
log.println(temp);
System.out.println("Done Building Case-Sensitive cache");
log.close();
} catch (Exception e) {
throw new ChainException("internal error", null, e);
}
}
return case_sensitive.contains(cp);
}
};
dprops[Other_Case_Ignorable] = new UnicodeProperty() {
{
name = "Other_Case_Ignorable";
@ -602,8 +789,8 @@ of characters, the first of which has a non-zero combining class.
}
public boolean hasValue(int cp) {
if (hasSoftDot(cp)) return true;
if (Default.nfkd.isNormalized(cp)) return false;
String decomp = Default.nfd.normalize(cp);
if (nfkd.isNormalized(cp)) return false;
String decomp = nfd.normalize(cp);
boolean ok = false;
for (int i = decomp.length()-1; i >= 0; --i) {
int ch = UTF16.charAt(decomp, i);
@ -650,16 +837,19 @@ of characters, the first of which has a non-zero combining class.
name = "Grapheme_Extend";
shortName = "GrExt";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link - CGJ"
+ "\r\n# (CGJ = U+034F)";
+ "\r\n# Generated from: Me + Mn + Other_Grapheme_Extend"
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
;
}
public boolean hasValue(int cp) {
if (cp == 0x034F) return false;
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
//if (cp == 0x034F) return false;
//if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
// || cat == Mc
byte cat = ucdData.getCategory(cp);
if (cat == Me || cat == Mn || cat == Mc
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
if (cat == Me || cat == Mn
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
return false;
}
};
@ -671,14 +861,16 @@ of characters, the first of which has a non-zero combining class.
shortName = "GrBase";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend"
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
;
}
public boolean hasValue(int cp) {
if (cp == 0x034F) return false;
//if (cp == 0x034F) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
|| ucdData.getBinaryProperty(cp,GraphemeLink)) return false;
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp) return false;
// || ucdData.getBinaryProperty(cp,GraphemeLink)
if (dprops[GraphemeExtend].hasValue(cp)) return false;
return true;
}
@ -702,11 +894,11 @@ of characters, the first of which has a non-zero combining class.
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
// if (true) throw new IllegalArgumentException("FIX Default.nf[2]");
// if (true) throw new IllegalArgumentException("FIX nf[2]");
if (Default.nf[NFKD].isNormalized(cp)) return Lo;
if (nf[NFKD].isNormalized(cp)) return Lo;
String norm = Default.nf[NFKD].normalize(cp);
String norm = nf[NFKD].normalize(cp);
int cp2;
boolean gotUpper = false;
boolean gotLower = false;

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.7 $
* $Date: 2003/02/25 23:38:23 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -31,6 +31,7 @@ class DiffPropertyLister extends PropertyLister {
}
breakByCategory = property != NOPROPERTY;
useKenName = false;
usePropertyComment = false;
}
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
@ -61,20 +62,27 @@ class DiffPropertyLister extends PropertyLister {
public String optionalComment(int cp) {
String normal = super.optionalComment(cp);
return oldUCD.getModCatID_fromIndex(
oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0))
+ "/" + normal;
if (oldUCD != null && breakByCategory) {
byte modCat = oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
normal = oldUCD.getModCatID_fromIndex(modCat) + "/" + normal;
}
return normal;
}
byte getModCat(int cp) {
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : -1);
//System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result));
return result;
}
public byte status(int cp) {
if (newProp == null) {
if (ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp))) {
set.add(cp);
return INCLUDE;
}
else {
} else {
return EXCLUDE;
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.2 $
* $Date: 2003/02/25 23:38:23 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -83,14 +83,14 @@ abstract public class GenerateBreakTest implements UCD_Types {
Default.setUCD();
}
static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5\\u02ED\\u05F3]");
static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5-\\u02ED\\u05F3]");
static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet()
.addAll(extraAlpha);
static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet();
static {
System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
if (false) System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
}
@ -116,16 +116,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
PrintWriter systemPrintWriter = new PrintWriter(System.out);
gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false);
systemPrintWriter.flush();
}
if (false) {
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
foo.isBreak("(\"Go.\") (He did)", 5, true);
showSet("sepSet", GenerateSentenceBreakTest.sepSet);
showSet("atermSet", GenerateSentenceBreakTest.atermSet);
showSet("termSet", GenerateSentenceBreakTest.termSet);
}
if (true) {
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
//foo.isBreak("(\"Go.\") (He did)", 5, true);
foo.isBreak("3.4", 2, true);
}
new GenerateSentenceBreakTest().run();
@ -276,7 +276,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
+ fileName + "</title></head>");
+ fileName + " Break Chart</title></head>");
out.println("<body bgcolor='#FFFFFF'><h3>Current:</h3>");
@ -304,7 +304,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest"
+ (recommended & recommendedDiffers() ? "_NEW" : "")
+ (shortVersion ? "_SHORT" : "")
+ ".txt", Utility.LATIN1_WINDOWS);
+ ".txt", Utility.UTF8_WINDOWS);
int counter = 0;
out.println("# Default " + fileName + " Break Test");
@ -623,6 +623,60 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
static public class Context {
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
public byte tBefore2, tBefore, tAfter, tAfter2;
public String toString() {
return "["
+ Utility.hex(cpBefore2) + "(" + tBefore2 + "), "
+ Utility.hex(cpBefore) + "(" + tBefore + "), "
+ Utility.hex(cpAfter) + "(" + tAfter + "), "
+ Utility.hex(cpAfter2) + "(" + tAfter2 + ")]";
}
}
public void getGraphemeBases(String source, int offset, boolean recommended, byte ignoreType, Context context) {
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType);
MyBreakIterator graphemeIterator = new MyBreakIterator();
graphemeIterator.set(source, offset);
while (true) {
int cp = graphemeIterator.previousBase();
if (cp == -1) break;
byte t = getResolvedType(cp, recommended);
if (t == ignoreType) continue;
if (context.cpBefore == -1) {
context.cpBefore = cp;
context.tBefore = t;
} else {
context.cpBefore2 = cp;
context.tBefore2 = t;
break;
}
}
graphemeIterator.set(source, offset);
while (true) {
int cp = graphemeIterator.nextBase();
if (cp == -1) break;
byte t = getResolvedType(cp, recommended);
if (t == ignoreType) continue;
if (context.cpAfter == -1) {
context.cpAfter = cp;
context.tAfter = t;
} else {
context.cpAfter2 = cp;
context.tAfter2 = t;
break;
}
}
}
// ========================================
static class GenerateLineBreakTest extends GenerateBreakTest {
@ -1050,7 +1104,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (cp == 0xA) return LF;
if (cp == 0xD) return CR;
if (recommended) {
if (cp == 0x034F) return CGJ;
if (cp == 0x034F) return Extend;
}
if (cp == 0x2028 || cp == 0x2029) return Control;
@ -1178,7 +1232,6 @@ abstract public class GenerateBreakTest implements UCD_Types {
static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]");
//static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
@ -1325,52 +1378,6 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
return 3;
}
static public class Context {
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
public byte tBefore2, tBefore, tAfter, tAfter2;
}
public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
MyBreakIterator graphemeIterator = new MyBreakIterator();
graphemeIterator.set(source, offset);
while (true) {
int cp = graphemeIterator.previousBase();
if (cp == -1) break;
byte t = getResolvedType(cp, recommended);
if (t == Format) continue;
if (context.cpBefore == -1) {
context.cpBefore = cp;
context.tBefore = t;
} else {
context.cpBefore2 = cp;
context.tBefore2 = t;
break;
}
}
graphemeIterator.set(source, offset);
while (true) {
int cp = graphemeIterator.nextBase();
if (cp == -1) break;
byte t = getResolvedType(cp, recommended);
if (t == Format) continue;
if (context.cpAfter == -1) {
context.cpAfter = cp;
context.tAfter = t;
} else {
context.cpAfter2 = cp;
context.tAfter2 = t;
break;
}
}
}
public boolean isBreak(String source, int offset, boolean recommended) {
recommended = true; // don't care about old stuff
@ -1391,7 +1398,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
// now get the base character before and after, and their types
getGraphemeBases(source, offset, recommended, context);
getGraphemeBases(source, offset, recommended, Format, context);
byte before = context.tBefore;
byte after = context.tAfter;
@ -1457,42 +1464,55 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
static class GenerateSentenceBreakTest extends GenerateBreakTest {
static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5,
Close = 6, ATerm = 7, Term = 8, Other = 9,
Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10,
LIMIT = Other + 1;
static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper",
static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric",
"Close", "ATerm", "Term", "Other" };
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]");
static UnicodeSet atermSet = new UnicodeSet("[\\u002E]");
static UnicodeSet termSet = new UnicodeSet("[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
+ "\\u1362\\u1367\\u1368\\u1803\\u1809\\u203c\\u203d\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
static UnicodeSet termSet = new UnicodeSet(
"[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
+ "\\u1362\\u1367\\u1368\\u104A\\u104B\\u166E"
+ "\\u1803\\u1809\\u203c\\u203d"
+ "\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase);
static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase);
UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet();
{
fileName = "Sentence";
extraSamples = new String[] {
};
String[] temp = new String[] {
extraSingleSamples = new String[] {
"(\"Go.\") (He did.)",
"(\"Go?\") (He did.)",
"(\u201CGo?\u201D) (He did.)",
"U.S.A\u0300. is",
"U.S.A\u0300? He",
"U.S.A\u0300.",
"\u4e00.\u4300",
"\u4e00?\u4300",
"3.4",
"c.d",
"etc.)\u2019 \u2018(the",
"etc.)\u2019 \u2018(The",
"the resp. leaders are",
"\u5B57.\u5B57",
"etc.\u5B83",
"etc.\u3002",
"\u5B57\u3002\u5B83",
};
extraSingleSamples = new String [temp.length * 2];
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
for (int i = 0; i < temp.length; ++i) {
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
String[] temp = new String [extraSingleSamples.length * 2];
System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length);
for (int i = 0; i < extraSingleSamples.length; ++i) {
temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme);
}
extraSingleSamples = temp;
}
@ -1509,9 +1529,10 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
if (cat == Cf) return Format;
if (sepSet.contains(cp)) return Sep;
if (Default.ucd.getBinaryProperty(cp, White_space)) return Sp;
if (alphabeticSet.contains(cp)) return OLetter;
if (linebreakNS.contains(cp)) return Numeric;
if (lowercaseProp.hasValue(cp)) return Lower;
if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper;
if (alphabeticSet.contains(cp)) return OLetter;
if (atermSet.contains(cp)) return ATerm;
if (termSet.contains(cp)) return Term;
if (cat == Po || cat == Pe
@ -1529,6 +1550,8 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
return 1;
}
static Context context = new Context();
public boolean isBreak(String source, int offset, boolean recommended) {
rule = "1";
@ -1541,8 +1564,8 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
// Sep ÷ (3)
rule = "3";
byte before = getResolvedType(source.charAt(offset-1), recommended);
if (before == Sep) return true;
byte beforeChar = getResolvedType(source.charAt(offset-1), recommended);
if (beforeChar == Sep) return true;
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
@ -1556,17 +1579,29 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
rule="3";
if (!grapheme.isBreak( source, offset, recommended)) return false;
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
// ATerm ×Upper (7)
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp*÷(10)
getGraphemeBases(source, offset, recommended, Format, context);
byte before = context.tBefore;
byte after = context.tAfter;
byte before2 = context.tBefore2;
byte after2 = context.tAfter2;
// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
// ATerm × (Lower | Numeric) (6)
// Upper ATerm × Upper (7)
if (before == ATerm) {
rule = "6";
if (after == Lower || after == Numeric) return false;
rule = "7";
if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper);
if (before2 == Upper && after == Upper) return false;
}
// The following cases are all handled together.
// These cases are all handled together.
// First we loop backwards, checking for the different types.
MyBreakIterator graphemeIterator = new MyBreakIterator();
@ -1620,19 +1655,18 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
if (lookAfter == -1) {
// Otherwise, do not break
// Any × Any (11)
rule = "11";
rule = "12";
return false;
}
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
// ATerm ×Upper (7)
// ATerm Close* Sp*×(¬( OLetter))* Lower(8)
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp*÷(10)
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
// ( Term | ATerm ) Close* Sp*÷(11)
// We DID find one. Loop to see if the right side is ok.
graphemeIterator.set(source, offset);
@ -1648,16 +1682,16 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
if (isFirst) {
isFirst = false;
if (lookAfter == ATerm && t == Upper) {
rule = "7";
rule = "8";
return false;
}
if (gotSpace) {
if (t == Sp || t == Sep) {
rule = "9";
rule = "10";
return false;
}
} else if (t == Close || t == Sp || t == Sep) {
rule = "8";
rule = "9";
return false;
}
if (lookAfter == Term) break;
@ -1666,16 +1700,18 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
if (t != OLetter && t != Upper && t != Lower) continue;
if (t == Lower) {
rule = "6";
rule = "8";
return false;
}
break;
}
rule = "10";
rule = "11";
return true;
}
}
static final boolean DEBUG_GRAPHEMES = false;
static class MyBreakIterator {
int offset = 0;
String string = "";
@ -1683,6 +1719,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
boolean recommended = true;
public MyBreakIterator set(String source, int offset) {
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset);
string = source;
this.offset = offset;
return this;
@ -1694,6 +1731,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
for (++offset; offset < string.length(); ++offset) {
if (breaker.isBreak(string, offset, recommended)) break;
}
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
return result;
}
@ -1702,7 +1740,9 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
for (--offset; offset >= 0; --offset) {
if (breaker.isBreak(string, offset, recommended)) break;
}
return UTF16.charAt(string, offset);
int result = UTF16.charAt(string, offset);
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
return result;
}
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.12 $
* $Date: 2003/02/25 23:38:23 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -45,10 +45,19 @@ public class GenerateCaseFolding implements UCD_Types {
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
System.out.println("Making Full Data");
Map fullData = getCaseFolding(true, NF_CLOSURE);
Map fullData = getCaseFolding(true, NF_CLOSURE, "");
Utility.fixDot();
System.out.println("Making Simple Data");
Map simpleData = getCaseFolding(false, NF_CLOSURE);
Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
// write the data
System.out.println("Making Turkish Full Data");
Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
Utility.fixDot();
System.out.println("Making Simple Data");
Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
// write the data
Utility.fixDot();
@ -58,7 +67,8 @@ public class GenerateCaseFolding implements UCD_Types {
String directory = "DerivedData/";
String newFile = directory + filename + GenerateData.getFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
String[] batName = {""};
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName);
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
out.println(GenerateData.generateDateLine());
@ -81,7 +91,10 @@ public class GenerateCaseFolding implements UCD_Types {
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
if (rFull == null && rSimple == null) continue;
String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;
if (rFull != null && rFull.equals(rSimple)
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
String type = "C";
@ -105,10 +118,16 @@ public class GenerateCaseFolding implements UCD_Types {
drawLine(out, ch, "S", rSimple);
}
}
if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
drawLine(out, ch, "T", rFullTurkish);
}
if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
drawLine(out, ch, "t", rSimpleTurkish);
}
}
out.close();
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
/* Goal is following (with no entries for 0131 or 0069)
@ -146,7 +165,7 @@ public class GenerateCaseFolding implements UCD_Types {
static int probeCh = 0x01f0;
static String shower = UTF16.valueOf(probeCh);
static Map getCaseFolding(boolean full, boolean nfClose) throws java.io.IOException {
static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
Map data = new TreeMap();
Map repChar = new TreeMap();
//String option = "";
@ -157,7 +176,7 @@ public class GenerateCaseFolding implements UCD_Types {
Utility.dot(ch);
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
if (!Default.ucd.isRepresented(ch)) continue;
getClosure(ch, data, full, nfClose);
getClosure(ch, data, full, nfClose, condition);
}
// get the representative characters
@ -180,7 +199,7 @@ public class GenerateCaseFolding implements UCD_Types {
Iterator it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
int s2Good = goodness(s2, full);
int s2Good = goodness(s2, full, condition);
if (s2Good > repGood) {
rep = s2;
repGood = s2Good;
@ -206,12 +225,20 @@ public class GenerateCaseFolding implements UCD_Types {
log.println(" Set:\t" + toString(set,true, true));
}
log.println();
log.println();
log.println(rep + "\t#" + Default.ucd.getName(rep));
// Add it for all the elements of the set
it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
if (UTF16.countCodePoint(s2) == 1 && !s2.equals(rep)) {
if (s2.equals(rep)) continue;
log.println(s2 + "\t#" + Default.ucd.getName(s2));
if (UTF16.countCodePoint(s2) == 1) {
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
charsUsed.set(UTF16.charAt(s2, 0));
}
@ -225,14 +252,14 @@ public class GenerateCaseFolding implements UCD_Types {
static final int NFC_FORMAT = 64;
static final int ISLOWER = 128;
static int goodness(String s, boolean full) {
static int goodness(String s, boolean full, String condition) {
if (s == null) return 0;
int result = 32-s.length();
if (!PICK_SHORT) {
result = s.length();
}
if (!full) result <<= 8;
String low = lower(upper(s, full), full);
String low = lower(upper(s, full, condition), full, condition);
if (s.equals(low)) result |= ISLOWER;
else if (PICK_SHORT && Default.nfd.normalize(s).equals(Default.nfd.normalize(low))) result |= ISLOWER;
@ -295,11 +322,11 @@ public class GenerateCaseFolding implements UCD_Types {
}
*/
static void getClosure(int ch, Map data, boolean full, boolean nfClose) {
static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
String charStr = UTF32.valueOf32(ch);
String lowerStr = lower(charStr, full);
String titleStr = title(charStr, full);
String upperStr = upper(charStr, full);
String lowerStr = lower(charStr, full, condition);
String titleStr = title(charStr, full, condition);
String upperStr = upper(charStr, full, condition);
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
@ -327,47 +354,47 @@ public class GenerateCaseFolding implements UCD_Types {
if (add(set, Default.nfkd.normalize(s), data)) continue main;
if (add(set, Default.nfkc.normalize(s), data)) continue main;
}
if (add(set, lower(s, full), data)) continue main;
if (add(set, title(s, full), data)) continue main;
if (add(set, upper(s, full), data)) continue main;
if (add(set, lower(s, full, condition), data)) continue main;
if (add(set, title(s, full, condition), data)) continue main;
if (add(set, upper(s, full, condition), data)) continue main;
}
break;
}
}
static String lower(String s, boolean full) {
String result = lower2(s,full);
static String lower(String s, boolean full, String condition) {
String result = lower2(s,full, condition);
return result.replace('\u03C2', '\u03C3'); // HACK for lower
}
// These functions are no longer necessary, since Default.ucd is parameterized,
// but it's not worth changing
static String lower2(String s, boolean full) {
static String lower2(String s, boolean full, String condition) {
/*if (!full) {
if (s.length() != 1) return s;
return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
}
*/
return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER);
return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER, condition);
}
static String upper(String s, boolean full) {
static String upper(String s, boolean full, String condition) {
/* if (!full) {
if (s.length() != 1) return s;
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
}
*/
return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER);
return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER, condition);
}
static String title(String s, boolean full) {
static String title(String s, boolean full, String condition) {
/*if (!full) {
if (s.length() != 1) return s;
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
}
*/
return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE);
return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE, condition);
}
static boolean add(Set set, String s, Map data) {
@ -557,7 +584,8 @@ public class GenerateCaseFolding implements UCD_Types {
System.out.println("Writing");
String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true));
String[] batName = {""};
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true), batName);
out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
out.println(GenerateData.generateDateLine());
out.println("#");
@ -594,6 +622,6 @@ public class GenerateCaseFolding implements UCD_Types {
}
Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
out.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.23 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.24 $
*
*******************************************************************************
*/
@ -116,16 +116,13 @@ public class GenerateData implements UCD_Types {
output.println(generateDateLine());
output.println("#");
if (headerChoice == HEADER_SCRIPTS) {
output.println("# For documentation, see UTR #24: Script Names");
output.println("# http://www.unicode.org/unicode/reports/tr24/");
} else if (headerChoice == HEADER_EXTEND) {
output.println("# Unicode Character Database: Extended Properties");
output.println("# For documentation, see PropList.html");
} else {
output.println("# Unicode Character Database: Derived Property Data");
output.println("# Generated algorithmically from the Unicode Character Database");
output.println("# For documentation, see DerivedProperties.html");
}
output.println("# For documentation, see UCD.html");
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
output.println("# except when listing Noncharacter or Cn.");
output.println(HORIZONTAL_LINE);
@ -144,12 +141,14 @@ public class GenerateData implements UCD_Types {
String newFile = directory + fileName + getFileSuffix(true);
System.out.println("New File: " + newFile);
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat(directory, fileName, getFileSuffix(true), batName);
System.out.println("Most recent: " + mostRecent);
doHeader(fileName + getFileSuffix(false), output, headerChoice);
for (int i = 0; i < DERIVED_PROPERTY_LIMIT; ++i) {
UnicodeProperty up = DerivedProperty.make(i, Default.ucd);
if (up == null) continue;
boolean keepGoing = true;
if (!up.isStandard()) keepGoing = false;
if ((up.getType() & type) == 0) keepGoing = false;
@ -164,7 +163,7 @@ public class GenerateData implements UCD_Types {
output.flush();
}
output.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
/*
@ -192,7 +191,8 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = "DerivedData/CompositionExclusions" + getFileSuffix(true);
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true), batName);
output.println("# CompositionExclusions" + getFileSuffix(false));
output.println(generateDateLine());
@ -248,7 +248,7 @@ public class GenerateData implements UCD_Types {
new CompLister(output, 4).print();
output.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
static String generateDateLine() {
@ -538,12 +538,14 @@ public class GenerateData implements UCD_Types {
addLine(sorted, "qc", "M", "Maybe");
checkDuplicate(duplicates, accumulation, "M", "qc=Maybe");
addLine(sorted, "blk", "n/a", Utility.getUnskeleton("no block", true));
for (int i = 0; i < LIMIT_ENUM; ++i) {
int type = i & 0xFF00;
if (type == AGE) continue;
if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
if (i == (BINARY_PROPERTIES | Non_break)) continue;
if (i == (BINARY_PROPERTIES | Case_Sensitive)) continue;
if (type == NUMERIC_TYPE) {
//System.out.println("debug");
@ -658,7 +660,8 @@ public class GenerateData implements UCD_Types {
String filename = "PropertyAliases";
String newFile = "DerivedData/" + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true), batName);
log.println("# " + filename + getFileSuffix(false));
log.println(generateDateLine());
@ -669,12 +672,12 @@ public class GenerateData implements UCD_Types {
Utility.print(log, sorted, "\r\n", new MyBreaker(true));
log.println();
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
filename = "PropertyValueAliases";
newFile = "DerivedData/" + filename + getFileSuffix(true);
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true), batName);
log.println("# " + filename + getFileSuffix(false));
log.println(generateDateLine());
@ -685,12 +688,13 @@ public class GenerateData implements UCD_Types {
Utility.print(log, sorted, "\r\n", new MyBreaker(false));
log.println();
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
filename = "PropertyAliasSummary";
newFile = "OtherData/" + filename + getFileSuffix(true);
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
mostRecent = generateBat("OtherData/", filename, getFileSuffix(true));
mostRecent = generateBat("OtherData/", filename, getFileSuffix(true), batName);
log.println();
log.println(HORIZONTAL_LINE);
log.println();
@ -702,7 +706,7 @@ public class GenerateData implements UCD_Types {
Utility.print(log, accumulation, "\r\n", new MyBreaker(false));
log.println();
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
static void addLine(Set sorted, String f1, String f2, String f3) {
@ -821,10 +825,10 @@ public class GenerateData implements UCD_Types {
*/
// static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static String generateBat(String directory, String fileRoot, String suffix) throws IOException {
public static String generateBat(String directory, String fileRoot, String suffix, String[] batName) throws IOException {
String mostRecent = Utility.getMostRecentUnicodeDataFile(fixFile(fileRoot), Default.ucd.getVersion(), true, true);
if (mostRecent != null) {
generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
batName[0] = generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
mostRecent, directory + fileRoot + suffix);
} else {
System.out.println("No previous version of: " + fileRoot + ".txt");
@ -839,8 +843,10 @@ public class GenerateData implements UCD_Types {
return mostRecent;
}
public static void generateBatAux(String batName, String oldName, String newName) throws IOException {
public static String generateBatAux(String batName, String oldName, String newName) throws IOException {
String fullBatName = batName + ".bat";
PrintWriter output = Utility.openPrintWriter(batName + ".bat", Utility.LATIN1_UNIX);
newName = Utility.getOutputName(newName);
System.out.println("Writing BAT to compare " + oldName + " and " + newName);
@ -851,6 +857,7 @@ public class GenerateData implements UCD_Types {
+ " "
+ newFile.getCanonicalFile());
output.close();
return new File(Utility.getOutputName(fullBatName)).getCanonicalFile().toString();
}
@ -860,20 +867,25 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + file + getFileSuffix(true);
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, file, getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat(directory, file, getFileSuffix(true), batName);
doHeader(file + getFileSuffix(false), output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd);
if (up == null) continue;
if (up.isDefaultValue()) continue;
/*
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (BINARY_PROPERTIES | Non_break)
|| i == (BINARY_PROPERTIES | CaseFoldTurkishI)
|| i == (HANGUL_SYLLABLE_TYPE | NA)
|| i == (JOINING_TYPE | JT_U)
|| i == (JOINING_GROUP | NO_SHAPING)
) continue; // skip zero case
*/
/*if (skipSpecial == SKIP_SPECIAL
&& i >= (BINARY_PROPERTIES | CompositionExclusion)
&& i < (AGE + NEXT_ENUM)) continue;
@ -920,8 +932,8 @@ public class GenerateData implements UCD_Types {
output.flush();
}
output.close();
System.out.println("HERE");
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
//System.out.println("HERE");
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
System.out.println();
}
@ -929,7 +941,8 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + fileName + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat(directory, fileName, getFileSuffix(true), batName);
String[] example = new String[256];
@ -959,7 +972,7 @@ public class GenerateData implements UCD_Types {
log.println("# NFKD");
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
log.println("#");
log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
log.println("# listed in Part 1, the following invariants must be true for all conformant");
log.println("# implementations:");
log.println("#");
@ -1038,7 +1051,7 @@ public class GenerateData implements UCD_Types {
log.println("#");
log.println("# END OF FILE");
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
static void handleIdentical() throws IOException {
@ -1130,7 +1143,8 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat(directory, filename, getFileSuffix(true), batName);
DiffPropertyLister dpl;
UnicodeSet cummulative = new UnicodeSet();
@ -1203,7 +1217,7 @@ public class GenerateData implements UCD_Types {
} finally {
if (log != null) {
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
}
}
@ -1212,7 +1226,8 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
String[] batName = {""};
String mostRecent = generateBat(directory, filename, getFileSuffix(true), batName);
try {
log.println("# " + filename + getFileSuffix(false));
log.println(generateDateLine());
@ -1253,6 +1268,9 @@ public class GenerateData implements UCD_Types {
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.1.0", "3.2.0", log).print();
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.2.0", "4.0.0", log).print();
/*
printDiff("110", "200");
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
@ -1298,7 +1316,7 @@ public class GenerateData implements UCD_Types {
} finally {
if (log != null) {
log.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.10 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
@ -73,7 +73,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
String property = line.substring(tabPos+1, tabPos2).trim();
String propertyValue = line.substring(tabPos2+1).trim();
if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
HanInfo values = (HanInfo) properties.get(property);
if (values == null) {
@ -203,13 +203,15 @@ public final class GenerateHanTransliterator implements UCD_Types {
return (radical << 8) + strokes;
}
static Transliterator fixHex = Transliterator.getInstance("hex-any/unicode");
static Transliterator fromHexUnicode = Transliterator.getInstance("hex-any/unicode");
static Transliterator toHexUnicode = Transliterator.getInstance("any-hex/unicode");
/*
static String convertUPlus(String other) {
int pos1 = other.indexOf("U+");
if (pos1 < 0) return other;
return fixHex(
return fromHexUnicode(
pos1 += 2;
StringBuffer result = new StringBuffer();
@ -297,6 +299,47 @@ public final class GenerateHanTransliterator implements UCD_Types {
readFrequencyData(type);
Iterator it = fullPinyin.iterator();
while (it.hasNext()) {
String s = (String) it.next();
if (!isValidPinyin2(s)) {
err.println("?Valid Pinyin: " + s);
}
}
it = unihanMap.keySet().iterator();
Map badPinyin = new TreeMap();
PrintWriter out2 = Utility.openPrintWriter("Raw_mapping.txt", Utility.UTF8_WINDOWS);
try {
while (it.hasNext()) {
String keyChar = (String) it.next();
String def = (String) unihanMap.get(keyChar);
if (!isValidPinyin(def)) {
String fixedDef = fixPinyin(def);
err.println(Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t" + fixedDef + "\t#" + def
+ (fixedDef.equals(def) ? " FAIL" : ""));
Utility.addToSet(badPinyin, def, keyChar);
}
// check both ways
String digitDef = accentPinyin_digitPinyin.transliterate(def);
String accentDef = digitPinyin_accentPinyin.transliterate(digitDef);
if (!accentDef.equals(def)) {
err.println("Failed Digit Pinyin: "
+ Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t"
+ def + " => " + digitDef + " => " + accentDef);
}
out2.println(toHexUnicode.transliterate(keyChar)
+ "\tkMandarin\t" + digitDef.toUpperCase() + "\t# " + keyChar + ";\t" + def);
}
err.println();
err.println("Summary of Bad syllables");
Utility.printMapOfCollection(err, badPinyin, "\r\n", ":\t", ", ");
} finally {
out2.close();
}
out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
out.println("# Start RAW data for converting CJK characters");
/*
@ -315,13 +358,12 @@ public final class GenerateHanTransliterator implements UCD_Types {
*/
Set gotAlready = new HashSet();
Iterator it = rankList.iterator();
Set lenSet = new TreeSet();
Set backSet = new TreeSet();
int rank = 0;
Map definitionCount = new HashMap();
it = rankList.iterator();
while (it.hasNext()) {
String keyChar = (String) it.next();
String def = (String) unihanMap.get(keyChar);
@ -478,6 +520,578 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
//http://fog.ccsf.cc.ca.us/~jliou/phonetic.htm
// longer ones must be AFTER!
// longer ones must be AFTER!
static final String[] initialPinyin = {
"",
"b", "p", "m", "f",
"d", "t", "n", "l",
"z", "c", "s",
"zh", "ch", "sh", "r",
"j", "q", "x",
"g", "k", "h",
"y", "w"}; // added to make checking simpler
static final String[] finalPinyin = {
"a", "ai", "ao", "an", "ang",
"o", "ou", "ong",
"e", "ei", "er", "en", "eng",
"i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
"u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
"ü", "üe", "üan", "ün"
};
// Don't bother with the following rules; just add w,y to initials
// When i stands alone, a y will be added before it as yi.
// If i is the first letter of the syllable it will be changed to y.
// When u stands alone, a w will be added before it as wu.
// If u is the first letter of the syllable it will be changed to w. e.g. uang -> wang.
// When ü stands alone, a y will be added before it and ü will be changed to u as yu.
// If ü is the first letter of the syllable, then the spelling will be changed to yu. e.g. üan -> yuan.
//Note: The nasal final ueng never occurs after an initial but always form a syllable by itself.
// The o in iou is hidden, so it will be wrote as iu. But, dont forget to pronounce it.
// The e in uei is hidden, so it will be wrote as ui. But, dont forget to pronounce it.
public static final String[] pinyin_bopomofo = {
"a", "\u311a",
"ai", "\u311e",
"an", "\u3122",
"ang", "\u3124",
"ao", "\u3120",
"ba", "\u3105\u311a",
"bai", "\u3105\u311e",
"ban", "\u3105\u3122",
"bang", "\u3105\u3124",
"bao", "\u3105\u3120",
"bei", "\u3105\u311f",
"ben", "\u3105\u3123",
"beng", "\u3105\u3125",
"bi", "\u3105\u3127",
"bian", "\u3105\u3127\u3122",
"biao", "\u3105\u3127\u3120",
"bie", "\u3105\u3127\u311d",
"bin", "\u3105\u3127\u3123",
"bing", "\u3105\u3127\u3125",
"bo", "\u3105\u311b",
"bu", "\u3105\u3128",
"ca", "\u3118\u311a",
"cai", "\u3118\u311e",
"can", "\u3118\u3122",
"cang", "\u3118\u3124",
"cao", "\u3118\u3120",
"ce", "\u3118",
"cen", "\u3118\u3123",
"ceng", "\u3118\u3125",
"cha", "\u3114\u311a",
"chai", "\u3114\u311e",
"chan", "\u3114\u3122",
"chang", "\u3114\u3124",
"chao", "\u3114\u3120",
"che", "\u3114\u311c",
"chen", "\u3114\u3123",
"cheng", "\u3114\u3125",
"chi", "\u3114",
"chong", "\u3114\u3121\u3125",
"chou", "\u3114\u3121",
"chu", "\u3114\u3128",
//"chua", "XXX",
"chuai", "\u3114\u3128\u311e",
"chuan", "\u3114\u3128\u3122",
"chuang", "\u3114\u3128\u3124",
"chui", "\u3114\u3128\u311f",
"chun", "\u3114\u3128\u3123",
"chuo", "\u3114\u3128\u311b",
"ci", "\u3118",
"cong", "\u3118\u3128\u3125",
"cou", "\u3118\u3121",
"cu", "\u3118\u3128",
"cuan", "\u3118\u3128\u3122",
"cui", "\u3118\u3128\u311f",
"cun", "\u3118\u3128\u3123",
"cuo", "\u3118\u3128\u311b",
"da", "\u3109\u311a",
"dai", "\u3109\u311e",
"dan", "\u3109\u3122",
"dang", "\u3109\u3124",
"dao", "\u3109\u3120",
"de", "\u3109\u311c",
"dei", "\u3109\u311f",
"den", "\u3109\u3123",
"deng", "\u3109\u3125",
"di", "\u3109\u3127",
"dia", "\u3109\u3127\u311a",
"dian", "\u3109\u3127\u3122",
"diao", "\u3109\u3127\u3120",
"die", "\u3109\u3127\u311d",
"ding", "\u3109\u3127\u3125",
"diu", "\u3109\u3127\u3121",
"dong", "\u3109\u3128\u3125",
"dou", "\u3109\u3121",
"du", "\u3109\u3128",
"duan", "\u3109\u3128\u3122",
"dui", "\u3109\u3128\u311f",
"dun", "\u3109\u3128\u3123",
"duo", "\u3109\u3128\u311b",
"e", "\u311c",
"ei", "\u311f",
"en", "\u3123",
"eng", "\u3125",
"er", "\u3126",
"fa", "\u3108\u311a",
"fan", "\u3108\u3122",
"fang", "\u3108\u3124",
"fei", "\u3108\u311f",
"fen", "\u3108\u3123",
"feng", "\u3108\u3125",
"fo", "\u3108\u311b",
"fou", "\u3108\u3121",
"fu", "\u3108\u3128",
"ga", "\u310d\u311a",
"gai", "\u310d\u311e",
"gan", "\u310d\u3122",
"gang", "\u310d\u3124",
"gao", "\u310d\u3120",
"ge", "\u310d\u311c",
"gei", "\u310d\u311f",
"gen", "\u310d\u3123",
"geng", "\u310d\u3125",
"gong", "\u310d\u3128\u3125",
"gou", "\u310d\u3121",
"gu", "\u310d\u3128",
"gua", "\u310d\u3128\u311a",
"guai", "\u310d\u3128\u311e",
"guan", "\u310d\u3128\u3122",
"guang", "\u310d\u3128\u3124",
"gui", "\u310d\u3128\u311f",
"gun", "\u310d\u3128\u3123",
"guo", "\u310d\u3128\u311b",
"ha", "\u310f\u311a",
"hai", "\u310f\u311e",
"han", "\u310f\u3122",
"hang", "\u310f\u3124",
"hao", "\u310f\u3120",
"he", "\u310f\u311c",
"hei", "\u310f\u311f",
"hen", "\u310f\u3123",
"heng", "\u310f\u3125",
"hm", "\u310f\u3107",
"hng", "\u310f\u312b", // 'dialect of n'
"hong", "\u310f\u3128\u3125",
"hou", "\u310f\u3121",
"hu", "\u310f\u3128",
"hua", "\u310f\u3128\u311a",
"huai", "\u310f\u3128\u311e",
"huan", "\u310f\u3128\u3122",
"huang", "\u310f\u3128\u3124",
"hui", "\u310f\u3128\u311f",
"hun", "\u310f\u3128\u3123",
"huo", "\u310f\u3128\u311b",
"ji", "\u3110\u3127",
"jia", "\u3110\u3127\u311a",
"jian", "\u3110\u3127\u3122",
"jiang", "\u3110\u3127\u3124",
"jiao", "\u3110\u3127\u3120",
"jie", "\u3110\u3127\u311d",
"jin", "\u3110\u3127\u3123",
"jing", "\u3110\u3127\u3125",
"jiong", "\u3110\u3129\u3125",
"jiu", "\u3110\u3127\u3121",
"ju", "\u3110\u3129",
"juan", "\u3110\u3129\u3122",
"jue", "\u3110\u3129\u311d",
"jun", "\u3110\u3129\u3123",
"ka", "\u310e\u311a",
"kai", "\u310e\u311e",
"kan", "\u310e\u3122",
"kang", "\u310e\u3124",
"kao", "\u310e\u3120",
"ke", "\u310e\u311c",
"kei", "\u310e\u311f",
"ken", "\u310e\u3123",
"keng", "\u310e\u3125",
"kong", "\u310e\u3128\u3125",
"kou", "\u310e\u3121",
"ku", "\u310e\u3128",
"kua", "\u310e\u3128\u311a",
"kuai", "\u310e\u3128\u311e",
"kuan", "\u310e\u3128\u3122",
"kuang", "\u310e\u3128\u3124",
"kui", "\u310e\u3128\u311f",
"kun", "\u310e\u3128\u3123",
"kuo", "\u310e\u3128\u311b",
"la", "\u310c\u311a",
"lai", "\u310c\u311e",
"lan", "\u310c\u3122",
"lang", "\u310c\u3124",
"lao", "\u310c\u3120",
"le", "\u310c\u311c",
"lei", "\u310c\u311f",
"leng", "\u310c\u3125",
"li", "\u310c\u3127",
"lia", "\u310c\u3127\u311a",
"lian", "\u310c\u3127\u3122",
"liang", "\u310c\u3127\u3124",
"liao", "\u310c\u3127\u3120",
"lie", "\u310c\u3127\u311d",
"lin", "\u310c\u3127\u3123",
"ling", "\u310c\u3127\u3125",
"liu", "\u310c\u3127\u3121",
"lo", "\u310c\u311b",
"long", "\u310c\u3128\u3125",
"lou", "\u310c\u3121",
"lu", "\u310c\u3128",
"", "\u310c\u3129",
"luan", "\u310c\u3128\u3122",
"lüe", "\u310c\u3129\u311d",
"lun", "\u310c\u3128\u3123",
"luo", "\u310c\u3128\u311b",
"m", "\u3107",
"ma", "\u3107\u311a",
"mai", "\u3107\u311e",
"man", "\u3107\u3122",
"mang", "\u3107\u3124",
"mao", "\u3107\u3120",
"me", "\u3107\u311c",
"mei", "\u3107\u311f",
"men", "\u3107\u3123",
"meng", "\u3107\u3125",
"mi", "\u3107\u3127",
"mian", "\u3107\u3127\u3122",
"miao", "\u3107\u3127\u3120",
"mie", "\u3107\u3127\u311d",
"min", "\u3107\u3127\u3123",
"ming", "\u3107\u3127\u3125",
"miu", "\u3107\u3127\u3121",
"mo", "\u3107\u311b",
"mou", "\u3107\u3121",
"mu", "\u3107\u3128",
"n", "\u310b",
"na", "\u310b\u311a",
"nai", "\u310b\u311e",
"nan", "\u310b\u3122",
"nang", "\u310b\u3124",
"nao", "\u310b\u3120",
"ne", "\u310b\u311c",
"nei", "\u310b\u311f",
"nen", "\u310b\u3123",
"neng", "\u310b\u3125",
"ng", "\u312b",
"ni", "\u310b\u3127",
"nian", "\u310b\u3127\u3122",
"niang", "\u310b\u3127\u3124",
"niao", "\u310b\u3127\u3120",
"nie", "\u310b\u3127\u311d",
"nin", "\u310b\u3127\u3123",
"ning", "\u310b\u3127\u3125",
"niu", "\u310b\u3127\u3121",
"nong", "\u310b\u3128\u3125",
"nou", "\u310b\u3121",
"nu", "\u310b\u3128",
"", "\u310b\u3129",
"nuan", "\u310b\u3128\u3122",
"nüe", "\u310b\u3129\u311d",
"nuo", "\u310b\u3128\u311b",
"o", "\u311b",
"ou", "\u3121",
"pa", "\u3106\u311a",
"pai", "\u3106\u311e",
"pan", "\u3106\u3122",
"pang", "\u3106\u3124",
"pao", "\u3106\u3120",
"pei", "\u3106\u311f",
"pen", "\u3106\u3123",
"peng", "\u3106\u3125",
"pi", "\u3106\u3127",
"pian", "\u3106\u3127\u3122",
"piao", "\u3106\u3127\u3120",
"pie", "\u3106\u3127\u311d",
"pin", "\u3106\u3127\u3123",
"ping", "\u3106\u3127\u3125",
"po", "\u3106\u311b",
"pou", "\u3106\u3121",
"pu", "\u3106\u3128",
"qi", "\u3111",
"qia", "\u3111\u3127\u311a",
"qian", "\u3111\u3127\u3122",
"qiang", "\u3111\u3127\u3124",
"qiao", "\u3111\u3127\u3120",
"qie", "\u3111\u3127\u311d",
"qin", "\u3111\u3127\u3123",
"qing", "\u3111\u3127\u3125",
"qiong", "\u3111\u3129\u3125",
"qiu", "\u3111\u3129\u3121",
"qu", "\u3111\u3129",
"quan", "\u3111\u3129\u3122",
"que", "\u3111\u3129\u311d",
"qun", "\u3111\u3129\u3123",
"ran", "\u3116\u3122",
"rang", "\u3116\u3124",
"rao", "\u3116\u3120",
"re", "\u3116\u311c",
"ren", "\u3116\u3123",
"reng", "\u3116\u3125",
"ri", "\u3116",
"rong", "\u3116\u3128\u3125",
"rou", "\u3116\u3121",
"ru", "\u3116\u3128",
"ruan", "\u3116\u3128\u3122",
"rui", "\u3116\u3128\u311f",
"run", "\u3116\u3128\u3123",
"ruo", "\u3116\u3128\u311b",
"sa", "\u3119\u311a",
"sai", "\u3119\u311e",
"san", "\u3119\u3122",
"sang", "\u3119\u3124",
"sao", "\u3119\u3120",
"se", "\u3119\u311c",
"sen", "\u3119\u3123",
"seng", "\u3119\u3125",
"sha", "\u3115\u311a",
"shai", "\u3115\u311e",
"shan", "\u3115\u3122",
"shang", "\u3115\u3124",
"shao", "\u3115\u3120",
"she", "\u3115\u311c",
"shei", "\u3115\u311f",
"shen", "\u3115\u3123",
"sheng", "\u3115\u3125",
"shi", "\u3115",
"shou", "\u3115\u3121",
"shu", "\u3115\u3128",
"shua", "\u3115\u3128\u311a",
"shuai", "\u3115\u3128\u311e",
"shuan", "\u3115\u3128\u3122",
"shuang", "\u3115\u3128\u3124",
"shui", "\u3115\u3128\u311f",
"shun", "\u3115\u3128\u3123",
"shuo", "\u3115\u3128\u311b",
"si", "\u3119",
"song", "\u3119\u3128\u3125",
"sou", "\u3119\u3121",
"su", "\u3119\u3128",
"suan", "\u3119\u3128\u3122",
"sui", "\u3119\u3128\u311f",
"sun", "\u3119\u3128\u3123",
"suo", "\u3119\u3128\u311b",
"ta", "\u310a\u311a",
"tai", "\u310a\u311e",
"tan", "\u310a\u3122",
"tang", "\u310a\u3124",
"tao", "\u310a\u3120",
"te", "\u310a\u311c",
"teng", "\u310a\u3125",
"ti", "\u310a\u3127",
"tian", "\u310a\u3127\u3122",
"tiao", "\u310a\u3127\u3120",
"tie", "\u310a\u3127\u311d",
"ting", "\u310a\u3127\u3125",
"tong", "\u310a\u3128\u3125",
"tou", "\u310a\u3121",
"tu", "\u310a\u3128",
"tuan", "\u310a\u3128\u3122",
"tui", "\u310a\u3128\u311f",
"tun", "\u310a\u3128\u3123",
"tuo", "\u310a\u3128\u311b",
"wa", "\u3128\u311a",
"wai", "\u3128\u311e",
"wan", "\u3128\u3122",
"wang", "\u3128\u3124",
"wei", "\u3128\u311f",
"wen", "\u3128\u3123",
"weng", "\u3128\u3125",
"wo", "\u3128\u311b",
"wu", "\u3128",
"xi", "\u3112\u3127",
"xia", "\u3112\u3127\u311a",
"xian", "\u3112\u3127\u3122",
"xiang", "\u3112\u3127\u3124",
"xiao", "\u3112\u3127\u3120",
"xie", "\u3112\u3127\u311d",
"xin", "\u3112\u3127\u3123",
"xing", "\u3112\u3127\u3125",
"xiong", "\u3112\u3129\u3125",
"xiu", "\u3112\u3127\u3121",
"xu", "\u3112\u3129",
"xuan", "\u3112\u3129\u3122",
"xue", "\u3112\u3129\u311d",
"xun", "\u3112\u3129\u3123",
"ya", "\u3127\u311a",
"yai", "\u3127\u311e", // not in xinhua zidian index, but listed as alternate pronunciation
"yan", "\u3127\u3122",
"yang", "\u3127\u3124",
"yao", "\u3127\u3120",
"ye", "\u3127\u311d",
"yi", "\u3127",
"yin", "\u3127\u3123",
"ying", "\u3127\u3125",
"yo", "\u3127\u311b",
"yong", "\u3129\u3125",
"you", "\u3127\u3121",
"yu", "\u3129",
"yuan", "\u3129\u3122",
"yue", "\u3129\u311d",
"yun", "\u3129\u3123",
"za", "\u3117\u311a",
"zai", "\u3117\u311e",
"zan", "\u3117\u3122",
"zang", "\u3117\u3124",
"zao", "\u3117\u3120",
"ze", "\u3117",
"zei", "\u3117\u311f",
"zen", "\u3117\u3123",
"zeng", "\u3117\u3125",
"zha", "\u3113\u311a",
"zhai", "\u3113\u311e",
"zhan", "\u3113\u3122",
"zhang", "\u3113\u3124",
"zhao", "\u3113\u3120",
"zhe", "\u3113\u311d",
"zhei", "\u3113\u311f",
"zhen", "\u3113\u3123",
"zheng", "\u3113\u3125",
"zhi", "\u3113",
"zhong", "\u3113\u3128\u3125",
"zhou", "\u3113\u3121",
"zhu", "\u3113\u3128",
"zhua", "\u3113\u3128\u311a",
"zhuai", "\u3113\u3128\u311e",
"zhuan", "\u3113\u3128\u3122",
"zhuang", "\u3113\u3128\u3124",
"zhui", "\u3113\u3128\u311f",
"zhun", "\u3113\u3128\u3123",
"zhuo", "\u3113\u3128\u311b",
"zi", "\u3117",
"zong", "\u3117\u3128\u3125",
"zou", "\u3117\u3121",
"zu", "\u3117\u3128",
"zuan", "\u3117\u3128\u3122",
"zui", "\u3117\u3128\u311f",
"zun", "\u3117\u3128\u3123",
"zuo", "\u3117\u3128\u311b",
};
static final Set fullPinyin = new TreeSet();
static {
for (int i = 0; i < pinyin_bopomofo.length; i+= 2) {
fullPinyin.add(pinyin_bopomofo[i]);
}
}
static boolean isValidPinyin(String s) {
s = dropTones.transliterate(s);
if (fullPinyin.contains(s)) return true;
return false;
}
static boolean isValidPinyin2(String s) {
s = dropTones.transliterate(s);
for (int i = initialPinyin.length-1; i >= 0; --i) {
if (s.startsWith(initialPinyin[i])) {
String end = s.substring(initialPinyin[i].length());
for (int j = finalPinyin.length-1; j >= 0; --j) {
if (end.equals(finalPinyin[j])) return true;
}
return false;
}
}
return false;
}
/*
U+347C · liù #lyuè
U+3500 · lüè #lvè
U+3527 · liù #lyù
U+3729 · ào #àu
U+380E · #jjí
U+3825 · l· #lv·
U+3A3C · lüè #luè
U+3B5A · li· #ly· *** ?
U+3CB6 · l· #lv·
U+3D56 · niù #nyù *** ?
U+3D88 · li·ng #li·ng
U+3EF2 · li· #ly·*** ?
U+3F94 · li· #ly·*** ?
U+4071 · ào #àu
U+40AE · liù #lyuè *** lüe?
U+430E · liù #lyuè *** lüe?
U+451E · liù #lyù *** ?
U+4588 · nüè #nuè
U+458B · nüè #nuè
U+45A1 · niù #nyù *** ?
U+4610 · niù #nyù *** ?
U+46BC · niù #nyù *** ?
U+46DA · liù #lyuè *** lüe?
U+4896 · liù #lyù *** ?
U+4923 · liù #lyuè *** lüe?
U+4968 · liù #lyù *** ?
U+4A0B · niù #nyuè *** nüe?
U+4AC4 · chuò #chuà
U+4D08 · ·o #·u
U+4D8A · niù #nyù *** ?
U+51CA · qíng #qýng
U+51D6 · zhu·n #zhu·n *** this is probably zh·n
U+5481 · gàn #gèm
U+5838 · féng #fúng
U+639F · · #lu· *** this pronunciation surprises me, but I don't know...
U+66D5 · yàn #yiàn
U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry
U+6B56 · chu· #chu· *** chua
U+6C7C · ni· #ni·u
U+6E6D · qiú #qióu
U+6F71 · y· #yi·
U+7493 · xiù #xiòu
U+7607 · zh·ng #zh·ng *** I suspect zh·ng
U+7674 · luán #lüán
U+7867 · y·ng #i·ng
U+7878 · nüè #nuè
*/
static Transliterator fixTypos = Transliterator.createFromRules("fix_typos",
"$cons=[bcdfghjklmnpqrstvwxyz];"
+"$nlet=[^[:Letter:][:Mark:]];"
+"$cons{iou}$nlet > iu;"
+"$cons{em}$nlet > an;"
+"$cons{uen}$nlet > ueng;"
+"$cons{ve}$nlet > üe;"
+"$cons{v}$nlet > ü;"
+"$cons{yue}$nlet > iu;"
+"$cons{yng}$nlet > ing;"
+"$cons{yu}$nlet > iu;"
//+"$cons{ue} > üe;"
+"jj > j;"
//+"$nlet{ng}$nlet > eng;"
//+"$nlet{n}$nlet > en;"
//+"$nlet{m}$nlet > en;"
+"$nlet{au}$nlet > ao;"
// new fixes
+"zhueng}$nlet > zhong;"
+"zhuen}$nlet > zhuan;"
+"lue > lüe;"
+"liong > liang;"
+"nue > nüe;"
+"chua > chuo;"
+"yian > yan;"
+"yie > ye;"
+"lüan > luan;"
+"iong > yong;"
, Transliterator.FORWARD);
static String fixPinyin(String s) {
String original = s;
//err.println("Source: " + s);
s = accentPinyin_digitPinyin.transliterate(s);
//err.println("Digit: " + s);
s = fixTypos.transliterate(s);
//err.println("fixed: " + s);
s = digitPinyin_accentPinyin.transliterate(s);
//err.println("Result: " + s);
if (isValidPinyin(s)) return s;
return original;
}
static PrintWriter log;
static PrintWriter out;
static PrintWriter err;
@ -734,7 +1348,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
if (type == JAPANESE) {
processEdict(word, definition, line);
} else {
definition = convertPinyin.transliterate(definition);
definition = digitToPinyin(definition, line);
//definition = Utility.replace(definition, " ", "\\ ");
addCheck(word, definition, line);
}
@ -755,20 +1369,37 @@ public final class GenerateHanTransliterator implements UCD_Types {
int counter = 0;
String[] pieces = new String[50];
String line = "";
boolean noOverrideFailure = true;
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
Utility.dot(counter++);
//System.out.println(line);
// skip code
line=line.toLowerCase();
int wordStart = line.indexOf('\t') + 1;
int wordEnd = line.indexOf('\t', wordStart);
String word = line.substring(wordStart, wordEnd);
String definition = line.substring(wordEnd+1);
addCheck(word, definition, line);
overrideSet.add(word);
String definition = fixPinyin(line.substring(wordEnd+1));
String old = (String) unihanMap.get(word);
if (old != null) {
if (!old.equals(definition)) {
if (noOverrideFailure) {
System.out.println("Overriding Failure");
noOverrideFailure = false;
}
err.println("Overriding Failure: " + word
+ "\t" + old + " " + toHexUnicode.transliterate(old)
+ "\t" + definition + " " + toHexUnicode.transliterate(definition));
}
} else {
addCheck(word, definition, line);
overrideSet.add(word);
}
}
br.close();
} catch (Exception e) {
@ -776,6 +1407,81 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
/*
@Unihan Data
Bad pinyin data: \u4E7F ? LE
\u7684 ? de, de, ,
*/
static void fixChineseOverrides() throws IOException {
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
out = Utility.openPrintWriter("new_Chinese_override.txt", Utility.UTF8_WINDOWS);
try {
String fname = "fixed_Chinese_transliterate_log.txt";
int counter = 0;
String line = "";
String pinyinPrefix = "Bad pinyin data: ";
System.out.println("Reading " + fname);
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
if (line.charAt(0) == 0xFEFF) {
line = line.substring(1); // remove BOM
if (line.length() == 0) continue;
}
Utility.dot(counter++);
if (line.charAt(0) == '@') continue;
if (line.startsWith(pinyinPrefix)) {
line = line.substring(pinyinPrefix.length());
}
line = line.toLowerCase();
//System.out.println(Default.ucd.getCode(line));
// skip code
int wordStart = line.indexOf('\t') + 1;
int wordEnd = line.indexOf('\t', wordStart);
String word = line.substring(wordStart, wordEnd).trim();
int defStart = wordEnd+1;
int defEnd = line.indexOf(',', defStart);
if (defEnd < 0) defEnd = line.length();
String definition = fixCircumflex.transliterate(line.substring(defStart, defEnd).trim());
String notones = dropTones.transliterate(definition);
if (definition.equals(notones)) {
definition = digitPinyin_accentPinyin.transliterate(definition + "1");
if (definition == null) {
System.out.println("Huh? " + notones);
}
log.println("Fixing: " + notones + " => " + definition + "; " + line);
}
out.println(hex.transliterate(word) + "\t" + word + "\t" + definition);
}
} catch (Exception e) {
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
} finally {
br.close();
}
} finally {
out.close();
}
}
static Set overrideSet = new HashSet();
static void processEdict(String word, String definition, String line) {
@ -997,7 +1703,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
static void readCDICT() throws IOException {
System.out.println("Reading cdict.txt");
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", Utility.UTF8);
String fname = "cdict.txt";
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
int counter = 0;
String[] pieces = new String[50];
String line = "";
@ -1026,7 +1734,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
for (int i = 0; i < len; ++i) {
String chr = word.substring(i, i+1);
String piece = convertPinyin.transliterate(pieces[i]);
String piece = digitToPinyin(pieces[i], line);
Map oldMap = (Map) cdict.get(chr);
if (oldMap == null) {
oldMap = new TreeMap();
@ -1069,6 +1779,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
static String digitToPinyin(String source, String line) {
if (source.indexOf('5') >= 0) log.println("Pinyin Tone5 at: " + line);
return digitPinyin_accentPinyin.transliterate(source);
}
static Map cdict = new TreeMap();
static Map simplifiedToTraditional = new HashMap();
static Map traditionalToSimplified = new HashMap();
@ -1098,7 +1813,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
String property = line.substring(tabPos+1, tabPos2).trim();
String propertyValue = line.substring(tabPos2+1).trim();
if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
// gather traditional mapping
if (property.equals("kTraditionalVariant")) {
@ -1160,7 +1875,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
definition = definition.substring(0, end3);
definition = convertPinyin.transliterate(definition);
definition = digitToPinyin(definition, line);
}
if (type == DEFINITION) {
definition = removeMatched(definition,'(', ')', line);
@ -1220,7 +1935,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
return source;
}
static Map unihanMap = new HashMap();
static Map unihanMap = new TreeMap(); // could be hashmap
static Map duplicates = new TreeMap();
static boolean unihanNonSingular = false;
@ -1274,14 +1989,26 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
static Transliterator convertPinyin;
static Transliterator digitPinyin_accentPinyin;
static Transliterator accentPinyin_digitPinyin = Transliterator.createFromRules("accentPinyin_digitPinyin",
"::NFD; "
+ " ([\u0304\u0301\u030C\u0300\u0306]) ([[:Mark:][:Letter:]]+) > $2 | $1;"
+ "\u0304 > '1'; \u0301 > '2'; \u030C > '3'; \u0300 > '4'; \u0306 > '3';"
+ " ::NFC;", Transliterator.FORWARD);
static Transliterator fixCircumflex = Transliterator.createFromRules("fix_circumflex",
"::NFD; \u0306 > \u030C; ::NFC;", Transliterator.FORWARD);
static Transliterator dropTones = Transliterator.createFromRules("drop_tones",
"::NFD; \u0304 > ; \u0301 > ; \u030C > ; \u0300 > ; \u0306 > ; ::NFC;", Transliterator.FORWARD);
static {
String dt = "1 > ;\n"
String dt = "1 > \u0304;\n"
+ "2 <> \u0301;\n"
+ "3 <> \u0306;\n"
+ "3 <> \u030C;\n"
+ "4 <> \u0300;\n"
+ "5 <> \u0304;";
+ "5 <> ;";
String dp = "# syllable is ...vowel+ consonant* number\n"
+ "# 'a', 'e' are the preferred bases\n"
@ -1301,8 +2028,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
System.out.println(at.transliterate("a1a2a3a4a5"));
DummyFactory.add(at.getID(), at);
convertPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
System.out.println(convertPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
digitPinyin_accentPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
System.out.println(digitPinyin_accentPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
}
/*

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.25 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.26 $
*
*******************************************************************************
*/
@ -37,7 +37,10 @@ public final class Main implements UCD_Types {
"PropList",
"Scripts",
"SpecialCasing",
"HangulSyllableType",
"DerivedAge",
"StandardizedVariants",
//"HangulSyllable",
//"OtherDerivedProperties",
};
@ -71,6 +74,10 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
else if (arg.equalsIgnoreCase("hanproperties")) GenerateHanTransliterator.readUnihan();
else if (arg.equalsIgnoreCase("fixChineseOverrides")) GenerateHanTransliterator.fixChineseOverrides();
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
else if (arg.equalsIgnoreCase("testenum")) SampleEnum.test();
@ -115,6 +122,7 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test();
else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical();
else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test();
//else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();
@ -191,10 +199,17 @@ public final class Main implements UCD_Types {
GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedNumericType" );
} else if (arg.equalsIgnoreCase("HangulSyllableType")) {
GenerateData.generateVerticalSlice(HANGUL_SYLLABLE_TYPE,HANGUL_SYLLABLE_TYPE+NEXT_ENUM, GenerateData.HEADER_EXTEND,
"DerivedData/", "HangulSyllableType" );
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/extracted/", "DerivedNumericValues" );
} else if (arg.equalsIgnoreCase("StandardizedVariants")) {
GenerateStandardizedVariants.generate();
// OTHER STANDARD PROPERTIES
} else if (arg.equalsIgnoreCase("CaseFolding")) {
@ -239,7 +254,7 @@ public final class Main implements UCD_Types {
} else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
//mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
GenerateData.generateDerived(ALL, false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
} else if (arg.equalsIgnoreCase("AllBinary")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.13 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -416,7 +416,11 @@ public final class Normalizer implements UCD_Types {
String s = ucd.getDecompositionMapping(i);
int len = UTF16.countCodePoint(s);
if (len != 2) {
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
if (len > 2) {
if (ucd.getVersion().compareTo("3.0.0") >= 0) {
throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
}
}
continue;
}
int a = UTF16.charAt(s, 0);

View File

@ -1,9 +1,7 @@
# This file contains aliases for properties used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
# The names are not normative, except where they correspond to normative
# properties in the UCD. For information on which properties are normative,
# see UnicodeCharacterDatabase.html.
# For information on which properties are normative, see UCD.html.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
@ -20,16 +18,14 @@
# and '_' are ignored.
#
# NOTE: Currently there is at most one abbreviated name and one long name for
# each property. However, in the future additional aliases
# may be added. In such a case, the first line for the property
# would have the preferred alias for output.
# each property. However, in the future additional aliases may be added.
#
# NOTE: The property value names are NOT unique across properties, especially
# with loose matches. For example,
# with loose matches. For example:
#
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names.
#

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
* $Date: 2002/05/29 02:01:00 $
* $Revision: 1.9 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -57,7 +57,7 @@ abstract public class PropertyLister implements UCD_Types {
}
public String optionalComment(int cp) {
if (!usePropertyComment || !breakByCategory) return "";
if (!usePropertyComment) return "";
return ucdData.getModCatID_fromIndex(getModCat(cp));
}
@ -143,7 +143,8 @@ abstract public class PropertyLister implements UCD_Types {
}
byte getModCat(int cp) {
return ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
return result;
}

View File

@ -1,9 +1,7 @@
# This file contains aliases for property values used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
# The names are not normative, except where they correspond to normative property
# values in the UCD. For information on which properties are normative, see
# UnicodeCharacterDatabase.html.
# For information on which properties are normative, see UCD.html.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
@ -22,29 +20,29 @@
#
# Third Field: The third field is a long name.
#
# In the case of ccc, their are 4 fields. The second field is numeric, third
# In the case of ccc, there are 4 fields. The second field is numeric, third
# is abbreviated, and fourth is long.
#
# With loose matching of property names, the case distinctions, whitespace,
# and '_' are ignored.
#
# NOTE: The Block property values are in Blocks.txt, and not repeated here.
# For more information on the use of blocks, see UTR #18: Regular Expression Guidelines
#
# NOTE: Currently there is at most one abbreviated name and one long name for
# property value. However, in the future additional aliases
# may be added. In such a case, the first line for the property value
# would have the preferred alias for output.
# property value. However, in the future additional aliases may be added.
# In such a case, the first line for the property value would have
# the preferred alias for output.
#
# NOTE: The property value names are NOT unique across properties, especially
# with loose matches. For example,
# with loose matches. For example:
#
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names:
# cc means Combining_Class property, and
# cc means the General_Category property value Control (cc)
# In addition, some property names may be the same as some property value names.
# For example:
#
# cc means Combining_Class property, and
# cc means the General_Category property value Control (cc)
#
# The combination of property value and property name is, however, unique.
# For more information, see UTR #24: Regular Expression Guidelines
# For more information, see UTR #18: Regular Expression Guidelines

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.1 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -23,25 +23,38 @@ import com.ibm.text.utility.*;
public class QuickTest implements UCD_Types {
static final void test() {
Default.setUCD();
UnicodeSet format = new UnicodeSet("[:Cf:]");
/*
[4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
[#xC0 - #x2FF] | [#x370 - #x37D] | [#x37F - #x1FFF] |
[#x200C - #x200D] | [#x2070 - #x218F] | [#x2C00 - #x2FEF] |
[#x3001 - #xD7FF] | [#xF900 - #xF9FF] | [#x10000 - #xDFFFF]
[4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F]
[4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
[#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
[#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
[#x3001-#xD7FF] | [#xF900-#xEFFFF]
[4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 |
[#x0300-#x036F] | [#x203F-#x2040]
*/
UnicodeSet nameStartChar = new UnicodeSet("[\\: A-Z \\_ a-z"
+ "\\u00c0-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF"
+ "\\u200C-\\u200D \\u2070-\\u218F \\u2C00-\\u2FEF"
+ "\\u3001-\\uD7FF \\uF900-\\uF9FF \\U00010000-\\U000DFFFF]");
+ "\\u3001-\\uD7FF \\uF900-\\U000EFFFF]");
UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 \\u0300-\\u036F]")
UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 "
+ "\\u0300-\\u036F \\u203F-\\u2040]")
.addAll(nameStartChar);
UnicodeSet nameAll = new UnicodeSet(nameChar).addAll(nameStartChar);
showSet("NameStartChar", nameStartChar);
showDiffs("NameChar", nameChar, "NameStartChar", nameStartChar);
UnicodeSet ID_Start = new UnicodeSet("[:ID_Start:]");
UnicodeSet ID_Continue = new UnicodeSet("[:ID_Continue:]").removeAll(format);
UnicodeSet ID_All = new UnicodeSet(ID_Start).addAll(ID_Continue);
showDiffs("ID_All", ID_All, "nameAll", nameAll);
showDiffs("ID_Start", ID_Start, "nameStartChar", nameStartChar);
UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED | DefaultIgnorable).getSet();
UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES | White_space).getSet();
@ -49,7 +62,6 @@ public class QuickTest implements UCD_Types {
UnicodeSet notNFKC = new UnicodeSet();
UnicodeSet privateUse = new UnicodeSet();
UnicodeSet noncharacter = new UnicodeSet();
UnicodeSet format = new UnicodeSet("[:Cf:]");
for (int i = 0; i <= 0x10FFFF; ++i) {
if (!Default.ucd.isAllocated(i)) continue;

View File

@ -48,14 +48,14 @@
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.

View File

@ -4,8 +4,7 @@
# It contains additional information about the casing of Unicode characters.
# (For compatibility, the UnicodeData.txt file only contains case mappings for
# characters where they are 1-1, and does not have locale-specific mappings.)
# For more information, see
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
# For more information, see the discussion of Case Mappings in the Unicode Standard.
#
# ================================================================================
# Format
@ -31,10 +30,10 @@
# <ISO_3166_code> := 2-letter ISO country code,
# <ISO_639_code> := 2-letter ISO language code
#
# A context is one of the following, as defined in UAX #21: Case Mappings:
# Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot
# A context is one of the following, as defined in the Unicode Standard:
# Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot, Not_Before_Dot, After_I
#
# Parsers of this file must be prepared to deal future additions to this format:
# Parsers of this file must be prepared to deal with future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.19 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
@ -35,7 +35,7 @@ public final class UCD implements UCD_Types {
/**
* Used for the default version.
*/
public static final String latestVersion = "3.2.1";
public static final String latestVersion = "4.0.0";
/**
* Create singleton instance for default (latest) version
@ -79,17 +79,19 @@ public final class UCD implements UCD_Types {
*/
public boolean isAllocated(int codePoint) {
if (getCategory(codePoint) != Cn) return true;
if (major >= 2 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
if (compositeVersion >= 0x20000 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
if (isNoncharacter(codePoint)) return true;
return false;
}
public boolean isNoncharacter(int codePoint) {
if ((codePoint & 0xFFFE) == 0xFFFE) {
if (major < 2 && codePoint > 0xFFFF) return false;
if (compositeVersion < 0x20000 && codePoint > 0xFFFF) return false;
// major < 2
return true;
}
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && compositeVersion >= 0x30100) return true;
// major >= 3 && minor >= 1
return false;
}
@ -239,8 +241,9 @@ public final class UCD implements UCD_Types {
public byte getModCat(int cp, int collapseBits) {
byte cat = getCategory(cp);
if (cat == UNASSIGNED && isNoncharacter(cp)) cat = FAKENC;
if (((1<<cat) & collapseBits) != 0) {
if (cat == UNASSIGNED && isNoncharacter(cp)) {
cat = FAKENC;
} else if (((1<<cat) & collapseBits) != 0) {
switch (cat) {
case UNASSIGNED: cat = FAKE_OTHER; break;
case FAKENC: cat = FAKE_OTHER; break;
@ -281,7 +284,17 @@ public final class UCD implements UCD_Types {
case CURRENCY_SYMBOL: cat = FAKE_SYMBOL; break;
case MODIFIER_SYMBOL: cat = FAKE_SYMBOL; break;
case OTHER_SYMBOL: cat = FAKE_SYMBOL; break;
}
if (collapseBits == -1) {
switch (cat) {
case FAKE_MARK:
case FAKE_NUMBER:
case FAKE_SEPERATOR:
case FAKE_PUNCTUATION:
case FAKE_SYMBOL:
cat = FAKE_LETTER;
break;
}
}
}
return cat;
@ -832,7 +845,7 @@ public final class UCD implements UCD_Types {
return style == SHORT ? UCD_Names.SHORT_BP[bit] : UCD_Names.BP[bit];
}
public static int mapToRepresentative(int ch, boolean old) {
public static int mapToRepresentative(int ch, boolean lessThan20105) {
if (ch <= 0xFFFD) {
//if (ch <= 0x2800) return ch;
//if (ch <= 0x28FF) return 0x2800; // braille
@ -850,7 +863,7 @@ public final class UCD implements UCD_Types {
if (ch <= 0xDFFF) return 0xDC00;
if (ch <= 0xE000) return ch; // Private Use
if (ch <= 0xF8FF) return 0xE000;
if (old) {
if (lessThan20105) {
if (ch <= 0xF900) return ch; // CJK Compatibility Ideograp
if (ch <= 0xFA2D) return 0xF900;
}
@ -870,37 +883,43 @@ public final class UCD implements UCD_Types {
return ch;
}
public boolean isIdentifierStart(int cp, boolean extended) {
public boolean isIdentifierStart(int cp) {
/*
if (extended) {
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
if (cp == 0x037A || cp >= 0xFC5E && cp <= 0xFC63 || cp == 0xFDFA || cp == 0xFDFB) return false;
if (cp >= 0xFE70 && cp <= 0xFE7E && (cp & 1) == 0) return false;
}
*/
byte cat = getCategory(cp);
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
if (getBinaryProperty(cp, ID_Start_Exceptions)) return true;
return false;
}
public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
if (isIdentifierStart(cp, extended)) return true;
public boolean isIdentifierContinue_NO_Cf(int cp) {
if (isIdentifierStart(cp)) return true;
/*
if (extended) {
if (cp == 0x00B7) return true;
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return true;
}
*/
byte cat = getCategory(cp);
if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
if (getBinaryProperty(cp, ID_Start_Exceptions)) return true;
return false;
}
public boolean isIdentifier(String s, boolean extended) {
public boolean isIdentifier(String s) {
if (s.length() == 0) return false; // at least one!
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i == 0) {
if (!isIdentifierStart(cp, extended)) return false;
if (!isIdentifierStart(cp)) return false;
} else {
if (!isIdentifierContinue_NO_Cf(cp, extended)) return false;
if (!isIdentifierContinue_NO_Cf(cp)) return false;
}
}
return true;
@ -940,9 +959,10 @@ to guarantee identifier closure.
private String file;
private long date = -1;
private byte format = -1;
private byte major = -1;
private byte minor = -1;
private byte update = -1;
//private byte major = -1;
//private byte minor = -1;
//private byte update = -1;
private int compositeVersion = -1;
private int size = -1;
// cache last UData
@ -971,7 +991,7 @@ to guarantee identifier closure.
if (codePoint >= 0x2800 && codePoint <= 0x28FF) return true;
if (codePoint >= 0x2F800 && codePoint <= 0x2FA1D) return true;
int rangeStart = mapToRepresentative(codePoint, major < 2);
int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105);
switch (rangeStart) {
default:
return getRaw(codePoint) == null;
@ -999,6 +1019,11 @@ to guarantee identifier closure.
// access data for codepoint
UData get(int codePoint, boolean fixStrings) {
/*if (codePoint == 0xF901) {
System.out.println(version + ", " + Integer.toString(compositeVersion, 16));
System.out.println("debug: ");
}
*/
if (codePoint < 0 || codePoint > 0x10FFFF) {
throw new IllegalArgumentException("Illegal Code Point: " + Utility.hex(codePoint));
}
@ -1024,11 +1049,11 @@ to guarantee identifier closure.
// do range stuff
String constructedName = null;
int rangeStart = mapToRepresentative(codePoint, major < 2);
int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105);
boolean isHangul = false;
switch (rangeStart) {
case 0xF900:
if (major < 2) {
if (compositeVersion < 0x020105) {
if (fixStrings) constructedName = "CJK COMPATIBILITY IDEOGRAPH-" + Utility.hex(codePoint, 4);
break;
}
@ -1198,9 +1223,11 @@ to guarantee identifier closure.
}
static boolean isLeadingJamoComposition(int char1) {
return (LBase <= char1 && char1 < LLimit
|| SBase <= char1 && char1 < SLimit
&& ((char1 - SBase) % TCount) == 0);
return isLeadingJamo(char1) || isLV(char1);
}
static boolean isLV(int char1) {
return (SBase <= char1 && char1 < SLimit && ((char1 - SBase) % TCount) == 0);
}
static boolean isVowelJamo(int cp) {
@ -1218,6 +1245,24 @@ to guarantee identifier closure.
static boolean isNonLeadJamo(int cp) {
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
}
static byte getHangulSyllableType(int cp) {
if (isLeadingJamo(cp)) return L;
else if (isVowelJamo(cp)) return V;
else if (isTrailingJamo(cp)) return T;
else if (isLV(cp)) return LV;
else if (isHangulSyllable(cp)) return LVT;
else return NA;
}
static String getHangulSyllableTypeID_fromIndex(byte index, byte style) {
if (style == LONG) return UCD_Names.LONG_HANGUL_SYLLABLE_TYPE[index];
return UCD_Names.HANGUL_SYLLABLE_TYPE[index];
}
static String getHangulSyllableTypeID(int char1, byte style) {
return getHangulSyllableTypeID_fromIndex(getHangulSyllableType(char1),style);
}
private void fillFromFile(String version) {
try {
@ -1243,9 +1288,11 @@ to guarantee identifier closure.
128*1024));
// header
format = dataIn.readByte();
major = dataIn.readByte();
minor = dataIn.readByte();
update = dataIn.readByte();
byte major = dataIn.readByte();
byte minor = dataIn.readByte();
byte update = dataIn.readByte();
compositeVersion = (major << 16) | (minor << 8) | update;
String foundVersion = major + "." + minor + "." + update;
if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
@ -1262,7 +1309,7 @@ to guarantee identifier closure.
UData uData = new UData();
uData.readBytes(dataIn);
if (DEBUG && uData.codePoint == 0x2801) {
if (uData.codePoint == 0x0221) {
System.out.println("SPOT-CHECK: " + uData);
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.15 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -53,6 +53,7 @@ final class UCD_Names implements UCD_Types {
"BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
"Script",
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)",
"Hangul Syllable Type\r\n# All codepoints not explicitly listed here have the value NA",
"Derived"
};
@ -69,6 +70,7 @@ final class UCD_Names implements UCD_Types {
"",
"Script",
"Age",
"Hangul_Syllable_Type",
""
};
@ -85,6 +87,7 @@ final class UCD_Names implements UCD_Types {
"",
"sc",
"ag",
"hst",
"",
};
@ -121,6 +124,7 @@ final class UCD_Names implements UCD_Types {
"Deprecated",
"Soft_Dotted",
"Logical_Order_Exception",
"ID_Start_Exceptions",
};
static final String[] SHORT_BP = {
@ -155,6 +159,7 @@ final class UCD_Names implements UCD_Types {
"Dep",
"SD",
"LOE",
"IDSX",
};
/*
@ -273,6 +278,14 @@ final class UCD_Names implements UCD_Types {
"HANUNOO",
"BUHID",
"TAGBANWA",
"LIMBU",
"TAI_LE",
"LINEAR_B",
"UGARITIC",
"SHAVIAN",
"OSMANYA",
"CYPRIOT",
};
public static final String[] ABB_SCRIPT = {
@ -322,6 +335,13 @@ final class UCD_Names implements UCD_Types {
"Hano",
"Buhd",
"Tagb",
"LIMBU",
"TAI_LE",
"LINEAR_B",
"UGARITIC",
"SHAVIAN",
"OSMANYA",
"CYPRIOT",
};
@ -330,7 +350,8 @@ final class UCD_Names implements UCD_Types {
"UNSPECIFIED",
"1.1",
"2.0", "2.1",
"3.0", "3.1"
"3.0", "3.1", "3.2",
"4.0"
};
@ -573,6 +594,24 @@ final class UCD_Names implements UCD_Types {
public static byte ON = Utility.lookup("ON", BC, true);
public static String[] HANGUL_SYLLABLE_TYPE = {
"NA",
"L",
"V",
"T",
"LV",
"LVT",
};
public static String[] LONG_HANGUL_SYLLABLE_TYPE = {
"Not_Applicable",
"Leading_Jamo",
"Vowel_Jamo",
"Trailing_Jamo",
"LV_Syllable",
"LVT_Syllable",
};
public static String[] JOINING_TYPE = {
"C",
"D",
@ -643,6 +682,9 @@ final class UCD_Names implements UCD_Types {
"YUDH",
"YUDH_HE",
"ZAIN",
"ZHAIN",
"KHAPH",
"FE",
};
public static String[] OLD_JOINING_GROUP = {
@ -697,6 +739,9 @@ final class UCD_Names implements UCD_Types {
"YUDH",
"YUDH_HE",
"ZAIN",
"ZHAIN",
"KHAPH",
"FE",
};

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2002/10/05 01:28:58 $
* $Revision: 1.16 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -15,7 +15,7 @@ package com.ibm.text.UCD;
public interface UCD_Types {
public static final int dVersion = 2; // change to fix the generated file D version. If less than zero, no "d"
public static final int dVersion = 10; // change to fix the generated file D version. If less than zero, no "d"
public static final String BASE_DIR = "C:\\DATA\\";
public static final String UCD_DIR = BASE_DIR + "UCD\\";
@ -41,7 +41,7 @@ public interface UCD_Types {
NOT_DERIVED = 1,
DERIVED_CORE = 2,
DERIVED_NORMALIZATION = 4,
DERIVED_ALL = 6,
DERIVED_ALL = 0x6,
ALL = (byte)-1;
static final byte
@ -86,9 +86,10 @@ public interface UCD_Types {
BINARY_PROPERTIES = 0x900,
SCRIPT = 0xA00,
AGE = 0xB00,
DERIVED = 0xC00,
NEXT_ENUM = 0x100,
LIMIT_ENUM = DERIVED + 0x100;
HANGUL_SYLLABLE_TYPE = 0xC00,
DERIVED = 0xD00,
LIMIT_ENUM = DERIVED + 0x100,
NEXT_ENUM = 0x100;
public static final int LIMIT_COMBINING_CLASS = 256;
@ -207,7 +208,8 @@ public interface UCD_Types {
Deprecated = 28,
Soft_Dotted = 29,
Logical_Order_Exception = 30,
LIMIT_BINARY_PROPERTIES = 31;
ID_Start_Exceptions = 31,
LIMIT_BINARY_PROPERTIES = 32;
/*
static final int
@ -309,6 +311,9 @@ public interface UCD_Types {
// numericType
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
LIMIT_NUMERIC_TYPE = 4;
static final byte NA = 0, L = 1, V = 2, T = 3, LV = 4, LVT = 5,
HANGUL_SYLLABLE_TYPE_LIMIT = 6;
public static final byte // SCRIPT CODE
COMMON_SCRIPT = 0,
@ -357,7 +362,14 @@ public interface UCD_Types {
HANUNOO_SCRIPT = 43,
BUHID_SCRIPT = 44,
TAGBANWA_SCRIPT = 45,
LIMIT_SCRIPT = 46;
LIMBU = 46,
TAI_LE = 47,
LINEAR_B = 48,
UGARITIC = 49,
SHAVIAN = 50,
OSMANYA = 51,
CYPRIOT = 52,
LIMIT_SCRIPT = 53;
static final int
UNKNOWN = 0,
@ -366,7 +378,9 @@ public interface UCD_Types {
AGE21 = 3,
AGE30 = 4,
AGE31 = 5,
LIMIT_AGE = 6;
AGE32 = 6,
AGE40 = 7,
LIMIT_AGE = 8;
@ -431,7 +445,11 @@ public static byte
YUDH = 48,
YUDH_HE = 49,
ZAIN = 50,
LIMIT_JOINING_GROUP = 51;
ZHAIN = 51,
KHAPH = 52,
FE = 53,
LIMIT_JOINING_GROUP = 54;
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
public static final int
@ -500,7 +518,9 @@ public static byte
NFC_Skippable = 42,
NFKD_Skippable = 43,
NFKC_Skippable = 44,
Case_Sensitive = 45,
DERIVED_PROPERTY_LIMIT = 41;
DERIVED_PROPERTY_LIMIT = 46;
}

View File

@ -35,6 +35,8 @@ public abstract class UnicodeProperty implements UCD_Types {
public boolean isStandard() { return isStandard; }
public void setStandard(boolean in) { isStandard = in; }
public boolean isDefaultValue() {return false;}
/**
* What type is it? DERIVED..
*/

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
* $Date: 2002/10/05 01:28:57 $
* $Revision: 1.10 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
@ -122,7 +122,11 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
propValue = propMask & 0xFF;
//System.out.println("A: " + getValueType());
if (majorProp <= (JOINING_GROUP>>8) || majorProp == SCRIPT>>8) setValueType(FLATTENED_BINARY);
if (majorProp <= (JOINING_GROUP>>8)
|| majorProp == (SCRIPT>>8)
|| majorProp==(HANGUL_SYLLABLE_TYPE>>8)) {
setValueType(FLATTENED_BINARY);
}
//System.out.println("B: " + getValueType());
header = UCD_Names.UNIFIED_PROPERTY_HEADERS[majorProp];
@ -217,6 +221,8 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
return true;
case AGE>>8: if (propValue >= LIMIT_AGE) break;
return true;
case HANGUL_SYLLABLE_TYPE>>8: if (propValue >= HANGUL_SYLLABLE_TYPE_LIMIT) break;
return true;
/*
case DERIVED>>8:
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
@ -227,6 +233,28 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
return false;
}
public boolean isDefaultValue() {
switch ((majorProp<<8) | propValue) {
//case CATEGORY | Cn:
//case COMBINING_CLASS | 0:
//case BIDI_CLASS | BIDI_L:
case DECOMPOSITION_TYPE | NONE:
case NUMERIC_TYPE | NUMERIC_NONE:
// case EAST_ASIAN_WIDTH | EAN:
// case LINE_BREAK | LB_XX:
case JOINING_TYPE | JT_U:
case JOINING_GROUP | NO_SHAPING:
case BINARY_PROPERTIES | Non_break:
case BINARY_PROPERTIES | CaseFoldTurkishI:
case SCRIPT | COMMON_SCRIPT:
case HANGUL_SYLLABLE_TYPE | NA:
return true;
}
return false;
}
public boolean hasValue(int cp) {
try {
switch (majorProp) {
@ -242,6 +270,8 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
case BINARY_PROPERTIES>>8: return ucd.getBinaryProperty(cp, propValue);
case SCRIPT>>8: return ucd.getScript(cp) == propValue;
case AGE>>8: return ucd.getAge(cp) == propValue;
case HANGUL_SYLLABLE_TYPE>>8: return ucd.getHangulSyllableType(cp) == propValue;
// return true;
/*
case DERIVED>>8:
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
@ -307,6 +337,7 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue, style);
case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
case HANGUL_SYLLABLE_TYPE>>8: return ucd.getHangulSyllableTypeID_fromIndex((byte)propValue, style);
/*
case DERIVED>>8:
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
@ -337,6 +368,7 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
case BINARY_PROPERTIES>>8: return LONG;
case SCRIPT>>8: return LONG;
case AGE>>8: return LONG;
case HANGUL_SYLLABLE_TYPE>>8: return SHORT;
}
} catch (RuntimeException e) {
throw new ChainException("Illegal property Number {0}, {1}", new Object[]{

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedProperty.java,v $
* $Date: 2002/10/05 01:28:57 $
* $Revision: 1.2 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -142,6 +142,7 @@ public final class UnifiedProperty extends UnicodeProperty {
case JOINING_GROUP>>8:
case SCRIPT>>8:
case AGE>>8:
case HANGUL_SYLLABLE_TYPE>>8:
return true;
/*
case DERIVED>>8:
@ -181,7 +182,9 @@ public final class UnifiedProperty extends UnicodeProperty {
case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp), style);
case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp), style);
case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp), style);
case HANGUL_SYLLABLE_TYPE>>8:
return ucd.getHangulSyllableTypeID(cp,style);
default: throw new IllegalArgumentException("Internal Error");
}
}
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.19 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
@ -1806,8 +1806,11 @@ E0020-E007F; [TAGGING CHARACTERS]
String x_cp = 'x' + UTF32.valueOf32(cp);
String nfx_x_cp = normalize(x_cp, j);
plain = Default.ucd.isIdentifier(x_cp, true);
norm = Default.ucd.isIdentifier(nfx_x_cp, true);
if (true) {
throw new RuntimeException("Fix plain & norm, 4 instances!!");
}
// plain = Default.ucd.isIdentifier(x_cp, true);
//norm = Default.ucd.isIdentifier(nfx_x_cp, true);
if (plain & !norm) {
Utility.fixDot();
System.out.println("*Not Identifier: " + Default.ucd.getCodeAndName(cp));
@ -1822,8 +1825,8 @@ E0020-E007F; [TAGGING CHARACTERS]
}
String nfx_cp = normalize(UTF32.valueOf32(cp), j);
plain = Default.ucd.isIdentifierStart(cp, true);
norm = Default.ucd.isIdentifier(nfx_cp, true);
// plain = Default.ucd.isIdentifierStart(cp, true);
// norm = Default.ucd.isIdentifier(nfx_cp, true);
if (plain & !norm) {
Utility.fixDot();
System.out.println(" Changes Category: " + Default.ucd.getCodeAndName(cp));

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FileLineIterator.java,v $
* $Date: 2002/10/01 01:12:10 $
* $Revision: 1.1 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -43,18 +43,18 @@ public class FileLineIterator {
public int counter = 0;
private BufferedReader br = null;
private boolean isUTF8 = false;
private Utility.Encoding encoding = Utility.UTF8;
/**
* Open the file for reading. If useGenDir is set, use the normal generation directory
*/
public void open(String filename, boolean isUTF8) throws IOException {
public void open(String filename, Utility.Encoding encoding) throws IOException {
if (showFilename) {
Utility.fixDot();
System.out.println("Reading File: " + new File(filename).getCanonicalPath());
}
br = Utility.openReadFile(filename, isUTF8);
this.isUTF8 = isUTF8;
br = Utility.openReadFile(filename, encoding);
this.encoding = encoding;
}
/**
@ -68,7 +68,7 @@ public class FileLineIterator {
if (cleanedLine == null) return null;
// drop BOM
if (isUTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
if (encoding == Utility.UTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
cleanedLine = cleanedLine.substring(1);
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2002/10/05 01:28:56 $
* $Revision: 1.26 $
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.27 $
*
*******************************************************************************
*/
@ -144,7 +144,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
boolean haveFirstCased = true;
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c == ' ' || c == '-') c = '_';
if (c == ' ' || c == '-' || c == '_') {
c = '_';
haveFirstCased = true;
}
int cat = Character.getType(c);
if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
result.append('_');
@ -616,6 +619,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
private static final String[] searchPath = {
"EXTRAS",
"4.0.0",
"3.2.0",
"3.1.1",
"3.1.0",
@ -654,8 +658,13 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
UTF8_UNIX = Encoding.add("UTF8_UNIX"),
UTF8_WINDOWS = Encoding.add("UTF8_WINDOWS"),
UTF8 = Encoding.add("UTF8"), // for read-only
LATIN1 = Encoding.add("LATIN1"), // for read-only
//UTF8 = Encoding.add("UTF8"), // for read-only
//LATIN1 = Encoding.add("LATIN1"), // for read-only
// read-only (platform doesn't matter, since it is only line-end)
UTF8 = UTF8_WINDOWS,
LATIN1 = LATIN1_WINDOWS,
FIRST = LATIN1_UNIX;
@ -700,6 +709,24 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
public boolean filter(Object current); // true is keep
}
public static void printMapOfCollection(PrintWriter pw, Map c, String mainSeparator, String itemSeparator, String subseparator) {
Iterator it = c.keySet().iterator();
boolean first = true;
Object last = null;
while (it.hasNext()) {
Object key = it.next();
Collection value = (Collection) c.get(key);
if (first) {
first = false;
} else {
pw.print(mainSeparator);
}
pw.print(key);
pw.print(itemSeparator);
print(pw, value, subseparator);
}
}
public static void print(PrintWriter pw, Collection c, String separator, Breaker b) {
Iterator it = c.iterator();
boolean first = true;
@ -745,7 +772,12 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
public static BufferedReader openReadFile(String filename, Encoding encoding) throws FileNotFoundException, UnsupportedEncodingException {
FileInputStream fis = new FileInputStream(filename);
InputStreamReader isr = (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis);
InputStreamReader isr;
if (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) {
isr = new InputStreamReader(fis, "UTF8");
} else {
isr = new InputStreamReader(fis);
}
BufferedReader br = new BufferedReader(isr, 32*1024);
return br;
}
@ -817,10 +849,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
}
}
public static void renameIdentical(String file1, String file2) throws IOException {
public static boolean renameIdentical(String file1, String file2, String batFile) throws IOException {
if (file1 == null) {
System.out.println("Null file");
return;
return false;
}
boolean identical = false;
@ -845,25 +877,34 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
br2.close();
}
if (identical) {
File foo = new File(file2);
File newName = new File(foo.getParent(), "UNCHANGED-" + foo.getName());
if (newName.exists()) {
for (int i = 1; newName.exists(); ++i) {
newName = new File(foo.getParent(), "UNCHANGED" + i + "-" + foo.getName());
}
}
System.out.println("IDENTICAL TO PREVIOUS, RENAMING : " + foo);
System.out.println("TO : " + newName);
boolean renameResult = foo.renameTo(newName);
if (!renameResult) System.out.println("Couldn't rename!");
renameIdentical(file2);
if (batFile != null) renameIdentical(batFile);
return true;
} else {
if (line1 == null) line1 = "<end of file>";
if (line2 == null) line2 = "<end of file>";
System.out.println("Found difference in : " + file1 + ", " + file2);
int diff = compare(line1, line2);
System.out.println(" Line1: '" + line1.substring(0,diff) + "', '" + line1.substring(diff));
System.out.println(" Line2: '" + line2.substring(0,diff) + "', '" + line2.substring(diff));
return false;
}
}
static void renameIdentical(String file2) {
File foo = new File(file2);
File newName = new File(foo.getParent(), "UNCHANGED-" + foo.getName());
if (newName.exists()) {
for (int i = 1; newName.exists(); ++i) {
newName = new File(foo.getParent(), "UNCHANGED" + i + "-" + foo.getName());
}
}
System.out.println("IDENTICAL TO PREVIOUS, RENAMING : " + foo);
System.out.println("TO : " + newName);
boolean renameResult = foo.renameTo(newName);
if (!renameResult) System.out.println("Couldn't rename!");
}
static String getLineWithoutFluff(BufferedReader br1, boolean first) throws IOException {
while (true) {
String line1 = br1.readLine();