updated for 4.0
X-SVN-Rev: 11161
This commit is contained in:
parent
c31688a777
commit
07a8be151c
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2003/02/25 23:38:23 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -718,6 +718,8 @@ public final class ConvertUCD implements UCD_Types {
|
||||
|
||||
static Set jtSet = new TreeSet();
|
||||
static Set jgSet = new TreeSet();
|
||||
|
||||
static final boolean SHOW_SAMPLE = false;
|
||||
|
||||
/** Adds the character data. Signals duplicates with an exception
|
||||
*/
|
||||
@ -725,6 +727,11 @@ public final class ConvertUCD implements UCD_Types {
|
||||
//if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value));
|
||||
UData charEntry = getEntry(cp);
|
||||
//if (cp < 10) System.out.println(" " + charEntry);
|
||||
|
||||
if (SHOW_SAMPLE && cp == 0x221) {
|
||||
System.out.println("Sample: " + cp + ", " + key + ", " + value);
|
||||
System.out.println(charEntry);
|
||||
}
|
||||
|
||||
if (key.equals("bm")) {
|
||||
if (value.equals("Y")) charEntry.binaryProperties |= 1;
|
||||
@ -780,6 +787,11 @@ public final class ConvertUCD implements UCD_Types {
|
||||
} else {
|
||||
setField(charEntry, key, value);
|
||||
}
|
||||
if (SHOW_SAMPLE && cp == 0x221) {
|
||||
System.out.println("Sample Result:");
|
||||
System.out.println(charEntry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static public void setField(UData uData, String fieldName, String fieldValue) {
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2003/02/25 23:38:23 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -14,11 +14,20 @@
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
public final class DerivedProperty implements UCD_Types {
|
||||
|
||||
UCD ucdData;
|
||||
Normalizer nfc;
|
||||
Normalizer nfd;
|
||||
Normalizer nfkc;
|
||||
Normalizer nfkd;
|
||||
Normalizer[] nf = new Normalizer[4];
|
||||
UnicodeSet XID_Start_Set = new UnicodeSet();
|
||||
UnicodeSet XID_Continue_Set = new UnicodeSet();
|
||||
|
||||
// ADD CONSTANT to UCD_TYPES
|
||||
|
||||
@ -33,9 +42,6 @@ public final class DerivedProperty implements UCD_Types {
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
private DerivedProperty(UCD ucd) {
|
||||
ucdData = ucd;
|
||||
}
|
||||
|
||||
static Map cache = new HashMap();
|
||||
static UCD lastUCD = null;
|
||||
@ -101,7 +107,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
Normalizer nfx;
|
||||
ExDProp(int i) {
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = Default.nf[i];
|
||||
nfx = nf[i];
|
||||
name = "Expands_On_" + nfx.getName();
|
||||
shortName = "XO_" + nfx.getName();
|
||||
header = "# Derived Property: " + name
|
||||
@ -125,7 +131,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
NF_UnsafeStartProp(int i) {
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = Default.nf[i];
|
||||
nfx = nf[i];
|
||||
name = nfx.getName() + "_UnsafeStart";
|
||||
shortName = nfx.getName() + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
@ -144,6 +150,35 @@ public final class DerivedProperty implements UCD_Types {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
class HangulSyllableType extends UnicodeProperty {
|
||||
Normalizer nfx;
|
||||
//int prop;
|
||||
|
||||
HangulSyllableType(int i) {
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
name = nfx.getName() + "_UnsafeStart";
|
||||
shortName = nfx.getName() + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
|
||||
;
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getCombiningClass(cp) != 0) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
int first = UTF16.charAt(norm, 0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
if (nfx.isComposition()
|
||||
&& dprops[NFC_TrailingZero].hasValue(first)) return true; // 1,3 == composing
|
||||
return false;
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
class NFC_Prop extends UnicodeProperty {
|
||||
BitSet bitset;
|
||||
@ -161,7 +196,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
|
||||
}
|
||||
filter = bitsets[1] != null;
|
||||
Default.nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
|
||||
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
|
||||
|
||||
name = Names[i-NFC_Leading];
|
||||
shortName = SNames[i-NFC_Leading];
|
||||
@ -197,17 +232,17 @@ public final class DerivedProperty implements UCD_Types {
|
||||
isStandard = false;
|
||||
setValueType(NON_ENUMERATED);
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = Default.nf[i];
|
||||
nfx = nf[i];
|
||||
name = nfx.getName();
|
||||
String compName = "the character itself";
|
||||
|
||||
if (i == NFKC || i == NFD) {
|
||||
name += "-NFC";
|
||||
nfComp = Default.nfc;
|
||||
nfComp = nfc;
|
||||
compName = "NFC for the character";
|
||||
} else if (i == NFKD) {
|
||||
name += "-NFD";
|
||||
nfComp = Default.nfd;
|
||||
nfComp = nfd;
|
||||
compName = "NFD for the character";
|
||||
}
|
||||
header = "# Derived Property: " + name
|
||||
@ -273,7 +308,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
QuickDProp (int i) {
|
||||
setValueType((i == NFC || i == NFKC) ? ENUMERATED : BINARY);
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = Default.nf[i];
|
||||
nfx = nf[i];
|
||||
NO = nfx.getName() + "_NO";
|
||||
MAYBE = nfx.getName() + "_MAYBE";
|
||||
name = nfx.getName() + "_QuickCheck";
|
||||
@ -297,7 +332,14 @@ public final class DerivedProperty implements UCD_Types {
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
{
|
||||
private DerivedProperty(UCD ucd) {
|
||||
ucdData = ucd;
|
||||
|
||||
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, ucdData.getVersion());
|
||||
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, ucdData.getVersion());
|
||||
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, ucdData.getVersion());
|
||||
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdData.getVersion());
|
||||
|
||||
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
|
||||
dprops[i] = new ExDProp(i-ExpandsOnNFD);
|
||||
}
|
||||
@ -321,10 +363,10 @@ public final class DerivedProperty implements UCD_Types {
|
||||
shortName = "IDS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Characters that can start an identifier."
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl+ID_Start_Exceptions";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierStart(cp, false);
|
||||
return ucdData.isIdentifierStart(cp);
|
||||
}
|
||||
};
|
||||
|
||||
@ -339,10 +381,65 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out.";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierContinue_NO_Cf(cp, false);
|
||||
return ucdData.isIdentifierContinue_NO_Cf(cp);
|
||||
}
|
||||
};
|
||||
|
||||
StringBuffer tempBuf = new StringBuffer();
|
||||
|
||||
//System.out.println("Deriving data for XID");
|
||||
// special hack for middle dot
|
||||
XID_Continue_Set.add(0x00B7);
|
||||
//System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7));
|
||||
|
||||
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
// skip cases that can't matter
|
||||
if (!ucdData.isAssigned(cp)) continue;
|
||||
|
||||
// find out normal status
|
||||
int status = 0;
|
||||
if (ucdData.isIdentifierStart(cp)) status = 1;
|
||||
else if (ucdData.isIdentifierContinue_NO_Cf(cp)) status = 2;
|
||||
|
||||
if (status != 0 && !nfkd.isNormalized(cp)) {
|
||||
// now find out NFKD status
|
||||
// if it is <start><extend>*, then it is start
|
||||
// else if it is <extend>*, then it is extend
|
||||
// else it is nothing
|
||||
int status2 = 0;
|
||||
tempBuf.setLength(0);
|
||||
nfkd.normalize(UTF32.valueOf32(cp), tempBuf);
|
||||
for (int i = 0; i < tempBuf.length(); i += UTF32.count16(cp)) {
|
||||
int cp2 = UTF32.char32At(tempBuf, i);
|
||||
if (i == 0) {
|
||||
if (ucdData.isIdentifierStart(cp2)) status2 = 1;
|
||||
else if (ucdData.isIdentifierContinue_NO_Cf(cp2)) status2 = 2;
|
||||
else {
|
||||
status2 = 0;
|
||||
break;
|
||||
}
|
||||
} else if (!ucdData.isIdentifierContinue_NO_Cf(cp2) && cp2 != 0xB7) {
|
||||
status2 = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Now see if the statuses are compatible.
|
||||
if (status != status2) {
|
||||
//System.out.println("Need to do something with:");
|
||||
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
|
||||
//System.out.println(" " + status2 + ": " + ucdData.getCodeAndName(tempBuf.toString()));
|
||||
if (status2 == 0) status = 0;
|
||||
else if (status2 > status) status = status2;
|
||||
//System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp));
|
||||
}
|
||||
}
|
||||
|
||||
if (status == 1) XID_Start_Set.add(cp);
|
||||
if (status != 0) XID_Continue_Set.add(cp);
|
||||
}
|
||||
|
||||
dprops[Mod_ID_Start] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
@ -355,7 +452,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierStart(cp, true);
|
||||
return XID_Start_Set.contains(cp);
|
||||
}
|
||||
};
|
||||
|
||||
@ -372,7 +469,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierContinue_NO_Cf(cp, true);
|
||||
return XID_Continue_Set.contains(cp);
|
||||
}
|
||||
};
|
||||
|
||||
@ -458,7 +555,6 @@ of characters, the first of which has a non-zero combining class.
|
||||
shortName = "Comp_Ex";
|
||||
defaultValueStyle = defaultPropertyStyle = SHORT;
|
||||
header = "# Derived Property: " + name
|
||||
+ ": Full Composition Exclusion"
|
||||
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
@ -469,6 +565,9 @@ of characters, the first of which has a non-zero combining class.
|
||||
if (isCompEx(cp)) return true;
|
||||
return false;
|
||||
}
|
||||
/*public String getListingValue(int cp) {
|
||||
return "Comp_Ex";
|
||||
}*/
|
||||
/*
|
||||
public String getListingValue(int cp) {
|
||||
if (getValueType() != BINARY) return getValue(cp, SHORT);
|
||||
@ -511,8 +610,8 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!ucdData.isRepresented(cp)) return "";
|
||||
String b = Default.nfkc.normalize(fold(cp));
|
||||
String c = Default.nfkc.normalize(fold(b));
|
||||
String b = nfkc.normalize(fold(cp));
|
||||
String c = nfkc.normalize(fold(b));
|
||||
if (c.equals(b)) return "";
|
||||
return "FNC; " + Utility.hex(c);
|
||||
} // default
|
||||
@ -533,8 +632,8 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!ucdData.isRepresented(cp)) return "";
|
||||
String b = Default.nfc.normalize(fold(cp));
|
||||
String c = Default.nfc.normalize(fold(b));
|
||||
String b = nfc.normalize(fold(cp));
|
||||
String c = nfc.normalize(fold(b));
|
||||
if (c.equals(b)) return "";
|
||||
return "FN; " + Utility.hex(c);
|
||||
} // default
|
||||
@ -565,6 +664,94 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Case_Sensitive] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
isStandard = false;
|
||||
name = "Case_Sensitive";
|
||||
hasUnassigned = false;
|
||||
shortName = "CS";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from all characters that are either on the right or left side of a case mapping";
|
||||
}
|
||||
|
||||
UnicodeSet case_sensitive = null;
|
||||
UnicodeSet tempSet = new UnicodeSet();
|
||||
UnicodeSet cased = null;
|
||||
PrintWriter log;
|
||||
|
||||
private void addCase(String cps, byte c1, byte c2) {
|
||||
String temp = ucdData.getCase(cps, c1, c2);
|
||||
if (temp.equals(cps)) return;
|
||||
|
||||
//temp = nfc.normalize(temp);
|
||||
//if (temp.equals(cps)) return;
|
||||
|
||||
tempSet.clear();
|
||||
tempSet.addAll(cps);
|
||||
tempSet.addAll(temp);
|
||||
if (!case_sensitive.containsAll(tempSet)) {
|
||||
tempSet.removeAll(case_sensitive);
|
||||
if (!cased.containsAll(tempSet)) {
|
||||
log.println();
|
||||
log.println("Adding " + tempSet + " because of: ");
|
||||
log.println("\t" + ucdData.getCodeAndName(cps));
|
||||
log.println("=>\t" + ucdData.getCodeAndName(temp));
|
||||
}
|
||||
case_sensitive.addAll(tempSet);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasValue(int cp) {
|
||||
if (case_sensitive == null) {
|
||||
try {
|
||||
log = Utility.openPrintWriter("Case_Sensitive_Log.txt", Utility.UTF8_UNIX);
|
||||
|
||||
System.out.println("Building Case-Sensitive cache");
|
||||
case_sensitive = new UnicodeSet();
|
||||
cased = DerivedProperty.make(PropLowercase, ucdData).getSet()
|
||||
.addAll(DerivedProperty.make(PropUppercase, ucdData).getSet())
|
||||
.addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet());
|
||||
for (int c = 0; c < 0x10FFFF; ++c) {
|
||||
Utility.dot(c);
|
||||
// skip cases that can't matter
|
||||
if (!ucdData.isAssigned(c)) continue;
|
||||
|
||||
String cps = UTF16.valueOf(c);
|
||||
addCase(cps, FULL, LOWER);
|
||||
addCase(cps, FULL, UPPER);
|
||||
addCase(cps, FULL, TITLE);
|
||||
addCase(cps, FULL, FOLD);
|
||||
addCase(cps, SIMPLE, LOWER);
|
||||
addCase(cps, SIMPLE, UPPER);
|
||||
addCase(cps, SIMPLE, TITLE);
|
||||
addCase(cps, SIMPLE, FOLD);
|
||||
}
|
||||
Utility.fixDot();
|
||||
UnicodeSet temp;
|
||||
log.println("Cased, but not Case_Sensitive");
|
||||
temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive);
|
||||
Utility.showSetNames(log, "", temp, false, false, ucdData);
|
||||
|
||||
log.println("Case_Sensitive, but not Cased");
|
||||
temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased);
|
||||
Utility.showSetNames(log, "", temp, false, false, ucdData);
|
||||
|
||||
log.println("Both Case_Sensitive, and Cased");
|
||||
temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased);
|
||||
log.println(temp);
|
||||
System.out.println("Done Building Case-Sensitive cache");
|
||||
|
||||
log.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("internal error", null, e);
|
||||
}
|
||||
}
|
||||
return case_sensitive.contains(cp);
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Other_Case_Ignorable] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Other_Case_Ignorable";
|
||||
@ -602,8 +789,8 @@ of characters, the first of which has a non-zero combining class.
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (hasSoftDot(cp)) return true;
|
||||
if (Default.nfkd.isNormalized(cp)) return false;
|
||||
String decomp = Default.nfd.normalize(cp);
|
||||
if (nfkd.isNormalized(cp)) return false;
|
||||
String decomp = nfd.normalize(cp);
|
||||
boolean ok = false;
|
||||
for (int i = decomp.length()-1; i >= 0; --i) {
|
||||
int ch = UTF16.charAt(decomp, i);
|
||||
@ -650,16 +837,19 @@ of characters, the first of which has a non-zero combining class.
|
||||
name = "Grapheme_Extend";
|
||||
shortName = "GrExt";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link - CGJ"
|
||||
+ "\r\n# (CGJ = U+034F)";
|
||||
+ "\r\n# Generated from: Me + Mn + Other_Grapheme_Extend"
|
||||
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
|
||||
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
|
||||
;
|
||||
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (cp == 0x034F) return false;
|
||||
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
|
||||
//if (cp == 0x034F) return false;
|
||||
//if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
|
||||
// || cat == Mc
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Me || cat == Mn || cat == Mc
|
||||
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
|
||||
if (cat == Me || cat == Mn
|
||||
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
@ -671,14 +861,16 @@ of characters, the first of which has a non-zero combining class.
|
||||
shortName = "GrBase";
|
||||
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
|
||||
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend"
|
||||
+ "\r\n# Note: depending on an application's interpretation of Co (private use),"
|
||||
+ "\r\n# they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."
|
||||
;
|
||||
}
|
||||
public boolean hasValue(int cp) {
|
||||
if (cp == 0x034F) return false;
|
||||
//if (cp == 0x034F) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
|
||||
|| ucdData.getBinaryProperty(cp,GraphemeLink)) return false;
|
||||
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp) return false;
|
||||
// || ucdData.getBinaryProperty(cp,GraphemeLink)
|
||||
if (dprops[GraphemeExtend].hasValue(cp)) return false;
|
||||
return true;
|
||||
}
|
||||
@ -702,11 +894,11 @@ of characters, the first of which has a non-zero combining class.
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
|
||||
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
|
||||
|
||||
// if (true) throw new IllegalArgumentException("FIX Default.nf[2]");
|
||||
// if (true) throw new IllegalArgumentException("FIX nf[2]");
|
||||
|
||||
if (Default.nf[NFKD].isNormalized(cp)) return Lo;
|
||||
if (nf[NFKD].isNormalized(cp)) return Lo;
|
||||
|
||||
String norm = Default.nf[NFKD].normalize(cp);
|
||||
String norm = nf[NFKD].normalize(cp);
|
||||
int cp2;
|
||||
boolean gotUpper = false;
|
||||
boolean gotLower = false;
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
|
||||
* $Date: 2002/06/22 01:21:09 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2003/02/25 23:38:23 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -31,6 +31,7 @@ class DiffPropertyLister extends PropertyLister {
|
||||
}
|
||||
breakByCategory = property != NOPROPERTY;
|
||||
useKenName = false;
|
||||
usePropertyComment = false;
|
||||
}
|
||||
|
||||
public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output) {
|
||||
@ -61,20 +62,27 @@ class DiffPropertyLister extends PropertyLister {
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
String normal = super.optionalComment(cp);
|
||||
return oldUCD.getModCatID_fromIndex(
|
||||
oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0))
|
||||
+ "/" + normal;
|
||||
if (oldUCD != null && breakByCategory) {
|
||||
byte modCat = oldUCD.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
|
||||
normal = oldUCD.getModCatID_fromIndex(modCat) + "/" + normal;
|
||||
}
|
||||
return normal;
|
||||
}
|
||||
|
||||
|
||||
byte getModCat(int cp) {
|
||||
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : -1);
|
||||
//System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public byte status(int cp) {
|
||||
if (newProp == null) {
|
||||
if (ucdData.isAllocated(cp) && (oldUCD == null || !oldUCD.isAllocated(cp))) {
|
||||
set.add(cp);
|
||||
return INCLUDE;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return EXCLUDE;
|
||||
}
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2003/02/25 23:38:23 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -83,14 +83,14 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
Default.setUCD();
|
||||
}
|
||||
|
||||
static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5\\u02ED\\u05F3]");
|
||||
static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5-\\u02ED\\u05F3]");
|
||||
static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet()
|
||||
.addAll(extraAlpha);
|
||||
|
||||
static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet();
|
||||
|
||||
static {
|
||||
System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
|
||||
if (false) System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
|
||||
}
|
||||
|
||||
|
||||
@ -116,16 +116,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
PrintWriter systemPrintWriter = new PrintWriter(System.out);
|
||||
gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false);
|
||||
systemPrintWriter.flush();
|
||||
}
|
||||
|
||||
if (false) {
|
||||
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
|
||||
foo.isBreak("(\"Go.\") (He did)", 5, true);
|
||||
|
||||
showSet("sepSet", GenerateSentenceBreakTest.sepSet);
|
||||
showSet("atermSet", GenerateSentenceBreakTest.atermSet);
|
||||
showSet("termSet", GenerateSentenceBreakTest.termSet);
|
||||
}
|
||||
|
||||
if (true) {
|
||||
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
|
||||
//foo.isBreak("(\"Go.\") (He did)", 5, true);
|
||||
foo.isBreak("3.4", 2, true);
|
||||
}
|
||||
|
||||
new GenerateSentenceBreakTest().run();
|
||||
|
||||
@ -276,7 +276,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
|
||||
+ fileName + "</title></head>");
|
||||
+ fileName + " Break Chart</title></head>");
|
||||
out.println("<body bgcolor='#FFFFFF'><h3>Current:</h3>");
|
||||
|
||||
|
||||
@ -304,7 +304,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest"
|
||||
+ (recommended & recommendedDiffers() ? "_NEW" : "")
|
||||
+ (shortVersion ? "_SHORT" : "")
|
||||
+ ".txt", Utility.LATIN1_WINDOWS);
|
||||
+ ".txt", Utility.UTF8_WINDOWS);
|
||||
int counter = 0;
|
||||
|
||||
out.println("# Default " + fileName + " Break Test");
|
||||
@ -623,6 +623,60 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
}
|
||||
|
||||
|
||||
static public class Context {
|
||||
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
|
||||
public byte tBefore2, tBefore, tAfter, tAfter2;
|
||||
public String toString() {
|
||||
return "["
|
||||
+ Utility.hex(cpBefore2) + "(" + tBefore2 + "), "
|
||||
+ Utility.hex(cpBefore) + "(" + tBefore + "), "
|
||||
+ Utility.hex(cpAfter) + "(" + tAfter + "), "
|
||||
+ Utility.hex(cpAfter2) + "(" + tAfter2 + ")]";
|
||||
}
|
||||
}
|
||||
|
||||
public void getGraphemeBases(String source, int offset, boolean recommended, byte ignoreType, Context context) {
|
||||
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
|
||||
context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
|
||||
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType);
|
||||
|
||||
MyBreakIterator graphemeIterator = new MyBreakIterator();
|
||||
|
||||
graphemeIterator.set(source, offset);
|
||||
while (true) {
|
||||
int cp = graphemeIterator.previousBase();
|
||||
if (cp == -1) break;
|
||||
byte t = getResolvedType(cp, recommended);
|
||||
if (t == ignoreType) continue;
|
||||
|
||||
if (context.cpBefore == -1) {
|
||||
context.cpBefore = cp;
|
||||
context.tBefore = t;
|
||||
} else {
|
||||
context.cpBefore2 = cp;
|
||||
context.tBefore2 = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
graphemeIterator.set(source, offset);
|
||||
while (true) {
|
||||
int cp = graphemeIterator.nextBase();
|
||||
if (cp == -1) break;
|
||||
byte t = getResolvedType(cp, recommended);
|
||||
if (t == ignoreType) continue;
|
||||
|
||||
if (context.cpAfter == -1) {
|
||||
context.cpAfter = cp;
|
||||
context.tAfter = t;
|
||||
} else {
|
||||
context.cpAfter2 = cp;
|
||||
context.tAfter2 = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ========================================
|
||||
|
||||
static class GenerateLineBreakTest extends GenerateBreakTest {
|
||||
@ -1050,7 +1104,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
if (cp == 0xA) return LF;
|
||||
if (cp == 0xD) return CR;
|
||||
if (recommended) {
|
||||
if (cp == 0x034F) return CGJ;
|
||||
if (cp == 0x034F) return Extend;
|
||||
}
|
||||
if (cp == 0x2028 || cp == 0x2029) return Control;
|
||||
|
||||
@ -1178,7 +1232,6 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]");
|
||||
|
||||
//static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
|
||||
static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
|
||||
static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
|
||||
|
||||
@ -1325,52 +1378,6 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
return 3;
|
||||
}
|
||||
|
||||
static public class Context {
|
||||
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
|
||||
public byte tBefore2, tBefore, tAfter, tAfter2;
|
||||
}
|
||||
|
||||
public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
|
||||
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
|
||||
context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
|
||||
|
||||
MyBreakIterator graphemeIterator = new MyBreakIterator();
|
||||
|
||||
graphemeIterator.set(source, offset);
|
||||
while (true) {
|
||||
int cp = graphemeIterator.previousBase();
|
||||
if (cp == -1) break;
|
||||
byte t = getResolvedType(cp, recommended);
|
||||
if (t == Format) continue;
|
||||
|
||||
if (context.cpBefore == -1) {
|
||||
context.cpBefore = cp;
|
||||
context.tBefore = t;
|
||||
} else {
|
||||
context.cpBefore2 = cp;
|
||||
context.tBefore2 = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
graphemeIterator.set(source, offset);
|
||||
while (true) {
|
||||
int cp = graphemeIterator.nextBase();
|
||||
if (cp == -1) break;
|
||||
byte t = getResolvedType(cp, recommended);
|
||||
if (t == Format) continue;
|
||||
|
||||
if (context.cpAfter == -1) {
|
||||
context.cpAfter = cp;
|
||||
context.tAfter = t;
|
||||
} else {
|
||||
context.cpAfter2 = cp;
|
||||
context.tAfter2 = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
recommended = true; // don't care about old stuff
|
||||
|
||||
@ -1391,7 +1398,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
|
||||
// now get the base character before and after, and their types
|
||||
|
||||
getGraphemeBases(source, offset, recommended, context);
|
||||
getGraphemeBases(source, offset, recommended, Format, context);
|
||||
|
||||
byte before = context.tBefore;
|
||||
byte after = context.tAfter;
|
||||
@ -1457,42 +1464,55 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
static class GenerateSentenceBreakTest extends GenerateBreakTest {
|
||||
|
||||
static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5,
|
||||
Close = 6, ATerm = 7, Term = 8, Other = 9,
|
||||
Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10,
|
||||
LIMIT = Other + 1;
|
||||
|
||||
static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper",
|
||||
static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric",
|
||||
"Close", "ATerm", "Term", "Other" };
|
||||
|
||||
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
|
||||
|
||||
static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]");
|
||||
static UnicodeSet atermSet = new UnicodeSet("[\\u002E]");
|
||||
static UnicodeSet termSet = new UnicodeSet("[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
|
||||
+ "\\u1362\\u1367\\u1368\\u1803\\u1809\\u203c\\u203d\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
|
||||
static UnicodeSet termSet = new UnicodeSet(
|
||||
"[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
|
||||
+ "\\u1362\\u1367\\u1368\\u104A\\u104B\\u166E"
|
||||
+ "\\u1803\\u1809\\u203c\\u203d"
|
||||
+ "\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
|
||||
|
||||
static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase);
|
||||
static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase);
|
||||
|
||||
UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet();
|
||||
|
||||
{
|
||||
|
||||
fileName = "Sentence";
|
||||
extraSamples = new String[] {
|
||||
|
||||
};
|
||||
String[] temp = new String[] {
|
||||
|
||||
extraSingleSamples = new String[] {
|
||||
"(\"Go.\") (He did.)",
|
||||
"(\"Go?\") (He did.)",
|
||||
"(\u201CGo?\u201D) (He did.)",
|
||||
"U.S.A\u0300. is",
|
||||
"U.S.A\u0300? He",
|
||||
"U.S.A\u0300.",
|
||||
"\u4e00.\u4300",
|
||||
"\u4e00?\u4300",
|
||||
"3.4",
|
||||
"c.d",
|
||||
"etc.)\u2019 \u2018(the",
|
||||
"etc.)\u2019 \u2018(The",
|
||||
"the resp. leaders are",
|
||||
"\u5B57.\u5B57",
|
||||
"etc.\u5B83",
|
||||
"etc.\u3002",
|
||||
"\u5B57\u3002\u5B83",
|
||||
};
|
||||
extraSingleSamples = new String [temp.length * 2];
|
||||
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
|
||||
for (int i = 0; i < temp.length; ++i) {
|
||||
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
|
||||
String[] temp = new String [extraSingleSamples.length * 2];
|
||||
System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length);
|
||||
for (int i = 0; i < extraSingleSamples.length; ++i) {
|
||||
temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme);
|
||||
}
|
||||
extraSingleSamples = temp;
|
||||
|
||||
}
|
||||
|
||||
@ -1509,9 +1529,10 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
if (cat == Cf) return Format;
|
||||
if (sepSet.contains(cp)) return Sep;
|
||||
if (Default.ucd.getBinaryProperty(cp, White_space)) return Sp;
|
||||
if (alphabeticSet.contains(cp)) return OLetter;
|
||||
if (linebreakNS.contains(cp)) return Numeric;
|
||||
if (lowercaseProp.hasValue(cp)) return Lower;
|
||||
if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper;
|
||||
if (alphabeticSet.contains(cp)) return OLetter;
|
||||
if (atermSet.contains(cp)) return ATerm;
|
||||
if (termSet.contains(cp)) return Term;
|
||||
if (cat == Po || cat == Pe
|
||||
@ -1529,6 +1550,8 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
return 1;
|
||||
}
|
||||
|
||||
static Context context = new Context();
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
|
||||
rule = "1";
|
||||
@ -1541,8 +1564,8 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
|
||||
// Sep ÷ (3)
|
||||
rule = "3";
|
||||
byte before = getResolvedType(source.charAt(offset-1), recommended);
|
||||
if (before == Sep) return true;
|
||||
byte beforeChar = getResolvedType(source.charAt(offset-1), recommended);
|
||||
if (beforeChar == Sep) return true;
|
||||
|
||||
// Treat a grapheme cluster as if it were a single character:
|
||||
// the first base character, if there is one; otherwise the first character.
|
||||
@ -1556,17 +1579,29 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
rule="3";
|
||||
if (!grapheme.isBreak( source, offset, recommended)) return false;
|
||||
|
||||
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
|
||||
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
|
||||
// ATerm ×Upper (7)
|
||||
|
||||
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp*÷(10)
|
||||
getGraphemeBases(source, offset, recommended, Format, context);
|
||||
|
||||
byte before = context.tBefore;
|
||||
byte after = context.tAfter;
|
||||
byte before2 = context.tBefore2;
|
||||
byte after2 = context.tAfter2;
|
||||
|
||||
|
||||
// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
|
||||
|
||||
// ATerm × (Lower | Numeric) (6)
|
||||
// Upper ATerm × Upper (7)
|
||||
|
||||
if (before == ATerm) {
|
||||
rule = "6";
|
||||
if (after == Lower || after == Numeric) return false;
|
||||
rule = "7";
|
||||
if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper);
|
||||
if (before2 == Upper && after == Upper) return false;
|
||||
}
|
||||
|
||||
// The following cases are all handled together.
|
||||
|
||||
// These cases are all handled together.
|
||||
// First we loop backwards, checking for the different types.
|
||||
|
||||
MyBreakIterator graphemeIterator = new MyBreakIterator();
|
||||
@ -1620,19 +1655,18 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
if (lookAfter == -1) {
|
||||
// Otherwise, do not break
|
||||
// Any × Any (11)
|
||||
rule = "11";
|
||||
rule = "12";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
|
||||
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
|
||||
// ATerm ×Upper (7)
|
||||
// ATerm Close* Sp*×(¬( OLetter))* Lower(8)
|
||||
|
||||
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp*÷(10)
|
||||
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10)
|
||||
// ( Term | ATerm ) Close* Sp*÷(11)
|
||||
|
||||
|
||||
// We DID find one. Loop to see if the right side is ok.
|
||||
|
||||
graphemeIterator.set(source, offset);
|
||||
@ -1648,16 +1682,16 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
if (isFirst) {
|
||||
isFirst = false;
|
||||
if (lookAfter == ATerm && t == Upper) {
|
||||
rule = "7";
|
||||
rule = "8";
|
||||
return false;
|
||||
}
|
||||
if (gotSpace) {
|
||||
if (t == Sp || t == Sep) {
|
||||
rule = "9";
|
||||
rule = "10";
|
||||
return false;
|
||||
}
|
||||
} else if (t == Close || t == Sp || t == Sep) {
|
||||
rule = "8";
|
||||
rule = "9";
|
||||
return false;
|
||||
}
|
||||
if (lookAfter == Term) break;
|
||||
@ -1666,16 +1700,18 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
|
||||
if (t != OLetter && t != Upper && t != Lower) continue;
|
||||
if (t == Lower) {
|
||||
rule = "6";
|
||||
rule = "8";
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
rule = "10";
|
||||
rule = "11";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static final boolean DEBUG_GRAPHEMES = false;
|
||||
|
||||
static class MyBreakIterator {
|
||||
int offset = 0;
|
||||
String string = "";
|
||||
@ -1683,6 +1719,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
boolean recommended = true;
|
||||
|
||||
public MyBreakIterator set(String source, int offset) {
|
||||
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset);
|
||||
string = source;
|
||||
this.offset = offset;
|
||||
return this;
|
||||
@ -1694,6 +1731,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
for (++offset; offset < string.length(); ++offset) {
|
||||
if (breaker.isBreak(string, offset, recommended)) break;
|
||||
}
|
||||
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -1702,7 +1740,9 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
||||
for (--offset; offset >= 0; --offset) {
|
||||
if (breaker.isBreak(string, offset, recommended)) break;
|
||||
}
|
||||
return UTF16.charAt(string, offset);
|
||||
int result = UTF16.charAt(string, offset);
|
||||
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2003/02/25 23:38:23 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -45,10 +45,19 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
|
||||
|
||||
System.out.println("Making Full Data");
|
||||
Map fullData = getCaseFolding(true, NF_CLOSURE);
|
||||
Map fullData = getCaseFolding(true, NF_CLOSURE, "");
|
||||
Utility.fixDot();
|
||||
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleData = getCaseFolding(false, NF_CLOSURE);
|
||||
Map simpleData = getCaseFolding(false, NF_CLOSURE, "");
|
||||
// write the data
|
||||
|
||||
System.out.println("Making Turkish Full Data");
|
||||
Map fullDataTurkish = getCaseFolding(true, NF_CLOSURE, "tr");
|
||||
Utility.fixDot();
|
||||
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleDataTurkish = getCaseFolding(false, NF_CLOSURE, "tr");
|
||||
// write the data
|
||||
|
||||
Utility.fixDot();
|
||||
@ -58,7 +67,8 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
String directory = "DerivedData/";
|
||||
String newFile = directory + filename + GenerateData.getFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName);
|
||||
|
||||
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
|
||||
out.println(GenerateData.generateDateLine());
|
||||
@ -81,7 +91,10 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
|
||||
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
|
||||
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
|
||||
if (rFull == null && rSimple == null) continue;
|
||||
String rFullTurkish = (String)fullDataTurkish.get(UTF32.valueOf32(ch));
|
||||
String rSimpleTurkish = (String)simpleDataTurkish.get(UTF32.valueOf32(ch));
|
||||
if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) continue;
|
||||
|
||||
if (rFull != null && rFull.equals(rSimple)
|
||||
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
|
||||
String type = "C";
|
||||
@ -105,10 +118,16 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
drawLine(out, ch, "S", rSimple);
|
||||
}
|
||||
}
|
||||
if (rFullTurkish != null && !rFullTurkish.equals(rFull)) {
|
||||
drawLine(out, ch, "T", rFullTurkish);
|
||||
}
|
||||
if (rSimpleTurkish != null && !rSimpleTurkish.equals(rSimple)) {
|
||||
drawLine(out, ch, "t", rSimpleTurkish);
|
||||
}
|
||||
}
|
||||
out.close();
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
|
||||
/* Goal is following (with no entries for 0131 or 0069)
|
||||
@ -146,7 +165,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
static int probeCh = 0x01f0;
|
||||
static String shower = UTF16.valueOf(probeCh);
|
||||
|
||||
static Map getCaseFolding(boolean full, boolean nfClose) throws java.io.IOException {
|
||||
static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException {
|
||||
Map data = new TreeMap();
|
||||
Map repChar = new TreeMap();
|
||||
//String option = "";
|
||||
@ -157,7 +176,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
Utility.dot(ch);
|
||||
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
||||
if (!Default.ucd.isRepresented(ch)) continue;
|
||||
getClosure(ch, data, full, nfClose);
|
||||
getClosure(ch, data, full, nfClose, condition);
|
||||
}
|
||||
|
||||
// get the representative characters
|
||||
@ -180,7 +199,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
Iterator it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
int s2Good = goodness(s2, full);
|
||||
int s2Good = goodness(s2, full, condition);
|
||||
if (s2Good > repGood) {
|
||||
rep = s2;
|
||||
repGood = s2Good;
|
||||
@ -206,12 +225,20 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
log.println(" Set:\t" + toString(set,true, true));
|
||||
}
|
||||
|
||||
log.println();
|
||||
log.println();
|
||||
log.println(rep + "\t#" + Default.ucd.getName(rep));
|
||||
|
||||
// Add it for all the elements of the set
|
||||
|
||||
it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
if (UTF16.countCodePoint(s2) == 1 && !s2.equals(rep)) {
|
||||
if (s2.equals(rep)) continue;
|
||||
|
||||
log.println(s2 + "\t#" + Default.ucd.getName(s2));
|
||||
|
||||
if (UTF16.countCodePoint(s2) == 1) {
|
||||
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
|
||||
charsUsed.set(UTF16.charAt(s2, 0));
|
||||
}
|
||||
@ -225,14 +252,14 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
static final int NFC_FORMAT = 64;
|
||||
static final int ISLOWER = 128;
|
||||
|
||||
static int goodness(String s, boolean full) {
|
||||
static int goodness(String s, boolean full, String condition) {
|
||||
if (s == null) return 0;
|
||||
int result = 32-s.length();
|
||||
if (!PICK_SHORT) {
|
||||
result = s.length();
|
||||
}
|
||||
if (!full) result <<= 8;
|
||||
String low = lower(upper(s, full), full);
|
||||
String low = lower(upper(s, full, condition), full, condition);
|
||||
if (s.equals(low)) result |= ISLOWER;
|
||||
else if (PICK_SHORT && Default.nfd.normalize(s).equals(Default.nfd.normalize(low))) result |= ISLOWER;
|
||||
|
||||
@ -295,11 +322,11 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
}
|
||||
*/
|
||||
|
||||
static void getClosure(int ch, Map data, boolean full, boolean nfClose) {
|
||||
static void getClosure(int ch, Map data, boolean full, boolean nfClose, String condition) {
|
||||
String charStr = UTF32.valueOf32(ch);
|
||||
String lowerStr = lower(charStr, full);
|
||||
String titleStr = title(charStr, full);
|
||||
String upperStr = upper(charStr, full);
|
||||
String lowerStr = lower(charStr, full, condition);
|
||||
String titleStr = title(charStr, full, condition);
|
||||
String upperStr = upper(charStr, full, condition);
|
||||
if (charStr.equals(lowerStr) && charStr.equals(upperStr) && charStr.equals(titleStr)) return;
|
||||
if (DEBUG) System.err.println("Closure for " + Utility.hex(ch));
|
||||
|
||||
@ -327,47 +354,47 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
if (add(set, Default.nfkd.normalize(s), data)) continue main;
|
||||
if (add(set, Default.nfkc.normalize(s), data)) continue main;
|
||||
}
|
||||
if (add(set, lower(s, full), data)) continue main;
|
||||
if (add(set, title(s, full), data)) continue main;
|
||||
if (add(set, upper(s, full), data)) continue main;
|
||||
if (add(set, lower(s, full, condition), data)) continue main;
|
||||
if (add(set, title(s, full, condition), data)) continue main;
|
||||
if (add(set, upper(s, full, condition), data)) continue main;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static String lower(String s, boolean full) {
|
||||
String result = lower2(s,full);
|
||||
static String lower(String s, boolean full, String condition) {
|
||||
String result = lower2(s,full, condition);
|
||||
return result.replace('\u03C2', '\u03C3'); // HACK for lower
|
||||
}
|
||||
|
||||
// These functions are no longer necessary, since Default.ucd is parameterized,
|
||||
// but it's not worth changing
|
||||
|
||||
static String lower2(String s, boolean full) {
|
||||
static String lower2(String s, boolean full, String condition) {
|
||||
/*if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return Default.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
|
||||
}
|
||||
*/
|
||||
return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER);
|
||||
return Default.ucd.getCase(s, full ? FULL : SIMPLE, LOWER, condition);
|
||||
}
|
||||
|
||||
static String upper(String s, boolean full) {
|
||||
static String upper(String s, boolean full, String condition) {
|
||||
/* if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
|
||||
}
|
||||
*/
|
||||
return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER);
|
||||
return Default.ucd.getCase(s, full ? FULL : SIMPLE, UPPER, condition);
|
||||
}
|
||||
|
||||
static String title(String s, boolean full) {
|
||||
static String title(String s, boolean full, String condition) {
|
||||
/*if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return Default.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
|
||||
}
|
||||
*/
|
||||
return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE);
|
||||
return Default.ucd.getCase(s, full ? FULL : SIMPLE, TITLE, condition);
|
||||
}
|
||||
|
||||
static boolean add(Set set, String s, Map data) {
|
||||
@ -557,7 +584,8 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
System.out.println("Writing");
|
||||
String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true), batName);
|
||||
out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
|
||||
out.println(GenerateData.generateDateLine());
|
||||
out.println("#");
|
||||
@ -594,6 +622,6 @@ public class GenerateCaseFolding implements UCD_Types {
|
||||
}
|
||||
Utility.appendFile("SpecialCasingFooter.txt", Utility.UTF8, out);
|
||||
out.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
}
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -116,16 +116,13 @@ public class GenerateData implements UCD_Types {
|
||||
output.println(generateDateLine());
|
||||
output.println("#");
|
||||
if (headerChoice == HEADER_SCRIPTS) {
|
||||
output.println("# For documentation, see UTR #24: Script Names");
|
||||
output.println("# http://www.unicode.org/unicode/reports/tr24/");
|
||||
} else if (headerChoice == HEADER_EXTEND) {
|
||||
output.println("# Unicode Character Database: Extended Properties");
|
||||
output.println("# For documentation, see PropList.html");
|
||||
} else {
|
||||
output.println("# Unicode Character Database: Derived Property Data");
|
||||
output.println("# Generated algorithmically from the Unicode Character Database");
|
||||
output.println("# For documentation, see DerivedProperties.html");
|
||||
}
|
||||
output.println("# For documentation, see UCD.html");
|
||||
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
|
||||
output.println("# except when listing Noncharacter or Cn.");
|
||||
output.println(HORIZONTAL_LINE);
|
||||
@ -144,12 +141,14 @@ public class GenerateData implements UCD_Types {
|
||||
String newFile = directory + fileName + getFileSuffix(true);
|
||||
System.out.println("New File: " + newFile);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat(directory, fileName, getFileSuffix(true), batName);
|
||||
System.out.println("Most recent: " + mostRecent);
|
||||
|
||||
doHeader(fileName + getFileSuffix(false), output, headerChoice);
|
||||
for (int i = 0; i < DERIVED_PROPERTY_LIMIT; ++i) {
|
||||
UnicodeProperty up = DerivedProperty.make(i, Default.ucd);
|
||||
if (up == null) continue;
|
||||
boolean keepGoing = true;
|
||||
if (!up.isStandard()) keepGoing = false;
|
||||
if ((up.getType() & type) == 0) keepGoing = false;
|
||||
@ -164,7 +163,7 @@ public class GenerateData implements UCD_Types {
|
||||
output.flush();
|
||||
}
|
||||
output.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -192,7 +191,8 @@ public class GenerateData implements UCD_Types {
|
||||
Default.setUCD();
|
||||
String newFile = "DerivedData/CompositionExclusions" + getFileSuffix(true);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true), batName);
|
||||
|
||||
output.println("# CompositionExclusions" + getFileSuffix(false));
|
||||
output.println(generateDateLine());
|
||||
@ -248,7 +248,7 @@ public class GenerateData implements UCD_Types {
|
||||
new CompLister(output, 4).print();
|
||||
|
||||
output.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
|
||||
static String generateDateLine() {
|
||||
@ -538,12 +538,14 @@ public class GenerateData implements UCD_Types {
|
||||
addLine(sorted, "qc", "M", "Maybe");
|
||||
checkDuplicate(duplicates, accumulation, "M", "qc=Maybe");
|
||||
|
||||
addLine(sorted, "blk", "n/a", Utility.getUnskeleton("no block", true));
|
||||
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) {
|
||||
int type = i & 0xFF00;
|
||||
if (type == AGE) continue;
|
||||
if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
|
||||
if (i == (BINARY_PROPERTIES | Non_break)) continue;
|
||||
if (i == (BINARY_PROPERTIES | Case_Sensitive)) continue;
|
||||
|
||||
if (type == NUMERIC_TYPE) {
|
||||
//System.out.println("debug");
|
||||
@ -658,7 +660,8 @@ public class GenerateData implements UCD_Types {
|
||||
String filename = "PropertyAliases";
|
||||
String newFile = "DerivedData/" + filename + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true), batName);
|
||||
|
||||
log.println("# " + filename + getFileSuffix(false));
|
||||
log.println(generateDateLine());
|
||||
@ -669,12 +672,12 @@ public class GenerateData implements UCD_Types {
|
||||
Utility.print(log, sorted, "\r\n", new MyBreaker(true));
|
||||
log.println();
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
|
||||
filename = "PropertyValueAliases";
|
||||
newFile = "DerivedData/" + filename + getFileSuffix(true);
|
||||
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
|
||||
mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true), batName);
|
||||
|
||||
log.println("# " + filename + getFileSuffix(false));
|
||||
log.println(generateDateLine());
|
||||
@ -685,12 +688,13 @@ public class GenerateData implements UCD_Types {
|
||||
Utility.print(log, sorted, "\r\n", new MyBreaker(false));
|
||||
log.println();
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
|
||||
filename = "PropertyAliasSummary";
|
||||
newFile = "OtherData/" + filename + getFileSuffix(true);
|
||||
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
mostRecent = generateBat("OtherData/", filename, getFileSuffix(true));
|
||||
mostRecent = generateBat("OtherData/", filename, getFileSuffix(true), batName);
|
||||
|
||||
log.println();
|
||||
log.println(HORIZONTAL_LINE);
|
||||
log.println();
|
||||
@ -702,7 +706,7 @@ public class GenerateData implements UCD_Types {
|
||||
Utility.print(log, accumulation, "\r\n", new MyBreaker(false));
|
||||
log.println();
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
|
||||
static void addLine(Set sorted, String f1, String f2, String f3) {
|
||||
@ -821,10 +825,10 @@ public class GenerateData implements UCD_Types {
|
||||
*/
|
||||
// static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
public static String generateBat(String directory, String fileRoot, String suffix) throws IOException {
|
||||
public static String generateBat(String directory, String fileRoot, String suffix, String[] batName) throws IOException {
|
||||
String mostRecent = Utility.getMostRecentUnicodeDataFile(fixFile(fileRoot), Default.ucd.getVersion(), true, true);
|
||||
if (mostRecent != null) {
|
||||
generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
|
||||
batName[0] = generateBatAux(directory + "DIFF/Diff_" + fileRoot + suffix,
|
||||
mostRecent, directory + fileRoot + suffix);
|
||||
} else {
|
||||
System.out.println("No previous version of: " + fileRoot + ".txt");
|
||||
@ -839,8 +843,10 @@ public class GenerateData implements UCD_Types {
|
||||
return mostRecent;
|
||||
}
|
||||
|
||||
public static void generateBatAux(String batName, String oldName, String newName) throws IOException {
|
||||
public static String generateBatAux(String batName, String oldName, String newName) throws IOException {
|
||||
String fullBatName = batName + ".bat";
|
||||
PrintWriter output = Utility.openPrintWriter(batName + ".bat", Utility.LATIN1_UNIX);
|
||||
|
||||
newName = Utility.getOutputName(newName);
|
||||
System.out.println("Writing BAT to compare " + oldName + " and " + newName);
|
||||
|
||||
@ -851,6 +857,7 @@ public class GenerateData implements UCD_Types {
|
||||
+ " "
|
||||
+ newFile.getCanonicalFile());
|
||||
output.close();
|
||||
return new File(Utility.getOutputName(fullBatName)).getCanonicalFile().toString();
|
||||
}
|
||||
|
||||
|
||||
@ -860,20 +867,25 @@ public class GenerateData implements UCD_Types {
|
||||
Default.setUCD();
|
||||
String newFile = directory + file + getFileSuffix(true);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, file, getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat(directory, file, getFileSuffix(true), batName);
|
||||
|
||||
doHeader(file + getFileSuffix(false), output, headerChoice);
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
UnicodeProperty up = UnifiedBinaryProperty.make(i, Default.ucd);
|
||||
if (up == null) continue;
|
||||
if (up.isDefaultValue()) continue;
|
||||
|
||||
/*
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|| i == (BINARY_PROPERTIES | CaseFoldTurkishI)
|
||||
|| i == (HANGUL_SYLLABLE_TYPE | NA)
|
||||
|| i == (JOINING_TYPE | JT_U)
|
||||
|| i == (JOINING_GROUP | NO_SHAPING)
|
||||
) continue; // skip zero case
|
||||
*/
|
||||
/*if (skipSpecial == SKIP_SPECIAL
|
||||
&& i >= (BINARY_PROPERTIES | CompositionExclusion)
|
||||
&& i < (AGE + NEXT_ENUM)) continue;
|
||||
@ -920,8 +932,8 @@ public class GenerateData implements UCD_Types {
|
||||
output.flush();
|
||||
}
|
||||
output.close();
|
||||
System.out.println("HERE");
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
//System.out.println("HERE");
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
@ -929,7 +941,8 @@ public class GenerateData implements UCD_Types {
|
||||
Default.setUCD();
|
||||
String newFile = directory + fileName + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
|
||||
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat(directory, fileName, getFileSuffix(true), batName);
|
||||
|
||||
String[] example = new String[256];
|
||||
|
||||
@ -959,7 +972,7 @@ public class GenerateData implements UCD_Types {
|
||||
log.println("# NFKD");
|
||||
log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)");
|
||||
log.println("#");
|
||||
log.println("# 2. For every assigned Unicode 3.1.0 code point X that is not specifically");
|
||||
log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically");
|
||||
log.println("# listed in Part 1, the following invariants must be true for all conformant");
|
||||
log.println("# implementations:");
|
||||
log.println("#");
|
||||
@ -1038,7 +1051,7 @@ public class GenerateData implements UCD_Types {
|
||||
log.println("#");
|
||||
log.println("# END OF FILE");
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
|
||||
static void handleIdentical() throws IOException {
|
||||
@ -1130,7 +1143,8 @@ public class GenerateData implements UCD_Types {
|
||||
Default.setUCD();
|
||||
String newFile = directory + filename + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat(directory, filename, getFileSuffix(true), batName);
|
||||
DiffPropertyLister dpl;
|
||||
UnicodeSet cummulative = new UnicodeSet();
|
||||
|
||||
@ -1203,7 +1217,7 @@ public class GenerateData implements UCD_Types {
|
||||
} finally {
|
||||
if (log != null) {
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1212,7 +1226,8 @@ public class GenerateData implements UCD_Types {
|
||||
Default.setUCD();
|
||||
String newFile = directory + filename + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
|
||||
String[] batName = {""};
|
||||
String mostRecent = generateBat(directory, filename, getFileSuffix(true), batName);
|
||||
try {
|
||||
log.println("# " + filename + getFileSuffix(false));
|
||||
log.println(generateDateLine());
|
||||
@ -1253,6 +1268,9 @@ public class GenerateData implements UCD_Types {
|
||||
log.println(HORIZONTAL_LINE);
|
||||
log.println();
|
||||
new DiffPropertyLister("3.1.0", "3.2.0", log).print();
|
||||
log.println(HORIZONTAL_LINE);
|
||||
log.println();
|
||||
new DiffPropertyLister("3.2.0", "4.0.0", log).print();
|
||||
/*
|
||||
printDiff("110", "200");
|
||||
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
|
||||
@ -1298,7 +1316,7 @@ public class GenerateData implements UCD_Types {
|
||||
} finally {
|
||||
if (log != null) {
|
||||
log.close();
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile));
|
||||
Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -73,7 +73,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
String property = line.substring(tabPos+1, tabPos2).trim();
|
||||
|
||||
String propertyValue = line.substring(tabPos2+1).trim();
|
||||
if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
|
||||
if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
|
||||
|
||||
HanInfo values = (HanInfo) properties.get(property);
|
||||
if (values == null) {
|
||||
@ -203,13 +203,15 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
return (radical << 8) + strokes;
|
||||
}
|
||||
|
||||
static Transliterator fixHex = Transliterator.getInstance("hex-any/unicode");
|
||||
static Transliterator fromHexUnicode = Transliterator.getInstance("hex-any/unicode");
|
||||
|
||||
static Transliterator toHexUnicode = Transliterator.getInstance("any-hex/unicode");
|
||||
|
||||
/*
|
||||
static String convertUPlus(String other) {
|
||||
int pos1 = other.indexOf("U+");
|
||||
if (pos1 < 0) return other;
|
||||
return fixHex(
|
||||
return fromHexUnicode(
|
||||
pos1 += 2;
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
@ -297,6 +299,47 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
|
||||
readFrequencyData(type);
|
||||
|
||||
Iterator it = fullPinyin.iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
if (!isValidPinyin2(s)) {
|
||||
err.println("?Valid Pinyin: " + s);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
it = unihanMap.keySet().iterator();
|
||||
Map badPinyin = new TreeMap();
|
||||
PrintWriter out2 = Utility.openPrintWriter("Raw_mapping.txt", Utility.UTF8_WINDOWS);
|
||||
try {
|
||||
while (it.hasNext()) {
|
||||
String keyChar = (String) it.next();
|
||||
String def = (String) unihanMap.get(keyChar);
|
||||
if (!isValidPinyin(def)) {
|
||||
String fixedDef = fixPinyin(def);
|
||||
err.println(Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t" + fixedDef + "\t#" + def
|
||||
+ (fixedDef.equals(def) ? " FAIL" : ""));
|
||||
Utility.addToSet(badPinyin, def, keyChar);
|
||||
}
|
||||
// check both ways
|
||||
String digitDef = accentPinyin_digitPinyin.transliterate(def);
|
||||
String accentDef = digitPinyin_accentPinyin.transliterate(digitDef);
|
||||
if (!accentDef.equals(def)) {
|
||||
err.println("Failed Digit Pinyin: "
|
||||
+ Default.ucd.getCode(keyChar) + "\t" + keyChar + "\t"
|
||||
+ def + " => " + digitDef + " => " + accentDef);
|
||||
}
|
||||
|
||||
out2.println(toHexUnicode.transliterate(keyChar)
|
||||
+ "\tkMandarin\t" + digitDef.toUpperCase() + "\t# " + keyChar + ";\t" + def);
|
||||
}
|
||||
err.println();
|
||||
err.println("Summary of Bad syllables");
|
||||
Utility.printMapOfCollection(err, badPinyin, "\r\n", ":\t", ", ");
|
||||
} finally {
|
||||
out2.close();
|
||||
}
|
||||
|
||||
out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
|
||||
out.println("# Start RAW data for converting CJK characters");
|
||||
/*
|
||||
@ -315,13 +358,12 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
*/
|
||||
|
||||
Set gotAlready = new HashSet();
|
||||
Iterator it = rankList.iterator();
|
||||
Set lenSet = new TreeSet();
|
||||
Set backSet = new TreeSet();
|
||||
int rank = 0;
|
||||
Map definitionCount = new HashMap();
|
||||
|
||||
|
||||
it = rankList.iterator();
|
||||
while (it.hasNext()) {
|
||||
String keyChar = (String) it.next();
|
||||
String def = (String) unihanMap.get(keyChar);
|
||||
@ -478,6 +520,578 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
//http://fog.ccsf.cc.ca.us/~jliou/phonetic.htm
|
||||
// longer ones must be AFTER!
|
||||
// longer ones must be AFTER!
|
||||
static final String[] initialPinyin = {
|
||||
"",
|
||||
"b", "p", "m", "f",
|
||||
"d", "t", "n", "l",
|
||||
"z", "c", "s",
|
||||
"zh", "ch", "sh", "r",
|
||||
"j", "q", "x",
|
||||
"g", "k", "h",
|
||||
"y", "w"}; // added to make checking simpler
|
||||
|
||||
static final String[] finalPinyin = {
|
||||
"a", "ai", "ao", "an", "ang",
|
||||
"o", "ou", "ong",
|
||||
"e", "ei", "er", "en", "eng",
|
||||
"i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
|
||||
"u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
|
||||
"ü", "üe", "üan", "ün"
|
||||
};
|
||||
// Don't bother with the following rules; just add w,y to initials
|
||||
// When “i” stands alone, a “y” will be added before it as “yi”.
|
||||
// If “i” is the first letter of the syllable it will be changed to “y”.
|
||||
// When “u” stands alone, a “w” will be added before it as “wu”.
|
||||
// If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> wang”.
|
||||
// When “ü” stands alone, a “y” will be added before it and “ü” will be changed to “u” as “yu”.
|
||||
// If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. e.g. “üan -> yuan”.
|
||||
//Note: The nasal final “ueng” never occurs after an initial but always form a syllable by itself.
|
||||
// The “o” in “iou” is hidden, so it will be wrote as “iu”. But, don’t forget to pronounce it.
|
||||
// The “e” in “uei” is hidden, so it will be wrote as “ui”. But, don’t forget to pronounce it.
|
||||
|
||||
|
||||
public static final String[] pinyin_bopomofo = {
|
||||
"a", "\u311a",
|
||||
"ai", "\u311e",
|
||||
"an", "\u3122",
|
||||
"ang", "\u3124",
|
||||
"ao", "\u3120",
|
||||
"ba", "\u3105\u311a",
|
||||
"bai", "\u3105\u311e",
|
||||
"ban", "\u3105\u3122",
|
||||
"bang", "\u3105\u3124",
|
||||
"bao", "\u3105\u3120",
|
||||
"bei", "\u3105\u311f",
|
||||
"ben", "\u3105\u3123",
|
||||
"beng", "\u3105\u3125",
|
||||
"bi", "\u3105\u3127",
|
||||
"bian", "\u3105\u3127\u3122",
|
||||
"biao", "\u3105\u3127\u3120",
|
||||
"bie", "\u3105\u3127\u311d",
|
||||
"bin", "\u3105\u3127\u3123",
|
||||
"bing", "\u3105\u3127\u3125",
|
||||
"bo", "\u3105\u311b",
|
||||
"bu", "\u3105\u3128",
|
||||
"ca", "\u3118\u311a",
|
||||
"cai", "\u3118\u311e",
|
||||
"can", "\u3118\u3122",
|
||||
"cang", "\u3118\u3124",
|
||||
"cao", "\u3118\u3120",
|
||||
"ce", "\u3118",
|
||||
"cen", "\u3118\u3123",
|
||||
"ceng", "\u3118\u3125",
|
||||
"cha", "\u3114\u311a",
|
||||
"chai", "\u3114\u311e",
|
||||
"chan", "\u3114\u3122",
|
||||
"chang", "\u3114\u3124",
|
||||
"chao", "\u3114\u3120",
|
||||
"che", "\u3114\u311c",
|
||||
"chen", "\u3114\u3123",
|
||||
"cheng", "\u3114\u3125",
|
||||
"chi", "\u3114",
|
||||
"chong", "\u3114\u3121\u3125",
|
||||
"chou", "\u3114\u3121",
|
||||
"chu", "\u3114\u3128",
|
||||
//"chua", "XXX",
|
||||
"chuai", "\u3114\u3128\u311e",
|
||||
"chuan", "\u3114\u3128\u3122",
|
||||
"chuang", "\u3114\u3128\u3124",
|
||||
"chui", "\u3114\u3128\u311f",
|
||||
"chun", "\u3114\u3128\u3123",
|
||||
"chuo", "\u3114\u3128\u311b",
|
||||
"ci", "\u3118",
|
||||
"cong", "\u3118\u3128\u3125",
|
||||
"cou", "\u3118\u3121",
|
||||
"cu", "\u3118\u3128",
|
||||
"cuan", "\u3118\u3128\u3122",
|
||||
"cui", "\u3118\u3128\u311f",
|
||||
"cun", "\u3118\u3128\u3123",
|
||||
"cuo", "\u3118\u3128\u311b",
|
||||
"da", "\u3109\u311a",
|
||||
"dai", "\u3109\u311e",
|
||||
"dan", "\u3109\u3122",
|
||||
"dang", "\u3109\u3124",
|
||||
"dao", "\u3109\u3120",
|
||||
"de", "\u3109\u311c",
|
||||
"dei", "\u3109\u311f",
|
||||
"den", "\u3109\u3123",
|
||||
"deng", "\u3109\u3125",
|
||||
"di", "\u3109\u3127",
|
||||
"dia", "\u3109\u3127\u311a",
|
||||
"dian", "\u3109\u3127\u3122",
|
||||
"diao", "\u3109\u3127\u3120",
|
||||
"die", "\u3109\u3127\u311d",
|
||||
"ding", "\u3109\u3127\u3125",
|
||||
"diu", "\u3109\u3127\u3121",
|
||||
"dong", "\u3109\u3128\u3125",
|
||||
"dou", "\u3109\u3121",
|
||||
"du", "\u3109\u3128",
|
||||
"duan", "\u3109\u3128\u3122",
|
||||
"dui", "\u3109\u3128\u311f",
|
||||
"dun", "\u3109\u3128\u3123",
|
||||
"duo", "\u3109\u3128\u311b",
|
||||
"e", "\u311c",
|
||||
"ei", "\u311f",
|
||||
"en", "\u3123",
|
||||
"eng", "\u3125",
|
||||
"er", "\u3126",
|
||||
"fa", "\u3108\u311a",
|
||||
"fan", "\u3108\u3122",
|
||||
"fang", "\u3108\u3124",
|
||||
"fei", "\u3108\u311f",
|
||||
"fen", "\u3108\u3123",
|
||||
"feng", "\u3108\u3125",
|
||||
"fo", "\u3108\u311b",
|
||||
"fou", "\u3108\u3121",
|
||||
"fu", "\u3108\u3128",
|
||||
"ga", "\u310d\u311a",
|
||||
"gai", "\u310d\u311e",
|
||||
"gan", "\u310d\u3122",
|
||||
"gang", "\u310d\u3124",
|
||||
"gao", "\u310d\u3120",
|
||||
"ge", "\u310d\u311c",
|
||||
"gei", "\u310d\u311f",
|
||||
"gen", "\u310d\u3123",
|
||||
"geng", "\u310d\u3125",
|
||||
"gong", "\u310d\u3128\u3125",
|
||||
"gou", "\u310d\u3121",
|
||||
"gu", "\u310d\u3128",
|
||||
"gua", "\u310d\u3128\u311a",
|
||||
"guai", "\u310d\u3128\u311e",
|
||||
"guan", "\u310d\u3128\u3122",
|
||||
"guang", "\u310d\u3128\u3124",
|
||||
"gui", "\u310d\u3128\u311f",
|
||||
"gun", "\u310d\u3128\u3123",
|
||||
"guo", "\u310d\u3128\u311b",
|
||||
"ha", "\u310f\u311a",
|
||||
"hai", "\u310f\u311e",
|
||||
"han", "\u310f\u3122",
|
||||
"hang", "\u310f\u3124",
|
||||
"hao", "\u310f\u3120",
|
||||
"he", "\u310f\u311c",
|
||||
"hei", "\u310f\u311f",
|
||||
"hen", "\u310f\u3123",
|
||||
"heng", "\u310f\u3125",
|
||||
"hm", "\u310f\u3107",
|
||||
"hng", "\u310f\u312b", // 'dialect of n'
|
||||
"hong", "\u310f\u3128\u3125",
|
||||
"hou", "\u310f\u3121",
|
||||
"hu", "\u310f\u3128",
|
||||
"hua", "\u310f\u3128\u311a",
|
||||
"huai", "\u310f\u3128\u311e",
|
||||
"huan", "\u310f\u3128\u3122",
|
||||
"huang", "\u310f\u3128\u3124",
|
||||
"hui", "\u310f\u3128\u311f",
|
||||
"hun", "\u310f\u3128\u3123",
|
||||
"huo", "\u310f\u3128\u311b",
|
||||
"ji", "\u3110\u3127",
|
||||
"jia", "\u3110\u3127\u311a",
|
||||
"jian", "\u3110\u3127\u3122",
|
||||
"jiang", "\u3110\u3127\u3124",
|
||||
"jiao", "\u3110\u3127\u3120",
|
||||
"jie", "\u3110\u3127\u311d",
|
||||
"jin", "\u3110\u3127\u3123",
|
||||
"jing", "\u3110\u3127\u3125",
|
||||
"jiong", "\u3110\u3129\u3125",
|
||||
"jiu", "\u3110\u3127\u3121",
|
||||
"ju", "\u3110\u3129",
|
||||
"juan", "\u3110\u3129\u3122",
|
||||
"jue", "\u3110\u3129\u311d",
|
||||
"jun", "\u3110\u3129\u3123",
|
||||
"ka", "\u310e\u311a",
|
||||
"kai", "\u310e\u311e",
|
||||
"kan", "\u310e\u3122",
|
||||
"kang", "\u310e\u3124",
|
||||
"kao", "\u310e\u3120",
|
||||
"ke", "\u310e\u311c",
|
||||
"kei", "\u310e\u311f",
|
||||
"ken", "\u310e\u3123",
|
||||
"keng", "\u310e\u3125",
|
||||
"kong", "\u310e\u3128\u3125",
|
||||
"kou", "\u310e\u3121",
|
||||
"ku", "\u310e\u3128",
|
||||
"kua", "\u310e\u3128\u311a",
|
||||
"kuai", "\u310e\u3128\u311e",
|
||||
"kuan", "\u310e\u3128\u3122",
|
||||
"kuang", "\u310e\u3128\u3124",
|
||||
"kui", "\u310e\u3128\u311f",
|
||||
"kun", "\u310e\u3128\u3123",
|
||||
"kuo", "\u310e\u3128\u311b",
|
||||
"la", "\u310c\u311a",
|
||||
"lai", "\u310c\u311e",
|
||||
"lan", "\u310c\u3122",
|
||||
"lang", "\u310c\u3124",
|
||||
"lao", "\u310c\u3120",
|
||||
"le", "\u310c\u311c",
|
||||
"lei", "\u310c\u311f",
|
||||
"leng", "\u310c\u3125",
|
||||
"li", "\u310c\u3127",
|
||||
"lia", "\u310c\u3127\u311a",
|
||||
"lian", "\u310c\u3127\u3122",
|
||||
"liang", "\u310c\u3127\u3124",
|
||||
"liao", "\u310c\u3127\u3120",
|
||||
"lie", "\u310c\u3127\u311d",
|
||||
"lin", "\u310c\u3127\u3123",
|
||||
"ling", "\u310c\u3127\u3125",
|
||||
"liu", "\u310c\u3127\u3121",
|
||||
"lo", "\u310c\u311b",
|
||||
"long", "\u310c\u3128\u3125",
|
||||
"lou", "\u310c\u3121",
|
||||
"lu", "\u310c\u3128",
|
||||
"lü", "\u310c\u3129",
|
||||
"luan", "\u310c\u3128\u3122",
|
||||
"lüe", "\u310c\u3129\u311d",
|
||||
"lun", "\u310c\u3128\u3123",
|
||||
"luo", "\u310c\u3128\u311b",
|
||||
"m", "\u3107",
|
||||
"ma", "\u3107\u311a",
|
||||
"mai", "\u3107\u311e",
|
||||
"man", "\u3107\u3122",
|
||||
"mang", "\u3107\u3124",
|
||||
"mao", "\u3107\u3120",
|
||||
"me", "\u3107\u311c",
|
||||
"mei", "\u3107\u311f",
|
||||
"men", "\u3107\u3123",
|
||||
"meng", "\u3107\u3125",
|
||||
"mi", "\u3107\u3127",
|
||||
"mian", "\u3107\u3127\u3122",
|
||||
"miao", "\u3107\u3127\u3120",
|
||||
"mie", "\u3107\u3127\u311d",
|
||||
"min", "\u3107\u3127\u3123",
|
||||
"ming", "\u3107\u3127\u3125",
|
||||
"miu", "\u3107\u3127\u3121",
|
||||
"mo", "\u3107\u311b",
|
||||
"mou", "\u3107\u3121",
|
||||
"mu", "\u3107\u3128",
|
||||
"n", "\u310b",
|
||||
"na", "\u310b\u311a",
|
||||
"nai", "\u310b\u311e",
|
||||
"nan", "\u310b\u3122",
|
||||
"nang", "\u310b\u3124",
|
||||
"nao", "\u310b\u3120",
|
||||
"ne", "\u310b\u311c",
|
||||
"nei", "\u310b\u311f",
|
||||
"nen", "\u310b\u3123",
|
||||
"neng", "\u310b\u3125",
|
||||
"ng", "\u312b",
|
||||
"ni", "\u310b\u3127",
|
||||
"nian", "\u310b\u3127\u3122",
|
||||
"niang", "\u310b\u3127\u3124",
|
||||
"niao", "\u310b\u3127\u3120",
|
||||
"nie", "\u310b\u3127\u311d",
|
||||
"nin", "\u310b\u3127\u3123",
|
||||
"ning", "\u310b\u3127\u3125",
|
||||
"niu", "\u310b\u3127\u3121",
|
||||
"nong", "\u310b\u3128\u3125",
|
||||
"nou", "\u310b\u3121",
|
||||
"nu", "\u310b\u3128",
|
||||
"nü", "\u310b\u3129",
|
||||
"nuan", "\u310b\u3128\u3122",
|
||||
"nüe", "\u310b\u3129\u311d",
|
||||
"nuo", "\u310b\u3128\u311b",
|
||||
"o", "\u311b",
|
||||
"ou", "\u3121",
|
||||
"pa", "\u3106\u311a",
|
||||
"pai", "\u3106\u311e",
|
||||
"pan", "\u3106\u3122",
|
||||
"pang", "\u3106\u3124",
|
||||
"pao", "\u3106\u3120",
|
||||
"pei", "\u3106\u311f",
|
||||
"pen", "\u3106\u3123",
|
||||
"peng", "\u3106\u3125",
|
||||
"pi", "\u3106\u3127",
|
||||
"pian", "\u3106\u3127\u3122",
|
||||
"piao", "\u3106\u3127\u3120",
|
||||
"pie", "\u3106\u3127\u311d",
|
||||
"pin", "\u3106\u3127\u3123",
|
||||
"ping", "\u3106\u3127\u3125",
|
||||
"po", "\u3106\u311b",
|
||||
"pou", "\u3106\u3121",
|
||||
"pu", "\u3106\u3128",
|
||||
"qi", "\u3111",
|
||||
"qia", "\u3111\u3127\u311a",
|
||||
"qian", "\u3111\u3127\u3122",
|
||||
"qiang", "\u3111\u3127\u3124",
|
||||
"qiao", "\u3111\u3127\u3120",
|
||||
"qie", "\u3111\u3127\u311d",
|
||||
"qin", "\u3111\u3127\u3123",
|
||||
"qing", "\u3111\u3127\u3125",
|
||||
"qiong", "\u3111\u3129\u3125",
|
||||
"qiu", "\u3111\u3129\u3121",
|
||||
"qu", "\u3111\u3129",
|
||||
"quan", "\u3111\u3129\u3122",
|
||||
"que", "\u3111\u3129\u311d",
|
||||
"qun", "\u3111\u3129\u3123",
|
||||
"ran", "\u3116\u3122",
|
||||
"rang", "\u3116\u3124",
|
||||
"rao", "\u3116\u3120",
|
||||
"re", "\u3116\u311c",
|
||||
"ren", "\u3116\u3123",
|
||||
"reng", "\u3116\u3125",
|
||||
"ri", "\u3116",
|
||||
"rong", "\u3116\u3128\u3125",
|
||||
"rou", "\u3116\u3121",
|
||||
"ru", "\u3116\u3128",
|
||||
"ruan", "\u3116\u3128\u3122",
|
||||
"rui", "\u3116\u3128\u311f",
|
||||
"run", "\u3116\u3128\u3123",
|
||||
"ruo", "\u3116\u3128\u311b",
|
||||
"sa", "\u3119\u311a",
|
||||
"sai", "\u3119\u311e",
|
||||
"san", "\u3119\u3122",
|
||||
"sang", "\u3119\u3124",
|
||||
"sao", "\u3119\u3120",
|
||||
"se", "\u3119\u311c",
|
||||
"sen", "\u3119\u3123",
|
||||
"seng", "\u3119\u3125",
|
||||
"sha", "\u3115\u311a",
|
||||
"shai", "\u3115\u311e",
|
||||
"shan", "\u3115\u3122",
|
||||
"shang", "\u3115\u3124",
|
||||
"shao", "\u3115\u3120",
|
||||
"she", "\u3115\u311c",
|
||||
"shei", "\u3115\u311f",
|
||||
"shen", "\u3115\u3123",
|
||||
"sheng", "\u3115\u3125",
|
||||
"shi", "\u3115",
|
||||
"shou", "\u3115\u3121",
|
||||
"shu", "\u3115\u3128",
|
||||
"shua", "\u3115\u3128\u311a",
|
||||
"shuai", "\u3115\u3128\u311e",
|
||||
"shuan", "\u3115\u3128\u3122",
|
||||
"shuang", "\u3115\u3128\u3124",
|
||||
"shui", "\u3115\u3128\u311f",
|
||||
"shun", "\u3115\u3128\u3123",
|
||||
"shuo", "\u3115\u3128\u311b",
|
||||
"si", "\u3119",
|
||||
"song", "\u3119\u3128\u3125",
|
||||
"sou", "\u3119\u3121",
|
||||
"su", "\u3119\u3128",
|
||||
"suan", "\u3119\u3128\u3122",
|
||||
"sui", "\u3119\u3128\u311f",
|
||||
"sun", "\u3119\u3128\u3123",
|
||||
"suo", "\u3119\u3128\u311b",
|
||||
"ta", "\u310a\u311a",
|
||||
"tai", "\u310a\u311e",
|
||||
"tan", "\u310a\u3122",
|
||||
"tang", "\u310a\u3124",
|
||||
"tao", "\u310a\u3120",
|
||||
"te", "\u310a\u311c",
|
||||
"teng", "\u310a\u3125",
|
||||
"ti", "\u310a\u3127",
|
||||
"tian", "\u310a\u3127\u3122",
|
||||
"tiao", "\u310a\u3127\u3120",
|
||||
"tie", "\u310a\u3127\u311d",
|
||||
"ting", "\u310a\u3127\u3125",
|
||||
"tong", "\u310a\u3128\u3125",
|
||||
"tou", "\u310a\u3121",
|
||||
"tu", "\u310a\u3128",
|
||||
"tuan", "\u310a\u3128\u3122",
|
||||
"tui", "\u310a\u3128\u311f",
|
||||
"tun", "\u310a\u3128\u3123",
|
||||
"tuo", "\u310a\u3128\u311b",
|
||||
"wa", "\u3128\u311a",
|
||||
"wai", "\u3128\u311e",
|
||||
"wan", "\u3128\u3122",
|
||||
"wang", "\u3128\u3124",
|
||||
"wei", "\u3128\u311f",
|
||||
"wen", "\u3128\u3123",
|
||||
"weng", "\u3128\u3125",
|
||||
"wo", "\u3128\u311b",
|
||||
"wu", "\u3128",
|
||||
"xi", "\u3112\u3127",
|
||||
"xia", "\u3112\u3127\u311a",
|
||||
"xian", "\u3112\u3127\u3122",
|
||||
"xiang", "\u3112\u3127\u3124",
|
||||
"xiao", "\u3112\u3127\u3120",
|
||||
"xie", "\u3112\u3127\u311d",
|
||||
"xin", "\u3112\u3127\u3123",
|
||||
"xing", "\u3112\u3127\u3125",
|
||||
"xiong", "\u3112\u3129\u3125",
|
||||
"xiu", "\u3112\u3127\u3121",
|
||||
"xu", "\u3112\u3129",
|
||||
"xuan", "\u3112\u3129\u3122",
|
||||
"xue", "\u3112\u3129\u311d",
|
||||
"xun", "\u3112\u3129\u3123",
|
||||
"ya", "\u3127\u311a",
|
||||
"yai", "\u3127\u311e", // not in xinhua zidian index, but listed as alternate pronunciation
|
||||
"yan", "\u3127\u3122",
|
||||
"yang", "\u3127\u3124",
|
||||
"yao", "\u3127\u3120",
|
||||
"ye", "\u3127\u311d",
|
||||
"yi", "\u3127",
|
||||
"yin", "\u3127\u3123",
|
||||
"ying", "\u3127\u3125",
|
||||
"yo", "\u3127\u311b",
|
||||
"yong", "\u3129\u3125",
|
||||
"you", "\u3127\u3121",
|
||||
"yu", "\u3129",
|
||||
"yuan", "\u3129\u3122",
|
||||
"yue", "\u3129\u311d",
|
||||
"yun", "\u3129\u3123",
|
||||
"za", "\u3117\u311a",
|
||||
"zai", "\u3117\u311e",
|
||||
"zan", "\u3117\u3122",
|
||||
"zang", "\u3117\u3124",
|
||||
"zao", "\u3117\u3120",
|
||||
"ze", "\u3117",
|
||||
"zei", "\u3117\u311f",
|
||||
"zen", "\u3117\u3123",
|
||||
"zeng", "\u3117\u3125",
|
||||
"zha", "\u3113\u311a",
|
||||
"zhai", "\u3113\u311e",
|
||||
"zhan", "\u3113\u3122",
|
||||
"zhang", "\u3113\u3124",
|
||||
"zhao", "\u3113\u3120",
|
||||
"zhe", "\u3113\u311d",
|
||||
"zhei", "\u3113\u311f",
|
||||
"zhen", "\u3113\u3123",
|
||||
"zheng", "\u3113\u3125",
|
||||
"zhi", "\u3113",
|
||||
"zhong", "\u3113\u3128\u3125",
|
||||
"zhou", "\u3113\u3121",
|
||||
"zhu", "\u3113\u3128",
|
||||
"zhua", "\u3113\u3128\u311a",
|
||||
"zhuai", "\u3113\u3128\u311e",
|
||||
"zhuan", "\u3113\u3128\u3122",
|
||||
"zhuang", "\u3113\u3128\u3124",
|
||||
"zhui", "\u3113\u3128\u311f",
|
||||
"zhun", "\u3113\u3128\u3123",
|
||||
"zhuo", "\u3113\u3128\u311b",
|
||||
"zi", "\u3117",
|
||||
"zong", "\u3117\u3128\u3125",
|
||||
"zou", "\u3117\u3121",
|
||||
"zu", "\u3117\u3128",
|
||||
"zuan", "\u3117\u3128\u3122",
|
||||
"zui", "\u3117\u3128\u311f",
|
||||
"zun", "\u3117\u3128\u3123",
|
||||
"zuo", "\u3117\u3128\u311b",
|
||||
};
|
||||
|
||||
static final Set fullPinyin = new TreeSet();
|
||||
static {
|
||||
for (int i = 0; i < pinyin_bopomofo.length; i+= 2) {
|
||||
fullPinyin.add(pinyin_bopomofo[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static boolean isValidPinyin(String s) {
|
||||
s = dropTones.transliterate(s);
|
||||
if (fullPinyin.contains(s)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static boolean isValidPinyin2(String s) {
|
||||
s = dropTones.transliterate(s);
|
||||
for (int i = initialPinyin.length-1; i >= 0; --i) {
|
||||
if (s.startsWith(initialPinyin[i])) {
|
||||
String end = s.substring(initialPinyin[i].length());
|
||||
for (int j = finalPinyin.length-1; j >= 0; --j) {
|
||||
if (end.equals(finalPinyin[j])) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
U+347C · liù #lyuè
|
||||
U+3500 · lüè #lvè
|
||||
U+3527 · liù #lyù
|
||||
U+3729 · ào #àu
|
||||
U+380E · jí #jjí
|
||||
U+3825 · l· #lv·
|
||||
U+3A3C · lüè #luè
|
||||
U+3B5A · li· #ly· *** lü?
|
||||
U+3CB6 · l· #lv·
|
||||
U+3D56 · niù #nyù *** nü?
|
||||
U+3D88 · li·ng #li·ng
|
||||
U+3EF2 · li· #ly·*** lü?
|
||||
U+3F94 · li· #ly·*** lü?
|
||||
U+4071 · ào #àu
|
||||
U+40AE · liù #lyuè *** lüe?
|
||||
U+430E · liù #lyuè *** lüe?
|
||||
U+451E · liù #lyù *** lü?
|
||||
U+4588 · nüè #nuè
|
||||
U+458B · nüè #nuè
|
||||
U+45A1 · niù #nyù *** nü?
|
||||
U+4610 · niù #nyù *** nü?
|
||||
U+46BC · niù #nyù *** nü?
|
||||
U+46DA · liù #lyuè *** lüe?
|
||||
U+4896 · liù #lyù *** lü?
|
||||
U+4923 · liù #lyuè *** lüe?
|
||||
U+4968 · liù #lyù *** lü?
|
||||
U+4A0B · niù #nyuè *** nüe?
|
||||
U+4AC4 · chuò #chuà
|
||||
U+4D08 · ·o #·u
|
||||
U+4D8A · niù #nyù *** nü?
|
||||
U+51CA · qíng #qýng
|
||||
U+51D6 · zhu·n #zhu·n *** this is probably zh·n
|
||||
U+5481 · gàn #gèm
|
||||
U+5838 · féng #fúng
|
||||
U+639F · lü· #lu· *** this pronunciation surprises me, but I don't know...
|
||||
U+66D5 · yàn #yiàn
|
||||
U+6B3B · chu· #chu· *** chua _is_ ok after all, my table missed an entry
|
||||
U+6B56 · chu· #chu· *** chua
|
||||
U+6C7C · ni· #ni·u
|
||||
U+6E6D · qiú #qióu
|
||||
U+6F71 · y· #yi·
|
||||
U+7493 · xiù #xiòu
|
||||
U+7607 · zh·ng #zh·ng *** I suspect zh·ng
|
||||
U+7674 · luán #lüán
|
||||
U+7867 · y·ng #i·ng
|
||||
U+7878 · nüè #nuè
|
||||
*/
|
||||
|
||||
static Transliterator fixTypos = Transliterator.createFromRules("fix_typos",
|
||||
"$cons=[bcdfghjklmnpqrstvwxyz];"
|
||||
+"$nlet=[^[:Letter:][:Mark:]];"
|
||||
+"$cons{iou}$nlet > iu;"
|
||||
+"$cons{em}$nlet > an;"
|
||||
+"$cons{uen}$nlet > ueng;"
|
||||
+"$cons{ve}$nlet > üe;"
|
||||
+"$cons{v}$nlet > ü;"
|
||||
+"$cons{yue}$nlet > iu;"
|
||||
+"$cons{yng}$nlet > ing;"
|
||||
+"$cons{yu}$nlet > iu;"
|
||||
//+"$cons{ue} > üe;"
|
||||
+"jj > j;"
|
||||
//+"$nlet{ng}$nlet > eng;"
|
||||
//+"$nlet{n}$nlet > en;"
|
||||
//+"$nlet{m}$nlet > en;"
|
||||
+"$nlet{au}$nlet > ao;"
|
||||
|
||||
// new fixes
|
||||
+"zhueng}$nlet > zhong;"
|
||||
+"zhuen}$nlet > zhuan;"
|
||||
+"lue > lüe;"
|
||||
+"liong > liang;"
|
||||
+"nue > nüe;"
|
||||
+"chua > chuo;"
|
||||
+"yian > yan;"
|
||||
+"yie > ye;"
|
||||
+"lüan > luan;"
|
||||
+"iong > yong;"
|
||||
, Transliterator.FORWARD);
|
||||
|
||||
|
||||
static String fixPinyin(String s) {
|
||||
String original = s;
|
||||
//err.println("Source: " + s);
|
||||
s = accentPinyin_digitPinyin.transliterate(s);
|
||||
//err.println("Digit: " + s);
|
||||
s = fixTypos.transliterate(s);
|
||||
//err.println("fixed: " + s);
|
||||
s = digitPinyin_accentPinyin.transliterate(s);
|
||||
//err.println("Result: " + s);
|
||||
if (isValidPinyin(s)) return s;
|
||||
return original;
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
static PrintWriter out;
|
||||
static PrintWriter err;
|
||||
@ -734,7 +1348,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
if (type == JAPANESE) {
|
||||
processEdict(word, definition, line);
|
||||
} else {
|
||||
definition = convertPinyin.transliterate(definition);
|
||||
definition = digitToPinyin(definition, line);
|
||||
//definition = Utility.replace(definition, " ", "\\ ");
|
||||
addCheck(word, definition, line);
|
||||
}
|
||||
@ -755,20 +1369,37 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
int counter = 0;
|
||||
String[] pieces = new String[50];
|
||||
String line = "";
|
||||
boolean noOverrideFailure = true;
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter++);
|
||||
//System.out.println(line);
|
||||
|
||||
// skip code
|
||||
line=line.toLowerCase();
|
||||
|
||||
int wordStart = line.indexOf('\t') + 1;
|
||||
int wordEnd = line.indexOf('\t', wordStart);
|
||||
String word = line.substring(wordStart, wordEnd);
|
||||
String definition = line.substring(wordEnd+1);
|
||||
addCheck(word, definition, line);
|
||||
overrideSet.add(word);
|
||||
String definition = fixPinyin(line.substring(wordEnd+1));
|
||||
String old = (String) unihanMap.get(word);
|
||||
if (old != null) {
|
||||
if (!old.equals(definition)) {
|
||||
if (noOverrideFailure) {
|
||||
System.out.println("Overriding Failure");
|
||||
noOverrideFailure = false;
|
||||
}
|
||||
err.println("Overriding Failure: " + word
|
||||
+ "\t" + old + " " + toHexUnicode.transliterate(old)
|
||||
+ "\t" + definition + " " + toHexUnicode.transliterate(definition));
|
||||
}
|
||||
} else {
|
||||
addCheck(word, definition, line);
|
||||
overrideSet.add(word);
|
||||
}
|
||||
}
|
||||
br.close();
|
||||
} catch (Exception e) {
|
||||
@ -776,6 +1407,81 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
@Unihan Data
|
||||
|
||||
Bad pinyin data: \u4E7F ? LE
|
||||
\u7684 ? de, de, dí, dì
|
||||
*/
|
||||
|
||||
static void fixChineseOverrides() throws IOException {
|
||||
|
||||
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
|
||||
out = Utility.openPrintWriter("new_Chinese_override.txt", Utility.UTF8_WINDOWS);
|
||||
try {
|
||||
|
||||
String fname = "fixed_Chinese_transliterate_log.txt";
|
||||
|
||||
int counter = 0;
|
||||
String line = "";
|
||||
String pinyinPrefix = "Bad pinyin data: ";
|
||||
|
||||
System.out.println("Reading " + fname);
|
||||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
if (line.charAt(0) == 0xFEFF) {
|
||||
line = line.substring(1); // remove BOM
|
||||
if (line.length() == 0) continue;
|
||||
}
|
||||
Utility.dot(counter++);
|
||||
|
||||
|
||||
if (line.charAt(0) == '@') continue;
|
||||
if (line.startsWith(pinyinPrefix)) {
|
||||
line = line.substring(pinyinPrefix.length());
|
||||
}
|
||||
line = line.toLowerCase();
|
||||
|
||||
//System.out.println(Default.ucd.getCode(line));
|
||||
// skip code
|
||||
int wordStart = line.indexOf('\t') + 1;
|
||||
int wordEnd = line.indexOf('\t', wordStart);
|
||||
String word = line.substring(wordStart, wordEnd).trim();
|
||||
|
||||
int defStart = wordEnd+1;
|
||||
int defEnd = line.indexOf(',', defStart);
|
||||
if (defEnd < 0) defEnd = line.length();
|
||||
|
||||
String definition = fixCircumflex.transliterate(line.substring(defStart, defEnd).trim());
|
||||
|
||||
String notones = dropTones.transliterate(definition);
|
||||
if (definition.equals(notones)) {
|
||||
definition = digitPinyin_accentPinyin.transliterate(definition + "1");
|
||||
if (definition == null) {
|
||||
System.out.println("Huh? " + notones);
|
||||
}
|
||||
log.println("Fixing: " + notones + " => " + definition + "; " + line);
|
||||
}
|
||||
|
||||
out.println(hex.transliterate(word) + "\t" + word + "\t" + definition);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
||||
} finally {
|
||||
br.close();
|
||||
}
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static Set overrideSet = new HashSet();
|
||||
|
||||
static void processEdict(String word, String definition, String line) {
|
||||
@ -997,7 +1703,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
|
||||
static void readCDICT() throws IOException {
|
||||
System.out.println("Reading cdict.txt");
|
||||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", Utility.UTF8);
|
||||
String fname = "cdict.txt";
|
||||
|
||||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, Utility.UTF8);
|
||||
int counter = 0;
|
||||
String[] pieces = new String[50];
|
||||
String line = "";
|
||||
@ -1026,7 +1734,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
for (int i = 0; i < len; ++i) {
|
||||
String chr = word.substring(i, i+1);
|
||||
String piece = convertPinyin.transliterate(pieces[i]);
|
||||
|
||||
String piece = digitToPinyin(pieces[i], line);
|
||||
|
||||
Map oldMap = (Map) cdict.get(chr);
|
||||
if (oldMap == null) {
|
||||
oldMap = new TreeMap();
|
||||
@ -1069,6 +1779,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
static String digitToPinyin(String source, String line) {
|
||||
if (source.indexOf('5') >= 0) log.println("Pinyin Tone5 at: " + line);
|
||||
return digitPinyin_accentPinyin.transliterate(source);
|
||||
}
|
||||
|
||||
static Map cdict = new TreeMap();
|
||||
static Map simplifiedToTraditional = new HashMap();
|
||||
static Map traditionalToSimplified = new HashMap();
|
||||
@ -1098,7 +1813,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
String property = line.substring(tabPos+1, tabPos2).trim();
|
||||
|
||||
String propertyValue = line.substring(tabPos2+1).trim();
|
||||
if (propertyValue.indexOf("U+") >= 0) propertyValue = fixHex.transliterate(propertyValue);
|
||||
if (propertyValue.indexOf("U+") >= 0) propertyValue = fromHexUnicode.transliterate(propertyValue);
|
||||
|
||||
// gather traditional mapping
|
||||
if (property.equals("kTraditionalVariant")) {
|
||||
@ -1160,7 +1875,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
definition = definition.substring(0, end3);
|
||||
|
||||
definition = convertPinyin.transliterate(definition);
|
||||
definition = digitToPinyin(definition, line);
|
||||
}
|
||||
if (type == DEFINITION) {
|
||||
definition = removeMatched(definition,'(', ')', line);
|
||||
@ -1220,7 +1935,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
return source;
|
||||
}
|
||||
|
||||
static Map unihanMap = new HashMap();
|
||||
static Map unihanMap = new TreeMap(); // could be hashmap
|
||||
static Map duplicates = new TreeMap();
|
||||
|
||||
static boolean unihanNonSingular = false;
|
||||
@ -1274,14 +1989,26 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
static Transliterator convertPinyin;
|
||||
static Transliterator digitPinyin_accentPinyin;
|
||||
|
||||
static Transliterator accentPinyin_digitPinyin = Transliterator.createFromRules("accentPinyin_digitPinyin",
|
||||
"::NFD; "
|
||||
+ " ([\u0304\u0301\u030C\u0300\u0306]) ([[:Mark:][:Letter:]]+) > $2 | $1;"
|
||||
+ "\u0304 > '1'; \u0301 > '2'; \u030C > '3'; \u0300 > '4'; \u0306 > '3';"
|
||||
+ " ::NFC;", Transliterator.FORWARD);
|
||||
|
||||
static Transliterator fixCircumflex = Transliterator.createFromRules("fix_circumflex",
|
||||
"::NFD; \u0306 > \u030C; ::NFC;", Transliterator.FORWARD);
|
||||
|
||||
static Transliterator dropTones = Transliterator.createFromRules("drop_tones",
|
||||
"::NFD; \u0304 > ; \u0301 > ; \u030C > ; \u0300 > ; \u0306 > ; ::NFC;", Transliterator.FORWARD);
|
||||
|
||||
static {
|
||||
String dt = "1 > ;\n"
|
||||
String dt = "1 > \u0304;\n"
|
||||
+ "2 <> \u0301;\n"
|
||||
+ "3 <> \u0306;\n"
|
||||
+ "3 <> \u030C;\n"
|
||||
+ "4 <> \u0300;\n"
|
||||
+ "5 <> \u0304;";
|
||||
+ "5 <> ;";
|
||||
|
||||
String dp = "# syllable is ...vowel+ consonant* number\n"
|
||||
+ "# 'a', 'e' are the preferred bases\n"
|
||||
@ -1301,8 +2028,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
System.out.println(at.transliterate("a1a2a3a4a5"));
|
||||
DummyFactory.add(at.getID(), at);
|
||||
|
||||
convertPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
|
||||
System.out.println(convertPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
|
||||
digitPinyin_accentPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
|
||||
System.out.println(digitPinyin_accentPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
|
||||
|
||||
}
|
||||
/*
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.25 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.26 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -37,7 +37,10 @@ public final class Main implements UCD_Types {
|
||||
"PropList",
|
||||
"Scripts",
|
||||
"SpecialCasing",
|
||||
"HangulSyllableType",
|
||||
"DerivedAge",
|
||||
"StandardizedVariants",
|
||||
//"HangulSyllable",
|
||||
//"OtherDerivedProperties",
|
||||
};
|
||||
|
||||
@ -71,6 +74,10 @@ public final class Main implements UCD_Types {
|
||||
else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
|
||||
else if (arg.equalsIgnoreCase("hanproperties")) GenerateHanTransliterator.readUnihan();
|
||||
|
||||
else if (arg.equalsIgnoreCase("fixChineseOverrides")) GenerateHanTransliterator.fixChineseOverrides();
|
||||
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
|
||||
|
||||
else if (arg.equalsIgnoreCase("testenum")) SampleEnum.test();
|
||||
@ -115,6 +122,7 @@ public final class Main implements UCD_Types {
|
||||
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
|
||||
else if (arg.equalsIgnoreCase("TestDirectoryIterator")) DirectoryIterator.test();
|
||||
else if (arg.equalsIgnoreCase("checkIdentical")) GenerateData.handleIdentical();
|
||||
else if (arg.equalsIgnoreCase("testnameuniqueness")) TestNameUniqueness.test();
|
||||
|
||||
//else if (arg.equalsIgnoreCase("NormalizationCharts")) ChartGenerator.writeNormalizationCharts();
|
||||
|
||||
@ -191,10 +199,17 @@ public final class Main implements UCD_Types {
|
||||
GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedNumericType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("HangulSyllableType")) {
|
||||
GenerateData.generateVerticalSlice(HANGUL_SYLLABLE_TYPE,HANGUL_SYLLABLE_TYPE+NEXT_ENUM, GenerateData.HEADER_EXTEND,
|
||||
"DerivedData/", "HangulSyllableType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
|
||||
GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/extracted/", "DerivedNumericValues" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("StandardizedVariants")) {
|
||||
GenerateStandardizedVariants.generate();
|
||||
|
||||
// OTHER STANDARD PROPERTIES
|
||||
|
||||
} else if (arg.equalsIgnoreCase("CaseFolding")) {
|
||||
@ -239,7 +254,7 @@ public final class Main implements UCD_Types {
|
||||
|
||||
} else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
|
||||
//mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
|
||||
GenerateData.generateDerived(ALL, false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
|
||||
GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("AllBinary")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -416,7 +416,11 @@ public final class Normalizer implements UCD_Types {
|
||||
String s = ucd.getDecompositionMapping(i);
|
||||
int len = UTF16.countCodePoint(s);
|
||||
if (len != 2) {
|
||||
if (len > 2) throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
|
||||
if (len > 2) {
|
||||
if (ucd.getVersion().compareTo("3.0.0") >= 0) {
|
||||
throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
int a = UTF16.charAt(s, 0);
|
||||
|
@ -1,9 +1,7 @@
|
||||
# This file contains aliases for properties used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
# The names are not normative, except where they correspond to normative
|
||||
# properties in the UCD. For information on which properties are normative,
|
||||
# see UnicodeCharacterDatabase.html.
|
||||
# For information on which properties are normative, see UCD.html.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
@ -20,16 +18,14 @@
|
||||
# and '_' are ignored.
|
||||
#
|
||||
# NOTE: Currently there is at most one abbreviated name and one long name for
|
||||
# each property. However, in the future additional aliases
|
||||
# may be added. In such a case, the first line for the property
|
||||
# would have the preferred alias for output.
|
||||
# each property. However, in the future additional aliases may be added.
|
||||
#
|
||||
# NOTE: The property value names are NOT unique across properties, especially
|
||||
# with loose matches. For example,
|
||||
# with loose matches. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
#
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -57,7 +57,7 @@ abstract public class PropertyLister implements UCD_Types {
|
||||
}
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
if (!usePropertyComment || !breakByCategory) return "";
|
||||
if (!usePropertyComment) return "";
|
||||
return ucdData.getModCatID_fromIndex(getModCat(cp));
|
||||
}
|
||||
|
||||
@ -143,7 +143,8 @@ abstract public class PropertyLister implements UCD_Types {
|
||||
}
|
||||
|
||||
byte getModCat(int cp) {
|
||||
return ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
|
||||
byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,9 +1,7 @@
|
||||
# This file contains aliases for property values used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
# The names are not normative, except where they correspond to normative property
|
||||
# values in the UCD. For information on which properties are normative, see
|
||||
# UnicodeCharacterDatabase.html.
|
||||
# For information on which properties are normative, see UCD.html.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
@ -22,29 +20,29 @@
|
||||
#
|
||||
# Third Field: The third field is a long name.
|
||||
#
|
||||
# In the case of ccc, their are 4 fields. The second field is numeric, third
|
||||
# In the case of ccc, there are 4 fields. The second field is numeric, third
|
||||
# is abbreviated, and fourth is long.
|
||||
#
|
||||
# With loose matching of property names, the case distinctions, whitespace,
|
||||
# and '_' are ignored.
|
||||
#
|
||||
# NOTE: The Block property values are in Blocks.txt, and not repeated here.
|
||||
# For more information on the use of blocks, see UTR #18: Regular Expression Guidelines
|
||||
#
|
||||
# NOTE: Currently there is at most one abbreviated name and one long name for
|
||||
# property value. However, in the future additional aliases
|
||||
# may be added. In such a case, the first line for the property value
|
||||
# would have the preferred alias for output.
|
||||
# property value. However, in the future additional aliases may be added.
|
||||
# In such a case, the first line for the property value would have
|
||||
# the preferred alias for output.
|
||||
#
|
||||
# NOTE: The property value names are NOT unique across properties, especially
|
||||
# with loose matches. For example,
|
||||
# with loose matches. For example:
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names:
|
||||
# cc means Combining_Class property, and
|
||||
# cc means the General_Category property value Control (cc)
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
# For example:
|
||||
#
|
||||
# cc means Combining_Class property, and
|
||||
# cc means the General_Category property value Control (cc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
# For more information, see UTR #24: Regular Expression Guidelines
|
||||
# For more information, see UTR #18: Regular Expression Guidelines
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -23,25 +23,38 @@ import com.ibm.text.utility.*;
|
||||
public class QuickTest implements UCD_Types {
|
||||
static final void test() {
|
||||
Default.setUCD();
|
||||
UnicodeSet format = new UnicodeSet("[:Cf:]");
|
||||
/*
|
||||
[4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
|
||||
[#xC0 - #x2FF] | [#x370 - #x37D] | [#x37F - #x1FFF] |
|
||||
[#x200C - #x200D] | [#x2070 - #x218F] | [#x2C00 - #x2FEF] |
|
||||
[#x3001 - #xD7FF] | [#xF900 - #xF9FF] | [#x10000 - #xDFFFF]
|
||||
|
||||
[4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F]
|
||||
[4] NameStartChar := ":" | [A-Z] | "_" | [a-z] |
|
||||
[#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] |
|
||||
[#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
|
||||
[#x3001-#xD7FF] | [#xF900-#xEFFFF]
|
||||
[4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 |
|
||||
[#x0300-#x036F] | [#x203F-#x2040]
|
||||
*/
|
||||
UnicodeSet nameStartChar = new UnicodeSet("[\\: A-Z \\_ a-z"
|
||||
+ "\\u00c0-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF"
|
||||
+ "\\u200C-\\u200D \\u2070-\\u218F \\u2C00-\\u2FEF"
|
||||
+ "\\u3001-\\uD7FF \\uF900-\\uF9FF \\U00010000-\\U000DFFFF]");
|
||||
+ "\\u3001-\\uD7FF \\uF900-\\U000EFFFF]");
|
||||
|
||||
UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 \\u0300-\\u036F]")
|
||||
UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 "
|
||||
+ "\\u0300-\\u036F \\u203F-\\u2040]")
|
||||
.addAll(nameStartChar);
|
||||
|
||||
UnicodeSet nameAll = new UnicodeSet(nameChar).addAll(nameStartChar);
|
||||
|
||||
showSet("NameStartChar", nameStartChar);
|
||||
showDiffs("NameChar", nameChar, "NameStartChar", nameStartChar);
|
||||
|
||||
|
||||
UnicodeSet ID_Start = new UnicodeSet("[:ID_Start:]");
|
||||
UnicodeSet ID_Continue = new UnicodeSet("[:ID_Continue:]").removeAll(format);
|
||||
|
||||
UnicodeSet ID_All = new UnicodeSet(ID_Start).addAll(ID_Continue);
|
||||
|
||||
showDiffs("ID_All", ID_All, "nameAll", nameAll);
|
||||
showDiffs("ID_Start", ID_Start, "nameStartChar", nameStartChar);
|
||||
|
||||
|
||||
UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED | DefaultIgnorable).getSet();
|
||||
UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES | White_space).getSet();
|
||||
@ -49,7 +62,6 @@ public class QuickTest implements UCD_Types {
|
||||
UnicodeSet notNFKC = new UnicodeSet();
|
||||
UnicodeSet privateUse = new UnicodeSet();
|
||||
UnicodeSet noncharacter = new UnicodeSet();
|
||||
UnicodeSet format = new UnicodeSet("[:Cf:]");
|
||||
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
if (!Default.ucd.isAllocated(i)) continue;
|
||||
|
@ -48,14 +48,14 @@
|
||||
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
# The following rules handle those cases.
|
||||
|
||||
0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
|
||||
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
|
||||
0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
|
||||
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
|
@ -4,8 +4,7 @@
|
||||
# It contains additional information about the casing of Unicode characters.
|
||||
# (For compatibility, the UnicodeData.txt file only contains case mappings for
|
||||
# characters where they are 1-1, and does not have locale-specific mappings.)
|
||||
# For more information, see
|
||||
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
|
||||
# For more information, see the discussion of Case Mappings in the Unicode Standard.
|
||||
#
|
||||
# ================================================================================
|
||||
# Format
|
||||
@ -31,10 +30,10 @@
|
||||
# <ISO_3166_code> := 2-letter ISO country code,
|
||||
# <ISO_639_code> := 2-letter ISO language code
|
||||
#
|
||||
# A context is one of the following, as defined in UAX #21: Case Mappings:
|
||||
# Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot
|
||||
# A context is one of the following, as defined in the Unicode Standard:
|
||||
# Final_Sigma, After_Soft_Dotted, More_Above, Before_Dot, Not_Before_Dot, After_I
|
||||
#
|
||||
# Parsers of this file must be prepared to deal future additions to this format:
|
||||
# Parsers of this file must be prepared to deal with future additions to this format:
|
||||
# * Additional contexts
|
||||
# * Additional fields
|
||||
# ================================================================================
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.19 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -35,7 +35,7 @@ public final class UCD implements UCD_Types {
|
||||
/**
|
||||
* Used for the default version.
|
||||
*/
|
||||
public static final String latestVersion = "3.2.1";
|
||||
public static final String latestVersion = "4.0.0";
|
||||
|
||||
/**
|
||||
* Create singleton instance for default (latest) version
|
||||
@ -79,17 +79,19 @@ public final class UCD implements UCD_Types {
|
||||
*/
|
||||
public boolean isAllocated(int codePoint) {
|
||||
if (getCategory(codePoint) != Cn) return true;
|
||||
if (major >= 2 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
|
||||
if (compositeVersion >= 0x20000 && codePoint >= 0xF0000 && codePoint <= 0x10FFFD) return true;
|
||||
if (isNoncharacter(codePoint)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isNoncharacter(int codePoint) {
|
||||
if ((codePoint & 0xFFFE) == 0xFFFE) {
|
||||
if (major < 2 && codePoint > 0xFFFF) return false;
|
||||
if (compositeVersion < 0x20000 && codePoint > 0xFFFF) return false;
|
||||
// major < 2
|
||||
return true;
|
||||
}
|
||||
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && major >= 3 && minor >= 1) return true;
|
||||
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF && compositeVersion >= 0x30100) return true;
|
||||
// major >= 3 && minor >= 1
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -239,8 +241,9 @@ public final class UCD implements UCD_Types {
|
||||
|
||||
public byte getModCat(int cp, int collapseBits) {
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == UNASSIGNED && isNoncharacter(cp)) cat = FAKENC;
|
||||
if (((1<<cat) & collapseBits) != 0) {
|
||||
if (cat == UNASSIGNED && isNoncharacter(cp)) {
|
||||
cat = FAKENC;
|
||||
} else if (((1<<cat) & collapseBits) != 0) {
|
||||
switch (cat) {
|
||||
case UNASSIGNED: cat = FAKE_OTHER; break;
|
||||
case FAKENC: cat = FAKE_OTHER; break;
|
||||
@ -281,7 +284,17 @@ public final class UCD implements UCD_Types {
|
||||
case CURRENCY_SYMBOL: cat = FAKE_SYMBOL; break;
|
||||
case MODIFIER_SYMBOL: cat = FAKE_SYMBOL; break;
|
||||
case OTHER_SYMBOL: cat = FAKE_SYMBOL; break;
|
||||
|
||||
}
|
||||
if (collapseBits == -1) {
|
||||
switch (cat) {
|
||||
case FAKE_MARK:
|
||||
case FAKE_NUMBER:
|
||||
case FAKE_SEPERATOR:
|
||||
case FAKE_PUNCTUATION:
|
||||
case FAKE_SYMBOL:
|
||||
cat = FAKE_LETTER;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return cat;
|
||||
@ -832,7 +845,7 @@ public final class UCD implements UCD_Types {
|
||||
return style == SHORT ? UCD_Names.SHORT_BP[bit] : UCD_Names.BP[bit];
|
||||
}
|
||||
|
||||
public static int mapToRepresentative(int ch, boolean old) {
|
||||
public static int mapToRepresentative(int ch, boolean lessThan20105) {
|
||||
if (ch <= 0xFFFD) {
|
||||
//if (ch <= 0x2800) return ch;
|
||||
//if (ch <= 0x28FF) return 0x2800; // braille
|
||||
@ -850,7 +863,7 @@ public final class UCD implements UCD_Types {
|
||||
if (ch <= 0xDFFF) return 0xDC00;
|
||||
if (ch <= 0xE000) return ch; // Private Use
|
||||
if (ch <= 0xF8FF) return 0xE000;
|
||||
if (old) {
|
||||
if (lessThan20105) {
|
||||
if (ch <= 0xF900) return ch; // CJK Compatibility Ideograp
|
||||
if (ch <= 0xFA2D) return 0xF900;
|
||||
}
|
||||
@ -870,37 +883,43 @@ public final class UCD implements UCD_Types {
|
||||
return ch;
|
||||
}
|
||||
|
||||
public boolean isIdentifierStart(int cp, boolean extended) {
|
||||
public boolean isIdentifierStart(int cp) {
|
||||
/*
|
||||
if (extended) {
|
||||
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return false;
|
||||
if (cp == 0x037A || cp >= 0xFC5E && cp <= 0xFC63 || cp == 0xFDFA || cp == 0xFDFB) return false;
|
||||
if (cp >= 0xFE70 && cp <= 0xFE7E && (cp & 1) == 0) return false;
|
||||
}
|
||||
*/
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl) return true;
|
||||
if (getBinaryProperty(cp, ID_Start_Exceptions)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isIdentifierContinue_NO_Cf(int cp, boolean extended) {
|
||||
if (isIdentifierStart(cp, extended)) return true;
|
||||
public boolean isIdentifierContinue_NO_Cf(int cp) {
|
||||
if (isIdentifierStart(cp)) return true;
|
||||
/*
|
||||
if (extended) {
|
||||
if (cp == 0x00B7) return true;
|
||||
if (cp == 0x0E33 || cp == 0x0EB3 || cp == 0xFF9E || cp == 0xFF9F) return true;
|
||||
}
|
||||
*/
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Mn || cat == Mc || cat == Nd || cat == Pc) return true;
|
||||
if (getBinaryProperty(cp, ID_Start_Exceptions)) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isIdentifier(String s, boolean extended) {
|
||||
public boolean isIdentifier(String s) {
|
||||
if (s.length() == 0) return false; // at least one!
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i == 0) {
|
||||
if (!isIdentifierStart(cp, extended)) return false;
|
||||
if (!isIdentifierStart(cp)) return false;
|
||||
} else {
|
||||
if (!isIdentifierContinue_NO_Cf(cp, extended)) return false;
|
||||
if (!isIdentifierContinue_NO_Cf(cp)) return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
@ -940,9 +959,10 @@ to guarantee identifier closure.
|
||||
private String file;
|
||||
private long date = -1;
|
||||
private byte format = -1;
|
||||
private byte major = -1;
|
||||
private byte minor = -1;
|
||||
private byte update = -1;
|
||||
//private byte major = -1;
|
||||
//private byte minor = -1;
|
||||
//private byte update = -1;
|
||||
private int compositeVersion = -1;
|
||||
private int size = -1;
|
||||
|
||||
// cache last UData
|
||||
@ -971,7 +991,7 @@ to guarantee identifier closure.
|
||||
if (codePoint >= 0x2800 && codePoint <= 0x28FF) return true;
|
||||
if (codePoint >= 0x2F800 && codePoint <= 0x2FA1D) return true;
|
||||
|
||||
int rangeStart = mapToRepresentative(codePoint, major < 2);
|
||||
int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105);
|
||||
switch (rangeStart) {
|
||||
default:
|
||||
return getRaw(codePoint) == null;
|
||||
@ -999,6 +1019,11 @@ to guarantee identifier closure.
|
||||
|
||||
// access data for codepoint
|
||||
UData get(int codePoint, boolean fixStrings) {
|
||||
/*if (codePoint == 0xF901) {
|
||||
System.out.println(version + ", " + Integer.toString(compositeVersion, 16));
|
||||
System.out.println("debug: ");
|
||||
}
|
||||
*/
|
||||
if (codePoint < 0 || codePoint > 0x10FFFF) {
|
||||
throw new IllegalArgumentException("Illegal Code Point: " + Utility.hex(codePoint));
|
||||
}
|
||||
@ -1024,11 +1049,11 @@ to guarantee identifier closure.
|
||||
|
||||
// do range stuff
|
||||
String constructedName = null;
|
||||
int rangeStart = mapToRepresentative(codePoint, major < 2);
|
||||
int rangeStart = mapToRepresentative(codePoint, compositeVersion < 0x020105);
|
||||
boolean isHangul = false;
|
||||
switch (rangeStart) {
|
||||
case 0xF900:
|
||||
if (major < 2) {
|
||||
if (compositeVersion < 0x020105) {
|
||||
if (fixStrings) constructedName = "CJK COMPATIBILITY IDEOGRAPH-" + Utility.hex(codePoint, 4);
|
||||
break;
|
||||
}
|
||||
@ -1198,9 +1223,11 @@ to guarantee identifier closure.
|
||||
}
|
||||
|
||||
static boolean isLeadingJamoComposition(int char1) {
|
||||
return (LBase <= char1 && char1 < LLimit
|
||||
|| SBase <= char1 && char1 < SLimit
|
||||
&& ((char1 - SBase) % TCount) == 0);
|
||||
return isLeadingJamo(char1) || isLV(char1);
|
||||
}
|
||||
|
||||
static boolean isLV(int char1) {
|
||||
return (SBase <= char1 && char1 < SLimit && ((char1 - SBase) % TCount) == 0);
|
||||
}
|
||||
|
||||
static boolean isVowelJamo(int cp) {
|
||||
@ -1218,6 +1245,24 @@ to guarantee identifier closure.
|
||||
static boolean isNonLeadJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
}
|
||||
|
||||
static byte getHangulSyllableType(int cp) {
|
||||
if (isLeadingJamo(cp)) return L;
|
||||
else if (isVowelJamo(cp)) return V;
|
||||
else if (isTrailingJamo(cp)) return T;
|
||||
else if (isLV(cp)) return LV;
|
||||
else if (isHangulSyllable(cp)) return LVT;
|
||||
else return NA;
|
||||
}
|
||||
|
||||
static String getHangulSyllableTypeID_fromIndex(byte index, byte style) {
|
||||
if (style == LONG) return UCD_Names.LONG_HANGUL_SYLLABLE_TYPE[index];
|
||||
return UCD_Names.HANGUL_SYLLABLE_TYPE[index];
|
||||
}
|
||||
|
||||
static String getHangulSyllableTypeID(int char1, byte style) {
|
||||
return getHangulSyllableTypeID_fromIndex(getHangulSyllableType(char1),style);
|
||||
}
|
||||
|
||||
private void fillFromFile(String version) {
|
||||
try {
|
||||
@ -1243,9 +1288,11 @@ to guarantee identifier closure.
|
||||
128*1024));
|
||||
// header
|
||||
format = dataIn.readByte();
|
||||
major = dataIn.readByte();
|
||||
minor = dataIn.readByte();
|
||||
update = dataIn.readByte();
|
||||
byte major = dataIn.readByte();
|
||||
byte minor = dataIn.readByte();
|
||||
byte update = dataIn.readByte();
|
||||
compositeVersion = (major << 16) | (minor << 8) | update;
|
||||
|
||||
String foundVersion = major + "." + minor + "." + update;
|
||||
if (format != BINARY_FORMAT || !version.equals(foundVersion)) {
|
||||
throw new ChainException("Illegal data file format for {0}: {1}, {2}",
|
||||
@ -1262,7 +1309,7 @@ to guarantee identifier closure.
|
||||
UData uData = new UData();
|
||||
uData.readBytes(dataIn);
|
||||
|
||||
if (DEBUG && uData.codePoint == 0x2801) {
|
||||
if (uData.codePoint == 0x0221) {
|
||||
System.out.println("SPOT-CHECK: " + uData);
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -53,6 +53,7 @@ final class UCD_Names implements UCD_Types {
|
||||
"BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
|
||||
"Script",
|
||||
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)",
|
||||
"Hangul Syllable Type\r\n# All codepoints not explicitly listed here have the value NA",
|
||||
"Derived"
|
||||
};
|
||||
|
||||
@ -69,6 +70,7 @@ final class UCD_Names implements UCD_Types {
|
||||
"",
|
||||
"Script",
|
||||
"Age",
|
||||
"Hangul_Syllable_Type",
|
||||
""
|
||||
};
|
||||
|
||||
@ -85,6 +87,7 @@ final class UCD_Names implements UCD_Types {
|
||||
"",
|
||||
"sc",
|
||||
"ag",
|
||||
"hst",
|
||||
"",
|
||||
};
|
||||
|
||||
@ -121,6 +124,7 @@ final class UCD_Names implements UCD_Types {
|
||||
"Deprecated",
|
||||
"Soft_Dotted",
|
||||
"Logical_Order_Exception",
|
||||
"ID_Start_Exceptions",
|
||||
};
|
||||
|
||||
static final String[] SHORT_BP = {
|
||||
@ -155,6 +159,7 @@ final class UCD_Names implements UCD_Types {
|
||||
"Dep",
|
||||
"SD",
|
||||
"LOE",
|
||||
"IDSX",
|
||||
};
|
||||
|
||||
/*
|
||||
@ -273,6 +278,14 @@ final class UCD_Names implements UCD_Types {
|
||||
"HANUNOO",
|
||||
"BUHID",
|
||||
"TAGBANWA",
|
||||
"LIMBU",
|
||||
"TAI_LE",
|
||||
"LINEAR_B",
|
||||
"UGARITIC",
|
||||
"SHAVIAN",
|
||||
"OSMANYA",
|
||||
"CYPRIOT",
|
||||
|
||||
};
|
||||
|
||||
public static final String[] ABB_SCRIPT = {
|
||||
@ -322,6 +335,13 @@ final class UCD_Names implements UCD_Types {
|
||||
"Hano",
|
||||
"Buhd",
|
||||
"Tagb",
|
||||
"LIMBU",
|
||||
"TAI_LE",
|
||||
"LINEAR_B",
|
||||
"UGARITIC",
|
||||
"SHAVIAN",
|
||||
"OSMANYA",
|
||||
"CYPRIOT",
|
||||
};
|
||||
|
||||
|
||||
@ -330,7 +350,8 @@ final class UCD_Names implements UCD_Types {
|
||||
"UNSPECIFIED",
|
||||
"1.1",
|
||||
"2.0", "2.1",
|
||||
"3.0", "3.1"
|
||||
"3.0", "3.1", "3.2",
|
||||
"4.0"
|
||||
};
|
||||
|
||||
|
||||
@ -573,6 +594,24 @@ final class UCD_Names implements UCD_Types {
|
||||
|
||||
public static byte ON = Utility.lookup("ON", BC, true);
|
||||
|
||||
public static String[] HANGUL_SYLLABLE_TYPE = {
|
||||
"NA",
|
||||
"L",
|
||||
"V",
|
||||
"T",
|
||||
"LV",
|
||||
"LVT",
|
||||
};
|
||||
|
||||
public static String[] LONG_HANGUL_SYLLABLE_TYPE = {
|
||||
"Not_Applicable",
|
||||
"Leading_Jamo",
|
||||
"Vowel_Jamo",
|
||||
"Trailing_Jamo",
|
||||
"LV_Syllable",
|
||||
"LVT_Syllable",
|
||||
};
|
||||
|
||||
public static String[] JOINING_TYPE = {
|
||||
"C",
|
||||
"D",
|
||||
@ -643,6 +682,9 @@ final class UCD_Names implements UCD_Types {
|
||||
"YUDH",
|
||||
"YUDH_HE",
|
||||
"ZAIN",
|
||||
"ZHAIN",
|
||||
"KHAPH",
|
||||
"FE",
|
||||
};
|
||||
|
||||
public static String[] OLD_JOINING_GROUP = {
|
||||
@ -697,6 +739,9 @@ final class UCD_Names implements UCD_Types {
|
||||
"YUDH",
|
||||
"YUDH_HE",
|
||||
"ZAIN",
|
||||
"ZHAIN",
|
||||
"KHAPH",
|
||||
"FE",
|
||||
};
|
||||
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2002/10/05 01:28:58 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -15,7 +15,7 @@ package com.ibm.text.UCD;
|
||||
|
||||
public interface UCD_Types {
|
||||
|
||||
public static final int dVersion = 2; // change to fix the generated file D version. If less than zero, no "d"
|
||||
public static final int dVersion = 10; // change to fix the generated file D version. If less than zero, no "d"
|
||||
|
||||
public static final String BASE_DIR = "C:\\DATA\\";
|
||||
public static final String UCD_DIR = BASE_DIR + "UCD\\";
|
||||
@ -41,7 +41,7 @@ public interface UCD_Types {
|
||||
NOT_DERIVED = 1,
|
||||
DERIVED_CORE = 2,
|
||||
DERIVED_NORMALIZATION = 4,
|
||||
DERIVED_ALL = 6,
|
||||
DERIVED_ALL = 0x6,
|
||||
ALL = (byte)-1;
|
||||
|
||||
static final byte
|
||||
@ -86,9 +86,10 @@ public interface UCD_Types {
|
||||
BINARY_PROPERTIES = 0x900,
|
||||
SCRIPT = 0xA00,
|
||||
AGE = 0xB00,
|
||||
DERIVED = 0xC00,
|
||||
NEXT_ENUM = 0x100,
|
||||
LIMIT_ENUM = DERIVED + 0x100;
|
||||
HANGUL_SYLLABLE_TYPE = 0xC00,
|
||||
DERIVED = 0xD00,
|
||||
LIMIT_ENUM = DERIVED + 0x100,
|
||||
NEXT_ENUM = 0x100;
|
||||
|
||||
public static final int LIMIT_COMBINING_CLASS = 256;
|
||||
|
||||
@ -207,7 +208,8 @@ public interface UCD_Types {
|
||||
Deprecated = 28,
|
||||
Soft_Dotted = 29,
|
||||
Logical_Order_Exception = 30,
|
||||
LIMIT_BINARY_PROPERTIES = 31;
|
||||
ID_Start_Exceptions = 31,
|
||||
LIMIT_BINARY_PROPERTIES = 32;
|
||||
|
||||
/*
|
||||
static final int
|
||||
@ -309,6 +311,9 @@ public interface UCD_Types {
|
||||
// numericType
|
||||
static final byte NUMERIC_NONE = 0, NUMERIC = 1, DIGIT = 2, DECIMAL = 3,
|
||||
LIMIT_NUMERIC_TYPE = 4;
|
||||
|
||||
static final byte NA = 0, L = 1, V = 2, T = 3, LV = 4, LVT = 5,
|
||||
HANGUL_SYLLABLE_TYPE_LIMIT = 6;
|
||||
|
||||
public static final byte // SCRIPT CODE
|
||||
COMMON_SCRIPT = 0,
|
||||
@ -357,7 +362,14 @@ public interface UCD_Types {
|
||||
HANUNOO_SCRIPT = 43,
|
||||
BUHID_SCRIPT = 44,
|
||||
TAGBANWA_SCRIPT = 45,
|
||||
LIMIT_SCRIPT = 46;
|
||||
LIMBU = 46,
|
||||
TAI_LE = 47,
|
||||
LINEAR_B = 48,
|
||||
UGARITIC = 49,
|
||||
SHAVIAN = 50,
|
||||
OSMANYA = 51,
|
||||
CYPRIOT = 52,
|
||||
LIMIT_SCRIPT = 53;
|
||||
|
||||
static final int
|
||||
UNKNOWN = 0,
|
||||
@ -366,7 +378,9 @@ public interface UCD_Types {
|
||||
AGE21 = 3,
|
||||
AGE30 = 4,
|
||||
AGE31 = 5,
|
||||
LIMIT_AGE = 6;
|
||||
AGE32 = 6,
|
||||
AGE40 = 7,
|
||||
LIMIT_AGE = 8;
|
||||
|
||||
|
||||
|
||||
@ -431,7 +445,11 @@ public static byte
|
||||
YUDH = 48,
|
||||
YUDH_HE = 49,
|
||||
ZAIN = 50,
|
||||
LIMIT_JOINING_GROUP = 51;
|
||||
ZHAIN = 51,
|
||||
KHAPH = 52,
|
||||
FE = 53,
|
||||
|
||||
LIMIT_JOINING_GROUP = 54;
|
||||
|
||||
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
|
||||
public static final int
|
||||
@ -500,7 +518,9 @@ public static byte
|
||||
NFC_Skippable = 42,
|
||||
NFKD_Skippable = 43,
|
||||
NFKC_Skippable = 44,
|
||||
|
||||
Case_Sensitive = 45,
|
||||
|
||||
DERIVED_PROPERTY_LIMIT = 41;
|
||||
DERIVED_PROPERTY_LIMIT = 46;
|
||||
|
||||
}
|
@ -35,6 +35,8 @@ public abstract class UnicodeProperty implements UCD_Types {
|
||||
public boolean isStandard() { return isStandard; }
|
||||
public void setStandard(boolean in) { isStandard = in; }
|
||||
|
||||
public boolean isDefaultValue() {return false;}
|
||||
|
||||
/**
|
||||
* What type is it? DERIVED..
|
||||
*/
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
|
||||
* $Date: 2002/10/05 01:28:57 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -122,7 +122,11 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
propValue = propMask & 0xFF;
|
||||
|
||||
//System.out.println("A: " + getValueType());
|
||||
if (majorProp <= (JOINING_GROUP>>8) || majorProp == SCRIPT>>8) setValueType(FLATTENED_BINARY);
|
||||
if (majorProp <= (JOINING_GROUP>>8)
|
||||
|| majorProp == (SCRIPT>>8)
|
||||
|| majorProp==(HANGUL_SYLLABLE_TYPE>>8)) {
|
||||
setValueType(FLATTENED_BINARY);
|
||||
}
|
||||
//System.out.println("B: " + getValueType());
|
||||
|
||||
header = UCD_Names.UNIFIED_PROPERTY_HEADERS[majorProp];
|
||||
@ -217,6 +221,8 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
return true;
|
||||
case AGE>>8: if (propValue >= LIMIT_AGE) break;
|
||||
return true;
|
||||
case HANGUL_SYLLABLE_TYPE>>8: if (propValue >= HANGUL_SYLLABLE_TYPE_LIMIT) break;
|
||||
return true;
|
||||
/*
|
||||
case DERIVED>>8:
|
||||
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
|
||||
@ -227,6 +233,28 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isDefaultValue() {
|
||||
switch ((majorProp<<8) | propValue) {
|
||||
//case CATEGORY | Cn:
|
||||
//case COMBINING_CLASS | 0:
|
||||
//case BIDI_CLASS | BIDI_L:
|
||||
case DECOMPOSITION_TYPE | NONE:
|
||||
case NUMERIC_TYPE | NUMERIC_NONE:
|
||||
// case EAST_ASIAN_WIDTH | EAN:
|
||||
// case LINE_BREAK | LB_XX:
|
||||
case JOINING_TYPE | JT_U:
|
||||
case JOINING_GROUP | NO_SHAPING:
|
||||
case BINARY_PROPERTIES | Non_break:
|
||||
case BINARY_PROPERTIES | CaseFoldTurkishI:
|
||||
case SCRIPT | COMMON_SCRIPT:
|
||||
case HANGUL_SYLLABLE_TYPE | NA:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean hasValue(int cp) {
|
||||
try {
|
||||
switch (majorProp) {
|
||||
@ -242,6 +270,8 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
case BINARY_PROPERTIES>>8: return ucd.getBinaryProperty(cp, propValue);
|
||||
case SCRIPT>>8: return ucd.getScript(cp) == propValue;
|
||||
case AGE>>8: return ucd.getAge(cp) == propValue;
|
||||
case HANGUL_SYLLABLE_TYPE>>8: return ucd.getHangulSyllableType(cp) == propValue;
|
||||
// return true;
|
||||
/*
|
||||
case DERIVED>>8:
|
||||
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
|
||||
@ -307,6 +337,7 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
|
||||
case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue, style);
|
||||
case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
|
||||
case HANGUL_SYLLABLE_TYPE>>8: return ucd.getHangulSyllableTypeID_fromIndex((byte)propValue, style);
|
||||
/*
|
||||
case DERIVED>>8:
|
||||
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
|
||||
@ -337,6 +368,7 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
case BINARY_PROPERTIES>>8: return LONG;
|
||||
case SCRIPT>>8: return LONG;
|
||||
case AGE>>8: return LONG;
|
||||
case HANGUL_SYLLABLE_TYPE>>8: return SHORT;
|
||||
}
|
||||
} catch (RuntimeException e) {
|
||||
throw new ChainException("Illegal property Number {0}, {1}", new Object[]{
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedProperty.java,v $
|
||||
* $Date: 2002/10/05 01:28:57 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -142,6 +142,7 @@ public final class UnifiedProperty extends UnicodeProperty {
|
||||
case JOINING_GROUP>>8:
|
||||
case SCRIPT>>8:
|
||||
case AGE>>8:
|
||||
case HANGUL_SYLLABLE_TYPE>>8:
|
||||
return true;
|
||||
/*
|
||||
case DERIVED>>8:
|
||||
@ -181,7 +182,9 @@ public final class UnifiedProperty extends UnicodeProperty {
|
||||
case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp), style);
|
||||
case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp), style);
|
||||
case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp), style);
|
||||
case HANGUL_SYLLABLE_TYPE>>8:
|
||||
return ucd.getHangulSyllableTypeID(cp,style);
|
||||
default: throw new IllegalArgumentException("Internal Error");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.19 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -1806,8 +1806,11 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
|
||||
String x_cp = 'x' + UTF32.valueOf32(cp);
|
||||
String nfx_x_cp = normalize(x_cp, j);
|
||||
plain = Default.ucd.isIdentifier(x_cp, true);
|
||||
norm = Default.ucd.isIdentifier(nfx_x_cp, true);
|
||||
if (true) {
|
||||
throw new RuntimeException("Fix plain & norm, 4 instances!!");
|
||||
}
|
||||
// plain = Default.ucd.isIdentifier(x_cp, true);
|
||||
//norm = Default.ucd.isIdentifier(nfx_x_cp, true);
|
||||
if (plain & !norm) {
|
||||
Utility.fixDot();
|
||||
System.out.println("*Not Identifier: " + Default.ucd.getCodeAndName(cp));
|
||||
@ -1822,8 +1825,8 @@ E0020-E007F; [TAGGING CHARACTERS]
|
||||
}
|
||||
|
||||
String nfx_cp = normalize(UTF32.valueOf32(cp), j);
|
||||
plain = Default.ucd.isIdentifierStart(cp, true);
|
||||
norm = Default.ucd.isIdentifier(nfx_cp, true);
|
||||
// plain = Default.ucd.isIdentifierStart(cp, true);
|
||||
// norm = Default.ucd.isIdentifier(nfx_cp, true);
|
||||
if (plain & !norm) {
|
||||
Utility.fixDot();
|
||||
System.out.println(" Changes Category: " + Default.ucd.getCodeAndName(cp));
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/FileLineIterator.java,v $
|
||||
* $Date: 2002/10/01 01:12:10 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -43,18 +43,18 @@ public class FileLineIterator {
|
||||
public int counter = 0;
|
||||
|
||||
private BufferedReader br = null;
|
||||
private boolean isUTF8 = false;
|
||||
private Utility.Encoding encoding = Utility.UTF8;
|
||||
|
||||
/**
|
||||
* Open the file for reading. If useGenDir is set, use the normal generation directory
|
||||
*/
|
||||
public void open(String filename, boolean isUTF8) throws IOException {
|
||||
public void open(String filename, Utility.Encoding encoding) throws IOException {
|
||||
if (showFilename) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Reading File: " + new File(filename).getCanonicalPath());
|
||||
}
|
||||
br = Utility.openReadFile(filename, isUTF8);
|
||||
this.isUTF8 = isUTF8;
|
||||
br = Utility.openReadFile(filename, encoding);
|
||||
this.encoding = encoding;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -68,7 +68,7 @@ public class FileLineIterator {
|
||||
if (cleanedLine == null) return null;
|
||||
|
||||
// drop BOM
|
||||
if (isUTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
|
||||
if (encoding == Utility.UTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) {
|
||||
cleanedLine = cleanedLine.substring(1);
|
||||
}
|
||||
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/10/05 01:28:56 $
|
||||
* $Revision: 1.26 $
|
||||
* $Date: 2003/02/25 23:38:22 $
|
||||
* $Revision: 1.27 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -144,7 +144,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
boolean haveFirstCased = true;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char c = source.charAt(i);
|
||||
if (c == ' ' || c == '-') c = '_';
|
||||
if (c == ' ' || c == '-' || c == '_') {
|
||||
c = '_';
|
||||
haveFirstCased = true;
|
||||
}
|
||||
int cat = Character.getType(c);
|
||||
if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
|
||||
result.append('_');
|
||||
@ -616,6 +619,7 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
|
||||
private static final String[] searchPath = {
|
||||
"EXTRAS",
|
||||
"4.0.0",
|
||||
"3.2.0",
|
||||
"3.1.1",
|
||||
"3.1.0",
|
||||
@ -654,8 +658,13 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
UTF8_UNIX = Encoding.add("UTF8_UNIX"),
|
||||
UTF8_WINDOWS = Encoding.add("UTF8_WINDOWS"),
|
||||
|
||||
UTF8 = Encoding.add("UTF8"), // for read-only
|
||||
LATIN1 = Encoding.add("LATIN1"), // for read-only
|
||||
//UTF8 = Encoding.add("UTF8"), // for read-only
|
||||
//LATIN1 = Encoding.add("LATIN1"), // for read-only
|
||||
|
||||
// read-only (platform doesn't matter, since it is only line-end)
|
||||
|
||||
UTF8 = UTF8_WINDOWS,
|
||||
LATIN1 = LATIN1_WINDOWS,
|
||||
|
||||
FIRST = LATIN1_UNIX;
|
||||
|
||||
@ -700,6 +709,24 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
public boolean filter(Object current); // true is keep
|
||||
}
|
||||
|
||||
public static void printMapOfCollection(PrintWriter pw, Map c, String mainSeparator, String itemSeparator, String subseparator) {
|
||||
Iterator it = c.keySet().iterator();
|
||||
boolean first = true;
|
||||
Object last = null;
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
Collection value = (Collection) c.get(key);
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
pw.print(mainSeparator);
|
||||
}
|
||||
pw.print(key);
|
||||
pw.print(itemSeparator);
|
||||
print(pw, value, subseparator);
|
||||
}
|
||||
}
|
||||
|
||||
public static void print(PrintWriter pw, Collection c, String separator, Breaker b) {
|
||||
Iterator it = c.iterator();
|
||||
boolean first = true;
|
||||
@ -745,7 +772,12 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
|
||||
public static BufferedReader openReadFile(String filename, Encoding encoding) throws FileNotFoundException, UnsupportedEncodingException {
|
||||
FileInputStream fis = new FileInputStream(filename);
|
||||
InputStreamReader isr = (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis);
|
||||
InputStreamReader isr;
|
||||
if (encoding == UTF8_UNIX || encoding == UTF8_WINDOWS) {
|
||||
isr = new InputStreamReader(fis, "UTF8");
|
||||
} else {
|
||||
isr = new InputStreamReader(fis);
|
||||
}
|
||||
BufferedReader br = new BufferedReader(isr, 32*1024);
|
||||
return br;
|
||||
}
|
||||
@ -817,10 +849,10 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
}
|
||||
}
|
||||
|
||||
public static void renameIdentical(String file1, String file2) throws IOException {
|
||||
public static boolean renameIdentical(String file1, String file2, String batFile) throws IOException {
|
||||
if (file1 == null) {
|
||||
System.out.println("Null file");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
boolean identical = false;
|
||||
@ -845,25 +877,34 @@ public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
br2.close();
|
||||
}
|
||||
if (identical) {
|
||||
File foo = new File(file2);
|
||||
File newName = new File(foo.getParent(), "UNCHANGED-" + foo.getName());
|
||||
if (newName.exists()) {
|
||||
for (int i = 1; newName.exists(); ++i) {
|
||||
newName = new File(foo.getParent(), "UNCHANGED" + i + "-" + foo.getName());
|
||||
}
|
||||
}
|
||||
System.out.println("IDENTICAL TO PREVIOUS, RENAMING : " + foo);
|
||||
System.out.println("TO : " + newName);
|
||||
boolean renameResult = foo.renameTo(newName);
|
||||
if (!renameResult) System.out.println("Couldn't rename!");
|
||||
renameIdentical(file2);
|
||||
if (batFile != null) renameIdentical(batFile);
|
||||
return true;
|
||||
} else {
|
||||
if (line1 == null) line1 = "<end of file>";
|
||||
if (line2 == null) line2 = "<end of file>";
|
||||
System.out.println("Found difference in : " + file1 + ", " + file2);
|
||||
int diff = compare(line1, line2);
|
||||
System.out.println(" Line1: '" + line1.substring(0,diff) + "', '" + line1.substring(diff));
|
||||
System.out.println(" Line2: '" + line2.substring(0,diff) + "', '" + line2.substring(diff));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void renameIdentical(String file2) {
|
||||
File foo = new File(file2);
|
||||
File newName = new File(foo.getParent(), "UNCHANGED-" + foo.getName());
|
||||
if (newName.exists()) {
|
||||
for (int i = 1; newName.exists(); ++i) {
|
||||
newName = new File(foo.getParent(), "UNCHANGED" + i + "-" + foo.getName());
|
||||
}
|
||||
}
|
||||
System.out.println("IDENTICAL TO PREVIOUS, RENAMING : " + foo);
|
||||
System.out.println("TO : " + newName);
|
||||
boolean renameResult = foo.renameTo(newName);
|
||||
if (!renameResult) System.out.println("Couldn't rename!");
|
||||
}
|
||||
|
||||
static String getLineWithoutFluff(BufferedReader br1, boolean first) throws IOException {
|
||||
while (true) {
|
||||
String line1 = br1.readLine();
|
||||
|
Loading…
Reference in New Issue
Block a user