More changes to check the boundary conditions

X-SVN-Rev: 9574
This commit is contained in:
Mark Davis 2002-08-04 21:38:45 +00:00
parent 1a7dc3a128
commit c0a9dd3bda
10 changed files with 620 additions and 194 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2002/06/22 21:02:16 $
* $Revision: 1.16 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -110,7 +110,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (ucdData.getDecompositionType(cp) == NONE) return false;
String norm = nfx.normalize(cp);
if (UTF16.countCodePoint(norm) != 1) return true;
@ -133,7 +133,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
;
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (ucdData.getCombiningClass(cp) != 0) return false;
String norm = nfx.normalize(cp);
int first = UTF16.charAt(norm, 0);
@ -172,7 +172,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
boolean result = bitset.get(cp);
if (result && filter) {
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
@ -243,7 +243,7 @@ public final class DerivedProperty implements UCD_Types {
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
} // default
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
class CaseDProp extends UnicodeProperty {
@ -256,7 +256,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == val
|| val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
@ -294,7 +294,7 @@ public final class DerivedProperty implements UCD_Types {
return getValue(cp, LONG);
}
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
{
@ -323,7 +323,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp, false);
}
};
@ -338,7 +338,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
+ "\r\n# NOTE: Cf characters should be filtered out.";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, false);
}
};
@ -354,7 +354,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
return ucdData.isIdentifierStart(cp, true);
}
};
@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types {
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, true);
}
};
@ -384,7 +384,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Sm + Other_Math";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Sm
|| ucdData.getBinaryProperty(cp,Math_Property)) return true;
@ -400,7 +400,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|| ucdData.getBinaryProperty(cp, Alphabetic)) return true;
@ -416,7 +416,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Ll + Other_Lowercase";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
@ -432,7 +432,7 @@ public final class DerivedProperty implements UCD_Types {
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu + Other_Uppercase";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
@ -461,7 +461,7 @@ of characters, the first of which has a non-zero combining class.
+ ": Full Composition Exclusion"
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
@ -488,7 +488,7 @@ of characters, the first of which has a non-zero combining class.
+ ": Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
@ -516,7 +516,7 @@ of characters, the first of which has a non-zero combining class.
if (c.equals(b)) return "";
return "FNC; " + Utility.hex(c);
} // default
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
dprops[FC_NFC_Closure] = new UnicodeProperty() {
@ -538,7 +538,7 @@ of characters, the first of which has a non-zero combining class.
if (c.equals(b)) return "";
return "FN; " + Utility.hex(c);
} // default
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
};
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
@ -555,7 +555,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true;
if (ucdData.getBinaryProperty(cp, White_space)) return false;
@ -573,7 +573,7 @@ of characters, the first of which has a non-zero combining class.
header = header = "# Binary Property";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
switch(cp) {
case 0x27: case 0x2019: case 0xAD: return true;
// case 0x2d: case 0x2010: case 0x2011:
@ -600,7 +600,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# - has no combining marks with zero canonical combining class"
;
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (hasSoftDot(cp)) return true;
if (Default.nfkd.isNormalized(cp)) return false;
String decomp = Default.nfd.normalize(cp);
@ -629,7 +629,7 @@ of characters, the first of which has a non-zero combining class.
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
if (dprops[Other_Case_Ignorable].hasValue(cp)) return true;
@ -654,7 +654,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# (CGJ = U+034F)";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (cp == 0x034F) return false;
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
byte cat = ucdData.getCategory(cp);
@ -674,7 +674,7 @@ of characters, the first of which has a non-zero combining class.
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
}
boolean hasValue(int cp) {
public boolean hasValue(int cp) {
if (cp == 0x034F) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.8 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -275,6 +275,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
log.print('\uFEFF');
log.println();
log.println("@*Override Data");
log.println();
readOverrides(type);
log.println();
log.println("@*DICT Data");
log.println();
@ -426,7 +431,27 @@ public final class GenerateHanTransliterator implements UCD_Types {
System.out.println("Defined Count: " + count);
log.println();
log.println("@Duplicates");
log.println("@Duplicates (Frequency Order");
log.println();
it = rankList.iterator();
while (it.hasNext()) {
String word = (String) it.next();
Collection dups = (Collection) duplicates.get(word);
if (dups == null) continue;
log.print(hex.transliterate(word) + "\t" + word + "\t");
Iterator it2 = dups.iterator();
boolean gotFirst = false;
while (it2.hasNext()) {
if (!gotFirst) gotFirst = true;
else log.print(", ");
log.print(it2.next());
}
if (overrideSet.contains(word)) log.print(" *override*");
log.println();
}
log.println();
log.println("@Duplicates (Character Order)");
log.println();
it = duplicates.keySet().iterator();
while (it.hasNext()) {
@ -440,6 +465,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
else log.print(", ");
log.print(it2.next());
}
if (overrideSet.contains(word)) log.print(" *override*");
log.println();
}
@ -536,13 +562,19 @@ public final class GenerateHanTransliterator implements UCD_Types {
int overallRank = 0;
it = combinedRank.iterator();
log.println();
log.println("@Frequency data: Rank of Character");
log.println();
boolean showFrequency = false;
if (showFrequency) {
log.println();
log.println("@Frequency data: Rank of Character");
log.println();
}
// make up rankMap, rankList
while(it.hasNext()) {
Pair p = (Pair) it.next();
log.println(p.first + ", " + p.second);
if (showFrequency) log.println(p.first + ", " + p.second);
Object rank = rankMap.get(p.second);
if (rank == null) {
rankMap.put(p.second, new Integer(++overallRank));
@ -550,16 +582,18 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
log.println();
log.println("@Frequency data: Character to Rank");
log.println();
// get full order
it = rankList.iterator();
while (it.hasNext()) {
Comparable key = (Comparable) it.next();
Comparable val = (Comparable) rankMap.get(key);
log.println(key + ", " + val);
if (showFrequency) {
log.println();
log.println("@Frequency data: Character to Rank");
log.println();
// get full order
it = rankList.iterator();
while (it.hasNext()) {
Comparable key = (Comparable) it.next();
Comparable val = (Comparable) rankMap.get(key);
log.println(key + ", " + val);
}
}
} catch (Exception e) {
@ -712,6 +746,38 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
static void readOverrides(int type) throws IOException {
if (type != CHINESE) return;
String fname = "Chinese_override.txt";
System.out.println("Reading " + fname);
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true);
int counter = 0;
String[] pieces = new String[50];
String line = "";
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
Utility.dot(counter++);
// skip code
int wordStart = line.indexOf('\t') + 1;
int wordEnd = line.indexOf('\t', wordStart);
String word = line.substring(wordStart, wordEnd);
String definition = line.substring(wordEnd+1);
addCheck(word, definition, line);
overrideSet.add(word);
}
br.close();
} catch (Exception e) {
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
}
}
static Set overrideSet = new HashSet();
static void processEdict(String word, String definition, String line) {
// We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH
// C = CJK, H = Hiragana, K = katakana

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
* $Date: 2002/07/30 09:57:18 $
* $Revision: 1.1 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -21,61 +21,126 @@ import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class GenerateLineBreakTest implements UCD_Types {
static String[] samples = new String[LB_LIMIT + 3];
static byte[] TROrder = {
// COMMON STUFF for Hangul
static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
static byte getHangulType(int cp) {
if (Default.ucd.isLeadingJamo(cp)) return hL;
if (Default.ucd.isVowelJamo(cp)) return hV;
if (Default.ucd.isTrailingJamo(cp)) return hT;
if (Default.ucd.isHangulSyllable(cp)) {
if (Default.ucd.isDoubleHangul(cp)) return hLV;
return hLVT;
}
return hNot;
}
//============================
protected String rule;
protected String fileName = "Line";
// all the other items are supplied in UCD_TYPES
static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT,
LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
LB2_LIMIT = (byte)(LB_SUP + 1);
String[] samples = new String[100];
byte[] TypeOrder = {
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
// missing from Pair Table
LB_SP, LB_BK, LB_CR, LB_LF,
// resolved types below
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
// 3 JAMO CLASSES
29, 30, 31
// 3 JAMO CLASSES, plus supplementary
LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
};
static final int TABLE_LIMIT = 25;
public static void main(String[] args) throws IOException {
Default.setUCD();
new GenerateLineBreakTest().run();
new GenerateWordBreakTest().run();
}
// stuff that subclasses need to override
public void run() throws IOException {
findSamples();
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><body><h1>Current (fixed only for consistency):</h1>");
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
+ fileName + "</title></head>");
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
generateTable(out, false);
out.println("<h1>Recommended:</h1>");
out.println("<h3>Recommended:</h3>");
generateTable(out, true);
out.println("</body></html>");
out.close();
String[] testCase = new String[50];
// do main test
for (int k = 0; k < 2; ++k) {
out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
int counter = 0;
out.println("# Default Linebreak conformance test");
out.println("# " + Default.getDate() + ", MED");
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + Default.getDate() + ", MED");
out.println("#");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
out.println("# <string> contains hex Unicode code points, with ");
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
out.println("#\t" + NOBREAK + " wherever there is not.");
out.println("# <comment> the format can change, but currently it shows:");
out.println("#\t- the sample character name");
out.println("#\t- (x) the line_break property* for the sample character");
out.println("#\t- [x] the rule that determines whether there is a break or not");
out.println("#");
out.println("# Samples:");
out.println("# The test currently takes all pairs of linebreak types*,");
out.println("# picks a sample for each type, and generates three strings: ");
out.println("#\t- the pair alone");
out.println("#\t- the pair alone with an imbeded space");
out.println("#\t- the pair alone with embedded combining marks");
out.println("# The sample for each type is simply the first code point (above NULL)");
out.println("# with that property.");
out.println("# * Note:");
out.println("#\t- SG is omitted");
out.println("#\t- 3 different Jamo characters and a supplementary character are added");
out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments");
out.println("#\t instead of the linebreak property");
out.println("# These samples may be extended in the future.");
out.println("#");
for (int ii = 0; ii < samples.length; ++ii) {
int i = TROrder[ii];
for (int ii = 0; ii < getLimit(); ++ii) {
int i = TypeOrder[ii];
if (i == LB_SG) continue;
String before = samples[i];
for (int jj = 0; jj < samples.length; ++jj) {
Utility.dot(counter++);
int j = TROrder[jj];
for (int jj = 0; jj < getLimit(); ++jj) {
Utility.dot(counter);
int j = TypeOrder[jj];
if (j == LB_SG) continue;
String after = samples[j];
// do line straight
printLine(out, before, "", after, k != 0);
printLine(out, before, " ", after, k != 0);
printLine(out, before, "\u0301\u0308", after, k != 0);
int len = genTestItems(before, after, testCase);
for (int q = 0; q < len; ++q) {
printLine(out, testCase[q], k != 0 && q == 0, false);
++counter;
}
}
}
out.println("# Lines: " + counter);
@ -83,25 +148,80 @@ public class GenerateLineBreakTest implements UCD_Types {
}
}
public static void generateTable(PrintWriter out, boolean recommended) {
out.print("<table border='1' cellspacing='0'><tr><th></th>");
for (int i = 0; i < TABLE_LIMIT; ++i) {
String h = getLBID(samples[TROrder[i]]);
out.print("<th>" + h + "</th>");
// stuff that subclasses need to override
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
results[1] = before + " " + after;
results[2] = before + "\u0301\u0308" + after;
return 3;
}
// stuff that subclasses need to override
boolean skipType(byte type) {
return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
byte result = getType(cp);
if (result == LB_SUP) return "SUP";
if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
return Default.ucd.getLineBreakID_fromIndex(result);
}
// stuff that subclasses need to override
public byte getType(int cp) {
if (cp > 0xFFFF) return LB_SUP;
byte result = getHangulType(cp);
if (result != hNot) return (byte)(result + LB_LIMIT);
return Default.ucd.getLineBreak(cp);
}
public int getLimit() {
return LB2_LIMIT;
}
public int getTableLimit() {
return LB_SUP; // skip last;
}
public void generateTable(PrintWriter out, boolean recommended) {
String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>");
byte type;
for (int i = 0; i < getTableLimit(); ++i) {
type = TypeOrder[i];
if (skipType(type)) continue;
String h = getTypeID(samples[TypeOrder[i]]);
out.print("<th " + width + ">" + h + "</th>");
}
out.print("</tr>");
String[] rule = new String[1];
String[] rule2 = new String[1];
for (int i = 0; i < TABLE_LIMIT; ++i) {
String before = samples[TROrder[i]];
String line = "<tr><th>" + getLBID(before) + "</th>";
for (int j = 0; j < TABLE_LIMIT; ++j) {
String after = samples[TROrder[j]];
for (int i = 0; i < getTableLimit(); ++i) {
type = TypeOrder[i];
if (skipType(type)) continue;
String before = samples[type];
String line = "<tr><th>" + getTypeID(before) + "</th>";
for (int j = 0; j < getTableLimit(); ++j) {
type = TypeOrder[j];
if (skipType(type)) continue;
String after = samples[type];
String t = getTableEntry(before, after, recommended, rule);
String background = "";
if (recommended) {
String t2 = getTableEntry(before, after, false, rule2);
if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
String t2 = getTableEntry(before, after, !recommended, rule2);
if (!t.equals(t2)) {
if (t.equals(NOBREAK)) {
background = " bgcolor='#CCFFFF'";
} else {
background = " bgcolor='#FFFF00'";
}
} else if (t.equals(NOBREAK)) {
background = " bgcolor='#CCCCFF'";
}
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
}
@ -110,7 +230,7 @@ public class GenerateLineBreakTest implements UCD_Types {
out.println("</table>");
}
public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
String t = "_";
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
String spaceRule = rule;
@ -137,75 +257,83 @@ public class GenerateLineBreakTest implements UCD_Types {
return t;
}
static final String BREAK = "\u00F7";
static final String NOBREAK = "\u00D7";
public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
String s = before + filler + after;
int offset = before.length() + filler.length();
public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
int cp;
StringBuffer string = new StringBuffer();
StringBuffer comment = new StringBuffer("\t# ");
String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
string.append(status);
comment.append(' ').append(status).append(" [").append(rule).append(']');
boolean lb = isBreak(s, offset, false);
String tlb = (lb ? "b" : "n");
String comment = "";
if (comments) comment =
" # " + getLBID(before + filler)
+ " " + tlb
+ " " + getLBID(after)
+ " # " + Default.ucd.getName(before + filler)
+ " " + tlb
+ " " + Default.ucd.getName(after);
for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
out.println(Utility.hex(before + filler)
+ "; " + tlb
+ "; " + Utility.hex(after)
+ comment);
cp = UTF16.charAt(source, offset);
if (string.length() > 0) {
string.append(' ');
comment.append(' ');
}
string.append(Utility.hex(cp));
comment.append(Default.ucd.getName(cp) + " (" + getTypeID(cp) + ")");
status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
string.append(' ').append(status);
comment.append(' ').append(status).append(" [").append(rule).append(']');
}
if (comments) string.append(comment);
out.println(string);
}
public static void findSamples() {
public void findSamples() {
for (int i = 1; i <= 0x10FFFF; ++i) {
if (!Default.ucd.isAllocated(i)) continue;
if (Default.ucd.isLeadingJamo(i)
|| Default.ucd.isVowelJamo(i)
|| Default.ucd.isTrailingJamo(i)) continue;
byte lb = Default.ucd.getLineBreak(i);
if (0xD800 <= i && i <= 0xDFFF) continue;
if(i == 0x1100) {
System.out.print("here");
}
byte lb = getType(i);
if (samples[lb] == null) {
samples[lb] = UTF16.valueOf(i);
}
}
// fill the last with special cases
samples[LB_LIMIT] = "\u1100";
samples[LB_LIMIT+1] = "\u1162";
samples[LB_LIMIT+2] = "\u11A8";
for (int i = 0; i < TypeOrder.length; ++i) {
String sample = samples[i];
System.out.println(getTypeID(sample) + ":\t" + Default.ucd.getCodeAndName(sample));
}
}
public static String getLBID(String s) {
if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
public String getTypeID(String s) {
if (s == null) return "<null>";
if (s.length() == 1) return getTypeID(s.charAt(0));
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(" ");
result.append(Default.ucd.getLineBreakID(cp));
result.append(getTypeID(cp));
}
return result.toString();
}
static String rule;
public static int findLastNon(String source, int offset, byte notLBType) {
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
int cp;
for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
byte f = getResolvedLB(cp);
if (f != notLBType) return cp;
byte f = getResolvedType(cp, recommended);
if (f != notLBType) return i;
}
return 0;
return -1;
}
public static byte getResolvedLB (int cp) {
public byte getResolvedType (int cp, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
byte result = Default.ucd.getLineBreak(cp);
byte result = getType(cp);
switch (result) {
case LB_AI: result = LB_AI; break;
// case LB_CB: result = LB_ID; break;
@ -213,17 +341,31 @@ public class GenerateLineBreakTest implements UCD_Types {
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX: result = LB_AL; break;
}
if (recommended) {
if (getHangulType(cp) != hNot) {
result = LB_ID;
}
}
return result;
}
public boolean onCodepointBoundary(String s, int offset) {
if (offset < 0 || offset > s.length()) return false;
if (offset == 0 || offset == s.length()) return true;
if (UTF16.isLeadSurrogate(s.charAt(offset-1))
&& UTF16.isTrailSurrogate(s.charAt(offset))) return false;
return true;
}
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
public static boolean isBreak(String source, int offset, boolean recommended) {
public boolean isBreak(String source, int offset, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
// this is taken care of in the getResolvedLB function
// this is taken care of in the getResolvedType function
// LB 2a Never break at the start of text
@ -237,8 +379,7 @@ public class GenerateLineBreakTest implements UCD_Types {
// UTF-16: never break in the middle of a code point
if (UTF16.isLeadSurrogate(source.charAt(offset-1))
&& UTF16.isTrailSurrogate(source.charAt(offset))) return false;
if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
@ -247,8 +388,8 @@ public class GenerateLineBreakTest implements UCD_Types {
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedLB(cpBefore);
byte after = getResolvedLB(cpAfter);
byte before = getResolvedType(cpBefore, recommended);
byte after = getResolvedType(cpAfter, recommended);
rule="3a";
@ -276,22 +417,21 @@ public class GenerateLineBreakTest implements UCD_Types {
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule="6";
if (after == LB_CM) return false;
if (Default.ucd.isLeadingJamo(cpBefore)) {
if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
} else if (Default.ucd.isVowelJamo(cpBefore)) {
if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
} else if (Default.ucd.isTrailingJamo(cpBefore)) {
if (Default.ucd.isTrailingJamo(cpAfter)) return false;
}
if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
boolean setBase = false;
if (before == LB_CM) {
setBase = true;
int cp = findLastNon(source, offset, LB_CM);
if (cp == 0) {
int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset < 0) {
before = LB_ID;
} else {
before = getResolvedLB(cp);
before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
@ -310,9 +450,9 @@ public class GenerateLineBreakTest implements UCD_Types {
// find the last non-space character; we will need it
byte lastNonSpace = before;
if (lastNonSpace == LB_SP) {
int cp = findLastNon(source, offset, LB_CM);
if (cp != 0) {
lastNonSpace = getResolvedLB(cp);
int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset >= 0) {
lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
@ -476,4 +616,162 @@ public class GenerateLineBreakTest implements UCD_Types {
rule="20";
return true;
}
static class GenerateWordBreakTest extends GenerateLineBreakTest {
static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
LIMIT = LVT + 1;
static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
{
fileName = "Word";
TypeOrder = new byte[LIMIT];
for (byte i = 0; i < TypeOrder.length; ++i) {
TypeOrder[i] = i;
}
}
boolean skipType(byte type) {
return false;
}
public int getLimit() {
return LIMIT;
}
public int getTableLimit() {
return LIMIT;
}
// stuff that subclasses need to override
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
return 1;
}
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
boolean normalBreak = isBreak(before + after, before.length(), recommended);
String normalRule = rule;
ruleOut[0] = rule;
return normalBreak ? BREAK : NOBREAK;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
byte type = getType(cp);
if (type >= oLIMIT) return hNames[type - oLIMIT];
return Names[type];
}
// stuff that subclasses need to override
public byte getType(int cp) {
// single characters
if (cp == 0xA) return LF;
if (cp == 0xD) return CR;
if (cp == 0x034F) return CGJ;
if (cp == 0x2028 || cp == 0x2029) return Control;
// Hangul
byte result = getHangulType(cp);
if (result != hNot) return (byte)(result + oLIMIT);
// other properties
// category based
byte cat = Default.ucd.getCategory(cp);
if (cat == Cc) return Control;
if (cat == Cf) return Extend;
if (((1<<cat) & LETTER_MASK) != 0) return LetterBase;
// other binary properties
if (linkProp.hasValue(cp)) return Link;
if (extendProp.hasValue(cp)) return Extend;
if (baseProp.hasValue(cp)) return Base;
return Other;
}
public byte getResolvedType(int cp, boolean recommended) {
return getType(cp);
}
public boolean isBreak(String source, int offset, boolean recommended) {
rule="1";
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
rule = "2";
if (offset == source.length()) return true;
// UTF-16: never break in the middle of a code point
if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedType(cpBefore, recommended);
byte after = getResolvedType(cpAfter, recommended);
rule = "3";
if (before == CR && after == LF) return false;
rule = "4";
if (before == CR || before == LF || before == Control
|| after == Control || after == LF || after == CR) return true;
rule = "6";
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
rule = "7";
if ((before == LV || before == V) && (after == V || after == T)) return false;
rule = "8";
if ((before == LVT || before == T) && (after == T)) return false;
rule = "9";
if (after == Extend) return false;
if (recommended) {
if (after == Link || after == CGJ) return false;
} else {
// Do not break around a CGJ.
rule = "10";
if (before == CGJ && (after == Base
|| after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
rule = "11";
if (after == CGJ) return false;
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
rule = "12";
//Link Extend* × LetterBase (12)
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
int backOffset = findLastNon(source, offset, Extend, recommended);
if (backOffset >= 0) {
byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
if (last == Link) return false;
}
}
rule = "13";
if (after == Link) return false;
}
// Otherwise break after all characters.
rule = "14";
return true;
}
}
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.2 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -23,20 +23,23 @@ public class GenerateThaiBreaks {
BufferedReader br = new BufferedReader(
new InputStreamReader(
new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle"));
new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
PrintWriter out = null;
try {
Default.setUCD();
UnicodeSet ignorables = new UnicodeSet(0xE30, 0xE3A);
UnicodeSet ignorables = new UnicodeSet();
/* new UnicodeSet(0xE30, 0xE3A);
ignorables.add(0x0E40, 0x0E44); // add logical order exception
ignorables.add(0x0E47, 0x0E4E);
*/
ignorables.add(0, ' '); // add controls
ignorables.add('.');
Set initials = new TreeSet();
Set finals = new TreeSet();
Set medials = new TreeSet();
UnicodeSet initials = new UnicodeSet();
UnicodeSet finals = new UnicodeSet();
UnicodeSet medials = new UnicodeSet();
char[] buffer = new char[100];
@ -60,34 +63,58 @@ public class GenerateThaiBreaks {
}
initials.add(temp.substring(0,1));
initials.add(temp.substring(0,2));
finals.add(temp.substring(temp.length()-2));
//initials.add(temp.substring(0,2));
finals.add(temp.substring(temp.length()-1));
//finals.add(temp.substring(temp.length()-1));
for (int i = 1; i < temp.length() - 3; ++i) {
medials.add(temp.substring(i, i+2));
for (int i = 1; i < temp.length() - 1; ++i) {
//medials.add(temp.substring(i, i+2));
medials.add(temp.substring(i, i+1));
}
medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
//medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
}
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
System.out.println("medials size: " + medials.size());
//out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
// out.write('\uFEFF');
UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
finals.addAll(marks);
UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
System.out.println("Never occur: " + missingThai.toPattern(true));
Utility.showSetNames("", missingThai, true, Default.ucd);
System.out.println();
UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
System.out.println("Never initial: " + neverInitial.toPattern(true));
Utility.showSetNames("", neverInitial, true, Default.ucd);
System.out.println();
System.out.println("Never final: " + neverFinal.toPattern(true));
Utility.showSetNames("", neverFinal, true, Default.ucd);
System.out.println();
initials.removeAll(medials);
finals.removeAll(medials);
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
out.write('\uFEFF');
out.println("Only Initials");
Utility.print(out, initials, ", ", new MyBreaker());
out.println();
out.println("Only Finals");
Utility.print(out, finals, ", ", new MyBreaker());
System.out.println("Only Initials" + initials.toPattern(true));
Utility.showSetNames("", initials, true, Default.ucd);
System.out.println();
System.out.println("Only Finals" + finals.toPattern(true));
Utility.showSetNames("", finals, true, Default.ucd);
} finally {
br.close();
if (out != null) out.close();

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.19 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
@ -78,7 +78,7 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);
else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.16 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.17 $
*
*******************************************************************************
*/
@ -737,6 +737,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.NT[prop];
}
public static String getNumericTypeID_fromIndex(byte prop, byte style) {
return style == SHORT ? UCD_Names.SHORT_NT[prop] : UCD_Names.NT[prop];
}
public String getEastAsianWidthID(int codePoint) {
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
}
@ -745,6 +749,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.EA[prop];
}
public static String getEastAsianWidthID_fromIndex(byte prop, byte style) {
return style != LONG ? UCD_Names.SHORT_EA[prop] : UCD_Names.EA[prop];
}
public String getLineBreakID(int codePoint) {
return getLineBreakID_fromIndex(getLineBreak(codePoint));
}
@ -753,6 +761,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.LB[prop];
}
public static String getLineBreakID_fromIndex(byte prop, byte style) {
return style != LONG ? UCD_Names.LB[prop] : UCD_Names.LONG_LB[prop];
}
public String getJoiningTypeID(int codePoint) {
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
}
@ -761,6 +773,10 @@ public final class UCD implements UCD_Types {
return UCD_Names.JOINING_TYPE[prop];
}
public static String getJoiningTypeID_fromIndex(byte prop, byte style) {
return style != LONG ? UCD_Names.JOINING_TYPE[prop] : UCD_Names.LONG_JOINING_TYPE[prop];
}
public String getJoiningGroupID(int codePoint) {
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
}
@ -769,6 +785,11 @@ public final class UCD implements UCD_Types {
return UCD_Names.JOINING_GROUP[prop];
}
public static String getJoiningGroupID_fromIndex(byte prop, byte style) {
// no short version
return UCD_Names.JOINING_GROUP[prop];
}
public String getScriptID(int codePoint) {
return getScriptID_fromIndex(getScript(codePoint));
}
@ -790,6 +811,11 @@ public final class UCD implements UCD_Types {
return UCD_Names.AGE[prop];
}
public static String getAgeID_fromIndex(byte prop, byte style) {
// no short for
return UCD_Names.AGE[prop];
}
public String getBinaryPropertiesID(int codePoint, byte bit) {
return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.14 $
* $Date: 2002/08/04 21:38:45 $
* $Revision: 1.15 $
*
*******************************************************************************
*/
@ -22,6 +22,8 @@ public interface UCD_Types {
public static final String BIN_DIR = BASE_DIR + "BIN\\";
public static final String GEN_DIR = BASE_DIR + "GEN\\";
public static final char DOTTED_CIRCLE = '\u25CC';
public static final int
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FFF+1,
@ -166,7 +168,10 @@ public interface UCD_Types {
CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
UNASSIGNED_MASK = (1<<Cn);
UNASSIGNED_MASK = (1<<Cn),
BASE_MASK = LETTER_MASK | NUMBER_MASK | PUNCTUATION_MASK | SYMBOL_MASK | (1<<Mc),
NONSPACING_MARK_MASK = (1<<Mn) | (1<<Me);
// Binary Properties

View File

@ -148,7 +148,7 @@ public abstract class UnicodeProperty implements UCD_Types {
/**
* Does it have the propertyValue?
*/
abstract boolean hasValue(int cp);
abstract public boolean hasValue(int cp);
/**
* Get the set of characters it contains

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
* $Date: 2002/07/03 02:15:47 $
* $Revision: 1.8 $
* $Date: 2002/08/04 21:38:44 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -299,26 +299,14 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
case COMBINING_CLASS>>8: return ucd.getCombiningClassID_fromIndex((byte)propValue, style);
case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex((byte)propValue, style);
case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex((byte)propValue, style);
case NUMERIC_TYPE>>8: if (propValue >= LIMIT_NUMERIC_TYPE) break;
if (style != SHORT) return ucd.getNumericTypeID_fromIndex((byte)propValue);
return UCD_Names.SHORT_NT[propValue];
case EAST_ASIAN_WIDTH>>8: if (propValue >= LIMIT_EAST_ASIAN_WIDTH) break;
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
return UCD_Names.SHORT_EA[propValue];
case LINE_BREAK>>8: if (propValue >= LIMIT_LINE_BREAK) break;
if (style != LONG) return ucd.getLineBreakID_fromIndex((byte)propValue);
return UCD_Names.LONG_LB[propValue];
case JOINING_TYPE>>8: if (propValue >= LIMIT_JOINING_TYPE) break;
if (style != LONG) return ucd.getJoiningTypeID_fromIndex((byte)propValue);
return UCD_Names.LONG_JOINING_TYPE[propValue];
case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroupID_fromIndex((byte)propValue);
case NUMERIC_TYPE>>8: ucd.getNumericTypeID_fromIndex((byte)propValue, style);
case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex((byte)propValue, style);
case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue);
case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue);
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break;
if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue);
return UCD_Names.ABB_SCRIPT[propValue];
case AGE>>8: if (propValue >= LIMIT_AGE) break;
return ucd.getAgeID_fromIndex((byte)propValue);
case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue);
case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
/*
case DERIVED>>8:
UnicodeProperty up = DerivedProperty.make(propValue, ucd);

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.23 $
* $Date: 2002/08/04 21:38:44 $
* $Revision: 1.24 $
*
*******************************************************************************
*/
@ -17,9 +17,10 @@ import java.util.*;
import java.text.*;
import java.io.*;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UTF16;
import com.ibm.text.UCD.*;
public final class Utility { // COMMON UTILITIES
public final class Utility implements UCD_Types { // COMMON UTILITIES
static final boolean UTF8 = true; // TODO -- make argument
@ -470,7 +471,22 @@ public final class Utility { // COMMON UTILITIES
return quoteXML(source, false);
}
private static UnicodeProperty defaultIgnorable = null;
public static String getDisplay(int cp) {
String result = UTF16.valueOf(cp);
byte cat = Default.ucd.getCategory(cp);
if (cat == Mn || cat == Me) {
result = String.valueOf(DOTTED_CIRCLE) + result;
} else if (cat == Cf || cat == Cc || cp == 0x034F || cp == 0x00AD || cp == 0x1806) {
result = "\u25A1";
} else {
if (defaultIgnorable == null) defaultIgnorable = DerivedProperty.make(DefaultIgnorable);
if (defaultIgnorable.hasValue(cp)) result = "\u25A1";
}
return result;
}
public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) {
while (aStart < aEnd && bStart < bEnd) {
int diff = a[aStart++] - b[bStart++];