More changes to check the boundary conditions
X-SVN-Rev: 9574
This commit is contained in:
parent
1a7dc3a128
commit
c0a9dd3bda
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2002/06/22 21:02:16 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -110,7 +110,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getDecompositionType(cp) == NONE) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
if (UTF16.countCodePoint(norm) != 1) return true;
|
||||
@ -133,7 +133,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
|
||||
;
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (ucdData.getCombiningClass(cp) != 0) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
int first = UTF16.charAt(norm, 0);
|
||||
@ -172,7 +172,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
boolean result = bitset.get(cp);
|
||||
if (result && filter) {
|
||||
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
|
||||
@ -243,7 +243,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
|
||||
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
|
||||
} // default
|
||||
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
class CaseDProp extends UnicodeProperty {
|
||||
@ -256,7 +256,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == val
|
||||
|| val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
|
||||
@ -294,7 +294,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
return getValue(cp, LONG);
|
||||
}
|
||||
|
||||
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
{
|
||||
@ -323,7 +323,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# Characters that can start an identifier."
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierStart(cp, false);
|
||||
}
|
||||
};
|
||||
@ -338,7 +338,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
|
||||
+ "\r\n# NOTE: Cf characters should be filtered out.";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierContinue_NO_Cf(cp, false);
|
||||
}
|
||||
};
|
||||
@ -354,7 +354,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierStart(cp, true);
|
||||
}
|
||||
};
|
||||
@ -371,7 +371,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
|
||||
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
return ucdData.isIdentifierContinue_NO_Cf(cp, true);
|
||||
}
|
||||
};
|
||||
@ -384,7 +384,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Sm + Other_Math";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Sm
|
||||
|| ucdData.getBinaryProperty(cp,Math_Property)) return true;
|
||||
@ -400,7 +400,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|
||||
|| ucdData.getBinaryProperty(cp, Alphabetic)) return true;
|
||||
@ -416,7 +416,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Ll + Other_Lowercase";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
|
||||
@ -432,7 +432,7 @@ public final class DerivedProperty implements UCD_Types {
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Lu + Other_Uppercase";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lu
|
||||
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
|
||||
@ -461,7 +461,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
+ ": Full Composition Exclusion"
|
||||
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (!ucdData.isRepresented(cp)) return false;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return false;
|
||||
@ -488,7 +488,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
+ ": Full Composition Inclusion"
|
||||
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (!ucdData.isRepresented(cp)) return false;
|
||||
byte dtype = ucdData.getDecompositionType(cp);
|
||||
if (dtype != CANONICAL) return false;
|
||||
@ -516,7 +516,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
if (c.equals(b)) return "";
|
||||
return "FNC; " + Utility.hex(c);
|
||||
} // default
|
||||
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
dprops[FC_NFC_Closure] = new UnicodeProperty() {
|
||||
@ -538,7 +538,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
if (c.equals(b)) return "";
|
||||
return "FN; " + Utility.hex(c);
|
||||
} // default
|
||||
boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
public boolean hasValue(int cp) { return getValue(cp).length() != 0; }
|
||||
};
|
||||
|
||||
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
|
||||
@ -555,7 +555,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
|
||||
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (0x2060 <= cp && cp <= 0x206F || 0xFFF0 <= cp && cp <= 0xFFFB || 0xE0000 <= cp && cp <= 0xE0FFF) return true;
|
||||
if (ucdData.getBinaryProperty(cp,Other_Default_Ignorable_Code_Point)) return true;
|
||||
if (ucdData.getBinaryProperty(cp, White_space)) return false;
|
||||
@ -573,7 +573,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
|
||||
header = header = "# Binary Property";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
switch(cp) {
|
||||
case 0x27: case 0x2019: case 0xAD: return true;
|
||||
// case 0x2d: case 0x2010: case 0x2011:
|
||||
@ -600,7 +600,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
+ "\r\n# - has no combining marks with zero canonical combining class"
|
||||
;
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (hasSoftDot(cp)) return true;
|
||||
if (Default.nfkd.isNormalized(cp)) return false;
|
||||
String decomp = Default.nfd.normalize(cp);
|
||||
@ -629,7 +629,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
|
||||
if (dprops[Other_Case_Ignorable].hasValue(cp)) return true;
|
||||
@ -654,7 +654,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
+ "\r\n# (CGJ = U+034F)";
|
||||
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (cp == 0x034F) return false;
|
||||
if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
@ -674,7 +674,7 @@ of characters, the first of which has a non-zero combining class.
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp"
|
||||
+ "\r\n# - Grapheme_Extend - Grapheme_Link - CGJ";
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
public boolean hasValue(int cp) {
|
||||
if (cp == 0x034F) return false;
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -275,6 +275,11 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
|
||||
log.print('\uFEFF');
|
||||
|
||||
log.println();
|
||||
log.println("@*Override Data");
|
||||
log.println();
|
||||
readOverrides(type);
|
||||
|
||||
log.println();
|
||||
log.println("@*DICT Data");
|
||||
log.println();
|
||||
@ -426,7 +431,27 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
System.out.println("Defined Count: " + count);
|
||||
|
||||
log.println();
|
||||
log.println("@Duplicates");
|
||||
log.println("@Duplicates (Frequency Order");
|
||||
log.println();
|
||||
it = rankList.iterator();
|
||||
while (it.hasNext()) {
|
||||
String word = (String) it.next();
|
||||
Collection dups = (Collection) duplicates.get(word);
|
||||
if (dups == null) continue;
|
||||
log.print(hex.transliterate(word) + "\t" + word + "\t");
|
||||
Iterator it2 = dups.iterator();
|
||||
boolean gotFirst = false;
|
||||
while (it2.hasNext()) {
|
||||
if (!gotFirst) gotFirst = true;
|
||||
else log.print(", ");
|
||||
log.print(it2.next());
|
||||
}
|
||||
if (overrideSet.contains(word)) log.print(" *override*");
|
||||
log.println();
|
||||
}
|
||||
|
||||
log.println();
|
||||
log.println("@Duplicates (Character Order)");
|
||||
log.println();
|
||||
it = duplicates.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
@ -440,6 +465,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
else log.print(", ");
|
||||
log.print(it2.next());
|
||||
}
|
||||
if (overrideSet.contains(word)) log.print(" *override*");
|
||||
log.println();
|
||||
}
|
||||
|
||||
@ -536,13 +562,19 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
int overallRank = 0;
|
||||
it = combinedRank.iterator();
|
||||
|
||||
log.println();
|
||||
log.println("@Frequency data: Rank of Character");
|
||||
log.println();
|
||||
boolean showFrequency = false;
|
||||
|
||||
if (showFrequency) {
|
||||
log.println();
|
||||
log.println("@Frequency data: Rank of Character");
|
||||
log.println();
|
||||
}
|
||||
|
||||
// make up rankMap, rankList
|
||||
|
||||
while(it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
log.println(p.first + ", " + p.second);
|
||||
if (showFrequency) log.println(p.first + ", " + p.second);
|
||||
Object rank = rankMap.get(p.second);
|
||||
if (rank == null) {
|
||||
rankMap.put(p.second, new Integer(++overallRank));
|
||||
@ -550,16 +582,18 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
log.println();
|
||||
log.println("@Frequency data: Character to Rank");
|
||||
log.println();
|
||||
|
||||
// get full order
|
||||
it = rankList.iterator();
|
||||
while (it.hasNext()) {
|
||||
Comparable key = (Comparable) it.next();
|
||||
Comparable val = (Comparable) rankMap.get(key);
|
||||
log.println(key + ", " + val);
|
||||
if (showFrequency) {
|
||||
log.println();
|
||||
log.println("@Frequency data: Character to Rank");
|
||||
log.println();
|
||||
|
||||
// get full order
|
||||
it = rankList.iterator();
|
||||
while (it.hasNext()) {
|
||||
Comparable key = (Comparable) it.next();
|
||||
Comparable val = (Comparable) rankMap.get(key);
|
||||
log.println(key + ", " + val);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
@ -712,6 +746,38 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
static void readOverrides(int type) throws IOException {
|
||||
if (type != CHINESE) return;
|
||||
String fname = "Chinese_override.txt";
|
||||
|
||||
System.out.println("Reading " + fname);
|
||||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\" + fname, true);
|
||||
int counter = 0;
|
||||
String[] pieces = new String[50];
|
||||
String line = "";
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter++);
|
||||
|
||||
// skip code
|
||||
int wordStart = line.indexOf('\t') + 1;
|
||||
int wordEnd = line.indexOf('\t', wordStart);
|
||||
String word = line.substring(wordStart, wordEnd);
|
||||
String definition = line.substring(wordEnd+1);
|
||||
addCheck(word, definition, line);
|
||||
overrideSet.add(word);
|
||||
}
|
||||
br.close();
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
||||
}
|
||||
}
|
||||
|
||||
static Set overrideSet = new HashSet();
|
||||
|
||||
static void processEdict(String word, String definition, String line) {
|
||||
// We have a situation where we have words of the form CCCHHHKKKCCHHCCH > HHHHHHKKKHHHHHHHH
|
||||
// C = CJK, H = Hiragana, K = katakana
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
|
||||
* $Date: 2002/07/30 09:57:18 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -21,61 +21,126 @@ import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class GenerateLineBreakTest implements UCD_Types {
|
||||
|
||||
static String[] samples = new String[LB_LIMIT + 3];
|
||||
|
||||
static byte[] TROrder = {
|
||||
// COMMON STUFF for Hangul
|
||||
static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
|
||||
static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
|
||||
|
||||
static byte getHangulType(int cp) {
|
||||
if (Default.ucd.isLeadingJamo(cp)) return hL;
|
||||
if (Default.ucd.isVowelJamo(cp)) return hV;
|
||||
if (Default.ucd.isTrailingJamo(cp)) return hT;
|
||||
if (Default.ucd.isHangulSyllable(cp)) {
|
||||
if (Default.ucd.isDoubleHangul(cp)) return hLV;
|
||||
return hLVT;
|
||||
}
|
||||
return hNot;
|
||||
}
|
||||
|
||||
//============================
|
||||
|
||||
protected String rule;
|
||||
protected String fileName = "Line";
|
||||
|
||||
// all the other items are supplied in UCD_TYPES
|
||||
static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT,
|
||||
LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
|
||||
LB2_LIMIT = (byte)(LB_SUP + 1);
|
||||
|
||||
String[] samples = new String[100];
|
||||
|
||||
|
||||
byte[] TypeOrder = {
|
||||
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
|
||||
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
|
||||
// missing from Pair Table
|
||||
LB_SP, LB_BK, LB_CR, LB_LF,
|
||||
// resolved types below
|
||||
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
|
||||
// 3 JAMO CLASSES
|
||||
29, 30, 31
|
||||
// 3 JAMO CLASSES, plus supplementary
|
||||
LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
|
||||
};
|
||||
static final int TABLE_LIMIT = 25;
|
||||
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Default.setUCD();
|
||||
new GenerateLineBreakTest().run();
|
||||
|
||||
new GenerateWordBreakTest().run();
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public void run() throws IOException {
|
||||
findSamples();
|
||||
|
||||
// test individual cases
|
||||
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
|
||||
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
|
||||
out.println("<html><body><h1>Current (fixed only for consistency):</h1>");
|
||||
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
|
||||
+ fileName + "</title></head>");
|
||||
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
|
||||
|
||||
|
||||
|
||||
generateTable(out, false);
|
||||
out.println("<h1>Recommended:</h1>");
|
||||
out.println("<h3>Recommended:</h3>");
|
||||
generateTable(out, true);
|
||||
out.println("</body></html>");
|
||||
out.close();
|
||||
|
||||
String[] testCase = new String[50];
|
||||
// do main test
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
|
||||
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
|
||||
int counter = 0;
|
||||
|
||||
out.println("# Default Linebreak conformance test");
|
||||
out.println("# " + Default.getDate() + ", MED");
|
||||
out.println("# Default " + fileName + " Break Test");
|
||||
out.println("# Generated: " + Default.getDate() + ", MED");
|
||||
out.println("#");
|
||||
out.println("# Format:");
|
||||
out.println("# <string> (# <comment>)? ");
|
||||
out.println("# <string> contains hex Unicode code points, with ");
|
||||
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
|
||||
out.println("#\t" + NOBREAK + " wherever there is not.");
|
||||
out.println("# <comment> the format can change, but currently it shows:");
|
||||
out.println("#\t- the sample character name");
|
||||
out.println("#\t- (x) the line_break property* for the sample character");
|
||||
out.println("#\t- [x] the rule that determines whether there is a break or not");
|
||||
out.println("#");
|
||||
out.println("# Samples:");
|
||||
out.println("# The test currently takes all pairs of linebreak types*,");
|
||||
out.println("# picks a sample for each type, and generates three strings: ");
|
||||
out.println("#\t- the pair alone");
|
||||
out.println("#\t- the pair alone with an imbeded space");
|
||||
out.println("#\t- the pair alone with embedded combining marks");
|
||||
out.println("# The sample for each type is simply the first code point (above NULL)");
|
||||
out.println("# with that property.");
|
||||
out.println("# * Note:");
|
||||
out.println("#\t- SG is omitted");
|
||||
out.println("#\t- 3 different Jamo characters and a supplementary character are added");
|
||||
out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments");
|
||||
out.println("#\t instead of the linebreak property");
|
||||
out.println("# These samples may be extended in the future.");
|
||||
out.println("#");
|
||||
|
||||
for (int ii = 0; ii < samples.length; ++ii) {
|
||||
int i = TROrder[ii];
|
||||
for (int ii = 0; ii < getLimit(); ++ii) {
|
||||
int i = TypeOrder[ii];
|
||||
if (i == LB_SG) continue;
|
||||
String before = samples[i];
|
||||
|
||||
for (int jj = 0; jj < samples.length; ++jj) {
|
||||
Utility.dot(counter++);
|
||||
int j = TROrder[jj];
|
||||
for (int jj = 0; jj < getLimit(); ++jj) {
|
||||
Utility.dot(counter);
|
||||
int j = TypeOrder[jj];
|
||||
if (j == LB_SG) continue;
|
||||
String after = samples[j];
|
||||
// do line straight
|
||||
printLine(out, before, "", after, k != 0);
|
||||
printLine(out, before, " ", after, k != 0);
|
||||
printLine(out, before, "\u0301\u0308", after, k != 0);
|
||||
int len = genTestItems(before, after, testCase);
|
||||
for (int q = 0; q < len; ++q) {
|
||||
printLine(out, testCase[q], k != 0 && q == 0, false);
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
}
|
||||
out.println("# Lines: " + counter);
|
||||
@ -83,25 +148,80 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
}
|
||||
}
|
||||
|
||||
public static void generateTable(PrintWriter out, boolean recommended) {
|
||||
out.print("<table border='1' cellspacing='0'><tr><th></th>");
|
||||
for (int i = 0; i < TABLE_LIMIT; ++i) {
|
||||
String h = getLBID(samples[TROrder[i]]);
|
||||
out.print("<th>" + h + "</th>");
|
||||
// stuff that subclasses need to override
|
||||
public int genTestItems(String before, String after, String[] results) {
|
||||
results[0] = before + after;
|
||||
results[1] = before + " " + after;
|
||||
results[2] = before + "\u0301\u0308" + after;
|
||||
return 3;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
boolean skipType(byte type) {
|
||||
return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
byte result = getType(cp);
|
||||
if (result == LB_SUP) return "SUP";
|
||||
if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
|
||||
return Default.ucd.getLineBreakID_fromIndex(result);
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp) {
|
||||
if (cp > 0xFFFF) return LB_SUP;
|
||||
byte result = getHangulType(cp);
|
||||
if (result != hNot) return (byte)(result + LB_LIMIT);
|
||||
return Default.ucd.getLineBreak(cp);
|
||||
}
|
||||
|
||||
public int getLimit() {
|
||||
return LB2_LIMIT;
|
||||
}
|
||||
|
||||
public int getTableLimit() {
|
||||
return LB_SUP; // skip last;
|
||||
}
|
||||
|
||||
|
||||
public void generateTable(PrintWriter out, boolean recommended) {
|
||||
String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
|
||||
out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>");
|
||||
byte type;
|
||||
for (int i = 0; i < getTableLimit(); ++i) {
|
||||
type = TypeOrder[i];
|
||||
if (skipType(type)) continue;
|
||||
|
||||
String h = getTypeID(samples[TypeOrder[i]]);
|
||||
out.print("<th " + width + ">" + h + "</th>");
|
||||
}
|
||||
out.print("</tr>");
|
||||
String[] rule = new String[1];
|
||||
String[] rule2 = new String[1];
|
||||
for (int i = 0; i < TABLE_LIMIT; ++i) {
|
||||
String before = samples[TROrder[i]];
|
||||
String line = "<tr><th>" + getLBID(before) + "</th>";
|
||||
for (int j = 0; j < TABLE_LIMIT; ++j) {
|
||||
String after = samples[TROrder[j]];
|
||||
for (int i = 0; i < getTableLimit(); ++i) {
|
||||
type = TypeOrder[i];
|
||||
if (skipType(type)) continue;
|
||||
|
||||
String before = samples[type];
|
||||
String line = "<tr><th>" + getTypeID(before) + "</th>";
|
||||
for (int j = 0; j < getTableLimit(); ++j) {
|
||||
type = TypeOrder[j];
|
||||
if (skipType(type)) continue;
|
||||
|
||||
String after = samples[type];
|
||||
String t = getTableEntry(before, after, recommended, rule);
|
||||
String background = "";
|
||||
if (recommended) {
|
||||
String t2 = getTableEntry(before, after, false, rule2);
|
||||
if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
|
||||
String t2 = getTableEntry(before, after, !recommended, rule2);
|
||||
if (!t.equals(t2)) {
|
||||
if (t.equals(NOBREAK)) {
|
||||
background = " bgcolor='#CCFFFF'";
|
||||
} else {
|
||||
background = " bgcolor='#FFFF00'";
|
||||
}
|
||||
} else if (t.equals(NOBREAK)) {
|
||||
background = " bgcolor='#CCCCFF'";
|
||||
}
|
||||
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
|
||||
}
|
||||
@ -110,7 +230,7 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
out.println("</table>");
|
||||
}
|
||||
|
||||
public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
|
||||
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
|
||||
String t = "_";
|
||||
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
|
||||
String spaceRule = rule;
|
||||
@ -137,75 +257,83 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
return t;
|
||||
}
|
||||
|
||||
static final String BREAK = "\u00F7";
|
||||
static final String NOBREAK = "\u00D7";
|
||||
|
||||
public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
|
||||
String s = before + filler + after;
|
||||
int offset = before.length() + filler.length();
|
||||
public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
|
||||
int cp;
|
||||
StringBuffer string = new StringBuffer();
|
||||
StringBuffer comment = new StringBuffer("\t# ");
|
||||
String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
|
||||
string.append(status);
|
||||
comment.append(' ').append(status).append(" [").append(rule).append(']');
|
||||
|
||||
boolean lb = isBreak(s, offset, false);
|
||||
|
||||
String tlb = (lb ? "b" : "n");
|
||||
String comment = "";
|
||||
if (comments) comment =
|
||||
" # " + getLBID(before + filler)
|
||||
+ " " + tlb
|
||||
+ " " + getLBID(after)
|
||||
+ " # " + Default.ucd.getName(before + filler)
|
||||
+ " " + tlb
|
||||
+ " " + Default.ucd.getName(after);
|
||||
for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
|
||||
|
||||
out.println(Utility.hex(before + filler)
|
||||
+ "; " + tlb
|
||||
+ "; " + Utility.hex(after)
|
||||
+ comment);
|
||||
cp = UTF16.charAt(source, offset);
|
||||
if (string.length() > 0) {
|
||||
string.append(' ');
|
||||
comment.append(' ');
|
||||
}
|
||||
|
||||
string.append(Utility.hex(cp));
|
||||
comment.append(Default.ucd.getName(cp) + " (" + getTypeID(cp) + ")");
|
||||
|
||||
status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
|
||||
string.append(' ').append(status);
|
||||
comment.append(' ').append(status).append(" [").append(rule).append(']');
|
||||
}
|
||||
|
||||
if (comments) string.append(comment);
|
||||
out.println(string);
|
||||
}
|
||||
|
||||
public static void findSamples() {
|
||||
|
||||
public void findSamples() {
|
||||
for (int i = 1; i <= 0x10FFFF; ++i) {
|
||||
if (!Default.ucd.isAllocated(i)) continue;
|
||||
if (Default.ucd.isLeadingJamo(i)
|
||||
|| Default.ucd.isVowelJamo(i)
|
||||
|| Default.ucd.isTrailingJamo(i)) continue;
|
||||
byte lb = Default.ucd.getLineBreak(i);
|
||||
if (0xD800 <= i && i <= 0xDFFF) continue;
|
||||
if(i == 0x1100) {
|
||||
System.out.print("here");
|
||||
}
|
||||
byte lb = getType(i);
|
||||
if (samples[lb] == null) {
|
||||
samples[lb] = UTF16.valueOf(i);
|
||||
}
|
||||
}
|
||||
// fill the last with special cases
|
||||
samples[LB_LIMIT] = "\u1100";
|
||||
samples[LB_LIMIT+1] = "\u1162";
|
||||
samples[LB_LIMIT+2] = "\u11A8";
|
||||
for (int i = 0; i < TypeOrder.length; ++i) {
|
||||
String sample = samples[i];
|
||||
System.out.println(getTypeID(sample) + ":\t" + Default.ucd.getCodeAndName(sample));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static String getLBID(String s) {
|
||||
if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
|
||||
public String getTypeID(String s) {
|
||||
if (s == null) return "<null>";
|
||||
if (s.length() == 1) return getTypeID(s.charAt(0));
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(" ");
|
||||
result.append(Default.ucd.getLineBreakID(cp));
|
||||
result.append(getTypeID(cp));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static String rule;
|
||||
|
||||
public static int findLastNon(String source, int offset, byte notLBType) {
|
||||
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
|
||||
int cp;
|
||||
for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
|
||||
for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
byte f = getResolvedLB(cp);
|
||||
if (f != notLBType) return cp;
|
||||
byte f = getResolvedType(cp, recommended);
|
||||
if (f != notLBType) return i;
|
||||
}
|
||||
return 0;
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static byte getResolvedLB (int cp) {
|
||||
public byte getResolvedType (int cp, boolean recommended) {
|
||||
// LB 1 Assign a line break category to each character of the input.
|
||||
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
|
||||
byte result = Default.ucd.getLineBreak(cp);
|
||||
byte result = getType(cp);
|
||||
switch (result) {
|
||||
case LB_AI: result = LB_AI; break;
|
||||
// case LB_CB: result = LB_ID; break;
|
||||
@ -213,17 +341,31 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
|
||||
case LB_XX: result = LB_AL; break;
|
||||
}
|
||||
if (recommended) {
|
||||
if (getHangulType(cp) != hNot) {
|
||||
result = LB_ID;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public boolean onCodepointBoundary(String s, int offset) {
|
||||
if (offset < 0 || offset > s.length()) return false;
|
||||
if (offset == 0 || offset == s.length()) return true;
|
||||
if (UTF16.isLeadSurrogate(s.charAt(offset-1))
|
||||
&& UTF16.isTrailSurrogate(s.charAt(offset))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// find out whether there is a break at offset
|
||||
// WARNING: as a side effect, sets "rule"
|
||||
|
||||
public static boolean isBreak(String source, int offset, boolean recommended) {
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
|
||||
// LB 1 Assign a line break category to each character of the input.
|
||||
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
|
||||
// this is taken care of in the getResolvedLB function
|
||||
// this is taken care of in the getResolvedType function
|
||||
|
||||
// LB 2a Never break at the start of text
|
||||
|
||||
@ -237,8 +379,7 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
|
||||
|
||||
// UTF-16: never break in the middle of a code point
|
||||
if (UTF16.isLeadSurrogate(source.charAt(offset-1))
|
||||
&& UTF16.isTrailSurrogate(source.charAt(offset))) return false;
|
||||
if (!onCodepointBoundary(source, offset)) return false;
|
||||
|
||||
|
||||
// now get the character before and after, and their types
|
||||
@ -247,8 +388,8 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
int cpBefore = UTF16.charAt(source, offset-1);
|
||||
int cpAfter = UTF16.charAt(source, offset);
|
||||
|
||||
byte before = getResolvedLB(cpBefore);
|
||||
byte after = getResolvedLB(cpAfter);
|
||||
byte before = getResolvedType(cpBefore, recommended);
|
||||
byte after = getResolvedType(cpAfter, recommended);
|
||||
|
||||
|
||||
rule="3a";
|
||||
@ -276,22 +417,21 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
rule="6";
|
||||
if (after == LB_CM) return false;
|
||||
if (Default.ucd.isLeadingJamo(cpBefore)) {
|
||||
if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
|
||||
} else if (Default.ucd.isVowelJamo(cpBefore)) {
|
||||
if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
|
||||
} else if (Default.ucd.isTrailingJamo(cpBefore)) {
|
||||
if (Default.ucd.isTrailingJamo(cpAfter)) return false;
|
||||
}
|
||||
|
||||
|
||||
if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
|
||||
|
||||
if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
|
||||
|
||||
if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
|
||||
|
||||
boolean setBase = false;
|
||||
if (before == LB_CM) {
|
||||
setBase = true;
|
||||
int cp = findLastNon(source, offset, LB_CM);
|
||||
if (cp == 0) {
|
||||
int backOffset = findLastNon(source, offset, LB_CM, recommended);
|
||||
if (backOffset < 0) {
|
||||
before = LB_ID;
|
||||
} else {
|
||||
before = getResolvedLB(cp);
|
||||
before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
|
||||
}
|
||||
}
|
||||
|
||||
@ -310,9 +450,9 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
// find the last non-space character; we will need it
|
||||
byte lastNonSpace = before;
|
||||
if (lastNonSpace == LB_SP) {
|
||||
int cp = findLastNon(source, offset, LB_CM);
|
||||
if (cp != 0) {
|
||||
lastNonSpace = getResolvedLB(cp);
|
||||
int backOffset = findLastNon(source, offset, LB_CM, recommended);
|
||||
if (backOffset >= 0) {
|
||||
lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
|
||||
}
|
||||
}
|
||||
|
||||
@ -476,4 +616,162 @@ public class GenerateLineBreakTest implements UCD_Types {
|
||||
rule="20";
|
||||
return true;
|
||||
}
|
||||
|
||||
static class GenerateWordBreakTest extends GenerateLineBreakTest {
|
||||
|
||||
static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
|
||||
oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
|
||||
L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
|
||||
LIMIT = LVT + 1;
|
||||
|
||||
static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
|
||||
|
||||
static UnicodeProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
|
||||
static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
|
||||
static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
|
||||
|
||||
{
|
||||
fileName = "Word";
|
||||
TypeOrder = new byte[LIMIT];
|
||||
for (byte i = 0; i < TypeOrder.length; ++i) {
|
||||
TypeOrder[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
boolean skipType(byte type) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public int getLimit() {
|
||||
return LIMIT;
|
||||
}
|
||||
|
||||
public int getTableLimit() {
|
||||
return LIMIT;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public int genTestItems(String before, String after, String[] results) {
|
||||
results[0] = before + after;
|
||||
return 1;
|
||||
}
|
||||
|
||||
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
|
||||
boolean normalBreak = isBreak(before + after, before.length(), recommended);
|
||||
String normalRule = rule;
|
||||
ruleOut[0] = rule;
|
||||
return normalBreak ? BREAK : NOBREAK;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
byte type = getType(cp);
|
||||
if (type >= oLIMIT) return hNames[type - oLIMIT];
|
||||
return Names[type];
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp) {
|
||||
// single characters
|
||||
if (cp == 0xA) return LF;
|
||||
if (cp == 0xD) return CR;
|
||||
if (cp == 0x034F) return CGJ;
|
||||
if (cp == 0x2028 || cp == 0x2029) return Control;
|
||||
|
||||
// Hangul
|
||||
byte result = getHangulType(cp);
|
||||
if (result != hNot) return (byte)(result + oLIMIT);
|
||||
|
||||
// other properties
|
||||
// category based
|
||||
byte cat = Default.ucd.getCategory(cp);
|
||||
if (cat == Cc) return Control;
|
||||
if (cat == Cf) return Extend;
|
||||
if (((1<<cat) & LETTER_MASK) != 0) return LetterBase;
|
||||
|
||||
// other binary properties
|
||||
if (linkProp.hasValue(cp)) return Link;
|
||||
if (extendProp.hasValue(cp)) return Extend;
|
||||
if (baseProp.hasValue(cp)) return Base;
|
||||
|
||||
return Other;
|
||||
}
|
||||
|
||||
public byte getResolvedType(int cp, boolean recommended) {
|
||||
return getType(cp);
|
||||
}
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
rule="1";
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
if (offset == 0) return true;
|
||||
|
||||
rule = "2";
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
// UTF-16: never break in the middle of a code point
|
||||
if (!onCodepointBoundary(source, offset)) return false;
|
||||
|
||||
// now get the character before and after, and their types
|
||||
|
||||
|
||||
int cpBefore = UTF16.charAt(source, offset-1);
|
||||
int cpAfter = UTF16.charAt(source, offset);
|
||||
|
||||
byte before = getResolvedType(cpBefore, recommended);
|
||||
byte after = getResolvedType(cpAfter, recommended);
|
||||
|
||||
rule = "3";
|
||||
if (before == CR && after == LF) return false;
|
||||
|
||||
rule = "4";
|
||||
if (before == CR || before == LF || before == Control
|
||||
|| after == Control || after == LF || after == CR) return true;
|
||||
|
||||
rule = "6";
|
||||
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
|
||||
|
||||
rule = "7";
|
||||
if ((before == LV || before == V) && (after == V || after == T)) return false;
|
||||
|
||||
rule = "8";
|
||||
if ((before == LVT || before == T) && (after == T)) return false;
|
||||
|
||||
rule = "9";
|
||||
if (after == Extend) return false;
|
||||
|
||||
if (recommended) {
|
||||
if (after == Link || after == CGJ) return false;
|
||||
} else {
|
||||
|
||||
// Do not break around a CGJ.
|
||||
rule = "10";
|
||||
if (before == CGJ && (after == Base
|
||||
|| after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
|
||||
rule = "11";
|
||||
if (after == CGJ) return false;
|
||||
|
||||
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
|
||||
|
||||
rule = "12";
|
||||
//Link Extend* × LetterBase (12)
|
||||
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
|
||||
int backOffset = findLastNon(source, offset, Extend, recommended);
|
||||
if (backOffset >= 0) {
|
||||
byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
|
||||
if (last == Link) return false;
|
||||
}
|
||||
}
|
||||
|
||||
rule = "13";
|
||||
if (after == Link) return false;
|
||||
}
|
||||
|
||||
// Otherwise break after all characters.
|
||||
rule = "14";
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -23,20 +23,23 @@ public class GenerateThaiBreaks {
|
||||
|
||||
BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new FileInputStream("\\icu4j\\src\\data\\thai6.ucs"), "UnicodeLittle"));
|
||||
new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
|
||||
PrintWriter out = null;
|
||||
|
||||
try {
|
||||
Default.setUCD();
|
||||
UnicodeSet ignorables = new UnicodeSet(0xE30, 0xE3A);
|
||||
UnicodeSet ignorables = new UnicodeSet();
|
||||
/* new UnicodeSet(0xE30, 0xE3A);
|
||||
ignorables.add(0x0E40, 0x0E44); // add logical order exception
|
||||
ignorables.add(0x0E47, 0x0E4E);
|
||||
*/
|
||||
ignorables.add(0, ' '); // add controls
|
||||
ignorables.add('.');
|
||||
|
||||
Set initials = new TreeSet();
|
||||
Set finals = new TreeSet();
|
||||
Set medials = new TreeSet();
|
||||
|
||||
UnicodeSet initials = new UnicodeSet();
|
||||
UnicodeSet finals = new UnicodeSet();
|
||||
UnicodeSet medials = new UnicodeSet();
|
||||
|
||||
char[] buffer = new char[100];
|
||||
|
||||
@ -60,34 +63,58 @@ public class GenerateThaiBreaks {
|
||||
}
|
||||
|
||||
initials.add(temp.substring(0,1));
|
||||
initials.add(temp.substring(0,2));
|
||||
finals.add(temp.substring(temp.length()-2));
|
||||
//initials.add(temp.substring(0,2));
|
||||
finals.add(temp.substring(temp.length()-1));
|
||||
//finals.add(temp.substring(temp.length()-1));
|
||||
|
||||
for (int i = 1; i < temp.length() - 3; ++i) {
|
||||
medials.add(temp.substring(i, i+2));
|
||||
for (int i = 1; i < temp.length() - 1; ++i) {
|
||||
//medials.add(temp.substring(i, i+2));
|
||||
medials.add(temp.substring(i, i+1));
|
||||
}
|
||||
medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
|
||||
//medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
|
||||
}
|
||||
|
||||
System.out.println("initials size: " + initials.size());
|
||||
System.out.println("finals size: " + finals.size());
|
||||
System.out.println("medials size: " + medials.size());
|
||||
|
||||
//out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
|
||||
// out.write('\uFEFF');
|
||||
|
||||
UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
|
||||
finals.addAll(marks);
|
||||
|
||||
UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
|
||||
|
||||
UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
|
||||
|
||||
System.out.println("Never occur: " + missingThai.toPattern(true));
|
||||
Utility.showSetNames("", missingThai, true, Default.ucd);
|
||||
System.out.println();
|
||||
|
||||
UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
|
||||
UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
|
||||
|
||||
System.out.println("Never initial: " + neverInitial.toPattern(true));
|
||||
Utility.showSetNames("", neverInitial, true, Default.ucd);
|
||||
System.out.println();
|
||||
|
||||
System.out.println("Never final: " + neverFinal.toPattern(true));
|
||||
Utility.showSetNames("", neverFinal, true, Default.ucd);
|
||||
System.out.println();
|
||||
|
||||
initials.removeAll(medials);
|
||||
finals.removeAll(medials);
|
||||
|
||||
System.out.println("initials size: " + initials.size());
|
||||
System.out.println("finals size: " + finals.size());
|
||||
|
||||
out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
|
||||
out.write('\uFEFF');
|
||||
out.println("Only Initials");
|
||||
Utility.print(out, initials, ", ", new MyBreaker());
|
||||
out.println();
|
||||
out.println("Only Finals");
|
||||
Utility.print(out, finals, ", ", new MyBreaker());
|
||||
System.out.println("Only Initials" + initials.toPattern(true));
|
||||
Utility.showSetNames("", initials, true, Default.ucd);
|
||||
System.out.println();
|
||||
|
||||
System.out.println("Only Finals" + finals.toPattern(true));
|
||||
Utility.showSetNames("", finals, true, Default.ucd);
|
||||
} finally {
|
||||
br.close();
|
||||
if (out != null) out.close();
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.19 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -78,7 +78,7 @@ public final class Main implements UCD_Types {
|
||||
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);
|
||||
else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
|
||||
|
||||
else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
|
||||
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.16 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.17 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -737,6 +737,10 @@ public final class UCD implements UCD_Types {
|
||||
return UCD_Names.NT[prop];
|
||||
}
|
||||
|
||||
public static String getNumericTypeID_fromIndex(byte prop, byte style) {
|
||||
return style == SHORT ? UCD_Names.SHORT_NT[prop] : UCD_Names.NT[prop];
|
||||
}
|
||||
|
||||
public String getEastAsianWidthID(int codePoint) {
|
||||
return getEastAsianWidthID_fromIndex(getEastAsianWidth(codePoint));
|
||||
}
|
||||
@ -745,6 +749,10 @@ public final class UCD implements UCD_Types {
|
||||
return UCD_Names.EA[prop];
|
||||
}
|
||||
|
||||
public static String getEastAsianWidthID_fromIndex(byte prop, byte style) {
|
||||
return style != LONG ? UCD_Names.SHORT_EA[prop] : UCD_Names.EA[prop];
|
||||
}
|
||||
|
||||
public String getLineBreakID(int codePoint) {
|
||||
return getLineBreakID_fromIndex(getLineBreak(codePoint));
|
||||
}
|
||||
@ -753,6 +761,10 @@ public final class UCD implements UCD_Types {
|
||||
return UCD_Names.LB[prop];
|
||||
}
|
||||
|
||||
public static String getLineBreakID_fromIndex(byte prop, byte style) {
|
||||
return style != LONG ? UCD_Names.LB[prop] : UCD_Names.LONG_LB[prop];
|
||||
}
|
||||
|
||||
public String getJoiningTypeID(int codePoint) {
|
||||
return getJoiningTypeID_fromIndex(getJoiningType(codePoint));
|
||||
}
|
||||
@ -761,6 +773,10 @@ public final class UCD implements UCD_Types {
|
||||
return UCD_Names.JOINING_TYPE[prop];
|
||||
}
|
||||
|
||||
public static String getJoiningTypeID_fromIndex(byte prop, byte style) {
|
||||
return style != LONG ? UCD_Names.JOINING_TYPE[prop] : UCD_Names.LONG_JOINING_TYPE[prop];
|
||||
}
|
||||
|
||||
public String getJoiningGroupID(int codePoint) {
|
||||
return getJoiningGroupID_fromIndex(getJoiningGroup(codePoint));
|
||||
}
|
||||
@ -769,6 +785,11 @@ public final class UCD implements UCD_Types {
|
||||
return UCD_Names.JOINING_GROUP[prop];
|
||||
}
|
||||
|
||||
public static String getJoiningGroupID_fromIndex(byte prop, byte style) {
|
||||
// no short version
|
||||
return UCD_Names.JOINING_GROUP[prop];
|
||||
}
|
||||
|
||||
public String getScriptID(int codePoint) {
|
||||
return getScriptID_fromIndex(getScript(codePoint));
|
||||
}
|
||||
@ -790,6 +811,11 @@ public final class UCD implements UCD_Types {
|
||||
return UCD_Names.AGE[prop];
|
||||
}
|
||||
|
||||
public static String getAgeID_fromIndex(byte prop, byte style) {
|
||||
// no short for
|
||||
return UCD_Names.AGE[prop];
|
||||
}
|
||||
|
||||
public String getBinaryPropertiesID(int codePoint, byte bit) {
|
||||
return (getBinaryProperties(codePoint) & (1<<bit)) != 0 ? "Y" : "N";
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2002/08/04 21:38:45 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -22,6 +22,8 @@ public interface UCD_Types {
|
||||
public static final String BIN_DIR = BASE_DIR + "BIN\\";
|
||||
public static final String GEN_DIR = BASE_DIR + "GEN\\";
|
||||
|
||||
public static final char DOTTED_CIRCLE = '\u25CC';
|
||||
|
||||
public static final int
|
||||
CJK_BASE = 0x4E00,
|
||||
CJK_LIMIT = 0x9FFF+1,
|
||||
@ -166,7 +168,10 @@ public interface UCD_Types {
|
||||
CONTROL_MASK = (1<<Cc) | (1<<Cf) | (1<<Cs) | (1<<Co),
|
||||
PUNCTUATION_MASK = (1<<Pc) | (1<<Pd) | (1<<Ps) | (1<<Pe) | (1<<Po) | (1<<Pi) | (1<<Pf),
|
||||
SYMBOL_MASK = (1<<Sm) | (1<<Sc) | (1<<Sk) | (1<<So),
|
||||
UNASSIGNED_MASK = (1<<Cn);
|
||||
UNASSIGNED_MASK = (1<<Cn),
|
||||
BASE_MASK = LETTER_MASK | NUMBER_MASK | PUNCTUATION_MASK | SYMBOL_MASK | (1<<Mc),
|
||||
NONSPACING_MARK_MASK = (1<<Mn) | (1<<Me);
|
||||
|
||||
|
||||
// Binary Properties
|
||||
|
||||
|
@ -148,7 +148,7 @@ public abstract class UnicodeProperty implements UCD_Types {
|
||||
/**
|
||||
* Does it have the propertyValue?
|
||||
*/
|
||||
abstract boolean hasValue(int cp);
|
||||
abstract public boolean hasValue(int cp);
|
||||
|
||||
/**
|
||||
* Get the set of characters it contains
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
|
||||
* $Date: 2002/07/03 02:15:47 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/08/04 21:38:44 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -299,26 +299,14 @@ public final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
case COMBINING_CLASS>>8: return ucd.getCombiningClassID_fromIndex((byte)propValue, style);
|
||||
case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex((byte)propValue, style);
|
||||
case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex((byte)propValue, style);
|
||||
case NUMERIC_TYPE>>8: if (propValue >= LIMIT_NUMERIC_TYPE) break;
|
||||
if (style != SHORT) return ucd.getNumericTypeID_fromIndex((byte)propValue);
|
||||
return UCD_Names.SHORT_NT[propValue];
|
||||
case EAST_ASIAN_WIDTH>>8: if (propValue >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
|
||||
return UCD_Names.SHORT_EA[propValue];
|
||||
case LINE_BREAK>>8: if (propValue >= LIMIT_LINE_BREAK) break;
|
||||
if (style != LONG) return ucd.getLineBreakID_fromIndex((byte)propValue);
|
||||
return UCD_Names.LONG_LB[propValue];
|
||||
case JOINING_TYPE>>8: if (propValue >= LIMIT_JOINING_TYPE) break;
|
||||
if (style != LONG) return ucd.getJoiningTypeID_fromIndex((byte)propValue);
|
||||
return UCD_Names.LONG_JOINING_TYPE[propValue];
|
||||
case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroupID_fromIndex((byte)propValue);
|
||||
case NUMERIC_TYPE>>8: ucd.getNumericTypeID_fromIndex((byte)propValue, style);
|
||||
case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex((byte)propValue);
|
||||
case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex((byte)propValue, style);
|
||||
case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex((byte)propValue);
|
||||
case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex((byte)propValue);
|
||||
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
|
||||
case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break;
|
||||
if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue);
|
||||
return UCD_Names.ABB_SCRIPT[propValue];
|
||||
case AGE>>8: if (propValue >= LIMIT_AGE) break;
|
||||
return ucd.getAgeID_fromIndex((byte)propValue);
|
||||
case SCRIPT>>8: return ucd.getScriptID_fromIndex((byte)propValue);
|
||||
case AGE>>8: return ucd.getAgeID_fromIndex((byte)propValue);
|
||||
/*
|
||||
case DERIVED>>8:
|
||||
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2002/08/04 21:38:44 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -17,9 +17,10 @@ import java.util.*;
|
||||
import java.text.*;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.text.UCD.*;
|
||||
|
||||
public final class Utility { // COMMON UTILITIES
|
||||
public final class Utility implements UCD_Types { // COMMON UTILITIES
|
||||
|
||||
static final boolean UTF8 = true; // TODO -- make argument
|
||||
|
||||
@ -470,7 +471,22 @@ public final class Utility { // COMMON UTILITIES
|
||||
return quoteXML(source, false);
|
||||
}
|
||||
|
||||
|
||||
private static UnicodeProperty defaultIgnorable = null;
|
||||
|
||||
public static String getDisplay(int cp) {
|
||||
String result = UTF16.valueOf(cp);
|
||||
byte cat = Default.ucd.getCategory(cp);
|
||||
if (cat == Mn || cat == Me) {
|
||||
result = String.valueOf(DOTTED_CIRCLE) + result;
|
||||
} else if (cat == Cf || cat == Cc || cp == 0x034F || cp == 0x00AD || cp == 0x1806) {
|
||||
result = "\u25A1";
|
||||
} else {
|
||||
if (defaultIgnorable == null) defaultIgnorable = DerivedProperty.make(DefaultIgnorable);
|
||||
if (defaultIgnorable.hasValue(cp)) result = "\u25A1";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static int compare(char[] a, int aStart, int aEnd, char[] b, int bStart, int bEnd) {
|
||||
while (aStart < aEnd && bStart < bEnd) {
|
||||
int diff = a[aStart++] - b[bStart++];
|
||||
|
Loading…
Reference in New Issue
Block a user