e8122316a2
X-SVN-Rev: 17314
2345 lines
93 KiB
Java
2345 lines
93 KiB
Java
/**
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
|
* $Date: 2005/03/10 02:37:20 $
|
|
* $Revision: 1.27 $
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
package com.ibm.text.UCD;
|
|
|
|
import java.io.IOException;
|
|
import java.math.BigDecimal;
|
|
|
|
//import com.ibm.text.unicode.UInfo;
|
|
import java.util.*;
|
|
import java.io.*;
|
|
//import java.text.Un;
|
|
import com.ibm.icu.text.CanonicalIterator;
|
|
import com.ibm.icu.text.UnicodeSet;
|
|
import com.ibm.icu.text.UnicodeSetIterator;
|
|
|
|
import com.ibm.icu.text.UTF16;
|
|
import com.ibm.text.utility.*;
|
|
import java.text.NumberFormat;
|
|
|
|
public class VerifyUCD implements UCD_Types {
|
|
static final boolean DEBUG = false;
|
|
|
|
static void checkDecompFolding() {
|
|
|
|
UnicodeSet sum = new UnicodeSet();
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAllocated(cp)) continue;
|
|
byte cat = Default.ucd().getCategory(cp);
|
|
if (cat == UNASSIGNED || cat == PRIVATE_USE) continue;
|
|
String decomp = Default.nfd().normalize(cp);
|
|
String foldDecomp = Default.ucd().getCase(decomp, FULL, FOLD);
|
|
int d0 = Default.ucd().getCombiningClass(decomp.charAt(0));
|
|
int dL = Default.ucd().getCombiningClass(decomp.charAt(decomp.length()-1));
|
|
int f0 = Default.ucd().getCombiningClass(foldDecomp.charAt(0));
|
|
int fL = Default.ucd().getCombiningClass(foldDecomp.charAt(decomp.length()-1));
|
|
if (d0 != f0 || dL != fL) {
|
|
Utility.fixDot();
|
|
System.out.println();
|
|
System.out.println("Exception: " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println("Decomp: " + Default.ucd().getCodeAndName(decomp));
|
|
System.out.println("FoldedDecomp: " + Default.ucd().getCodeAndName(foldDecomp));
|
|
System.out.println("d0: " + d0 + ", "
|
|
+ "dL: " + dL + ", "
|
|
+ "f0: " + f0 + ", "
|
|
+ "fL: " + fL
|
|
);
|
|
sum.add(cp);
|
|
}
|
|
}
|
|
System.out.println("Set: " + sum.toPattern(true));
|
|
}
|
|
|
|
static void oneTime() {
|
|
|
|
int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000
|
|
for (int i = 0; i < testSet.length; ++i) {
|
|
int item = testSet[i];
|
|
System.out.println(Default.ucd().getCode(item));
|
|
|
|
boolean ass = Default.ucd().isAssigned(item);
|
|
System.out.println(ass ? " assigned" : " unassigned");
|
|
ass = Default.ucd().isAllocated(item);
|
|
System.out.println(ass ? " allocated" : " unallocated");
|
|
|
|
String name = Default.ucd().getName(item, SHORT);
|
|
System.out.println(" " + name);
|
|
name = Default.ucd().getName(item);
|
|
System.out.println(" " + name);
|
|
|
|
System.out.println();
|
|
}
|
|
}
|
|
|
|
static final byte NC = UNUSED_CATEGORY;
|
|
|
|
static final NumberFormat format = NumberFormat.getInstance();
|
|
static {
|
|
format.setMinimumFractionDigits(0);
|
|
format.setGroupingUsed(true);
|
|
}
|
|
|
|
static abstract class SimpleProp {
|
|
abstract String getTitle();
|
|
abstract byte getUnallocatedProp();
|
|
abstract byte getProp(int cp);
|
|
abstract String getName(byte prop);
|
|
abstract String getCode(byte prop);
|
|
|
|
byte[] subtotalBreaks = null;
|
|
|
|
byte[] cumulativeTotalBreaks = null;
|
|
|
|
byte[] permute = null;
|
|
|
|
byte getPermutation(byte prop) {
|
|
if (permute == null) return prop;
|
|
if (prop >= permute.length) return prop;
|
|
return permute[prop];
|
|
}
|
|
|
|
boolean doTotal(byte prop, boolean sub) {
|
|
byte[] myBreak = sub ? subtotalBreaks : cumulativeTotalBreaks;
|
|
if (myBreak == null) return false;
|
|
for (int k = 0; k < myBreak.length; ++k) {
|
|
if (myBreak[k] == prop) return true;
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static class CatProp extends SimpleProp {
|
|
String getTitle() {
|
|
return "General Category";
|
|
}
|
|
byte getUnallocatedProp() {
|
|
return Cn;
|
|
}
|
|
|
|
byte getProp(int cp) {
|
|
byte cat = Default.ucd().getCategory(cp);
|
|
if (cat == Cn && Default.ucd().getBinaryProperty(cp, Noncharacter_Code_Point)) {
|
|
return NC;
|
|
}
|
|
return cat;
|
|
}
|
|
String getCode(byte prop) {
|
|
if (prop >= LIMIT_CATEGORY) return "???" + prop;
|
|
if (prop == NC) {
|
|
return "NC";
|
|
}
|
|
return Default.ucd().getCategoryID_fromIndex(prop);
|
|
}
|
|
String getName(byte prop) {
|
|
if (prop >= LIMIT_CATEGORY) return "???" + prop;
|
|
if (prop == NC) {
|
|
return "Noncharacter";
|
|
}
|
|
String name = Default.ucd().getCategoryID_fromIndex(prop, LONG);
|
|
if (prop == Cn) name += " - NC";
|
|
return name;
|
|
}
|
|
|
|
{
|
|
permute = new byte[] {
|
|
Lu, Ll, Lt, Lo, Lm,
|
|
Mn, Me, Mc,
|
|
Nd, Nl, No,
|
|
Pd, Pc, Ps, Pi, Pe, Pf, Po,
|
|
Sc, Sm, Sk, So,
|
|
Zs, Zl, Zp,
|
|
Cc, Cf, Co, Cs, NC, Cn};
|
|
|
|
subtotalBreaks = new byte[] {Lm, Mc, No, Po, So, Zp, Cs, Cn};
|
|
|
|
cumulativeTotalBreaks = new byte[] {Cf};
|
|
}
|
|
}
|
|
|
|
static class ScriptProp extends SimpleProp {
|
|
String getTitle() {
|
|
return "Script";
|
|
}
|
|
byte getUnallocatedProp() {
|
|
return COMMON_SCRIPT;
|
|
}
|
|
|
|
byte getProp(int cp) {
|
|
return Default.ucd().getScript(cp);
|
|
}
|
|
String getCode(byte prop) {
|
|
if (prop >= LIMIT_SCRIPT) return "???" + prop;
|
|
return Default.ucd().getScriptID_fromIndex(prop, SHORT);
|
|
}
|
|
String getName(byte prop) {
|
|
if (prop >= LIMIT_SCRIPT) return "???" + prop;
|
|
return Default.ucd().getScriptID_fromIndex(prop, LONG);
|
|
}
|
|
byte getPermutation(byte prop) {
|
|
if (prop == LIMIT_SCRIPT-1) return COMMON_SCRIPT;
|
|
if (prop == LIMIT_SCRIPT-2) return INHERITED_SCRIPT;
|
|
if (prop >= LIMIT_SCRIPT) return prop;
|
|
if (prop >= INHERITED_SCRIPT-1) return (byte)(prop+2);
|
|
return (byte)(prop+1);
|
|
}
|
|
{
|
|
cumulativeTotalBreaks = new byte[] {TAGBANWA_SCRIPT};
|
|
}
|
|
}
|
|
|
|
static SimpleProp CAT_PROP = new CatProp();
|
|
static SimpleProp SCRIPT_PROP = new ScriptProp();
|
|
|
|
public static void statistics() throws IOException {
|
|
statistics(CAT_PROP);
|
|
System.out.println("<p>");
|
|
statistics(SCRIPT_PROP);
|
|
}
|
|
|
|
public static void statistics(SimpleProp prop) throws IOException {
|
|
int[][] count = new int[100][5];
|
|
int[][] sample = new int[100][5];
|
|
int[] subtotalCount = new int[5];
|
|
int[] totalCount = new int[5];
|
|
|
|
|
|
|
|
byte cat;
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAllocated(cp)) {
|
|
cat = prop.getUnallocatedProp();
|
|
setSample(count[cat], sample[cat], 0, cp);
|
|
continue;
|
|
}
|
|
cat = prop.getProp(cp);
|
|
setSample(count[cat], sample[cat], 0, cp);
|
|
|
|
if (checkNormalizer(Default.nfd(), cp)) {
|
|
setSample(count[cat], sample[cat], NFD+1, cp);
|
|
}
|
|
if (checkNormalizer(Default.nfc(), cp)) {
|
|
setSample(count[cat], sample[cat], NFC+1, cp);
|
|
}
|
|
if (checkNormalizer(Default.nfkd(), cp)) {
|
|
setSample(count[cat], sample[cat], NFKD+1, cp);
|
|
}
|
|
if (checkNormalizer(Default.nfkc(), cp)) {
|
|
setSample(count[cat], sample[cat], NFKC+1, cp);
|
|
}
|
|
|
|
}
|
|
|
|
Utility.fixDot();
|
|
|
|
System.out.println("<table border='1' cellspacing='0' cellpadding='4'>");
|
|
System.out.print("<tr><th class='tt' colspan='2'>" + prop.getTitle() + "</th><th class='tn' colspan='2'>Count");
|
|
for (byte j = 0; j < 4; ++j) {
|
|
System.out.println("</th><th class='tn' colspan='2'>" + UCD_Names.NF_NAME[j]);
|
|
}
|
|
System.out.println("</th></tr>");
|
|
|
|
for (byte ii = 0; ii < count.length; ++ii) {
|
|
byte i = prop.getPermutation(ii);
|
|
// System.out.println(prop.getCode(ii) + ", " + ii + " => " + prop.getCode(i) + ", " + i);
|
|
if (count[i][0] == 0) continue;
|
|
|
|
String code = prop.getCode(i);
|
|
String name = prop.getName(i);
|
|
|
|
System.out.println(" <tr><th class='t'>" + code + "</th><th class='t'>" + name + "</th>");
|
|
for (byte j = 0; j < 5; ++j) {
|
|
if (count[i][j] == 0) System.out.println("<td colspan='2'> </td>");
|
|
else {
|
|
System.out.println(" <td class='n'><b>" + format.format(count[i][j]) + "</b></td>");
|
|
System.out.println(" <td class='s'><div title='" +
|
|
Default.ucd().getCodeAndName(sample[i][j]) + "'>" + quote(sample[i][j]) + "</div></td>");
|
|
}
|
|
subtotalCount[j] += count[i][j];
|
|
totalCount[j] += count[i][j];
|
|
}
|
|
System.out.println(" </tr>");
|
|
if (prop.doTotal(i, true)) printTotals("Subtotal", subtotalCount, true);
|
|
if (prop.doTotal(i, false)) printTotals("Cummulative Total", totalCount, false);
|
|
}
|
|
printTotals("Total", totalCount, false);
|
|
System.out.println("</table>");
|
|
}
|
|
|
|
static public String quote(int cp) {
|
|
byte cat2 = Default.ucd().getCategory(cp);
|
|
if (cat2 == Zs || cat2 == Zp || cat2 == Zl) return " ";
|
|
if (cat2 == Cc || cat2 == Cs) return "??";
|
|
if (cat2 == Mn || cat2 == Me || cat2 == Mc) return "◌&#" + cp + ";";
|
|
return "&#" + cp + ";";
|
|
}
|
|
|
|
static public void setSample(int[] count, int[] array, int index, int cp) {
|
|
count[index]++;
|
|
int value = array[index];
|
|
if (value == 0) {
|
|
array[index] = cp;
|
|
} else if (Default.ucd().isAllocated(cp)) {
|
|
int ncount1 = getNFCount(value, index);
|
|
int ncount2 = getNFCount(cp, index);
|
|
if (ncount1 != ncount2) {
|
|
if (ncount1 > ncount2) array[index] = cp;
|
|
return;
|
|
}
|
|
byte cat1 = CAT_PROP.getPermutation(CAT_PROP.getProp(value));
|
|
byte cat2 = CAT_PROP.getPermutation(CAT_PROP.getProp(cp));
|
|
if (cat1 > cat2) array[index] = cp;
|
|
}
|
|
}
|
|
|
|
public static int getNFCount(int cp, int index) {
|
|
int count = 0;
|
|
boolean nfc1 = checkNormalizer(Default.nfc(), cp);
|
|
boolean nfd1 = checkNormalizer(Default.nfd(), cp);
|
|
boolean nfkc1 = checkNormalizer(Default.nfkc(), cp);
|
|
boolean nfkd1 = checkNormalizer(Default.nfkd(), cp);
|
|
if (nfc1) count += 1;
|
|
if (nfd1) count += 2;
|
|
if (nfkc1) count += 4;
|
|
if (nfkd1) count += 8;
|
|
return count;
|
|
}
|
|
|
|
|
|
public static void printTotals(String title, int[] subtotalCount, boolean zeroit) {
|
|
System.out.println(" <tr><th class='tt' colspan='2'>" + title + "</th>");
|
|
for (byte j = 0; j < subtotalCount.length; ++j) {
|
|
System.out.println(" <td class='tn' colspan='2'>"
|
|
+ (subtotalCount[j] == 0 ? "" : format.format(subtotalCount[j])) + "</td>");
|
|
if (zeroit) subtotalCount[j] = 0;
|
|
}
|
|
}
|
|
|
|
public static boolean checkNormalizer(Normalizer x, int cp) {
|
|
boolean result = !x.isNormalized(cp);
|
|
if (false) {
|
|
String s = x.normalize(cp);
|
|
boolean sResult = !s.equals(UTF16.valueOf(cp));
|
|
if (result != sResult) {
|
|
System.out.println("Failure with " + x + " at " + Default.ucd().getCodeAndName(cp));
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
public static void checkBIDI() {
|
|
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAllocated(cp)) continue;
|
|
|
|
if (Default.nfd().isNormalized(cp)) continue;
|
|
|
|
String decomp = Default.nfd().normalize(cp);
|
|
String comp = Default.nfc().normalize(cp);
|
|
String source = UTF16.valueOf(cp);
|
|
|
|
String bidiDecomp = getBidi(decomp, true);
|
|
String bidiComp = getBidi(comp, true);
|
|
String bidiSource = getBidi(source, true);
|
|
|
|
if (!bidiDecomp.equals(bidiSource) || !bidiComp.equals(bidiSource)) {
|
|
Utility.fixDot();
|
|
System.out.println(Default.ucd().getCodeAndName(cp) + ": " + getBidi(source, false));
|
|
System.out.println("\tNFC: " + Default.ucd().getCodeAndName(comp) + ": " + getBidi(comp, false));
|
|
System.out.println("\tNFD: " + Default.ucd().getCodeAndName(decomp) + ": " + getBidi(decomp, false));
|
|
}
|
|
}
|
|
}
|
|
|
|
public static String getBidi(String s, boolean compact) {
|
|
String result = "";
|
|
byte lastBidi = -1;
|
|
int cp;
|
|
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
|
cp = UTF16.charAt(s, i);
|
|
byte bidi = Default.ucd().getBidiClass(cp);
|
|
if (compact) {
|
|
if (bidi == BIDI_NSM) {
|
|
if (lastBidi != -1) bidi = lastBidi;
|
|
}
|
|
if (bidi == lastBidi && bidi != BIDI_ES && bidi != BIDI_CS) {
|
|
continue;
|
|
}
|
|
}
|
|
result += Default.ucd().getCase(
|
|
Default.ucd().getBidiClassID_fromIndex(bidi, SHORT), FULL, TITLE);
|
|
lastBidi = bidi;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
public static void verify() throws IOException {
|
|
|
|
|
|
checkIdentical("ea=h", "dt=nar");
|
|
checkIdentical("ea=f", "dt=wide");
|
|
checkIdentical("gc=ps", "lb=op");
|
|
checkIdentical("lb=sg", "gc=cs");
|
|
|
|
/*
|
|
For LB we now have:
|
|
|
|
GC:Ps == LB:OP
|
|
GC:Nd && !(EA:F)
|
|
|
|
Try these on for size, and report any discrepancies
|
|
|
|
>GC:L& && EA:W -> LB:ID
|
|
>GC:L& && EA:A -> LB:AI
|
|
>GC:L& && EA:N -> LB:AL
|
|
>GC:L& && EA:Na -> LB:AL
|
|
|
|
plus
|
|
|
|
>LB:ID contains Ideo:T
|
|
|
|
Also, try these rules
|
|
|
|
GC:S# && EA:W -> LB:ID
|
|
GC:S# && EA:A -> LB:AI
|
|
GC:S# && EA:N -> LB:AL
|
|
GC:S# && EA:Na -> LB:AL
|
|
|
|
where S# is Sm | Sk | So
|
|
|
|
these will generate exceptions, but I need to see the list to them before I
|
|
can help you narrow these down.
|
|
|
|
>The trivial ones that I could glean from reading the TR are
|
|
>LB:SG == GC:Cs
|
|
>GC:Pi -> LB:QU
|
|
>GC:Pf -> LB:QU
|
|
>GC:Mc -> LB:CM
|
|
>GC:Me -> LB:CM
|
|
>GC:Mn -> LB:CM
|
|
>GC:Pe -> LB:CL
|
|
*/
|
|
}
|
|
|
|
static final void checkCase3 () {
|
|
|
|
|
|
checkNF_AndCase("\u0130", true);
|
|
checkNF_AndCase("\u0131", true);
|
|
|
|
UCDProperty softdot = null;
|
|
CanonicalIterator cit = new CanonicalIterator("a");
|
|
UnicodeSet badChars = new UnicodeSet();
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAllocated(cp)) continue;
|
|
byte cat = Default.ucd().getCategory(cp);
|
|
// check if canonical equivalents are case-mapped to canonical equivalents
|
|
if (cat != PRIVATE_USE && cat != SURROGATE) {
|
|
String str = UTF16.valueOf(cp);
|
|
if (!checkNF_AndCase(str, false)) badChars.add(cp);
|
|
//if (Default.ucd.getScript(cp) != GREEK_SCRIPT) continue;
|
|
str += "\u0334";
|
|
try {
|
|
//System.out.println("Check " + Default.ucd.getCodeAndName(str));
|
|
cit.setSource(str);
|
|
while (true) {
|
|
String s = cit.next();
|
|
if (s == null) break;
|
|
if (s.equals(str)) continue; // don't check twice
|
|
|
|
//System.out.println(" Checking " + Default.ucd.getCodeAndName(s));
|
|
if (!checkNF_AndCase(s, false)) badChars.add(cp);
|
|
}
|
|
} catch (StringIndexOutOfBoundsException e) {
|
|
System.out.println("Problem with " + Default.ucd().getCodeAndName(str));
|
|
throw e;
|
|
}
|
|
|
|
}
|
|
|
|
if (false) {
|
|
if (softdot == null) softdot = DerivedProperty.make(Type_i, Default.ucd());
|
|
if (Default.ucd().getBinaryProperty(cp, Soft_Dotted) !=
|
|
softdot.hasValue(cp)) {
|
|
System.out.println("FAIL: " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println("Soft_Dotted='" + Default.ucd().getBinaryPropertiesID(cp, Soft_Dotted)
|
|
+ "', DerivedSD=" + softdot.getValue(cp) + "'");
|
|
}
|
|
}
|
|
|
|
}
|
|
System.out.println();
|
|
Utility.showSetNames("", badChars, false, Default.ucd());
|
|
}
|
|
|
|
static void checkIdentical(String ubpName1, String ubpName2) {
|
|
UCDProperty prop1 = UnifiedBinaryProperty.make(ubpName1, Default.ucd());
|
|
UnicodeSet set1 = prop1.getSet();
|
|
UCDProperty prop2 = UnifiedBinaryProperty.make(ubpName2, Default.ucd());
|
|
UnicodeSet set2 = prop2.getSet();
|
|
UnicodeSet set1minus2 = new UnicodeSet(set1);
|
|
set1minus2.removeAll(set2);
|
|
UnicodeSet set2minus1 = new UnicodeSet(set2);
|
|
set2minus1.removeAll(set1);
|
|
|
|
if (set1minus2.isEmpty() && set2minus1.isEmpty()) {
|
|
System.out.println("PASS: " + prop1.getFullName(LONG) + " == " + prop2.getFullName(LONG));
|
|
System.out.println();
|
|
return;
|
|
}
|
|
System.out.println("FAIL: " + prop1.getFullName(LONG) + " != " + prop2.getFullName(LONG));
|
|
if (!set1minus2.isEmpty()) {
|
|
System.out.println(" In " + prop1.getFullName(LONG) + " but not " + prop2.getFullName(LONG));
|
|
Utility.showSetNames(" " + prop1.getFullName(SHORT) + ": ", set1minus2, false, Default.ucd());
|
|
}
|
|
if (!set2minus1.isEmpty()) {
|
|
System.out.println(" In " + prop2.getFullName(LONG) + " but not " + prop1.getFullName(LONG));
|
|
Utility.showSetNames(" " + prop2.getFullName(SHORT) + ": ", set2minus1, false, Default.ucd());
|
|
}
|
|
System.out.println();
|
|
}
|
|
|
|
static boolean checkNF_AndCase(String source, boolean both) {
|
|
boolean result = true;
|
|
String decomp = Default.nfd().normalize(source);
|
|
if (!decomp.equals(source)) {
|
|
|
|
result &= checkNFC("Lower", source, decomp, Default.ucd().getCase(source, FULL, LOWER), Default.ucd().getCase(decomp, FULL, LOWER));
|
|
result &= checkNFC("Upper", source, decomp, Default.ucd().getCase(source, FULL, UPPER), Default.ucd().getCase(decomp, FULL, UPPER));
|
|
result &= checkNFC("Title", source, decomp, Default.ucd().getCase(source, FULL, TITLE), Default.ucd().getCase(decomp, FULL, TITLE));
|
|
result &= checkNFC("Fold", source, decomp, Default.ucd().getCase(source, FULL, FOLD), Default.ucd().getCase(decomp, FULL, FOLD));
|
|
|
|
if (!both) return result;
|
|
|
|
result &= checkNFC("SLower", source, decomp, Default.ucd().getCase(source, SIMPLE, LOWER), Default.ucd().getCase(decomp, SIMPLE, LOWER));
|
|
result &= checkNFC("SUpper", source, decomp, Default.ucd().getCase(source, SIMPLE, UPPER), Default.ucd().getCase(decomp, SIMPLE, UPPER));
|
|
result &= checkNFC("STitle", source, decomp, Default.ucd().getCase(source, SIMPLE, TITLE), Default.ucd().getCase(decomp, SIMPLE, TITLE));
|
|
result &= checkNFC("SFold", source, decomp, Default.ucd().getCase(source, SIMPLE, TITLE), Default.ucd().getCase(decomp, SIMPLE, TITLE));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static final boolean SHOW_NFC_DIFFERENCE = false;
|
|
|
|
static boolean checkNFC(String label, String source, String decomp, String casedCp, String casedDecomp) {
|
|
if (!Default.nfd().normalize(casedCp).equals(Default.nfd().normalize(casedDecomp))) {
|
|
if (SHOW_NFC_DIFFERENCE) {
|
|
Utility.fixDot();
|
|
System.out.println("FAIL CASE CE: " + label + " (" + Default.ucd().getCodeAndName(source) + ")");
|
|
System.out.println("\t" + Default.ucd().getCode(source) + " => " + Default.ucd().getCode(casedCp));
|
|
System.out.println("\t" + Default.ucd().getCode(decomp) + " => " + Default.ucd().getCode(casedDecomp));
|
|
}
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public static final String IDN_DIR = BASE_DIR + "\\IDN\\";
|
|
|
|
/*
|
|
System.out.println(Default.ucd.toString(0x0387));
|
|
System.out.println(Default.ucd.toString(0x00B7));
|
|
System.out.println(Default.ucd.toString(0x03a3));
|
|
System.out.println(Default.ucd.toString(0x03c2));
|
|
System.out.println(Default.ucd.toString(0x03c3));
|
|
System.out.println(Default.ucd.toString(0x0069));
|
|
System.out.println(Default.ucd.toString(0x0130));
|
|
System.out.println(Default.ucd.toString(0x0131));
|
|
System.out.println(Default.ucd.toString(0x0345));
|
|
*/
|
|
|
|
static void checkAgainstOtherVersion(String otherVersion) {
|
|
|
|
UCD ucd2 = UCD.make(otherVersion);
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
UData curr = Default.ucd().get(cp, true);
|
|
UData other = ucd2.get(cp, true);
|
|
if (!curr.equals(other)) {
|
|
System.out.println("Difference at " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println(curr);
|
|
System.out.println(curr);
|
|
System.out.println();
|
|
}
|
|
}
|
|
}
|
|
|
|
static void generateXML() throws IOException {
|
|
|
|
String filename = "UCD.xml";
|
|
PrintWriter log = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
|
|
|
|
//log.println('\uFEFF');
|
|
log.println("<ucd>");
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isRepresented(cp)) continue;
|
|
if (cp == 0xE0026 || cp == 0x20000) {
|
|
System.out.println("debug");
|
|
}
|
|
log.println(Default.ucd().toString(cp));
|
|
}
|
|
|
|
log.println("</ucd>");
|
|
log.close();
|
|
}
|
|
|
|
static final byte MIXED = (byte)(UNCASED + 1);
|
|
|
|
public static void checkCase() throws IOException {
|
|
|
|
Utility.fixDot();
|
|
System.out.println("checkCase");
|
|
|
|
String test = "The qui'ck br\u2019own 'fox jum\u00ADped ov\u200Ber th\u200Ce lazy dog.";
|
|
|
|
String ttest = Default.ucd().getCase(test, FULL, TITLE);
|
|
|
|
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt", Utility.LATIN1_UNIX);
|
|
titleTest.println(test);
|
|
titleTest.println(ttest);
|
|
titleTest.close();
|
|
|
|
System.out.println(Default.ucd().getCase("ABC,DE'F G\u0308H", FULL, TITLE));
|
|
String fileName = "CaseDifferences.txt";
|
|
PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isRepresented(cp) || Default.ucd().isPUA(cp)) continue;
|
|
if (cp == '\u3371') {
|
|
System.out.println("debug");
|
|
}
|
|
String x = Default.nfkd().normalize(cp);
|
|
String xu = Default.ucd().getCase(x, FULL, UPPER);
|
|
String xl = Default.ucd().getCase(x, FULL, LOWER);
|
|
String xt = Default.ucd().getCase(x, FULL, TITLE);
|
|
|
|
byte caseCat = MIXED;
|
|
if (xu.equals(xl)) caseCat = UNCASED;
|
|
else if (x.equals(xl)) caseCat = LOWER;
|
|
else if (x.equals(xu)) caseCat = UPPER;
|
|
else if (x.equals(xt)) caseCat = TITLE;
|
|
|
|
byte cat = Default.ucd().getCategory(cp);
|
|
boolean otherLower = Default.ucd().getBinaryProperty(cp, Other_Lowercase);
|
|
boolean otherUpper = Default.ucd().getBinaryProperty(cp, Other_Uppercase);
|
|
byte oldCaseCat = (cat == Lu || otherUpper) ? UPPER
|
|
: (cat == Ll || otherLower) ? LOWER
|
|
: (cat == Lt) ? TITLE
|
|
: UNCASED;
|
|
|
|
if (caseCat != oldCaseCat) {
|
|
log.println(UTF32.valueOf32(cp)
|
|
+ "\t" + names[caseCat]
|
|
+ "\t" + names[oldCaseCat]
|
|
+ "\t" + Default.ucd().getCategoryID_fromIndex(cat)
|
|
+ "\t" + lowerNames[otherLower ? 1 : 0]
|
|
+ "\t" + upperNames[otherUpper ? 1 : 0]
|
|
+ "\t" + Default.ucd().getCodeAndName(cp)
|
|
+ "\t" + Default.ucd().getCodeAndName(x)
|
|
+ "\t" + Default.ucd().getCodeAndName(xu)
|
|
+ "\t" + Default.ucd().getCodeAndName(xl)
|
|
+ "\t" + Default.ucd().getCodeAndName(xt)
|
|
);
|
|
}
|
|
}
|
|
|
|
log.close();
|
|
}
|
|
|
|
public static void checkCase2(boolean longForm) throws IOException {
|
|
|
|
Utility.fixDot();
|
|
System.out.println("checkCase");
|
|
|
|
/*String tx1 = "\u0391\u0342\u0345";
|
|
String ux1 = "\u0391\u0342\u0399";
|
|
String ctx1 = nfc.normalize(tx1);
|
|
String ctx2 = nfc.normalize(ux1); // wrong??
|
|
|
|
//System.out.println(Default.ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
|
|
*/
|
|
|
|
|
|
String fileName = "CaseNormalizationDifferences.txt";
|
|
PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);
|
|
|
|
log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
|
|
log.println("u, l, t - upper, lower, title");
|
|
log.println("c, d - nfc, nfd");
|
|
|
|
//Utility.DOTMASK = 0x7F;
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isRepresented(cp) || Default.ucd().isPUA(cp)) continue;
|
|
if (cp == '\u0130') {
|
|
System.out.println("debug");
|
|
}
|
|
|
|
String x = UTF32.valueOf32(cp);
|
|
String dx = Default.nfd().normalize(cp);
|
|
String cx = Default.nfc().normalize(cp);
|
|
|
|
String ux = Default.ucd().getCase(x, FULL, UPPER);
|
|
String lx = Default.ucd().getCase(x, FULL, LOWER);
|
|
String tx = Default.ucd().getCase(x, FULL, TITLE);
|
|
|
|
if (x.equals(dx) && dx.equals(cx) && cx.equals(ux) && ux.equals(lx) && lx.equals(tx)) continue;
|
|
|
|
String cux = Default.nfc().normalize(ux);
|
|
String clx = Default.nfc().normalize(lx);
|
|
String ctx = Default.nfc().normalize(tx);
|
|
|
|
if (x.equals(cx)) {
|
|
boolean needBreak = false;
|
|
if (!clx.equals(lx)) needBreak = true;
|
|
if (!ctx.equals(tx)) needBreak = true;
|
|
if (!cux.equals(ux)) needBreak = true;
|
|
|
|
if (needBreak) {
|
|
log.println("# Was not NFC:");
|
|
log.println(
|
|
"## " + Utility.hex(x) + "; "
|
|
+ Utility.hex(lx) + "; "
|
|
+ Utility.hex(tx) + "; "
|
|
+ Utility.hex(ux) + "; # "
|
|
+ Default.ucd().getName(x));
|
|
log.println("# should be:");
|
|
log.println(
|
|
Utility.hex(x) + "; "
|
|
+ Utility.hex(clx) + "; "
|
|
+ Utility.hex(ctx) + "; "
|
|
+ Utility.hex(cux) + "; # "
|
|
+ Default.ucd().getName(x));
|
|
log.println();
|
|
}
|
|
}
|
|
|
|
String dux = Default.nfd().normalize(ux);
|
|
String dlx = Default.nfd().normalize(lx);
|
|
String dtx = Default.nfd().normalize(tx);
|
|
|
|
|
|
|
|
String startdx = getMarks(dx, false);
|
|
String enddx = getMarks(dx, true);
|
|
|
|
String startdux = getMarks(dux, false);
|
|
String enddux = getMarks(dux, true);
|
|
|
|
String startdtx = getMarks(dtx, false);
|
|
String enddtx = getMarks(dtx, true);
|
|
|
|
String startdlx = getMarks(dlx, false);
|
|
String enddlx = getMarks(dlx, true);
|
|
|
|
// If the new marks don't occur in the old decomposition, we got a problem!
|
|
|
|
if (!startdx.startsWith(startdux) || !startdx.startsWith(startdtx) || !startdx.startsWith(startdlx)
|
|
|| !enddx.endsWith(enddux) || !enddx.endsWith(enddtx) || !enddx.endsWith(enddlx)) {
|
|
log.println("Combining Class Difference for " + Default.ucd().getCodeAndName(x));
|
|
log.println("x: " + Default.ucd().getCodeAndName(dx) + ", " + Utility.hex(startdx) + ", " + Utility.hex(enddx));
|
|
log.println("ux: " + Default.ucd().getCodeAndName(dux) + ", " + Utility.hex(startdux) + ", " + Utility.hex(enddux));
|
|
log.println("tx: " + Default.ucd().getCodeAndName(dtx) + ", " + Utility.hex(startdtx) + ", " + Utility.hex(enddtx));
|
|
log.println("lx: " + Default.ucd().getCodeAndName(dlx) + ", " + Utility.hex(startdlx) + ", " + Utility.hex(enddlx));
|
|
log.println();
|
|
}
|
|
|
|
|
|
if (!longForm) continue;
|
|
|
|
String udx = Default.ucd().getCase(dx, FULL, UPPER);
|
|
String ldx = Default.ucd().getCase(dx, FULL, LOWER);
|
|
String tdx = Default.ucd().getCase(dx, FULL, TITLE);
|
|
|
|
String ucx = Default.ucd().getCase(cx, FULL, UPPER);
|
|
String lcx = Default.ucd().getCase(cx, FULL, LOWER);
|
|
String tcx = Default.ucd().getCase(cx, FULL, TITLE);
|
|
|
|
String dudx = Default.nfd().normalize(udx);
|
|
String dldx = Default.nfd().normalize(ldx);
|
|
String dtdx = Default.nfd().normalize(tdx);
|
|
|
|
String cucx = Default.nfc().normalize(ucx);
|
|
String clcx = Default.nfc().normalize(lcx);
|
|
String ctcx = Default.nfc().normalize(tcx);
|
|
|
|
|
|
if (!dux.equals(udx)
|
|
|| !dlx.equals(ldx)
|
|
|| !dtx.equals(tdx)
|
|
|| !cux.equals(ucx)
|
|
|| !clx.equals(lcx)
|
|
|| !ctx.equals(tcx)
|
|
|| !dux.equals(dudx)
|
|
|| !dlx.equals(dldx)
|
|
|| !dtx.equals(dtdx)
|
|
|| !cux.equals(cucx)
|
|
|| !clx.equals(clcx)
|
|
|| !ctx.equals(ctcx)
|
|
) {
|
|
log.println();
|
|
log.println("Difference at " + Default.ucd().getCodeAndName(cp));
|
|
if (!x.equals(ux)) log.println("\tu(cp):\t" + Default.ucd().getCodeAndName(ux));
|
|
if (!x.equals(lx)) log.println("\tl(cp):\t" + Default.ucd().getCodeAndName(lx));
|
|
if (!tx.equals(ux)) log.println("\tt(cp):\t" + Default.ucd().getCodeAndName(tx));
|
|
if (!x.equals(dx)) log.println("\td(cp):\t" + Default.ucd().getCodeAndName(dx));
|
|
if (!x.equals(cx)) log.println("\tc(cp):\t" + Default.ucd().getCodeAndName(cx));
|
|
|
|
if (!dux.equals(udx)) {
|
|
log.println();
|
|
log.println("\td(u(cp)):\t" + Default.ucd().getCodeAndName(dux));
|
|
log.println("\tu(d(cp)):\t" + Default.ucd().getCodeAndName(udx));
|
|
}
|
|
if (!dlx.equals(ldx)) {
|
|
log.println();
|
|
log.println("\td(l(cp)):\t" + Default.ucd().getCodeAndName(dlx));
|
|
log.println("\tl(d(cp)):\t" + Default.ucd().getCodeAndName(ldx));
|
|
}
|
|
if (!dtx.equals(tdx)) {
|
|
log.println();
|
|
log.println("\td(t(cp)):\t" + Default.ucd().getCodeAndName(dtx));
|
|
log.println("\tt(d(cp)):\t" + Default.ucd().getCodeAndName(tdx));
|
|
}
|
|
|
|
if (!cux.equals(ucx)) {
|
|
log.println();
|
|
log.println("\tc(u(cp)):\t" + Default.ucd().getCodeAndName(cux));
|
|
log.println("\tu(c(cp)):\t" + Default.ucd().getCodeAndName(ucx));
|
|
}
|
|
if (!clx.equals(lcx)) {
|
|
log.println();
|
|
log.println("\tc(l(cp)):\t" + Default.ucd().getCodeAndName(clx));
|
|
log.println("\tl(c(cp)):\t" + Default.ucd().getCodeAndName(lcx));
|
|
}
|
|
if (!ctx.equals(tcx)) {
|
|
log.println();
|
|
log.println("\tc(t(cp)):\t" + Default.ucd().getCodeAndName(ctx));
|
|
log.println("\tt(c(cp)):\t" + Default.ucd().getCodeAndName(tcx));
|
|
}
|
|
|
|
// ...........
|
|
|
|
if (!udx.equals(dudx)) {
|
|
log.println();
|
|
log.println("\tu(d(cp)):\t" + Default.ucd().getCodeAndName(udx));
|
|
log.println("\td(u(d(cp))):\t" + Default.ucd().getCodeAndName(dudx));
|
|
}
|
|
if (!ldx.equals(dldx)) {
|
|
log.println();
|
|
log.println("\tl(d(cp)):\t" + Default.ucd().getCodeAndName(ldx));
|
|
log.println("\td(l(d(cp))):\t" + Default.ucd().getCodeAndName(dldx));
|
|
}
|
|
if (!tdx.equals(dtdx)) {
|
|
log.println();
|
|
log.println("\tt(d(cp)):\t" + Default.ucd().getCodeAndName(tdx));
|
|
log.println("\td(t(d(cp))):\t" + Default.ucd().getCodeAndName(dtdx));
|
|
}
|
|
|
|
if (!ucx.equals(cucx)) {
|
|
log.println();
|
|
log.println("\tu(c(cp)):\t" + Default.ucd().getCodeAndName(ucx));
|
|
log.println("\tc(u(c(cp))):\t" + Default.ucd().getCodeAndName(cucx));
|
|
}
|
|
if (!lcx.equals(clcx)) {
|
|
log.println();
|
|
log.println("\tl(c(cp)):\t" + Default.ucd().getCodeAndName(lcx));
|
|
log.println("\tc(l(c(cp))):\t" + Default.ucd().getCodeAndName(clcx));
|
|
}
|
|
if (!tcx.equals(ctcx)) {
|
|
log.println();
|
|
log.println("\tt(c(cp)):\t" + Default.ucd().getCodeAndName(tcx));
|
|
log.println("\tc(t(c(cp))):\t" + Default.ucd().getCodeAndName(ctcx));
|
|
}
|
|
}
|
|
}
|
|
|
|
log.close();
|
|
}
|
|
|
|
public static String getMarks(String s, boolean doEnd) {
|
|
int cp;
|
|
if (!doEnd) {
|
|
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
|
cp = UTF16.charAt(s, i);
|
|
int cc = Default.ucd().getCombiningClass(cp);
|
|
if (cc == 0) {
|
|
return s.substring(0, i);
|
|
}
|
|
}
|
|
} else {
|
|
for (int i = s.length(); i > 0; i -= UTF16.getCharCount(cp)) {
|
|
cp = UTF16.charAt(s, i-1); // will go 2 before if necessary
|
|
int cc = Default.ucd().getCombiningClass(cp);
|
|
if (cc == 0) {
|
|
return s.substring(i);
|
|
}
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
|
|
static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
|
|
static final String names2[] = {"LOWER", "TITLE", "UPPER", "FOLD"};
|
|
static final String lowerNames[] = {"", "Other_Lower"};
|
|
static final String upperNames[] = {"", "Other_Upper"};
|
|
|
|
public static void CheckCaseFold() {
|
|
|
|
System.out.println("Checking Case Fold");
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAssigned(cp) || Default.ucd().isPUA(cp)) continue;
|
|
|
|
boolean failed = false;
|
|
String fullTest = Default.ucd().getCase(Default.ucd().getCase(cp, FULL, UPPER), FULL, LOWER);
|
|
String simpleTest = Default.ucd().getCase(Default.ucd().getCase(cp, SIMPLE, UPPER), SIMPLE, LOWER);
|
|
|
|
String full = Default.ucd().getCase(cp, FULL, FOLD);
|
|
String simple = Default.ucd().getCase(cp, SIMPLE, FOLD);
|
|
|
|
String realTest = "\u0360" + UTF16.valueOf(cp) + "\u0334";
|
|
|
|
int ccc = Default.ucd().getCombiningClass(cp);
|
|
|
|
for (byte style = FOLD; style < LIMIT_CASE; ++style) {
|
|
|
|
String fold_NFD = Default.nfd().normalize(Default.ucd().getCase(realTest, FULL, style));
|
|
String NFD_fold = Default.ucd().getCase(Default.nfd().normalize(realTest), FULL, style);
|
|
if (!fold_NFD.equals(NFD_fold)) {
|
|
Utility.fixDot();
|
|
System.out.println("Case check fails at " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println("\t" + names2[style] + ", then NFD: " + Default.ucd().getCodeAndName(fold_NFD));
|
|
System.out.println("\tNFD, then " + names2[style] + ": " + Default.ucd().getCodeAndName(NFD_fold));
|
|
failed = true;
|
|
}
|
|
}
|
|
|
|
/*
|
|
|
|
int ccc = Default.ucd.getCombiningClass(cp);
|
|
|
|
int cp2;
|
|
for (int i = 0; i < full.length(); i += UTF16.getCharCount(cp2)) {
|
|
cp2 = UTF16.charAt(full, i);
|
|
int ccc2 = Default.ucd.getCombiningClass(cp2);
|
|
if (ccc2 != ccc) {
|
|
System.out.println("Case fold CCC fails at " + Default.ucd.getCodeAndName(cp));
|
|
System.out.println("\tFull case folding:" + ccc2 + ", " + Default.ucd.getCodeAndName(full));
|
|
System.out.println("\tccc:" + ccc);
|
|
System.out.println("\tccc:" + ccc2 + ", " + Default.ucd.getCodeAndName(cp2));
|
|
failed = true;
|
|
}
|
|
}
|
|
|
|
*/
|
|
|
|
if (!full.equals(fullTest)) {
|
|
Utility.fixDot();
|
|
System.out.println("Case fold fails at " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println(" fullFold(ch): " + Default.ucd().getCodeAndName(full));
|
|
System.out.println(" fullUpper(fullLower(ch)): " + Default.ucd().getCodeAndName(fullTest));
|
|
failed = true;
|
|
}
|
|
if (!simple.equals(simpleTest)) {
|
|
Utility.fixDot();
|
|
if (!failed) System.out.println("Case fold fails at " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println(" simpleFold(ch): " + Default.ucd().getCodeAndName(simple));
|
|
System.out.println(" simpleUpper(simpleLower(ch)): " + Default.ucd().getCodeAndName(simpleTest));
|
|
failed = true;
|
|
}
|
|
if (failed) System.out.println();
|
|
}
|
|
}
|
|
|
|
public static void compareBlueberry() {
|
|
|
|
|
|
UnicodeSet NameStartChar = new UnicodeSet("[A-Z:_a-z\\u00C0-\\u02FF"
|
|
+ "\\u0370-\\u037D\\u037F-\\u2027\\u202A-\\u218F\\u2800-\\uD7FF"
|
|
+ "\\uE000-\\uFDCF\\uFDE0-\\uFFEF\\U00010000-\\U0010FFFF]");
|
|
System.out.println("NameStartChar:");
|
|
System.out.println("\t" + NameStartChar.toPattern(true));
|
|
|
|
UnicodeSet NameChar = new UnicodeSet("[-.0-9\\u00b7\\u0300-\\u036F]");
|
|
System.out.println("NameChar-:");
|
|
System.out.println("\t" + NameChar.toPattern(true));
|
|
NameChar.addAll(NameStartChar);
|
|
System.out.println("NameChar:");
|
|
System.out.println("\t" + NameChar.toPattern(true));
|
|
|
|
UCDProperty IDstart = DerivedProperty.make(Mod_ID_Start, Default.ucd());
|
|
UCDProperty IDcontinue = DerivedProperty.make(Mod_ID_Continue_NO_Cf, Default.ucd());
|
|
|
|
UnicodeSet IDContinueMinusNameChar = new UnicodeSet();
|
|
UnicodeSet IDStartMinusNameChar = new UnicodeSet();
|
|
UnicodeSet IDStartMinusNameStartChar = new UnicodeSet();
|
|
UnicodeSet UnassignedMinusNameChar = new UnicodeSet();
|
|
|
|
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
|
|
if (Default.ucd().isPUA(cp)) continue;
|
|
if (!Default.ucd().isAssigned(cp) && !NameChar.contains(cp)) {
|
|
UnassignedMinusNameChar.add(cp);
|
|
} else if (IDcontinue.hasValue(cp) && !NameChar.contains(cp)) {
|
|
IDContinueMinusNameChar.add(cp);
|
|
} else if (IDstart.hasValue(cp)) {
|
|
if (!NameChar.contains(cp)) {
|
|
IDStartMinusNameChar.add(cp);
|
|
} else if (!NameStartChar.contains(cp)) {
|
|
IDStartMinusNameStartChar.add(cp);
|
|
}
|
|
}
|
|
}
|
|
System.out.println("IDContinueMinusNameChar: ");
|
|
System.out.println("\t" + IDContinueMinusNameChar.toPattern(true));
|
|
Utility.showSetNames("\t", IDContinueMinusNameChar, false, Default.ucd());
|
|
System.out.println("IDStartMinusNameChar: ");
|
|
System.out.println("\t" + IDStartMinusNameChar.toPattern(true));
|
|
System.out.println("IDStartMinusNameStartChar: ");
|
|
System.out.println("\t" + IDStartMinusNameStartChar.toPattern(true));
|
|
System.out.println("UnassignedMinusNameChar: ");
|
|
System.out.println("\t" + UnassignedMinusNameChar.toPattern(true));
|
|
}
|
|
|
|
public static void VerifyIDN() throws IOException {
|
|
|
|
System.out.println("VerifyIDN");
|
|
|
|
System.out.println();
|
|
System.out.println("Checking Map");
|
|
System.out.println();
|
|
|
|
BitSet mappedOut = new BitSet();
|
|
int errorCount = verifyUTFMap(mappedOut);
|
|
|
|
BitSet unassigned = getIDNList("IDN-Unassigned.txt");
|
|
BitSet prohibited = getIDNList("IDN-Prohibited.txt");
|
|
BitSet guessSet = guessIDN();
|
|
|
|
System.out.println();
|
|
System.out.println("Checking Prohibited and Unassigned");
|
|
System.out.println();
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (mappedOut.get(cp)) continue;
|
|
|
|
boolean ucdUnassigned = !Default.ucd().isAllocated(cp);
|
|
boolean idnUnassigned = unassigned.get(cp);
|
|
boolean guess = guessSet.get(cp);
|
|
boolean idnProhibited = prohibited.get(cp);
|
|
|
|
if (ucdUnassigned && !idnUnassigned) {
|
|
showError("?UCD Unassigned but not IDN Unassigned", cp, "");
|
|
++errorCount;
|
|
} else if (!ucdUnassigned && idnUnassigned) {
|
|
showError("?Not UCD Unassigned but IDN Unassigned", cp, "");
|
|
++errorCount;
|
|
}
|
|
|
|
if (idnProhibited && unassigned.get(cp)) {
|
|
showError("?Both IDN Unassigned AND IDN Prohibited", cp, "");
|
|
++errorCount;
|
|
}
|
|
|
|
if (guess && !idnProhibited) {
|
|
showError("?UCD ?prohibited? but not IDN Prohibited ", cp, "");
|
|
++errorCount;
|
|
} else if (!guess && idnProhibited) {
|
|
showError("?Not UCD ?prohibited? but IDN Prohibited ", cp, "");
|
|
++errorCount;
|
|
}
|
|
|
|
if (cp == 0x3131) {
|
|
System.out.println("Debug: " + idnProhibited
|
|
+ ", " + idnUnassigned
|
|
+ ", " + !Default.nfkd().isNormalized(cp)
|
|
+ ", " + Default.ucd().getCodeAndName(Default.nfkc().normalize(cp))
|
|
+ ", " + Default.ucd().getCodeAndName(Default.nfc().normalize(cp)));
|
|
}
|
|
|
|
if (!idnProhibited && ! idnUnassigned && !Default.nfkd().isNormalized(cp)) {
|
|
String kc = Default.nfkc().normalize(cp);
|
|
String c = Default.nfc().normalize(cp);
|
|
if (kc.equals(c)) continue;
|
|
int cp2;
|
|
boolean excluded = false;
|
|
for (int j = 0; j < kc.length(); j += UTF16.getCharCount(cp2)) {
|
|
cp2 = UTF16.charAt(kc, j);
|
|
if (prohibited.get(cp2)) {
|
|
showError("Prohibited with NFKC, but output with NFC", cp, "");
|
|
excluded = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!excluded) {
|
|
showError("Remapped to core abstract character with NFKC (but not NFC)", cp, ""); // , "\t=> " + Default.ucd.getCodeAndName(kc));
|
|
}
|
|
}
|
|
|
|
}
|
|
System.out.println("Writing IDNCheck.txt");
|
|
|
|
|
|
PrintWriter log = Utility.openPrintWriter("IDNCheck.txt", Utility.LATIN1_UNIX);
|
|
log.println("IDN Check");
|
|
log.println("Total Errors: " + errorCount);
|
|
|
|
Iterator it = idnMap.keySet().iterator();
|
|
while (it.hasNext()) {
|
|
String description = (String) it.next();
|
|
Map map = (Map) idnMap.get(description);
|
|
log.println();
|
|
log.println(description);
|
|
log.println("Total: " + map.size());
|
|
log.println();
|
|
|
|
Iterator it2 = map.keySet().iterator();
|
|
while (it2.hasNext()) {
|
|
Object key = it2.next();
|
|
String line = (String) map.get(key);
|
|
log.println(" " + line);
|
|
}
|
|
}
|
|
log.close();
|
|
}
|
|
|
|
static Map idnMap = new java.util.HashMap();
|
|
|
|
static void showError(String description, int cp, String option) {
|
|
Map probe = (Map) idnMap.get(description);
|
|
if (probe == null) {
|
|
probe = new TreeMap();
|
|
idnMap.put(description, probe);
|
|
}
|
|
probe.put(new Integer(cp), Default.ucd().getCodeAndName(cp) + " (" + Default.ucd().getCategoryID(cp) + ")" + option);
|
|
}
|
|
|
|
static void showDifferences(PrintWriter log, UnicodeSet s1, String name1, UnicodeSet s2, String name2, boolean both) {
|
|
if (!s1.equals(s2)) {
|
|
log.println();
|
|
log.println("In " + name1 + ", but NOT " + name2);
|
|
Utility.showSetNames(log," ", new UnicodeSet(s1).removeAll(s2), false, false, Default.ucd());
|
|
log.println();
|
|
log.println("NOT in " + name1 + ", but in " + name2);
|
|
Utility.showSetNames(log," ", new UnicodeSet(s2).removeAll(s1), false, false, Default.ucd());
|
|
log.println();
|
|
if (both) {
|
|
log.println("In both " + name1 + " AND " + name2);
|
|
Utility.showSetNames(log," ", new UnicodeSet(s2).retainAll(s1), false, false, Default.ucd());
|
|
log.println();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
public static void genIDN() throws IOException {
|
|
PrintWriter out = new PrintWriter(System.out);
|
|
|
|
PrintWriter log = Utility.openPrintWriter("IDN-tables.txt", Utility.LATIN1_UNIX);
|
|
|
|
/*UnicodeSet y = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
|
|
UnicodeSet x = new UnicodeSet(0xE0001,0xE007F).retainAll(y);
|
|
|
|
System.out.println("y: " + y.toPattern(true));
|
|
System.out.println("x: " + x.toPattern(true));
|
|
Utility.showSetNames(out, "* ", x, false, true, Default.ucd);
|
|
out.flush();
|
|
*/
|
|
|
|
|
|
// table1
|
|
System.out.println("Getting Basics");
|
|
UnicodeSet unassigned = UnifiedBinaryProperty.make(CATEGORY + UNASSIGNED).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet lineSeparators = UnifiedBinaryProperty.make(CATEGORY+LINE_SEPARATOR).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet paraSeparators = UnifiedBinaryProperty.make(CATEGORY+PARAGRAPH_SEPARATOR).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet spaceSeparators = UnifiedBinaryProperty.make(CATEGORY+SPACE_SEPARATOR).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet noncharacters = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Noncharacter_Code_Point).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet deprecated = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Deprecated).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet format = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet bidi_control = UnifiedBinaryProperty.make(BINARY_PROPERTIES+Bidi_Control).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet binary_IDS = UnifiedBinaryProperty.make(BINARY_PROPERTIES+IDS_BinaryOperator).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet trinary_IDS = UnifiedBinaryProperty.make(BINARY_PROPERTIES+IDS_TrinaryOperator).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES+White_space).getSet();
|
|
whitespace.addAll(spaceSeparators); // bug.
|
|
System.out.print(".");
|
|
|
|
UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED + DefaultIgnorable).getSet();
|
|
System.out.print(".");
|
|
|
|
UnicodeSet privateUse = UnifiedBinaryProperty.make(CATEGORY+PRIVATE_USE).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet control = UnifiedBinaryProperty.make(CATEGORY+Cc).getSet();
|
|
System.out.print(".");
|
|
UnicodeSet surrogate = UnifiedBinaryProperty.make(CATEGORY+SURROGATE).getSet();
|
|
|
|
System.out.println("Building Sets");
|
|
// small test:
|
|
|
|
if (DEBUG) {
|
|
showDifferences(log, whitespace, "White_Space",
|
|
new UnicodeSet(spaceSeparators).addAll(lineSeparators).addAll(paraSeparators), "Separators", true);
|
|
|
|
showDifferences(log, UnifiedBinaryProperty.make(DERIVED + ID_Start).getSet(), "ID_Start",
|
|
UnifiedBinaryProperty.make(DERIVED + Mod_ID_Start).getSet(), "XID_Start", false);
|
|
|
|
showDifferences(log, UnifiedBinaryProperty.make(DERIVED + ID_Continue_NO_Cf).getSet(), "ID_Continue",
|
|
UnifiedBinaryProperty.make(DERIVED + Mod_ID_Continue_NO_Cf).getSet(), "XID_Continue", false);
|
|
|
|
System.out.println("Done with Test");
|
|
}
|
|
|
|
UnicodeSet A1 = new UnicodeSet(unassigned).removeAll(noncharacters);
|
|
|
|
// special code for B1
|
|
|
|
/*
|
|
B1, old
|
|
00AD; SOFT HYPHEN
|
|
1806; MONGOLIAN TODO SOFT HYPHEN
|
|
180B; MONGOLIAN FREE VARIATION SELECTOR ONE
|
|
180C; MONGOLIAN FREE VARIATION SELECTOR TWO
|
|
180D; MONGOLIAN FREE VARIATION SELECTOR THREE
|
|
200B; ZERO WIDTH SPACE
|
|
200C; ZERO WIDTH NON-JOINER
|
|
200D; ZERO WIDTH JOINER
|
|
FEFF; ZERO WIDTH NO-BREAK SPACE
|
|
*/
|
|
|
|
UnicodeSet B1 = new UnicodeSet().add(0xAD).add(0x1806).add(0x034F); // START WITH soft hyphen, mongolian soft hyphen, grapheme joiner
|
|
// THEN ADD default ignorables or format characters that are *variation* or *zero width*
|
|
UnicodeSet temp = new UnicodeSet(defaultIgnorable).addAll(format).addAll(spaceSeparators)
|
|
.removeAll(surrogate).removeAll(control); // remove some just to avoid clutter when debugging.
|
|
UnicodeSetIterator it = new UnicodeSetIterator(temp);
|
|
while(it.next()) {
|
|
if (!Default.ucd().isAssigned(it.codepoint)) continue;
|
|
String name = Default.ucd().getName(it.codepoint);
|
|
System.out.print(Default.ucd().getCodeAndName(it.codepoint));
|
|
|
|
if (name.indexOf("VARIATION") >= 0 || name.indexOf("ZERO") >= 0
|
|
|| name.indexOf("WORD JOINER") >= 0) {
|
|
B1.add(it.codepoint);
|
|
System.out.print("*");
|
|
}
|
|
System.out.println();
|
|
}
|
|
|
|
UnicodeSet C1 = new UnicodeSet(whitespace).removeAll(control).removeAll(lineSeparators)
|
|
.removeAll(paraSeparators);
|
|
|
|
UnicodeSet C2 = new UnicodeSet(defaultIgnorable).removeAll(unassigned).removeAll(surrogate)
|
|
.addAll(control).addAll(format).addAll(lineSeparators).addAll(paraSeparators);
|
|
|
|
UnicodeSet C3 = new UnicodeSet(privateUse);
|
|
|
|
UnicodeSet C4 = new UnicodeSet(noncharacters);
|
|
|
|
UnicodeSet C5 = new UnicodeSet(surrogate);
|
|
|
|
UnicodeSet C6 = new UnicodeSet(0xFFF9, 0xFFFC).add(0xFFFD);
|
|
|
|
UnicodeSet C7 = new UnicodeSet(binary_IDS).addAll(trinary_IDS);
|
|
|
|
UnicodeSet C8 = new UnicodeSet(deprecated).addAll(bidi_control);
|
|
|
|
UnicodeSet C9 = new UnicodeSet(0xE0001,0xE007F).retainAll(format);
|
|
//Utility.showSetNames(out, "\t&&& ", C9, false, true, Default.ucd);
|
|
//out.flush();
|
|
|
|
|
|
// FIX UP SETS!!
|
|
B1.removeAll(C6);
|
|
B1.removeAll(C8);
|
|
B1.removeAll(C9);
|
|
|
|
C1.removeAll(B1);
|
|
|
|
C2.removeAll(B1);
|
|
C2.removeAll(C6);
|
|
C2.removeAll(C8);
|
|
C2.removeAll(C9);
|
|
|
|
System.out.println("Check that A1, B1, C1..9 are disjoint");
|
|
|
|
UnicodeSet[] test = {A1, B1, C1, C2, C3, C4, C5, C6, C7, C8, C9};
|
|
String[] testNames = {"A1", "B1", "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9"};
|
|
UnicodeSet union = new UnicodeSet();
|
|
|
|
for (int i = 0; i < test.length; ++i) {
|
|
union.addAll(test[i]);
|
|
for (int j = i + 1; j < test.length; ++j) {
|
|
if (test[i].containsNone(test[j])) continue;
|
|
log.println(testNames[i] + " and " + testNames[j] + " intersect!");
|
|
UnicodeSet intersection = new UnicodeSet(test[i]).retainAll(test[j]);
|
|
Utility.showSetNames(log," ", intersection, false, true, Default.ucd());
|
|
log.println();
|
|
}
|
|
}
|
|
|
|
System.out.println("Check that union works");
|
|
|
|
UnicodeSet[] badChars = {unassigned, noncharacters, deprecated, format,
|
|
control, surrogate, privateUse, binary_IDS, trinary_IDS, whitespace, defaultIgnorable,
|
|
lineSeparators, paraSeparators, spaceSeparators};
|
|
UnicodeSet badCharUnion = new UnicodeSet();
|
|
for (int i = 0; i < badChars.length; ++i) {
|
|
badCharUnion.addAll(badChars[i]);
|
|
}
|
|
|
|
showDifferences(log, union, "(A1+B1+C1-C9)",
|
|
badCharUnion,
|
|
"(Whitespace+Deprecated+DefaultIgnorable+Separator+Other (cont/format/surr/priv/unass))", false);
|
|
|
|
System.out.println("Generating B2, B3");
|
|
|
|
log.println("Generating B2, B3");
|
|
Map B2 = new TreeMap();
|
|
Map B3 = new TreeMap();
|
|
Integer tempInteger = null;
|
|
|
|
for (int i = 0; i < 0x10FFFF; ++i) {
|
|
int cat = Default.ucd().getCategory(i);
|
|
if (!Default.ucd().isAssigned(i)) continue;
|
|
//if (cat == Cc || cat == Cf || cat == Co || cat == Cn) continue; // we can skip these
|
|
//if (Default.ucd.hasComputableName(i)) continue;
|
|
tempInteger = null;
|
|
|
|
String original = UTF16.valueOf(i);
|
|
String caseFold = Default.ucd().getCase(i, FULL, FOLD);
|
|
if (!original.equals(caseFold)) {
|
|
tempInteger = new Integer(i);
|
|
B2.put(tempInteger, caseFold);
|
|
B3.put(tempInteger, caseFold);
|
|
}
|
|
|
|
String b = Default.nfkc().normalize(caseFold);
|
|
String c = Default.nfkc().normalize(Default.ucd().getCase(b, FULL, FOLD));
|
|
|
|
if (!c.equals(b)) {
|
|
if (tempInteger != null) {
|
|
if (DEBUG) {
|
|
log.println("Possible Conflict");
|
|
log.println(" " + Default.ucd().getCodeAndName(i));
|
|
log.println(" => " + Default.ucd().getCodeAndName(caseFold));
|
|
log.println(" => " + Default.ucd().getCodeAndName(c));
|
|
}
|
|
} else {
|
|
tempInteger = new Integer(i);
|
|
if (DEBUG) {
|
|
log.println(" " + Default.ucd().getCodeAndName(i));
|
|
log.println(" => " + Default.ucd().getCodeAndName(c));
|
|
}
|
|
}
|
|
if (DEBUG) log.println();
|
|
B2.put(tempInteger, c);
|
|
}
|
|
}
|
|
|
|
|
|
// PRINTOUT
|
|
|
|
printIDN_Table(log, "A.1", "Unassigned code points in Unicode " + Default.ucd().getVersion(), A1);
|
|
printIDN_Table(log, "B.1", "Commonly mapped to nothing", B1);
|
|
|
|
printIDN_Map(log, "B.2", "Mapping for lowercase used with NFKC", B2, B3);
|
|
|
|
printIDN_Map(log, "B.3", "Mapping for lowercase used with no normalization", B3, B2);
|
|
|
|
printIDN_Table(log, "C.1", "Space characters", C1);
|
|
printIDN_Table(log, "C.2", "Control characters", C2);
|
|
printIDN_Table(log, "C.3", "Private use", C3);
|
|
printIDN_Table(log, "C.4", "Non-character code points", C4);
|
|
printIDN_Table(log, "C.5", "Surrogate codes", C5);
|
|
printIDN_Table(log, "C.6", "Inappropriate for plain text", C6);
|
|
printIDN_Table(log, "C.7", "Inappropriate for canonical representation", C7);
|
|
printIDN_Table(log, "C.8", "Change display properties (or deprecated)", C8);
|
|
printIDN_Table(log, "C.9", "Tagging characters", C9);
|
|
|
|
System.out.println("Done");
|
|
log.close();
|
|
}
|
|
|
|
public static void printIDN_Map(PrintWriter log, String tableNumber, String description, Map map, Map other) {
|
|
System.out.println(tableNumber+ " " + description);
|
|
log.println("");
|
|
log.println(tableNumber+ " " + description);
|
|
log.println("");
|
|
log.println("----- Start Table " + tableNumber + " -----");
|
|
Iterator it = map.keySet().iterator();
|
|
while(it.hasNext()) {
|
|
Integer key = (Integer) it.next();
|
|
String value = (String) map.get(key);
|
|
int cp = key.intValue();
|
|
log.println(Utility.hex(cp, 4) + "; " + Utility.hex(value, 4) + "; "
|
|
+ (!value.equals(other.get(key))? "***" : "")
|
|
+ Default.ucd().getName(cp));
|
|
}
|
|
log.println("----- End Table " + tableNumber + " -----");
|
|
}
|
|
|
|
public static void printIDN_Table(PrintWriter log, String tableNumber, String description, UnicodeSet set) {
|
|
System.out.println(tableNumber+ " " + description);
|
|
log.println("");
|
|
log.println(tableNumber+ " " + description);
|
|
log.println("");
|
|
log.println("----- Start Table " + tableNumber + " -----");
|
|
Utility.showSetNames(log, "", set, false, true, Default.ucd());
|
|
log.println("----- End Table " + tableNumber + " -----");
|
|
}
|
|
|
|
public static BitSet guessIDN() {
|
|
BitSet result = new BitSet();
|
|
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
|
int cat = Default.ucd().getCategory(cp);
|
|
// 5.1 Currently-prohibited ASCII characters
|
|
|
|
if (cp < 0x80 && cp != '-' && !(cat == Lu || cat == Ll || cat == Nd)) result.set(cp);
|
|
|
|
// 5.2 Space characters
|
|
|
|
if (cat == Zs) result.set(cp);
|
|
|
|
// 5.3 Control characters
|
|
if (cat == Cc || cat == Zp || cat == Zl) result.set(cp);
|
|
|
|
// exclude those reserved for Cf
|
|
/*if (0x2060 <= cp && cp <= 0x206F) result.set(cp);
|
|
if (0xFFF0 <= cp && cp <= 0xFFFC) result.set(cp);
|
|
if (0xE0000 <= cp && cp <= 0xE0FFF) result.set(cp);
|
|
*/
|
|
|
|
// 5.4 Private use and replacement characters
|
|
|
|
if (cat == Co) result.set(cp);
|
|
if (cp == 0xFFFD) result.set(cp);
|
|
|
|
// 5.5 Non-character code points
|
|
if (Default.ucd().getBinaryProperty(cp, Noncharacter_Code_Point)) result.set(cp);
|
|
|
|
// 5.6 Surrogate codes
|
|
if (cat == Cs) result.set(cp);
|
|
|
|
// 5.7 Inappropriate for plain text
|
|
|
|
if (cat == Cf) result.set(cp);
|
|
if (cp == 0xFFFC) result.set(cp);
|
|
|
|
// 5.8 Inappropriate for domain names
|
|
|
|
if (isIDS(cp)) result.set(cp);
|
|
|
|
// 5.9 Change display properties
|
|
// Cf, checked above
|
|
|
|
// 5.10 Inappropriate characters from common input mechanisms
|
|
if (cp == 0x3002) result.set(cp);
|
|
|
|
// 5.11 Tagging characters
|
|
// Cf, checked above
|
|
}
|
|
return result;
|
|
}
|
|
|
|
static boolean isIDS(int cp) { return 0x2FF0 <= cp && cp <= 0x2FFB; }
|
|
|
|
|
|
/*
|
|
5.1 Currently-prohibited ASCII characters
|
|
|
|
Some of the ASCII characters that are currently prohibited in host names
|
|
by [STD13] are also used in protocol elements such as URIs [URI]. The other
|
|
characters in the range U+0000 to U+007F that are not currently allowed
|
|
are also prohibited in host name parts to reserve them for future use in
|
|
protocol elements.
|
|
|
|
0000-002C; [ASCII CONTROL CHARACTERS and SPACE through ,]
|
|
002E-002F; [ASCII . through /]
|
|
003A-0040; [ASCII : through @]
|
|
005B-0060; [ASCII [ through `]
|
|
007B-007F; [ASCII { through DEL]
|
|
|
|
5.2 Space characters
|
|
|
|
Space characters would make visual transcription of URLs nearly
|
|
impossible and could lead to user entry errors in many ways.
|
|
|
|
0020; SPACE
|
|
00A0; NO-BREAK SPACE
|
|
1680; OGHAM SPACE MARK
|
|
2000; EN QUAD
|
|
2001; EM QUAD
|
|
2002; EN SPACE
|
|
2003; EM SPACE
|
|
2004; THREE-PER-EM SPACE
|
|
2005; FOUR-PER-EM SPACE
|
|
2006; SIX-PER-EM SPACE
|
|
2007; FIGURE SPACE
|
|
2008; PUNCTUATION SPACE
|
|
2009; THIN SPACE
|
|
200A; HAIR SPACE
|
|
202F; NARROW NO-BREAK SPACE
|
|
3000; IDEOGRAPHIC SPACE
|
|
|
|
5.3 Control characters
|
|
|
|
Control characters cannot be seen and can cause unpredictable results
|
|
when displayed.
|
|
|
|
0000-001F; [CONTROL CHARACTERS]
|
|
007F; DELETE
|
|
0080-009F; [CONTROL CHARACTERS]
|
|
2028; LINE SEPARATOR
|
|
2029; PARAGRAPH SEPARATOR
|
|
206A-206F; [CONTROL CHARACTERS]
|
|
FFF9-FFFC; [CONTROL CHARACTERS]
|
|
1D173-1D17A; [MUSICAL CONTROL CHARACTERS]
|
|
|
|
5.4 Private use and replacement characters
|
|
|
|
Because private-use characters do not have defined meanings, they are
|
|
prohibited. The private-use characters are:
|
|
|
|
E000-F8FF; [PRIVATE USE, PLANE 0]
|
|
F0000-FFFFD; [PRIVATE USE, PLANE 15]
|
|
100000-10FFFD; [PRIVATE USE, PLANE 16]
|
|
|
|
The replacement character (U+FFFD) has no known semantic definition in a
|
|
name, and is often displayed by renderers to indicate "there would be
|
|
some character here, but it cannot be rendered". For example, on a
|
|
computer with no Asian fonts, a name with three ideographs might be
|
|
rendered with three replacement characters.
|
|
|
|
FFFD; REPLACEMENT CHARACTER
|
|
|
|
5.5 Non-character code points
|
|
|
|
Non-character code points are code points that have been allocated in
|
|
ISO/IEC 10646 but are not characters. Because they are already assigned,
|
|
they are guaranteed not to later change into characters.
|
|
|
|
FDD0-FDEF; [NONCHARACTER CODE POINTS]
|
|
FFFE-FFFF; [NONCHARACTER CODE POINTS]
|
|
1FFFE-1FFFF; [NONCHARACTER CODE POINTS]
|
|
2FFFE-2FFFF; [NONCHARACTER CODE POINTS]
|
|
3FFFE-3FFFF; [NONCHARACTER CODE POINTS]
|
|
4FFFE-4FFFF; [NONCHARACTER CODE POINTS]
|
|
5FFFE-5FFFF; [NONCHARACTER CODE POINTS]
|
|
6FFFE-6FFFF; [NONCHARACTER CODE POINTS]
|
|
7FFFE-7FFFF; [NONCHARACTER CODE POINTS]
|
|
8FFFE-8FFFF; [NONCHARACTER CODE POINTS]
|
|
9FFFE-9FFFF; [NONCHARACTER CODE POINTS]
|
|
AFFFE-AFFFF; [NONCHARACTER CODE POINTS]
|
|
BFFFE-BFFFF; [NONCHARACTER CODE POINTS]
|
|
CFFFE-CFFFF; [NONCHARACTER CODE POINTS]
|
|
DFFFE-DFFFF; [NONCHARACTER CODE POINTS]
|
|
EFFFE-EFFFF; [NONCHARACTER CODE POINTS]
|
|
FFFFE-FFFFF; [NONCHARACTER CODE POINTS]
|
|
10FFFE-10FFFF; [NONCHARACTER CODE POINTS]
|
|
|
|
5.6 Surrogate codes
|
|
|
|
The following code points are permanently reserved for use as surrogate
|
|
code values in the UTF-16 encoding, will never be assigned to
|
|
characters, and are therefore prohibited:
|
|
|
|
D800-DFFF; [SURROGATE CODES]
|
|
|
|
5.7 Inappropriate for plain text
|
|
|
|
The following characters should not appear in regular text.
|
|
|
|
FFF9; INTERLINEAR ANNOTATION ANCHOR
|
|
FFFA; INTERLINEAR ANNOTATION SEPARATOR
|
|
FFFB; INTERLINEAR ANNOTATION TERMINATOR
|
|
FFFC; OBJECT REPLACEMENT CHARACTER
|
|
|
|
5.8 Inappropriate for domain names
|
|
|
|
The ideographic description characters allow different sequences of
|
|
characters to be rendered the same way, which makes them inappropriate
|
|
for host names that must have a single canonical representation.
|
|
|
|
2FF0-2FFB; [IDEOGRAPHIC DESCRIPTION CHARACTERS]
|
|
|
|
5.9 Change display properties
|
|
|
|
The following characters, some of which are deprecated in ISO/IEC 10646,
|
|
can cause changes in display or the order in which characters appear
|
|
when rendered.
|
|
|
|
200E; LEFT-TO-RIGHT MARK
|
|
200F; RIGHT-TO-LEFT MARK
|
|
202A; LEFT-TO-RIGHT EMBEDDING
|
|
202B; RIGHT-TO-LEFT EMBEDDING
|
|
202C; POP DIRECTIONAL FORMATTING
|
|
202D; LEFT-TO-RIGHT OVERRIDE
|
|
202E; RIGHT-TO-LEFT OVERRIDE
|
|
206A; INHIBIT SYMMETRIC SWAPPING
|
|
206B; ACTIVATE SYMMETRIC SWAPPING
|
|
206C; INHIBIT ARABIC FORM SHAPING
|
|
206D; ACTIVATE ARABIC FORM SHAPING
|
|
206E; NATIONAL DIGIT SHAPES
|
|
206F; NOMINAL DIGIT SHAPES
|
|
|
|
5.10 Inappropriate characters from common input mechanisms
|
|
|
|
U+3002 is used as if it were U+002E in many input mechanisms,
|
|
particularly in Asia. This prohibition allows input mechanisms to safely
|
|
map U+3002 to U+002E before doing nameprep without worrying about
|
|
preventing users from accessing legitimate host name parts.
|
|
|
|
3002; IDEOGRAPHIC FULL STOP
|
|
|
|
5.11 Tagging characters
|
|
|
|
The following characters are used for tagging text and are invisible.
|
|
|
|
E0001; LANGUAGE TAG
|
|
E0020-E007F; [TAGGING CHARACTERS]
|
|
*/
|
|
|
|
|
|
public static int verifyUTFMap(BitSet mappedOut) throws IOException {
|
|
int errorCount = 0;
|
|
BufferedReader input = new BufferedReader(new FileReader(IDN_DIR + "IDN-Mapping.txt"),32*1024);
|
|
String line = "";
|
|
Map idnFold = new TreeMap();
|
|
Map idnWhy = new HashMap();
|
|
try {
|
|
String[] parts = new String[20];
|
|
for (int lineNumber = 1; ; ++lineNumber) {
|
|
line = input.readLine();
|
|
if (line == null) break;
|
|
if ((lineNumber % 500) == 0) {
|
|
Utility.fixDot();
|
|
System.out.println("//" + lineNumber + ": '" + line + "'");
|
|
}
|
|
|
|
if (line.length() == 0) continue;
|
|
if (line.charAt(0) == '-') continue;
|
|
|
|
int count = Utility.split(line,';',parts);
|
|
if (count != 3) throw new ChainException("Incorrect # of fields in IDN folding, line = {0}",
|
|
new String[] {line});
|
|
|
|
String key = Utility.fromHex(parts[0]);
|
|
if (UTF32.length32(key) != 1) throw new ChainException("First IDN field not single character: " + line, null);
|
|
int cp = UTF32.char32At(key, 0);
|
|
if (!Default.ucd().isAssigned(cp) || Default.ucd().isPUA(cp)) throw new ChainException("IDN character unassigned or PUA: " + line, null);
|
|
String value = Utility.fromHex(parts[1]);
|
|
String reason = parts[2].trim();
|
|
|
|
if (reason.equals("Map out")) {
|
|
value = Utility.fromHex(parts[1]);
|
|
Utility.fixDot();
|
|
showError("Mapping Out: ", cp, "");
|
|
mappedOut.set(cp);
|
|
}
|
|
idnFold.put(key, value);
|
|
idnWhy.put(key, reason);
|
|
}
|
|
|
|
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAssigned(cp) || Default.ucd().isPUA(cp)) continue;
|
|
if (mappedOut.get(cp)) continue;
|
|
|
|
String key = UTF32.valueOf32(cp);
|
|
String value = (String)idnFold.get(key);
|
|
if (value == null) value = key;
|
|
String reason = (String)idnWhy.get(key);
|
|
String ucdFold = Default.ucd().getCase(cp, FULL, FOLD, "I");
|
|
if (!ucdFold.equals(value)) {
|
|
String b = Default.nfkc().normalize(Default.ucd().getCase(cp, FULL, FOLD, "I"));
|
|
String c = Default.nfkc().normalize(Default.ucd().getCase(b, FULL, FOLD, "I"));
|
|
|
|
if (c.equals(value)) continue;
|
|
Utility.fixDot();
|
|
|
|
System.out.println("Mismatch: " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println(" UCD Case Fold: <" + Default.ucd().getCodeAndName(ucdFold) + ">");
|
|
System.out.println(" IDN Map [" + reason + "]: <" + Default.ucd().getCodeAndName(value) + ">");
|
|
errorCount++;
|
|
}
|
|
}
|
|
} finally {
|
|
input.close();
|
|
}
|
|
return errorCount;
|
|
}
|
|
|
|
static BitSet getIDNList(String file) throws IOException {
|
|
BufferedReader input = new BufferedReader(new FileReader(IDN_DIR + file),32*1024);
|
|
BitSet result = new BitSet();
|
|
String line;
|
|
try {
|
|
String[] parts = new String[20];
|
|
for (int lineNumber = 1; ; ++lineNumber) {
|
|
line = input.readLine();
|
|
if (line == null) break;
|
|
if ((lineNumber % 500) == 0) {
|
|
Utility.fixDot();
|
|
System.out.println("//" + lineNumber + ": '" + line + "'");
|
|
}
|
|
|
|
int commentPos = line.indexOf(';');
|
|
if (commentPos >= 0) line = line.substring(0,commentPos);
|
|
line = line.trim();
|
|
if (line.length() == 0) continue;
|
|
if (line.charAt(0) == '-') continue;
|
|
|
|
int count = Utility.split(line,'-',parts);
|
|
if (count > 2) throw new ChainException("Incorrect # of fields in IDN list", null);
|
|
int start = Utility.codePointFromHex(parts[0]);
|
|
int end = count == 1 ? start : Utility.codePointFromHex(parts[1]);
|
|
|
|
for (int i = start; i <= end; ++i) {
|
|
result.set(i);
|
|
}
|
|
}
|
|
} finally {
|
|
input.close();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
+ "\r\n# Generated from <2060..206F, FFF0..FFFB, E0000..E0FFF>"
|
|
+ "\r\n# + Other_Default_Ignorable_Code_Point + (Cf + Cc + Cs - White_Space)";
|
|
*/
|
|
|
|
public static void diffIgnorable () {
|
|
|
|
|
|
UnicodeSet control = UnifiedBinaryProperty.make(CATEGORY + Cf, Default.ucd()).getSet();
|
|
|
|
System.out.println("Cf");
|
|
Utility.showSetNames("", control, false, Default.ucd());
|
|
|
|
control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cc, Default.ucd()).getSet());
|
|
|
|
System.out.println("Cf + Cc");
|
|
Utility.showSetNames("", control, false, Default.ucd());
|
|
|
|
control.addAll(UnifiedBinaryProperty.make(CATEGORY + Cs, Default.ucd()).getSet());
|
|
|
|
System.out.println("Cf + Cc + Cs");
|
|
Utility.showSetNames("", control, false, Default.ucd());
|
|
|
|
control.removeAll(UnifiedBinaryProperty.make(BINARY_PROPERTIES + White_space, Default.ucd()).getSet());
|
|
|
|
System.out.println("Cf + Cc + Cs - WhiteSpace");
|
|
Utility.showSetNames("", control, false, Default.ucd());
|
|
|
|
control.add(0x2060,0x206f).add(0xFFF0,0xFFFB).add(0xE0000,0xE0FFF);
|
|
|
|
System.out.println("(Cf + Cc + Cs - WhiteSpace) + ranges");
|
|
Utility.showSetNames("", control, false, Default.ucd());
|
|
|
|
UnicodeSet odicp = UnifiedBinaryProperty.make(BINARY_PROPERTIES + Other_Default_Ignorable_Code_Point, Default.ucd()).getSet();
|
|
|
|
odicp.removeAll(control);
|
|
|
|
System.out.println("Minimal Default Ignorable Code Points");
|
|
Utility.showSetNames("", odicp, true, Default.ucd());
|
|
}
|
|
|
|
|
|
public static void IdentifierTest() {
|
|
String x = normalize(UTF32.valueOf32(0x10300), 4) ;
|
|
getCategoryID(x);
|
|
|
|
/*
|
|
Changes Category: U+10300 OLD ITALIC LETTER A
|
|
nfx_cp: U+D800 <surrogate-D800>
|
|
isIdentifier(nfx_cp, true): false
|
|
cat(nfx_cp): Cs
|
|
isIdentifierStart(cp, true): true
|
|
cat(cp): Lo
|
|
*/
|
|
|
|
for (int j = 0; j < 5; ++j) {
|
|
System.out.println();
|
|
System.out.println("Testing Identifier Closure for " + NAMES[j]);
|
|
System.out.println();
|
|
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
|
Utility.dot(cp);
|
|
if (!Default.ucd().isAssigned(cp)) continue;
|
|
if (Default.ucd().isPUA(cp)) continue;
|
|
if (isNormalized(cp, j)) continue;
|
|
|
|
if (cp == 0xFDFB || cp == 0x0140) {
|
|
System.out.println("debug point");
|
|
}
|
|
|
|
boolean norm;
|
|
boolean plain;
|
|
|
|
String x_cp = 'x' + UTF32.valueOf32(cp);
|
|
String nfx_x_cp = normalize(x_cp, j);
|
|
if (true) {
|
|
throw new RuntimeException("Fix plain & norm, 4 instances!!");
|
|
}
|
|
// plain = Default.ucd.isIdentifier(x_cp, true);
|
|
//norm = Default.ucd.isIdentifier(nfx_x_cp, true);
|
|
if (plain & !norm) {
|
|
Utility.fixDot();
|
|
System.out.println("*Not Identifier: " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println(" nfx_x_cp: " + Default.ucd().getCodeAndName(nfx_x_cp));
|
|
|
|
System.out.println(" isIdentifier(nfx_x_cp, true): " + norm);
|
|
System.out.println(" cat(nfx_x_cp): " + getCategoryID(nfx_x_cp));
|
|
|
|
System.out.println(" isIdentifier(x_cp, true): " + plain);
|
|
System.out.println(" cat(x_cp): " + getCategoryID(x_cp));
|
|
continue;
|
|
}
|
|
|
|
String nfx_cp = normalize(UTF32.valueOf32(cp), j);
|
|
// plain = Default.ucd.isIdentifierStart(cp, true);
|
|
// norm = Default.ucd.isIdentifier(nfx_cp, true);
|
|
if (plain & !norm) {
|
|
Utility.fixDot();
|
|
System.out.println(" Changes Category: " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println(" nfx_cp: " + Default.ucd().getCodeAndName(nfx_cp));
|
|
|
|
System.out.println(" isIdentifier(nfx_cp, true): " + norm);
|
|
System.out.println(" cat(nfx_cp): " + getCategoryID(nfx_cp));
|
|
|
|
System.out.println(" isIdentifierStart(cp, true): " + plain);
|
|
System.out.println(" cat(cp): " + Default.ucd().getCategoryID(cp));
|
|
System.out.println();
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static String getCategoryID(String s) {
|
|
if (UTF32.length32(s) == 1) return Default.ucd().getCategoryID(UTF32.char32At(s, 0));
|
|
StringBuffer result = new StringBuffer();
|
|
int cp;
|
|
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
|
cp = UTF32.char32At(s, i);
|
|
if (i != 0) result.append(' ');
|
|
result.append(Default.ucd().getCategoryID(cp));
|
|
}
|
|
return result.toString();
|
|
}
|
|
|
|
static String normalize(String s, int j) {
|
|
if (j < 4) return Default.nf(j).normalize(s);
|
|
return Default.ucd().getCase(s, FULL, FOLD);
|
|
}
|
|
|
|
static boolean isNormalized(int cp, int j) {
|
|
if (j < 4) return !Default.nf(j).isNormalized(cp);
|
|
return false;
|
|
}
|
|
|
|
private static final String[] NAMES = {"Default.nfd", "NFC", "NFKD", "NFKC", "Fold"};
|
|
|
|
public static void NFTest() {
|
|
for (int j = 0; j < 4; ++j) {
|
|
Normalizer nfx = Default.nf(j);
|
|
System.out.println();
|
|
System.out.println("Testing isNormalized for " + NAMES[j]);
|
|
System.out.println();
|
|
for (int i = 0; i < 0x10FFFF; ++i) {
|
|
Utility.dot(i);
|
|
if (!Default.ucd().isAssigned(i)) continue;
|
|
if (Default.ucd().isPUA(i)) continue;
|
|
String s = nfx.normalize(i);
|
|
boolean differs = !s.equals(UTF32.valueOf32(i));
|
|
boolean call = !nfx.isNormalized(i);
|
|
if (differs != call) {
|
|
Utility.fixDot();
|
|
System.out.println("Problem: differs: " + differs
|
|
+ ", call: " + call + " " + Default.ucd().getCodeAndName(i));
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
static final int EXCEPTION_FLAG = 0x8000000;
|
|
|
|
public static void checkScripts() throws IOException {
|
|
|
|
boolean ok;
|
|
Map m = new TreeMap();
|
|
UnicodeSet exceptions = ScriptExceptions.getExceptions();
|
|
int maxScriptLen = 0;
|
|
UnicodeSet show = new UnicodeSet();
|
|
show.add(0x2071);
|
|
show.add(0x207F);
|
|
|
|
for (int i = 0; i < 0x10FFFF; ++i) {
|
|
if (!Default.ucd().isAssigned(i)) continue;
|
|
byte cat = Default.ucd().getCategory(i);
|
|
byte script = Default.ucd().getScript(i);
|
|
switch (cat) {
|
|
case Lo: case Lt: case Ll: case Lu: case Lm: case Mc: case Sk:
|
|
ok = script != INHERITED_SCRIPT && script != COMMON_SCRIPT;
|
|
break;
|
|
case Mn: case Me:
|
|
ok = script == INHERITED_SCRIPT;
|
|
break;
|
|
default:
|
|
ok = script == COMMON_SCRIPT;
|
|
break;
|
|
}
|
|
if (show.contains(i)) {
|
|
System.out.println(Default.ucd().getCodeAndName(i)
|
|
+ "; " + Default.ucd().getScriptID(i)
|
|
+ "; " + Default.ucd().getCategoryID(i)
|
|
);
|
|
}
|
|
if (!ok) {
|
|
if (cat == Ll || cat == Lt) cat = Lu;
|
|
int intKey = (cat << 8) + script;
|
|
if (exceptions.contains(i)) intKey |= EXCEPTION_FLAG;
|
|
Integer key = new Integer(intKey);
|
|
UnicodeSet us = (UnicodeSet) m.get(key);
|
|
if (us == null) {
|
|
us = new UnicodeSet();
|
|
m.put(key, us);
|
|
}
|
|
us.add(i);
|
|
int len = Default.ucd().getScriptID(i).length();
|
|
if (maxScriptLen < len) maxScriptLen = len;
|
|
}
|
|
}
|
|
|
|
PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt", Utility.LATIN1_UNIX);
|
|
|
|
Iterator it = m.keySet().iterator();
|
|
while (it.hasNext()) {
|
|
Integer key = (Integer) it.next();
|
|
int intKey = key.intValue();
|
|
UnicodeSet badChars = (UnicodeSet) m.get(key);
|
|
int ranges = badChars.getRangeCount();
|
|
for (int kk = 0; kk < ranges; ++kk) {
|
|
int start = badChars.getRangeStart(kk);
|
|
int end = badChars.getRangeEnd(kk);
|
|
String code = Utility.hex(start) + (start != end ? ".." + Utility.hex(end) : "");
|
|
String scriptName = Default.ucd().getScriptID(start);
|
|
String title = "FAIL";
|
|
if ((intKey & EXCEPTION_FLAG) != 0) title = "EXCEPTION";
|
|
log.println(title + ": " + code + "; " + Utility.repeat(" ", 14 - code.length())
|
|
+ scriptName + Utility.repeat(" ", maxScriptLen-scriptName.length())
|
|
+ " # (" + LCgetCategoryID(start) + ") " + Default.ucd().getName(start)
|
|
+ (start != end ? ".." + Default.ucd().getName(end) : "")
|
|
);
|
|
}
|
|
log.println();
|
|
}
|
|
log.close();
|
|
}
|
|
|
|
static public String LCgetCategoryID(int cp) {
|
|
byte cat = Default.ucd().getCategory(cp);
|
|
if (cat == Lu || cat == Lt || cat == Ll) return "LC";
|
|
return Default.ucd().getCategoryID(cp);
|
|
}
|
|
|
|
static public void verifyNormalizationStability() {
|
|
|
|
verifyNormalizationStability2("3.1.0");
|
|
verifyNormalizationStability2("3.0.0");
|
|
}
|
|
|
|
static public void verifyNormalizationStability2(String version) {
|
|
|
|
// Default.nfd.normalizationDiffers(0x10300);
|
|
|
|
UCD older = UCD.make(version); // Default.ucd.getPreviousVersion();
|
|
|
|
Normalizer oldNFC = new Normalizer(Normalizer.NFC, older.getVersion());
|
|
Normalizer oldNFD = new Normalizer(Normalizer.NFD, older.getVersion());
|
|
Normalizer oldNFKC = new Normalizer(Normalizer.NFKC, older.getVersion());
|
|
Normalizer oldNFKD = new Normalizer(Normalizer.NFKD, older.getVersion());
|
|
|
|
System.out.println("Testing " + Default.nfd().getUCDVersion() + " against " + oldNFD.getUCDVersion());
|
|
|
|
for (int i = 0; i <= 0x10FFFF; ++i) {
|
|
Utility.dot(i);
|
|
if (!Default.ucd().isAssigned(i)) continue;
|
|
byte cat = Default.ucd().getCategory(i);
|
|
if (cat == Cs || cat == PRIVATE_USE) continue;
|
|
|
|
if (i == 0x5e) {
|
|
System.out.println("debug");
|
|
String test1 = Default.nfkd().normalize(i);
|
|
String test2 = oldNFKD.normalize(i);
|
|
System.out.println("Testing (new/old)" + Default.ucd().getCodeAndName(i));
|
|
System.out.println("\t" + Default.ucd().getCodeAndName(test1));
|
|
System.out.println("\t" + Default.ucd().getCodeAndName(test2));
|
|
}
|
|
|
|
if (older.isAssigned(i)) {
|
|
|
|
int newCan = Default.ucd().getCombiningClass(i);
|
|
int oldCan = older.getCombiningClass(i);
|
|
if (newCan != oldCan) {
|
|
System.out.println("FAILS CCC STABILITY: " + newCan + " != " + oldCan
|
|
+ "; " + Default.ucd().getCodeAndName(i));
|
|
}
|
|
|
|
verifyEquals(i, "NFD STABILITY (new/old)", Default.nfd().normalize(i), oldNFD.normalize(i));
|
|
verifyEquals(i, "NFC STABILITY (new/old)", Default.nfc().normalize(i), oldNFC.normalize(i));
|
|
verifyEquals(i, "NFKD STABILITY (new/old)", Default.nfkd().normalize(i), oldNFKD.normalize(i));
|
|
verifyEquals(i, "NFKC STABILITY (new/old)", Default.nfkc().normalize(i), oldNFKC.normalize(i));
|
|
|
|
} else {
|
|
// not in older version.
|
|
// (1) If there is a decomp, and it is composed of all OLD characters, then it must NOT compose
|
|
if (!Default.nfd().isNormalized(i)) {
|
|
String decomp = Default.nfd().normalize(i);
|
|
if (noneHaveCategory(decomp, Cn, older)) {
|
|
String recomp = Default.nfc().normalize(decomp);
|
|
if (recomp.equals(UTF16.valueOf(i))) {
|
|
Utility.fixDot();
|
|
System.out.println("FAILS COMP STABILITY: " + Default.ucd().getCodeAndName(i));
|
|
System.out.println("\t" + Default.ucd().getCodeAndName(decomp));
|
|
System.out.println("\t" + Default.ucd().getCodeAndName(recomp));
|
|
System.out.println();
|
|
throw new IllegalArgumentException("Comp stability");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public static boolean noneHaveCategory(String s, byte cat, UCD ucd) {
|
|
int cp;
|
|
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
|
cp = UTF16.charAt(s, i);
|
|
byte cat2 = ucd.getCategory(i);
|
|
if (cat == cat2) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public static void verifyEquals(int cp, String message, String a, String b) {
|
|
if (!a.equals(b)) {
|
|
Utility.fixDot();
|
|
System.out.println("FAILS " + message + ": " + Default.ucd().getCodeAndName(cp));
|
|
System.out.println("\t" + Default.ucd().getCodeAndName(a));
|
|
System.out.println("\t" + Default.ucd().getCodeAndName(b));
|
|
System.out.println();
|
|
}
|
|
}
|
|
|
|
public static void checkAgainstUInfo() {
|
|
/*
|
|
Default.ucd = UCD.make(Default.Default.ucdVersion);
|
|
UData x = new UData();
|
|
x.fleshOut();
|
|
|
|
System.out.println(Default.ucd.toString(0x1E0A));
|
|
|
|
UInfo.init();
|
|
System.out.println("Cross-checking against old implementation");
|
|
System.out.println("Version: " + Default.ucd.getVersion() + ", " + new Date(Default.ucd.getDate()));
|
|
for (int i = 0; i <= 0xFFFF; ++i) {
|
|
Utility.dot(i);
|
|
|
|
if ((i & 0x0FFF) == 0) System.out.println("#" + Utility.hex(i));
|
|
try {
|
|
check(i, Default.ucd.getName(i), UInfo.getName((char)i), "Name");
|
|
check(i, Default.ucd.getCategory(i), UInfo.getCategory((char)i), UCD_Names.GC, "GeneralCategory");
|
|
check(i, Default.ucd.getCombiningClass(i), UInfo.getCanonicalClass((char)i), "CanonicalClass");
|
|
check(i, Default.ucd.getBidiClass(i), UInfo.getBidiClass((char)i), UCD_Names.BC, "BidiClass");
|
|
check(i, Default.ucd.getDecompositionMapping(i), UInfo.getDecomposition((char)i), "Decomposition");
|
|
check(i, Default.ucd.getDecompositionType(i), UInfo.getDecompositionType((char)i), UCD_Names.DT, "DecompositionType");
|
|
check(i, Default.ucd.getNumericValue(i), UInfo.getNumeric((char)i), "NumericValue");
|
|
check(i, Default.ucd.getNumericType(i), UInfo.getNumericType((char)i), UCD_Names.NT, "NumericType");
|
|
|
|
check(i, Default.ucd.getCase(i, SIMPLE, LOWER), UInfo.getLowercase((char)i), "SimpleLowercase");
|
|
check(i, Default.ucd.getCase(i, SIMPLE, UPPER), UInfo.getUppercase((char)i), "SimpleUppercase");
|
|
check(i, Default.ucd.getCase(i, SIMPLE, TITLE), UInfo.getTitlecase((char)i), "SimpleTitlecase");
|
|
//check(i, Default.ucd.getSimpleCaseFolding(i), UInfo.getSimpleCaseFolding((char)i));
|
|
|
|
if (Default.ucd.getSpecialCase(i).length() == 0) { // NORMAL
|
|
check(i, Default.ucd.getCase(i, FULL, LOWER), UInfo.toLowercase((char)i, ""), "FullLowercase");
|
|
check(i, Default.ucd.getCase(i, FULL, UPPER), UInfo.toUppercase((char)i, ""), "FullUppercase");
|
|
check(i, Default.ucd.getCase(i, FULL, TITLE), UInfo.toTitlecase((char)i, ""), "FullTitlecase");
|
|
} else { // SPECIAL
|
|
check(i, Default.ucd.getCase(i, SIMPLE, LOWER), UInfo.toLowercase((char)i, ""), "FullLowercase");
|
|
check(i, Default.ucd.getCase(i, SIMPLE, UPPER), UInfo.toUppercase((char)i, ""), "FullUppercase");
|
|
check(i, Default.ucd.getCase(i, SIMPLE, TITLE), UInfo.toTitlecase((char)i, ""), "FullTitlecase");
|
|
}
|
|
// check(i, Default.ucd.getFullCaseFolding(i), UInfo.getFullCaseFolding((char)i));
|
|
|
|
check(i, Default.ucd.getSpecialCase(i).toUpperCase(), UInfo.getCaseCondition((char)i).toUpperCase(), "SpecialCase");
|
|
check(i, Default.ucd.getLineBreak(i), UInfo.getLineBreakType((char)i), UCD_Names.LB, "LineBreak");
|
|
check(i, Default.ucd.getEastAsianWidth(i), UInfo.getEastAsianWidthType((char)i), UCD_Names.EA, "EastAsian");
|
|
|
|
int props = Default.ucd.getBinaryProperties(i);
|
|
check(i, (props>>BidiMirrored) & 1, UInfo.getMirrored((char)i), UCD_Names.YN_TABLE, "BidiMirroring");
|
|
check(i, (props>>CompositionExclusion) & 1, UInfo.isCompositionExcluded((char)i)?1:0, UCD_Names.YN_TABLE, "Comp-Exclusion");
|
|
|
|
} catch (Exception e) {
|
|
Utility.fixDot();
|
|
|
|
System.out.println("Error: " + Utility.hex(i) + " " + e.getClass().getName() + e.getMessage());
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
*/
|
|
}
|
|
|
|
|
|
public static void check(int cp, boolean x, boolean y, String[] names, String type) {
|
|
check(cp, x ? 1 : 0, y ? 1 : 0, names, type);
|
|
}
|
|
|
|
public static void check(int cp, int x, int y, String[] names, String type) {
|
|
if (x == y) return;
|
|
showLast(cp);
|
|
Utility.fixDot();
|
|
System.out.println(" " + type + ": "
|
|
+ Utility.getName(x, names) + " (" + x + ") " + " != "
|
|
+ Utility.getName(y, names) + " (" + y + ") ") ;
|
|
}
|
|
|
|
public static void check(int cp, int x, int y, String type) {
|
|
if (x == y) return;
|
|
showLast(cp);
|
|
Utility.fixDot();
|
|
System.out.println(" " + type + ": " + x + " != " + y) ;
|
|
}
|
|
|
|
public static void check(int cp, double x, double y, String type) {
|
|
if (!(x > y) && !(x < y)) return; // funny syntax to catch NaN
|
|
showLast(cp);
|
|
Utility.fixDot();
|
|
System.out.println(" " + type + ": " + x + " != " + y) ;
|
|
}
|
|
|
|
public static void check(int cp, String x, String y, String type) {
|
|
if (x != null && x.equals(y)) return;
|
|
if (x != null && y != null
|
|
&& x.length() > 0 && y.length() > 0
|
|
&& x.charAt(0) == '<' && y.charAt(0) == '<') {
|
|
if (x.startsWith("<unassigned") && y.equals("<reserved>")) return;
|
|
if (y.equals("<control>")) return;
|
|
if (x.startsWith("<surrogate") && y.indexOf("Surrogate") != -1) return;
|
|
if (x.startsWith("<private use") && y.startsWith("<Private Use")) return;
|
|
}
|
|
showLast(cp);
|
|
Utility.fixDot();
|
|
System.out.println(" " + type + ": " + Utility.quoteJavaString(x) + " != " + Utility.quoteJavaString(y));
|
|
}
|
|
|
|
|
|
static int lastShowed = -1;
|
|
static boolean showCanonicalDecomposition = false;
|
|
|
|
static void showLast(int cp) {
|
|
if (lastShowed != cp) {
|
|
Utility.fixDot();
|
|
System.out.println();
|
|
String s = Default.ucd().getDecompositionMapping(cp);
|
|
System.out.print(Default.ucd().getCodeAndName(cp));
|
|
if (showCanonicalDecomposition && !s.equals(UTF32.valueOf32(cp))) {
|
|
System.out.print(" => " + Default.ucd().getCodeAndName(s));
|
|
}
|
|
System.out.println();
|
|
lastShowed = cp;
|
|
}
|
|
}
|
|
|
|
public static void test1() {
|
|
|
|
|
|
for (int i = 0x19; i < 0x10FFFF; ++i) {
|
|
|
|
System.out.println(Utility.hex(i) + " " + Utility.quoteJavaString(Default.ucd().getName(i)));
|
|
|
|
System.out.print(" "
|
|
+ ", gc=" + Default.ucd().getCategoryID(i)
|
|
+ ", bc=" + Default.ucd().getBidiClassID(i)
|
|
+ ", cc=" + Default.ucd().getCombiningClassID(i)
|
|
+ ", ea=" + Default.ucd().getEastAsianWidthID(i)
|
|
+ ", lb=" + Default.ucd().getLineBreakID(i)
|
|
+ ", dt=" + Default.ucd().getDecompositionTypeID(i)
|
|
+ ", nt=" + Default.ucd().getNumericTypeID(i)
|
|
+ ", nv=" + Default.ucd().getNumericValue(i)
|
|
);
|
|
for (int j = 0; j < UCD_Types.LIMIT_BINARY_PROPERTIES; ++j) {
|
|
if (Default.ucd().getBinaryProperty(i,j)) System.out.print(", " + UCD_Names.BP[j]);
|
|
}
|
|
System.out.println();
|
|
|
|
System.out.println(" "
|
|
+ ", dm=" + Utility.quoteJavaString(Default.ucd().getDecompositionMapping(i))
|
|
+ ", slc=" + Utility.quoteJavaString(Default.ucd().getCase(i, SIMPLE, LOWER))
|
|
+ ", stc=" + Utility.quoteJavaString(Default.ucd().getCase(i, SIMPLE, TITLE))
|
|
+ ", suc=" + Utility.quoteJavaString(Default.ucd().getCase(i, SIMPLE, UPPER))
|
|
+ ", flc=" + Utility.quoteJavaString(Default.ucd().getCase(i, FULL, LOWER))
|
|
+ ", ftc=" + Utility.quoteJavaString(Default.ucd().getCase(i, FULL, TITLE))
|
|
+ ", fuc=" + Utility.quoteJavaString(Default.ucd().getCase(i, FULL, UPPER))
|
|
+ ", sc=" + Utility.quoteJavaString(Default.ucd().getSpecialCase(i))
|
|
);
|
|
|
|
if (i > 0x180) i = 3 * i / 2;
|
|
}
|
|
}
|
|
|
|
static void checkCanonicalProperties() {
|
|
|
|
System.out.println(Default.ucd().toString(0x1E0A));
|
|
|
|
System.out.println("Cross-checking canonical equivalence");
|
|
System.out.println("Version: " + Default.ucd().getVersion() + ", " + new Date(Default.ucd().getDate()));
|
|
showCanonicalDecomposition = true;
|
|
for (int q = 1; q < 2; ++q)
|
|
for (int i = 0; i <= 0x10FFFF; ++i) {
|
|
Utility.dot(i);
|
|
if (i == 0x0387) {
|
|
System.out.println("debug?");
|
|
}
|
|
byte type = Default.ucd().getDecompositionType(i);
|
|
if (type != CANONICAL) continue;
|
|
|
|
String s = Default.ucd().getDecompositionMapping(i);
|
|
int slen = UTF32.length32(s);
|
|
int j = UTF32.char32At(s, 0);
|
|
try {
|
|
if (q == 0) {
|
|
check(i, Default.ucd().getCategory(i), Default.ucd().getCategory(j), UCD_Names.GENERAL_CATEGORY, "GeneralCategory");
|
|
check(i, Default.ucd().getCombiningClass(i), Default.ucd().getCombiningClass(j), "CanonicalClass");
|
|
check(i, Default.ucd().getBidiClass(i), Default.ucd().getBidiClass(j), UCD_Names.BIDI_CLASS, "BidiClass");
|
|
check(i, Default.ucd().getNumericValue(i), Default.ucd().getNumericValue(j), "NumericValue");
|
|
check(i, Default.ucd().getNumericType(i), Default.ucd().getNumericType(j), UCD_Names.LONG_NUMERIC_TYPE, "NumericType");
|
|
|
|
if (false) {
|
|
for (byte k = LOWER; k < LIMIT_CASE; ++k) {
|
|
check(i, Default.ucd().getCase(i, SIMPLE, k), Default.ucd().getCase(j, SIMPLE, k), "Simple("+k+")");
|
|
check(i, Default.ucd().getCase(i, FULL, k), Default.ucd().getCase(j, FULL, k), "Full("+k+")");
|
|
}
|
|
}
|
|
|
|
if (slen == 1) check(i, Default.ucd().getSpecialCase(i), Default.ucd().getSpecialCase(j), "SpecialCase");
|
|
|
|
for (byte k = 0; k < LIMIT_BINARY_PROPERTIES; ++k) {
|
|
if (k == Hex_Digit) continue;
|
|
if (k == Radical) continue;
|
|
if (k == UnifiedIdeograph) continue;
|
|
if (k == CompositionExclusion) continue;
|
|
check(i, Default.ucd().getBinaryProperty(i, k), Default.ucd().getBinaryProperty(j, k), UCD_Names.YN_TABLE, Default.ucd().getBinaryPropertiesID_fromIndex(k));
|
|
}
|
|
} else {
|
|
//check(i, Default.ucd.getLineBreak(i), Default.ucd.getLineBreak(j), UCD_Names.LB, "LineBreak");
|
|
//check(i, Default.ucd.getEastAsianWidth(i), Default.ucd.getEastAsianWidth(j), UCD_Names.EA, "EastAsian");
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
System.out.println("Error: " + Utility.hex(i) + " " + e.getClass().getName() + e.getMessage());
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|
|
|
|
static void checkSpeed() {
|
|
int count = 1000000;
|
|
int sum = 0;
|
|
long start, end;
|
|
|
|
java.text.NumberFormat nf = java.text.NumberFormat.getPercentInstance();
|
|
|
|
start = System.currentTimeMillis();
|
|
for (int i = count; i >= 0; --i) {
|
|
sum += dummy0(i).length();
|
|
}
|
|
end = System.currentTimeMillis();
|
|
double base = end - start;
|
|
|
|
System.out.println("unsynchronized static char[]: " + nf.format((end - start)/base));
|
|
|
|
start = System.currentTimeMillis();
|
|
for (int i = count; i >= 0; --i) {
|
|
sum += dummy2(i).length();
|
|
}
|
|
end = System.currentTimeMillis();
|
|
System.out.println("synchronized static char[]: " + nf.format((end - start)/base));
|
|
|
|
start = System.currentTimeMillis();
|
|
for (int i = count; i >= 0; --i) {
|
|
sum += dummy1(i).length();
|
|
}
|
|
end = System.currentTimeMillis();
|
|
System.out.println("char[] each time: " + nf.format((end - start)/base));
|
|
|
|
start = System.currentTimeMillis();
|
|
for (int i = count; i >= 0; --i) {
|
|
sum += dummy3(i).length();
|
|
}
|
|
end = System.currentTimeMillis();
|
|
System.out.println("two valueofs: " + nf.format((end - start)/base));
|
|
|
|
System.out.println(sum);
|
|
}
|
|
|
|
static String dummy1(int a) {
|
|
char[] temp = new char[2];
|
|
temp[0] = (char)(a >>> 16);
|
|
temp[1] = (char)a;
|
|
return new String(temp);
|
|
}
|
|
|
|
static char[] temp2 = new char[2];
|
|
|
|
static String dummy2(int a) {
|
|
synchronized (temp2) {
|
|
temp2[0] = (char)(a >>> 16);
|
|
temp2[1] = (char)a;
|
|
return new String(temp2);
|
|
}
|
|
}
|
|
|
|
static String dummy0(int a) {
|
|
temp2[0] = (char)(a >>> 16);
|
|
temp2[1] = (char)a;
|
|
return new String(temp2);
|
|
}
|
|
|
|
static String dummy3(int a) {
|
|
return String.valueOf((char)(a >>> 16)) + (char)a;
|
|
}
|
|
|
|
|
|
}
|