scuffed-code/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java
Mark Davis 16682de25d minor mods
X-SVN-Rev: 5703
2001-09-06 01:30:31 +00:00

553 lines
21 KiB
Java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.*;
public class DerivedProperty implements UCD_Types {
UCD ucdData;
static final int
PropMath = 0,
PropAlphabetic = 1,
PropLowercase = 2,
PropUppercase = 3,
ID_Start = 4,
ID_Continue_NO_Cf = 5,
Mod_ID_Start = 6,
Mod_ID_Continue_NO_Cf = 7,
Missing_Uppercase = 8,
Missing_Lowercase = 9,
Missing_Mixedcase = 10,
FC_NFKC_Closure = 11,
FullCompExclusion = 12,
FullCompInclusion = 13,
QuickNFD = 14,
QuickNFC = 15,
QuickNFKD = 16,
QuickNFKC = 17,
ExpandsOnNFD = 18,
ExpandsOnNFC = 19,
ExpandsOnNFKD = 20,
ExpandsOnNFKC = 21,
GenNFD = 22,
GenNFC = 23,
GenNFKD = 24,
GenNFKC = 25,
DefaultIgnorable = 26,
GraphemeExtend = 27,
GraphemeBase = 28,
LIMIT = 29;
public DerivedProperty(UCD ucd) {
ucdData = ucd;
}
public String getHeader(int propNumber) {
DProp dp = dprops[propNumber];
if (dp != null) return dp.getHeader();
else return "Unimplemented!!";
}
public String getName(int propNumber) {
DProp dp = dprops[propNumber];
if (dp != null) return dp.getName();
else return "Unimplemented!!";
}
public String getProperty(int cp, int propNumber) {
DProp dp = dprops[propNumber];
if (dp != null) return dp.getProperty(cp);
else return "Unimplemented!!";
}
public boolean isDefined(int propNumber) {
return dprops[propNumber] != null;
}
public boolean hasProperty(int cp, int propNumber) {
return dprops[propNumber].hasProperty(cp);
}
public boolean propertyVaries(int propNumber) {
return dprops[propNumber].propertyVaries();
}
/*
public String getProperty(int cp, int propNumber) {
return dprops[propNumber].getProperty(int cp);
}
*/
private DProp[] dprops = new DProp[50];
private Normalizer[] nf = new Normalizer[4];
private Normalizer nfd, nfc, nfkd, nfkc;
static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"};
static final String[] CaseNames = {
"Uppercase",
"Lowercase",
"Mixedcase"};
private abstract class DProp {
String name, header;
String getName() { return name; }
String getHeader() { return header; }
abstract boolean hasProperty(int cp);
public boolean propertyVaries() { return false; }
public String getProperty(int cp) { return hasProperty(cp) ? name : ""; }
}
class ExDProp extends DProp {
Normalizer nfx;
ExDProp(int i) {
nfx = nf[i-ExpandsOnNFD];
name = "Expands_On_" + NAME[i-ExpandsOnNFD];
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters whose normalized length is not one."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
boolean hasProperty(int cp) {
if (ucdData.getDecompositionType(cp) == NONE) return false;
String norm = nfx.normalize(cp);
if (UTF16.countCodePoint(norm) != 1) return true;
return false;
}
};
class GenDProp extends DProp {
Normalizer nfx;
Normalizer nfComp = null;
GenDProp (int i) {
nfx = nf[i-GenNFD];
name = NAME[i-GenNFD];
String compName = "the character itself";
if (i == GenNFKC || i == GenNFD) {
name += "-NFC";
nfComp = nfc;
compName = "NFC for the character";
} else if (i == GenNFKD) {
name += "-NFD";
nfComp = nfd;
compName = "NFD for the character";
}
header = "# Derived Property: " + name
+ "\r\n# Normalized form " + NAME[i-GenNFD] + ", where DIFFERENT from " + compName + "."
+ "\r\n# HANGUL SYLLABLES are algorithmically decomposed, and not listed explicitly."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
}
public boolean propertyVaries() {return true;} // default
int cacheCp = 0;
String cacheStr = "";
public String getProperty(int cp) {
if (cacheCp == cp) return cacheStr;
cacheCp = cp;
cacheStr = "";
if (ucdData.getDecompositionType(cp) != NONE) {
String cps = UTF32.valueOf32(cp);
String comp = cps;
if (nfComp != null) {
comp = nfComp.normalize(comp);
}
String normal = nfx.normalize(cps);
if (!comp.equals(normal)) {
String norm = Utility.hex(normal);
String pad = Utility.repeat(" ", 14-norm.length());
cacheStr = name + "; " + norm + pad;
}
}
return cacheStr;
//if (cp >= 0xAC00 && cp <= 0xD7A3) return true;
//System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps)));
} // default
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
};
class CaseDProp extends DProp {
byte val;
CaseDProp (int i) {
val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt);
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
header = "# Derived Property: " + name
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == val
|| val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) return false;
byte xCat = getDecompCat(cp);
if (xCat == val) return true;
return false;
}
};
class QuickDProp extends DProp {
String NO;
String MAYBE;
Normalizer nfx;
QuickDProp (int i) {
nfx = nf[i - QuickNFD];
NO = NAME[i-QuickNFD] + "_NO";
MAYBE = NAME[i-QuickNFD] + "_MAYBE";
name = NAME[i-QuickNFD] + "_QuickCheck";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing decomposibles"
+ ((i == QuickNFC || i == QuickNFKC)
? " (and characters that may compose with previous ones)" : "");
}
public boolean propertyVaries() {return true;}
public String getProperty(int cp) {
if (nfx.normalizationDiffers(cp)) return NO;
else if (nfx.isTrailing(cp)) return MAYBE;
else return "";
}
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
};
{
nfd = nf[0] = new Normalizer(Normalizer.NFD);
nfc = nf[1] = new Normalizer(Normalizer.NFC);
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
dprops[i] = new ExDProp(i);
}
for (int i = GenNFD; i <= GenNFKC; ++i) {
dprops[i] = new GenDProp(i);
}
dprops[ID_Start] = new DProp() {
{
name = "ID_Start";
header = "# Derived Property: " + name
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
}
boolean hasProperty(int cp) {
return ucdData.isIdentifierStart(cp, false);
}
};
dprops[ID_Continue_NO_Cf] = new DProp() {
{
name = "ID_Continue";
header = "# Derived Property: " + name
+ "\r\n# Characters that can continue an identifier."
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
+ "\r\n# NOTE: Cf characters should be filtered out.";
}
boolean hasProperty(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, false);
}
};
dprops[Mod_ID_Start] = new DProp() {
{
name = "XID_Start";
header = "# Derived Property: " + name
+ "\r\n# ID_Start modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
boolean hasProperty(int cp) {
return ucdData.isIdentifierStart(cp, true);
}
};
dprops[Mod_ID_Continue_NO_Cf] = new DProp() {
{
name = "XID_Continue";
header = "# Derived Property: " + name
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
+ "\r\n# NOTE: Cf characters should be filtered out."
+ "\r\n# NOTE: Does NOT remove the non-NFKx characters."
+ "\r\n# Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))";
}
boolean hasProperty(int cp) {
return ucdData.isIdentifierContinue_NO_Cf(cp, true);
}
};
dprops[PropMath] = new DProp() {
{
name = "Math";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Sm + Other_Math";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Sm
|| ucdData.getBinaryProperty(cp,Math_Property)) return true;
return false;
}
};
dprops[PropAlphabetic] = new DProp() {
{
name = "Alphabetic";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl
|| ucdData.getBinaryProperty(cp, Alphabetic)) return true;
return false;
}
};
dprops[PropLowercase] = new DProp() {
{
name = "Lowercase";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Ll + Other_Lowercase";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return true;
return false;
}
};
dprops[PropUppercase] = new DProp() {
{
name = "Uppercase";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu + Other_Uppercase";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return true;
return false;
}
};
for (int i = Missing_Uppercase; i <= Missing_Mixedcase; ++i) {
dprops[i] = new CaseDProp(i);
}
/*
(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by
including all characters whose canonical decomposition consists of a single character.
(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData
file by including all characters whose canonical decomposition consists of a sequence
of characters, the first of which has a non-zero combining class.
*/
dprops[FullCompExclusion] = new DProp() {
{
name = "Comp_Ex";
header = "# Derived Property: " + name
+ ": Full Composition Exclusion"
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
}
boolean hasProperty(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
if (isCompEx(cp)) return true;
return false;
}
};
dprops[FullCompInclusion] = new DProp() {
{
name = "Comp_In";
header = "# Derived Property: " + name
+ ": Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
}
boolean hasProperty(int cp) {
if (!ucdData.isRepresented(cp)) return false;
byte dtype = ucdData.getDecompositionType(cp);
if (dtype != CANONICAL) return false;
if (isCompEx(cp)) return true;
return false;
}
};
dprops[FC_NFKC_Closure] = new DProp() {
{
name = "FC_NFKC_Closure";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
+ "\r\n# mappings that constitute the FC_NFKC_Closure list";
}
public boolean propertyVaries() {return true;} // default
public String getProperty(int cp) {
if (!ucdData.isRepresented(cp)) return "";
String b = nfkc.normalize(fold(cp));
String c = nfkc.normalize(fold(b));
if (c.equals(b)) return "";
return "FNC; " + Utility.hex(c);
} // default
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
};
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
dprops[i] = new QuickDProp(i);
}
dprops[DefaultIgnorable] = new DProp() {
{
name = "Default_Ignorable_Code_Point";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - WhiteSpace";
}
boolean hasProperty(int cp) {
if (ucdData.getBinaryProperty(cp, White_space)) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Cf || cat == Cs || cat == Cc
|| ucdData.getBinaryProperty(cp,Reserved_Cf_Code_Point)) return true;
return false;
}
};
/*
GraphemeExtend = 27,
GraphemeBase = 28,
# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink
# GraphemeBase :=
*/
dprops[GraphemeExtend] = new DProp() {
{
name = "GraphemeExtend";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink"
+ "\r\n# Used in the definition of GraphemeCluster: "
+ "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*";
}
boolean hasProperty(int cp) {
if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false;
byte cat = ucdData.getCategory(cp);
if (cat == Me || cat == Mn || cat == Mc
|| ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) return true;
return false;
}
};
dprops[GraphemeBase] = new DProp() {
{
name = "GraphemeBase";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - GraphemeLink - GraphemeExtend"
+ "\r\n# Used in the definition of GraphemeCluster: "
+ "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp
|| ucdData.getBinaryProperty(cp,GraphemeLink)) return false;
if (dprops[GraphemeExtend].hasProperty(cp)) return false;
return true;
}
};
}
byte getDecompCat(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lu
|| ucdData.getBinaryProperty(cp, Other_Uppercase)) return Lu;
if (cat == Ll
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
if (!nf[2].normalizationDiffers(cp)) return Lo;
String norm = nf[2].normalize(cp);
int cp2;
boolean gotUpper = false;
boolean gotLower = false;
boolean gotTitle = false;
for (int i = 0; i < norm.length(); i += UTF32.count16(cp2)) {
cp2 = UTF32.char32At(norm, i);
byte catx = ucdData.getCategory(cp2);
boolean upx = ucdData.getBinaryProperty(cp, Other_Uppercase);
boolean lowx = ucdData.getBinaryProperty(cp, Other_Lowercase);
if (catx == Ll || lowx || cp2 == 0x345) gotLower = true;
if (catx == Lu || upx) gotUpper = true;
if (catx == Lt) gotTitle = true;
}
if (gotLower && !gotUpper && !gotTitle) return Ll;
if (!gotLower && gotUpper && !gotTitle) return Lu;
if (gotLower || gotUpper || gotTitle) return Lt;
return cat;
}
boolean isCompEx(int cp) {
if (ucdData.getBinaryProperty(cp, CompositionExclusion)) return true;
String decomp = ucdData.getDecompositionMapping(cp);
if (UTF32.length32(decomp) == 1) return true;
int first = UTF32.char32At(decomp,0);
if (ucdData.getCombiningClass(first) != 0) return true;
return false;
}
String fold(int cp) {
return ucdData.getCase(cp, FULL, FOLD);
}
String fold(String s) {
return ucdData.getCase(s, FULL, FOLD);
}
public static void test() {
UCD ucd = UCD.make();
DerivedProperty dprop = new DerivedProperty(ucd);
/*
for (int j = 0; j < LIMIT; ++j) {
System.out.println();
System.out.println(j + "\t" + dprop.getName(j));
System.out.println(dprop.getHeader(j));
}
*/
for (int cp = 0xA0; cp < 0xFF; ++cp) {
System.out.println();
System.out.println(ucd.getCodeAndName(cp));
for (int j = 0; j < LIMIT; ++j) {
String prop = dprop.getProperty(cp, j);
if (prop.length() != 0) System.out.println("\t" + prop);
}
}
}
}