scuffed-code/tools/unicodetools/com/ibm/text/UCD/Normalizer.java

574 lines
20 KiB
Java
Raw Normal View History

2001-08-31 00:30:17 +00:00
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
2003-02-25 23:38:23 +00:00
* $Date: 2003/02/25 23:38:22 $
* $Revision: 1.14 $
2001-08-31 00:30:17 +00:00
*
*******************************************************************************
*/
2001-08-30 20:50:18 +00:00
package com.ibm.text.UCD;
import java.util.*;
import com.ibm.icu.text.UTF16;
2001-08-30 20:50:18 +00:00
import com.ibm.text.utility.*;
/**
* Implements Unicode Normalization Forms C, D, KC, KD.<br>
* See UTR#15 for details.<br>
* Copyright <EFBFBD> 1998-1999 Unicode, Inc. All Rights Reserved.<br>
* The Unicode Consortium makes no expressed or implied warranty of any
* kind, and assumes no liability for errors or omissions.
* No liability is assumed for incidental and consequential damages
* in connection with or arising out of the use of the information here.
* @author Mark Davis
*/
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public final class Normalizer implements UCD_Types {
2001-08-31 00:30:17 +00:00
public static final String copyright =
2001-08-30 20:50:18 +00:00
"Copyright (C) 2000, IBM Corp. and others. All Rights Reserved.";
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public static boolean SHOW_PROGRESS = false;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Create a normalizer for a given form.
*/
public Normalizer(byte form, String unicodeVersion) {
this.form = form;
this.composition = (form & NF_COMPOSITION_MASK) != 0;
this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0;
2001-08-30 20:50:18 +00:00
this.data = getData(unicodeVersion);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Create a normalizer for a given form.
*/
// public Normalizer(byte form) {
// this(form,"");
//}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Return string name
*/
public static String getName(byte form) {
return UCD_Names.NF_NAME[form];
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Return string name
*/
public String getName() {
return getName(form);
}
2002-03-20 00:21:43 +00:00
/**
* Return string name
*/
public String getUCDVersion() {
return data.getUCDVersion();
}
/**
* Does compose?
*/
public boolean isComposition() {
return composition;
}
/**
* Does compose?
*/
public boolean isCompatibility() {
return compatibility;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
2001-08-31 00:30:17 +00:00
* Normalizes text according to the chosen form,
2001-08-30 20:50:18 +00:00
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public StringBuffer normalize(String source, StringBuffer target) {
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// First decompose the source into target,
// then compose if the form requires.
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
if (source.length() != 0) {
internalDecompose(source, target, true, compatibility);
2001-08-30 20:50:18 +00:00
if (composition) {
internalCompose(target);
}
}
return target;
}
/**
* Normalizes text according to the chosen form,
* replacing contents of the target buffer.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
public boolean isFCD(String source) {
if (source.length() == 0) return true;
StringBuffer noReorder = new StringBuffer();
StringBuffer reorder = new StringBuffer();
internalDecompose(source, noReorder, false, false);
internalDecompose(source, reorder, true, false);
return reorder.toString().equals(noReorder.toString());
}
2001-08-30 20:50:18 +00:00
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(String source) {
return normalize(source, new StringBuffer()).toString();
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Normalizes text according to the chosen form
* @param source the original text, unnormalized
* @return target the resulting normalized text
*/
public String normalize(int cp) {
return normalize(UTF16.valueOf(cp));
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
private StringBuffer hasDecompositionBuffer = new StringBuffer();
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public boolean hasDecomposition(int cp) {
hasDecompositionBuffer.setLength(0);
normalize(UTF16.valueOf(cp), hasDecompositionBuffer);
if (hasDecompositionBuffer.length() != 1) return true;
return cp != hasDecompositionBuffer.charAt(0);
}
2002-03-20 00:21:43 +00:00
*/
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Does a quick check to see if the string is in the current form. Checks canonical order and
* isAllowed().
* @param source source text
* @return YES, NO, MAYBE
*/
/*
public static final int NO = 0, YES = 1, MAYBE = -1;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public int quickCheck(String source) {
short lastCanonicalClass = 0;
int result = YES;
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
short canonicalClass = data.getCanonicalClass(ch);
if (lastCanonicalClass > canonicalClass && canonicalClass != 0) {
return NO;
}
int check = isAllowed(ch);
if (check == NO) return NO;
if (check == MAYBE) result = MAYBE;
}
return result;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Find whether the given character is allowed in the current form.
* @return YES, NO, MAYBE
*/
/*
public int isAllowed(char ch) {
if (composition) {
if (compatibility) {
if (data.isCompatibilityExcluded(ch)) {
return NO;
}
} else {
if (data.isExcluded(ch)) {
return NO;
}
}
if (data.isTrailing(ch)) {
return MAYBE;
}
} else { // decomposition: both NFD and NFKD
if (data.normalizationDiffers(compatibility,ch)) return NO;
}
return YES;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Utility: Gets the combining class of a character from the
* Unicode Character Database. Only a byte is needed, but since they are signed in Java
* return an int to forstall problems.
* @param ch the source character
* @return value from 0 to 255
*/
2001-08-31 00:30:17 +00:00
public short getCanonicalClass(int ch) {
2001-08-30 20:50:18 +00:00
return data.getCanonicalClass(ch);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
2001-08-31 00:30:17 +00:00
* Utility: Checks whether there is a recursive decomposition of a character from the
2001-08-30 20:50:18 +00:00
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean isNormalized(int ch) {
return !data.normalizationDiffers(ch, composition, compatibility);
2001-08-30 20:50:18 +00:00
}
2001-08-31 00:30:17 +00:00
/**
* Utility: Checks whether there is a recursive decomposition of a character from the
* Unicode Character Database. It is compatibility or canonical according to the particular
* normalizer.
* @param ch the source character
*/
public boolean isNormalized(String s) {
if (UTF16.countCodePoint(s) > 1) {
return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility);
}
return s.equals(normalize(s)); // TODO: OPTIMIZE LATER
}
2001-08-30 20:50:18 +00:00
/**
2001-08-31 00:30:17 +00:00
* Utility: Gets recursive decomposition of a character from the
2001-08-30 20:50:18 +00:00
* Unicode Character Database.
2001-08-31 00:30:17 +00:00
* @param compatibility If false selects the recursive
2001-08-30 20:50:18 +00:00
* canonical decomposition, otherwise selects
* the recursive compatibility AND canonical decomposition.
* @param ch the source character
* @param buffer buffer to be filled with the decomposition
*/
public void getRecursiveDecomposition(char ch, StringBuffer buffer) {
data.getRecursiveDecomposition(ch, buffer, compatibility);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Utility: Gets composition mapping.
* @return IntEnumeration with the pair -> value mapping, where the
* pair is firstChar << 16 | secondChar.
* Will need to be fixed for surrogates.
*/
2001-08-31 00:30:17 +00:00
public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) {
Iterator it = data.compTable.keySet().iterator();
while (it.hasNext()) {
Long key = (Long)it.next();
Integer result = (Integer)data.compTable.get(key);
long keyLong = key.longValue();
if (leading != null) leading.set((int)(keyLong >>> 32));
if (trailing != null) trailing.set((int)keyLong);
if (resulting != null) resulting.set(result.intValue());
}
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
}
if (leading != null) {
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables
}
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
public boolean isTrailing(int cp) {
return this.composition ? data.isTrailing(cp) : false;
}
2001-08-31 00:30:17 +00:00
public boolean isLeading(int cp) {
return this.composition ? data.isLeading(cp) : false;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// ======================================
// PRIVATES
// ======================================
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* The current form.
*/
private byte form;
2001-08-30 20:50:18 +00:00
private boolean composition;
private boolean compatibility;
/**
* Decomposes text, either canonical or compatibility,
* replacing contents of the target buffer.
* @param form the normalization form. If NF_COMPATIBILITY_MASK
2001-08-31 00:30:17 +00:00
* bit is on in this byte, then selects the recursive
2001-08-30 20:50:18 +00:00
* compatibility decomposition, otherwise selects
* the recursive canonical decomposition.
* @param source the original text, unnormalized
* @param target the resulting normalized text
*/
private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) {
2001-08-30 20:50:18 +00:00
StringBuffer buffer = new StringBuffer();
int ch32;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
data.getRecursiveDecomposition(ch32, buffer, compat);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// add all of the characters in the decomposition.
// (may be just the original character, if there was
// no decomposition mapping)
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int ch;
for (int j = 0; j < buffer.length(); j += UTF16.getCharCount(ch)) {
2001-09-06 01:30:31 +00:00
ch = UTF16.charAt(buffer, j);
2001-08-30 20:50:18 +00:00
int chClass = data.getCanonicalClass(ch);
int k = target.length(); // insertion point
if (chClass != 0 && reorder) {
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// bubble-sort combining marks as necessary
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int ch2;
for (; k > 0; k -= UTF16.getCharCount(ch2)) {
2001-09-06 01:30:31 +00:00
ch2 = UTF16.charAt(target, k-1);
2001-08-30 20:50:18 +00:00
if (data.getCanonicalClass(ch2) <= chClass) break;
}
}
target.insert(k, UTF16.valueOf(ch));
}
}
}
/**
* Composes text in place. Target must already
* have been decomposed.
* Uses UTF16, which is a utility class for supplementary character support in Java.
* @param target input: decomposed text.
* output: the resulting normalized text.
*/
private void internalCompose(StringBuffer target) {
int starterPos = 0;
2001-09-06 01:30:31 +00:00
int starterCh = UTF16.charAt(target,0);
2001-08-30 20:50:18 +00:00
int compPos = UTF16.getCharCount(starterCh); // length of last composition
int lastClass = data.getCanonicalClass(starterCh);
if (lastClass != 0) lastClass = 256; // fix for strings staring with a combining mark
int oldLen = target.length();
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// Loop on the decomposed characters, combining where possible
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int ch;
for (int decompPos = compPos; decompPos < target.length(); decompPos += UTF16.getCharCount(ch)) {
2001-09-06 01:30:31 +00:00
ch = UTF16.charAt(target, decompPos);
2001-08-30 20:50:18 +00:00
if (SHOW_PROGRESS) System.out.println(Utility.hex(target)
+ ", decompPos: " + decompPos
+ ", compPos: " + compPos
+ ", ch: " + Utility.hex(ch)
);
int chClass = data.getCanonicalClass(ch);
int composite = data.getPairwiseComposition(starterCh, ch);
if (composite != data.NOT_COMPOSITE
&& (lastClass < chClass || lastClass == 0)) {
UTF16.setCharAt(target, starterPos, composite);
// we know that we will only be replacing non-supplementaries by non-supplementaries
// so we don't have to adjust the decompPos
starterCh = composite;
} else {
if (chClass == 0) {
starterPos = compPos;
starterCh = ch;
}
lastClass = chClass;
UTF16.setCharAt(target, compPos, ch);
if (target.length() != oldLen) { // MAY HAVE TO ADJUST!
System.out.println("ADJUSTING: " + Utility.hex(target));
decompPos += target.length() - oldLen;
oldLen = target.length();
}
compPos += UTF16.getCharCount(ch);
}
}
target.setLength(compPos);
}
static class Stub {
private UCD ucd;
private HashMap compTable = new HashMap();
private BitSet isSecond = new BitSet();
private BitSet isFirst = new BitSet();
2001-08-30 20:50:18 +00:00
private BitSet canonicalRecompose = new BitSet();
private BitSet compatibilityRecompose = new BitSet();
static final int NOT_COMPOSITE = 0xFFFF;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
Stub(String version) {
ucd = UCD.make(version);
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
2001-08-30 20:50:18 +00:00
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
try {
String s = ucd.getDecompositionMapping(i);
int len = UTF16.countCodePoint(s);
if (len != 2) {
2003-02-25 23:38:23 +00:00
if (len > 2) {
if (ucd.getVersion().compareTo("3.0.0") >= 0) {
throw new IllegalArgumentException("BAD LENGTH: " + len + ucd.toString(i));
}
}
2001-08-30 20:50:18 +00:00
continue;
}
int a = UTF16.charAt(s, 0);
if (ucd.getCombiningClass(a) != 0) continue;
isFirst.set(a);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int b = UTF16.charAt(s, UTF16.getCharCount(a));
isSecond.set(b);
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
// have a recomposition, so set the bit
canonicalRecompose.set(i);
2001-08-31 00:30:17 +00:00
// set the compatibility recomposition bit
2001-08-30 20:50:18 +00:00
// ONLY if the component characters
// don't compatibility decompose
if (ucd.getDecompositionType(a) <= CANONICAL
&& ucd.getDecompositionType(b) <= CANONICAL) {
compatibilityRecompose.set(i);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
long key = (((long)a)<<32) | b;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/*if (i == '\u1E0A' || key == 0x004400000307) {
System.out.println(Utility.hex(s));
System.out.println(Utility.hex(i));
System.out.println(Utility.hex(key));
}*/
compTable.put(new Long(key), new Integer(i));
} catch (Exception e) {
throw new ChainException("Error: {0}", new Object[]{ucd.toString(i)}, e);
}
}
}
// process compatibilityRecompose
// have to do this afterwards, since we don't know whether the pieces
// are allowable until we have processed all the characters
/*
Iterator it = compTable.keySet().iterator();
while (it.hasNext()) {
Long key = (Long)it.next();
int cp = compTable.get(key);
long keyLong = key.longValue();
int first = (int)(keyLong >>> 32);
int second = (int)keyLong;
if (ucd.
*/
}
2002-03-20 00:21:43 +00:00
String getUCDVersion() {
return ucd.getVersion();
}
2001-08-30 20:50:18 +00:00
/*
Problem: differs: true, call: false U+0385 GREEK DIALYTIKA TONOS
Problem: differs: true, call: false U+03D3 GREEK UPSILON WITH ACUTE AND HOOK SYMBOL
Problem: differs: true, call: false U+03D4 GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL
Problem: differs: true, call: false U+1E9B LATIN SMALL LETTER LONG S WITH DOT ABOVE
Problem: differs: true, call: false U+1FC1 GREEK DIALYTIKA AND PERISPOMENI
Problem: differs: true, call: false U+1FCD GREEK PSILI AND VARIA
Problem: differs: true, call: false U+1FCE GREEK PSILI AND OXIA
Problem: differs: true, call: false U+1FCF GREEK PSILI AND PERISPOMENI
Problem: differs: true, call: false U+1FDD GREEK DASIA AND VARIA
Problem: differs: true, call: false U+1FDE GREEK DASIA AND OXIA
Problem: differs: true, call: false U+1FDF GREEK DASIA AND PERISPOMENI
Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
*/
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
short getCanonicalClass(int cp) {
return ucd.getCombiningClass(cp);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
boolean isTrailing(int cp) {
return isSecond.get(cp);
}
2001-08-31 00:30:17 +00:00
boolean isLeading(int cp) {
return isFirst.get(cp);
}
boolean normalizationDiffers(int cp, boolean composition, boolean compat) {
2001-08-30 20:50:18 +00:00
byte dt = ucd.getDecompositionType(cp);
if (!composition) {
if (compat) return dt >= CANONICAL;
2001-08-30 20:50:18 +00:00
else return dt == CANONICAL;
} else {
// almost the same, except that we add back in the characters
// that RECOMPOSE
if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
2001-08-30 20:50:18 +00:00
else return dt == CANONICAL && !canonicalRecompose.get(cp);
}
}
2001-08-31 00:30:17 +00:00
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) {
2001-08-30 20:50:18 +00:00
byte dt = ucd.getDecompositionType(cp);
// we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE.
if (dt == CANONICAL || dt > CANONICAL && compat) {
2001-08-30 20:50:18 +00:00
String s = ucd.getDecompositionMapping(cp);
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
getRecursiveDecomposition(cp, buffer, compat);
2001-08-30 20:50:18 +00:00
}
} else {
UTF16.append(buffer, cp);
}
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
int getPairwiseComposition(int starterCh, int ch) {
int hangulPoss = UCD.composeHangul(starterCh, ch);
if (hangulPoss != 0xFFFF) return hangulPoss;
Object obj = compTable.get(new Long((((long)starterCh)<<32) | ch));
if (obj == null) return 0xFFFF;
return ((Integer)obj).intValue();
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Contains normalization data from the Unicode Character Database.
2001-08-31 00:30:17 +00:00
* use false for the minimal set, true for the real set.
2001-08-30 20:50:18 +00:00
*/
private Stub data;
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
private static HashMap versionCache = new HashMap();
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
private static Stub getData (String version) {
if (version.length() == 0) version = UCD.latestVersion;
Stub result = (Stub)versionCache.get(version);
if (result == null) {
result = new Stub(version);
versionCache.put(version, result);
}
return result;
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Just accessible for testing.
*/
/*
boolean isExcluded (char ch) {
return data.isExcluded(ch);
}
2001-08-31 00:30:17 +00:00
2001-08-30 20:50:18 +00:00
/**
* Just accessible for testing.
*/
/*
String getRawDecompositionMapping (char ch) {
return data.getRawDecompositionMapping(ch);
}
//*/
}