ICU-7273 port FilteredNormalizer2 to Java and implement Unicode 3.2 normalization via new code

X-SVN-Rev: 27464
This commit is contained in:
Markus Scherer 2010-01-30 19:42:03 +00:00
parent 13bbe4f77d
commit f934780ab2
3 changed files with 182 additions and 149 deletions

View File

@ -554,6 +554,8 @@ public final class Normalizer2Impl {
* @param c code point
* @param buffer out-only buffer gets the decomposition appended
* @return true if c has a decomposition
* TODO: Look at the call sites and see if it would be better to return a String
* rather than writing to an Appendable.
*/
public boolean getDecomposition(int c, Appendable buffer) {
try {

View File

@ -6,6 +6,8 @@
*/
package com.ibm.icu.text;
import java.io.IOException;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.UnicodeSet;
@ -45,6 +47,11 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
dest.delete(0, 0x7fffffff);
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
return dest;
}
/** {@inheritDoc}
@ -52,8 +59,12 @@ public class FilteredNormalizer2 extends Normalizer2 {
* @provisional This API might change or be removed in a future release.
*/
public Appendable normalize(CharSequence src, Appendable dest) {
return dest;
if(dest==src) {
throw new IllegalArgumentException();
}
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
}
/** {@inheritDoc}
* @draft ICU 4.4
* @provisional This API might change or be removed in a future release.
@ -61,7 +72,7 @@ public class FilteredNormalizer2 extends Normalizer2 {
@Override
public StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second) {
return first;
return normalizeSecondAndAppend(first, second, true);
}
/** {@inheritDoc}
* @draft ICU 4.4
@ -69,7 +80,7 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return first;
return normalizeSecondAndAppend(first, second, false);
}
/** {@inheritDoc}
@ -78,7 +89,20 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public boolean isNormalized(CharSequence s) {
return false;
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
return false;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return true;
}
/** {@inheritDoc}
* @draft ICU 4.4
@ -86,7 +110,25 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public Normalizer.QuickCheckResult quickCheck(CharSequence s) {
return Normalizer.NO;
Normalizer.QuickCheckResult result=Normalizer.YES;
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
Normalizer.QuickCheckResult qcResult=
norm2.quickCheck(s.subSequence(prevSpanLimit, spanLimit));
if(qcResult==Normalizer.NO) {
return qcResult;
} else if(qcResult==Normalizer.MAYBE) {
result=qcResult;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return result;
}
/** {@inheritDoc}
* @draft ICU 4.4
@ -94,7 +136,23 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public int spanQuickCheckYes(CharSequence s) {
return 0;
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
int yesLimit=
prevSpanLimit+
norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
if(yesLimit<spanLimit) {
return yesLimit;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return s.length();
}
/** {@inheritDoc}
@ -103,7 +161,7 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public boolean hasBoundaryBefore(int c) {
return false;
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
/** {@inheritDoc}
@ -112,7 +170,7 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public boolean hasBoundaryAfter(int c) {
return false;
return !set.contains(c) || norm2.hasBoundaryAfter(c);
}
/** {@inheritDoc}
@ -121,20 +179,89 @@ public class FilteredNormalizer2 extends Normalizer2 {
*/
@Override
public boolean isInert(int c) {
return false;
return !set.contains(c) || norm2.isInert(c);
}
/*private StringBuilder normalize(CharSequence src, StringBuilder dest, USetSpanCondition spanCondition) {
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
private Appendable normalize(CharSequence src, Appendable dest,
UnicodeSet.SpanCondition spanCondition) {
// Don't throw away destination buffer between iterations.
StringBuilder tempDest=new StringBuilder();
try {
for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
int spanLimit=set.span(src, prevSpanLimit, spanCondition);
int spanLength=spanLimit-prevSpanLimit;
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
if(spanLength!=0) {
dest.append(src, prevSpanLimit, spanLimit);
}
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(spanLength!=0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
} catch(IOException e) {
throw new RuntimeException(e);
}
return dest;
} TODO: need UnicodeSet.span() */
}
/*private StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second, boolean doNormalize) {
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
boolean doNormalize) {
if(first==second) {
throw new IllegalArgumentException();
}
if(first.length()==0) {
if(doNormalize) {
return normalize(second, first);
} else {
return first.append(second);
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of the second
int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
if(prefixLimit!=0) {
CharSequence prefix=second.subSequence(0, prefixLimit);
int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
if(suffixStart==0) {
if(doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix);
} else {
norm2.append(first, prefix);
}
} else {
StringBuilder middle=new StringBuilder(first.subSequence(suffixStart, 0x7fffffff));
if(doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix);
} else {
norm2.append(middle, prefix);
}
first.delete(suffixStart, 0x7fffffff).append(middle);
}
}
if(prefixLimit<second.length()) {
CharSequence rest=second.subSequence(prefixLimit, 0x7fffffff);
if(doNormalize) {
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
} else {
first.append(rest);
}
}
return first;
} TODO: need UnicodeSet.span() */
}
@SuppressWarnings("unused")
private Normalizer2 norm2;
@SuppressWarnings("unused")
private UnicodeSet set;
};

View File

@ -126,12 +126,7 @@ public final class Normalizer implements Cloneable {
private int bufferStart = 0;
private int bufferPos = 0;
private int bufferLimit = 0;
// This tells us what the bits in the "mode" object mean.
private static final int COMPAT_BIT = 1;
private static final int DECOMP_BIT = 2;
private static final int COMPOSE_BIT = 4;
// The input text and our position in it
private UCharacterIterator text;
private Mode mode = NFC;
@ -161,6 +156,10 @@ public final class Normalizer implements Cloneable {
public static class Mode {
protected Mode(Normalizer2 n2) {
normalizer2 = n2;
uni32Normalizer2 = new FilteredNormalizer2(n2, UNI32_SET);
}
protected final Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ? uni32Normalizer2 : normalizer2;
}
/**
@ -251,7 +250,9 @@ public final class Normalizer implements Cloneable {
protected boolean isNFSkippable(int c) {
return true;
}
protected final Normalizer2 normalizer2;
private final Normalizer2 normalizer2;
private final FilteredNormalizer2 uni32Normalizer2;
private static final UnicodeSet UNI32_SET = new UnicodeSet("[:age=3.2:]");
}
/**
@ -876,31 +877,7 @@ public final class Normalizer implements Cloneable {
* @stable ICU 2.6
*/
public static String compose(String str, boolean compat, int options) {
if(options == 0) {
return getComposeMode(compat).normalizer2.normalize(str);
}
char[] dest = new char[str.length()*MAX_BUF_SIZE_COMPOSE];
int destSize=0;
char[] src = str.toCharArray();
UnicodeSet nx = NormalizerImpl.getNX(options);
/* reset options bits that should only be set here or inside compose() */
options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
if(compat) {
options|=NormalizerImpl.OPTIONS_COMPAT;
}
for(;;) {
destSize=NormalizerImpl.compose(src,0,src.length,
dest,0,dest.length,options,
nx);
if(destSize<=dest.length) {
return new String(dest,0,destSize);
} else {
dest = new char[destSize];
}
}
return getComposeMode(compat).getNormalizer2(options).normalize(str);
}
/**
@ -944,34 +921,14 @@ public final class Normalizer implements Cloneable {
public static int compose(char[] src,int srcStart, int srcLimit,
char[] dest,int destStart, int destLimit,
boolean compat, int options) {
if(options == 0) {
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
getComposeMode(compat).normalizer2.normalize(srcBuffer, app);
return app.length();
}
UnicodeSet nx = NormalizerImpl.getNX(options);
/* reset options bits that should only be set here or inside compose() */
options&=~(NormalizerImpl.OPTIONS_SETS_MASK|NormalizerImpl.OPTIONS_COMPAT|NormalizerImpl.OPTIONS_COMPOSE_CONTIGUOUS);
if(compat) {
options|=NormalizerImpl.OPTIONS_COMPAT;
}
int length = NormalizerImpl.compose(src,srcStart,srcLimit,
dest,destStart,destLimit,
options, nx);
if(length<=(destLimit-destStart)) {
return length;
} else {
throw new IndexOutOfBoundsException(Integer.toString(length));
}
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
getComposeMode(compat).getNormalizer2(options).normalize(srcBuffer, app);
return app.length();
}
private static final int MAX_BUF_SIZE_COMPOSE = 2;
private static final int MAX_BUF_SIZE_DECOMPOSE = 3;
/**
* Decompose a string.
* The string will be decomposed to according to the specified mode.
@ -998,26 +955,9 @@ public final class Normalizer implements Cloneable {
* @stable ICU 2.6
*/
public static String decompose(String str, boolean compat, int options) {
if(options == 0) {
getDecomposeMode(compat).normalizer2.normalize(str);
}
char[] dest = new char[str.length()*MAX_BUF_SIZE_DECOMPOSE];
int[] trailCC = new int[1];
int destSize=0;
UnicodeSet nx = NormalizerImpl.getNX(options);
for(;;) {
destSize=NormalizerImpl.decompose(str.toCharArray(),0,str.length(),
dest,0,dest.length,
compat,trailCC, nx);
if(destSize<=dest.length) {
return new String(dest,0,destSize);
} else {
dest = new char[destSize];
}
}
return getDecomposeMode(compat).getNormalizer2(options).normalize(str);
}
/**
* Decompose a string.
* The string will be decomposed to according to the specified mode.
@ -1059,41 +999,14 @@ public final class Normalizer implements Cloneable {
public static int decompose(char[] src,int srcStart, int srcLimit,
char[] dest,int destStart, int destLimit,
boolean compat, int options) {
if(options == 0) {
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
getDecomposeMode(compat).normalizer2.normalize(srcBuffer, app);
return app.length();
}
int[] trailCC = new int[1];
UnicodeSet nx = NormalizerImpl.getNX(options);
int length = NormalizerImpl.decompose(src,srcStart,srcLimit,
dest,destStart,destLimit,
compat,trailCC,nx);
if(length<=(destLimit-destStart)) {
return length;
} else {
throw new IndexOutOfBoundsException(Integer.toString(length));
}
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
getDecomposeMode(compat).getNormalizer2(options).normalize(srcBuffer, app);
return app.length();
}
private static String makeFCD(String src,int options) {
if(options == 0) {
return Norm2AllModes.getFCDNormalizer2NoIOException().normalize(src);
}
int srcLen = src.length();
char[] dest = new char[MAX_BUF_SIZE_DECOMPOSE*srcLen];
int length = 0;
UnicodeSet nx = NormalizerImpl.getNX(options);
for(;;) {
length = NormalizerImpl.makeFCD(src.toCharArray(),0,srcLen,
dest,0,dest.length,nx);
if(length <= dest.length) {
return new String(dest,0,length);
} else {
dest = new char[length];
}
}
return Norm2AllModes.getFCDNormalizer2NoIOException().normalize(src);
}
/**
@ -1112,10 +1025,7 @@ public final class Normalizer implements Cloneable {
* @stable ICU 2.6
*/
public static String normalize(String str, Mode mode, int options) {
if(options == 0) {
return mode.normalizer2.normalize(str);
}
return mode.normalize(str,options);
return mode.getNormalizer2(options).normalize(str);
}
/**
@ -1176,22 +1086,12 @@ public final class Normalizer implements Cloneable {
public static int normalize(char[] src,int srcStart, int srcLimit,
char[] dest,int destStart, int destLimit,
Mode mode, int options) {
if(options == 0) {
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
Norm2AllModes norm2AllModes = Norm2AllModes.getNFCInstanceNoIOException();
mode.normalizer2.normalize(srcBuffer, app);
return app.length();
}
int length = mode.normalize(src,srcStart,srcLimit,dest,destStart,destLimit, options);
if(length<=(destLimit-destStart)) {
return length;
} else {
throw new IndexOutOfBoundsException(Integer.toString(length));
}
CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
mode.getNormalizer2(options).normalize(srcBuffer, app);
return app.length();
}
/**
* Normalize a codepoint according to the given mode
* @param char32 The input string to be normalized.
@ -1202,9 +1102,15 @@ public final class Normalizer implements Cloneable {
* @stable ICU 2.6
* @see #UNICODE_3_2
*/
// TODO: actually do the optimization when the guts of Normalizer are
// upgraded --has just dumb implementation for now
public static String normalize(int char32, Mode mode, int options) {
if(mode == NFD && options == 0) {
StringBuilder decomposition = new StringBuilder();
if(Norm2AllModes.getNFCInstanceNoIOException().impl.getDecomposition(char32, decomposition)) {
return decomposition.toString();
} else {
return UTF16.valueOf(char32);
}
}
return normalize(UTF16.valueOf(char32), mode, options);
}
@ -1215,10 +1121,8 @@ public final class Normalizer implements Cloneable {
* @return String The normalized string
* @stable ICU 2.6
*/
// TODO: actually do the optimization when the guts of Normalizer are
// upgraded --has just dumb implementation for now
public static String normalize(int char32, Mode mode) {
return normalize(UTF16.valueOf(char32), mode, 0);
return normalize(char32, mode, 0);
}
/**