ICU-8227 Whew, there were lots of problems in the way the old code was done. Now working much better.
X-SVN-Rev: 29207
This commit is contained in:
parent
e0872406eb
commit
9ffcb85ba1
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -262,6 +262,7 @@ icu4j/main/classes/translit/.externalToolBuilders/copy-data-translit.launch -tex
|
||||
icu4j/main/classes/translit/.settings/org.eclipse.core.resources.prefs -text
|
||||
icu4j/main/classes/translit/.settings/org.eclipse.jdt.core.prefs -text
|
||||
icu4j/main/classes/translit/.settings/org.eclipse.jdt.ui.prefs -text
|
||||
icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java -text
|
||||
icu4j/main/classes/translit/translit-build.launch -text
|
||||
icu4j/main/shared/.project -text
|
||||
icu4j/main/shared/.settings/org.eclipse.core.resources.prefs -text
|
||||
|
@ -404,5 +404,18 @@ class AnyTransliterator extends Transliterator {
|
||||
}
|
||||
return new AnyTransliterator(getID(), filter, target, targetScript, widthFix, cache);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
// Assume that it can modify any character to any other character
|
||||
sourceSet.addAll(myFilter);
|
||||
if (myFilter.size() != 0) {
|
||||
targetSet.addAll(0, 0x10FFFF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -387,5 +387,17 @@ final class BreakTransliterator extends Transliterator {
|
||||
}
|
||||
|
||||
}
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
// Doesn't actually modify the source characters, so leave them alone.
|
||||
// add the characters inserted
|
||||
if (myFilter.size() != 0) {
|
||||
targetSet.addAll(insertion);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
/**
|
||||
* A transliterator that performs locale-sensitive toLower()
|
||||
@ -102,4 +103,23 @@ class CaseFoldTransliterator extends Transliterator{
|
||||
}
|
||||
offsets.start = offsets.limit;
|
||||
}
|
||||
|
||||
static SourceTargetUtility sourceTargetUtility = null;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
synchronized (UppercaseTransliterator.class) {
|
||||
if (sourceTargetUtility == null) {
|
||||
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
|
||||
public String transform(String source) {
|
||||
return UCharacter.foldCase(source, true);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
|
||||
}
|
||||
}
|
||||
|
@ -305,26 +305,20 @@ class CompoundTransliterator extends Transliterator {
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the set of all characters that may be modified by this
|
||||
* Transliterator, ignoring the effect of our filter.
|
||||
* @internal
|
||||
*/
|
||||
protected UnicodeSet handleGetSourceSet() {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = new UnicodeSet(getFilterAsUnicodeSet(filter));
|
||||
UnicodeSet tempTargetSet = new UnicodeSet();
|
||||
for (int i=0; i<trans.length; ++i) {
|
||||
set.addAll(trans[i].getSourceSet());
|
||||
// Take the example of Hiragana-Latin. This is really
|
||||
// Hiragana-Katakana; Katakana-Latin. The source set of
|
||||
// these two is roughly [:Hiragana:] and [:Katakana:].
|
||||
// But the source set for the entire transliterator is
|
||||
// actually [:Hiragana:] ONLY -- that is, the first
|
||||
// non-empty source set.
|
||||
|
||||
// This is a heuristic, and not 100% reliable.
|
||||
if (!set.isEmpty()) {
|
||||
break;
|
||||
}
|
||||
// each time we produce targets, those can be used by subsequent items, despite the filter.
|
||||
// so we get just those items, and add them to the filter each time.
|
||||
tempTargetSet.clear();
|
||||
trans[i].addSourceTargetSet(myFilter, sourceSet, tempTargetSet);
|
||||
targetSet.addAll(tempTargetSet);
|
||||
myFilter.addAll(tempTargetSet);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -197,4 +197,23 @@ class EscapeTransliterator extends Transliterator {
|
||||
pos.limit = limit;
|
||||
pos.start = start;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
sourceSet.addAll(getFilterAsUnicodeSet(inputFilter));
|
||||
for (EscapeTransliterator it = this; it != null ; it = it.supplementalHandler) {
|
||||
if (inputFilter.size() != 0) {
|
||||
targetSet.addAll(it.prefix);
|
||||
targetSet.addAll(it.suffix);
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
for (int i = 0; i < it.radix; ++i) {
|
||||
Utility.appendNumber(buffer, i, it.radix, it.minDigits);
|
||||
}
|
||||
targetSet.addAll(buffer.toString()); // TODO drop once String is changed to CharSequence in UnicodeSet
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
@ -109,4 +110,24 @@ class LowercaseTransliterator extends Transliterator{
|
||||
}
|
||||
offsets.start = offsets.limit;
|
||||
}
|
||||
|
||||
// NOTE: normally this would be static, but because the results vary by locale....
|
||||
SourceTargetUtility sourceTargetUtility = null;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
synchronized (this) {
|
||||
if (sourceTargetUtility == null) {
|
||||
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
|
||||
public String transform(String source) {
|
||||
return UCharacter.toLowerCase(locale, source);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
|
||||
}
|
||||
}
|
||||
|
@ -165,4 +165,31 @@ class NameUnicodeTransliterator extends Transliterator {
|
||||
// open delimiter candidate.
|
||||
offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
if (!myFilter.containsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.contains(CLOSE_DELIM)) {
|
||||
return; // we have to contain both prefix and suffix
|
||||
}
|
||||
UnicodeSet items = new UnicodeSet()
|
||||
.addAll('0', '9')
|
||||
.addAll('A', 'F')
|
||||
.addAll('a', 'z') // for controls
|
||||
.add('<').add('>') // for controls
|
||||
.add('(').add(')') // for controls
|
||||
.add('-')
|
||||
.add(' ')
|
||||
.addAll(UnicodeNameTransliterator.OPEN_DELIM)
|
||||
.add(CLOSE_DELIM);
|
||||
items.retainAll(myFilter);
|
||||
if (items.size() > 0) {
|
||||
sourceSet.addAll(items);
|
||||
// could produce any character
|
||||
targetSet.addAll(0, 0x10FFFF);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,17 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/08/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 06/08/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.text;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
|
||||
@ -76,7 +79,7 @@ final class NormalizationTransliterator extends Transliterator {
|
||||
* Implements {@link Transliterator#handleTransliterate}.
|
||||
*/
|
||||
protected void handleTransliterate(Replaceable text,
|
||||
Position offsets, boolean isIncremental) {
|
||||
Position offsets, boolean isIncremental) {
|
||||
// start and limit of the input range
|
||||
int start = offsets.start;
|
||||
int limit = offsets.limit;
|
||||
@ -129,4 +132,34 @@ final class NormalizationTransliterator extends Transliterator {
|
||||
offsets.contextLimit += limit - offsets.limit;
|
||||
offsets.limit = limit;
|
||||
}
|
||||
|
||||
static final Map<Normalizer2, SourceTargetUtility> SOURCE_CACHE = new HashMap<Normalizer2, SourceTargetUtility>();
|
||||
|
||||
// TODO Get rid of this if Normalizer2 becomes a Transform
|
||||
static class NormalizingTransform implements Transform<String,String> {
|
||||
final Normalizer2 norm2;
|
||||
public NormalizingTransform(Normalizer2 norm2) {
|
||||
this.norm2 = norm2;
|
||||
}
|
||||
public String transform(String source) {
|
||||
return norm2.normalize(source);
|
||||
}
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
SourceTargetUtility cache;
|
||||
synchronized (SOURCE_CACHE) {
|
||||
//String id = getID();
|
||||
cache = SOURCE_CACHE.get(norm2);
|
||||
if (cache == null) {
|
||||
cache = new SourceTargetUtility(new NormalizingTransform(norm2), norm2);
|
||||
SOURCE_CACHE.put(norm2, cache);
|
||||
}
|
||||
}
|
||||
cache.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
|
||||
}
|
||||
}
|
||||
|
@ -30,4 +30,12 @@ class NullTransliterator extends Transliterator {
|
||||
Position offsets, boolean incremental) {
|
||||
offsets.start = offsets.limit;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2004, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -49,4 +49,15 @@ class RemoveTransliterator extends Transliterator {
|
||||
index.contextLimit -= len;
|
||||
index.limit -= len;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
// intersect myFilter with the input filter
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
sourceSet.addAll(myFilter);
|
||||
// do nothing with the target
|
||||
}
|
||||
}
|
||||
|
@ -448,24 +448,32 @@ public class RuleBasedTransliterator extends Transliterator {
|
||||
return data.ruleSet.toRules(escapeUnprintable);
|
||||
}
|
||||
|
||||
// /**
|
||||
// * Return the set of all characters that may be modified by this
|
||||
// * Transliterator, ignoring the effect of our filter.
|
||||
// * @internal
|
||||
// * @deprecated This API is ICU internal only.
|
||||
// */
|
||||
// protected UnicodeSet handleGetSourceSet() {
|
||||
// return data.ruleSet.getSourceTargetSet(false, unicodeFilter);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Returns the set of all characters that may be generated as
|
||||
// * replacement text by this transliterator.
|
||||
// * @internal
|
||||
// * @deprecated This API is ICU internal only.
|
||||
// */
|
||||
// public UnicodeSet getTargetSet() {
|
||||
// return data.ruleSet.getSourceTargetSet(true, unicodeFilter);
|
||||
// }
|
||||
|
||||
/**
|
||||
* Return the set of all characters that may be modified by this
|
||||
* Transliterator, ignoring the effect of our filter.
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
protected UnicodeSet handleGetSourceSet() {
|
||||
return data.ruleSet.getSourceTargetSet(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of all characters that may be generated as
|
||||
* replacement text by this transliterator.
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public UnicodeSet getTargetSet() {
|
||||
return data.ruleSet.getSourceTargetSet(true);
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
data.ruleSet.addSourceTargetSet(filter, sourceSet, targetSet);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -0,0 +1,133 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, Google, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.ibm.icu.lang.CharSequences;
|
||||
import com.ibm.icu.text.Normalizer2.Mode;
|
||||
|
||||
/**
|
||||
* Simple internal utility class for helping with getSource/TargetSet
|
||||
*/
|
||||
class SourceTargetUtility {
|
||||
final Transform<String, String> transform;
|
||||
final UnicodeSet sourceCache;
|
||||
final Set<String> sourceStrings;
|
||||
static final UnicodeSet NON_STARTERS = new UnicodeSet("[:^ccc=0:]").freeze();
|
||||
static Normalizer2 NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
|
||||
//static final UnicodeSet TRAILING_COMBINING = new UnicodeSet();
|
||||
|
||||
public SourceTargetUtility(Transform<String, String> transform) {
|
||||
this(transform, null);
|
||||
}
|
||||
|
||||
public SourceTargetUtility(Transform<String, String> transform, Normalizer2 normalizer) {
|
||||
this.transform = transform;
|
||||
if (normalizer != null) {
|
||||
// synchronized (SourceTargetUtility.class) {
|
||||
// if (NFC == null) {
|
||||
// NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
|
||||
// for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
// String d = NFC.getDecomposition(i);
|
||||
// if (d == null) {
|
||||
// continue;
|
||||
// }
|
||||
// String s = NFC.normalize(d);
|
||||
// if (!CharSequences.equals(i, s)) {
|
||||
// continue;
|
||||
// }
|
||||
// // composes
|
||||
// boolean first = false;
|
||||
// for (int trailing : CharSequences.codePoints(d)) {
|
||||
// if (first) {
|
||||
// first = false;
|
||||
// } else {
|
||||
// TRAILING_COMBINING.add(trailing);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
sourceCache = new UnicodeSet("[:^ccc=0:]");
|
||||
} else {
|
||||
sourceCache = new UnicodeSet();
|
||||
}
|
||||
sourceStrings = new HashSet<String>();
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
String s = transform.transform(UTF16.valueOf(i));
|
||||
boolean added = false;
|
||||
if (!CharSequences.equals(i, s)) {
|
||||
sourceCache.add(i);
|
||||
added = true;
|
||||
}
|
||||
if (normalizer == null) {
|
||||
continue;
|
||||
}
|
||||
String d = NFC.getDecomposition(i);
|
||||
if (d == null) {
|
||||
continue;
|
||||
}
|
||||
s = transform.transform(d);
|
||||
if (!d.equals(s)) {
|
||||
sourceStrings.add(d);
|
||||
}
|
||||
if (added) {
|
||||
continue;
|
||||
}
|
||||
if (!normalizer.isInert(i)) {
|
||||
sourceCache.add(i);
|
||||
continue;
|
||||
}
|
||||
// see if any of the non-starters change s; if so, add i
|
||||
// for (String ns : TRAILING_COMBINING) {
|
||||
// String s2 = transform.transform(s + ns);
|
||||
// if (!s2.startsWith(s)) {
|
||||
// sourceCache.add(i);
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
|
||||
// int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2;
|
||||
// if (endOfFirst >= d.length()) {
|
||||
// continue;
|
||||
// }
|
||||
// // now add all initial substrings
|
||||
// for (int j = 1; j < d.length(); ++j) {
|
||||
// if (!CharSequences.onCharacterBoundary(d, j)) {
|
||||
// continue;
|
||||
// }
|
||||
// String dd = d.substring(0,j);
|
||||
// s = transform.transform(dd);
|
||||
// if (!dd.equals(s)) {
|
||||
// sourceStrings.add(dd);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
sourceCache.freeze();
|
||||
}
|
||||
|
||||
public void addSourceTargetSet(Transliterator transliterator, UnicodeSet inputFilter, UnicodeSet sourceSet,
|
||||
UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = transliterator.getFilterAsUnicodeSet(inputFilter);
|
||||
UnicodeSet affectedCharacters = new UnicodeSet(sourceCache).retainAll(myFilter);
|
||||
sourceSet.addAll(affectedCharacters);
|
||||
for (String s : affectedCharacters) {
|
||||
targetSet.addAll(transform.transform(s));
|
||||
}
|
||||
for (String s : sourceStrings) {
|
||||
if (myFilter.containsAll(s)) {
|
||||
String t = transform.transform(s);
|
||||
if (!s.equals(t)) {
|
||||
targetSet.addAll(t);
|
||||
sourceSet.addAll(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
@ -147,4 +148,24 @@ class TitlecaseTransliterator extends Transliterator {
|
||||
}
|
||||
offsets.start = offsets.limit;
|
||||
}
|
||||
|
||||
// NOTE: normally this would be static, but because the results vary by locale....
|
||||
SourceTargetUtility sourceTargetUtility = null;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
synchronized (this) {
|
||||
if (sourceTargetUtility == null) {
|
||||
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
|
||||
public String transform(String source) {
|
||||
return UCharacter.toTitleCase(locale, source, null);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2007, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -548,8 +548,11 @@ class TransliterationRule {
|
||||
* Union the set of all characters that may be modified by this rule
|
||||
* into the given set.
|
||||
*/
|
||||
void addSourceSetTo(UnicodeSet toUnionTo) {
|
||||
void addSourceSetTo(UnicodeSet toUnionTo, UnicodeSet filter) {
|
||||
int limit = anteContextLength + keyLength;
|
||||
if (filter != null && !matches(filter)) {
|
||||
return;
|
||||
}
|
||||
for (int i=anteContextLength; i<limit; ) {
|
||||
int ch = UTF16.charAt(pattern, i);
|
||||
i += UTF16.getCharCount(ch);
|
||||
@ -562,11 +565,55 @@ class TransliterationRule {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sees if the source of the rule can match the filter. There is a known issue with filters containing multiple characters.
|
||||
* @param filter must not be null (check in caller)
|
||||
* @param pattern2
|
||||
* @param anteContextLength2
|
||||
* @param limit
|
||||
* @return
|
||||
*/
|
||||
// Problem: the rule is [{ab}]c > x
|
||||
// The filter is [a{bc}].
|
||||
// If the input is abc, then the rule will work.
|
||||
// However, following code applying the filter won't catch that case.
|
||||
private boolean matches(UnicodeSet filter) {
|
||||
int limit = anteContextLength + keyLength;
|
||||
// We need to walk through the pattern.
|
||||
// Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
|
||||
for (int i=anteContextLength; i<limit; ) {
|
||||
int ch = UTF16.charAt(pattern, i);
|
||||
i += UTF16.getCharCount(ch);
|
||||
UnicodeMatcher matcher = data.lookupMatcher(ch);
|
||||
if (matcher == null) {
|
||||
if (!filter.contains(ch)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
if (!filter.containsSome((UnicodeSet) matcher)) {
|
||||
return false;
|
||||
}
|
||||
} catch (ClassCastException e) {
|
||||
UnicodeSet temp = new UnicodeSet();
|
||||
matcher.addMatchSetTo(temp);
|
||||
if (!filter.containsSome(temp)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Union the set of all characters that may be emitted by this rule
|
||||
* into the given set.
|
||||
*/
|
||||
void addTargetSetTo(UnicodeSet toUnionTo) {
|
||||
void addTargetSetTo(UnicodeSet toUnionTo, UnicodeSet filter) {
|
||||
if (filter != null && !matches(filter)) {
|
||||
return;
|
||||
}
|
||||
output.addReplacementSetTo(toUnionTo);
|
||||
}
|
||||
}
|
||||
|
@ -238,21 +238,15 @@ class TransliterationRuleSet {
|
||||
return ruleSource.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the set of all characters that may be modified (getTarget=false)
|
||||
* or emitted (getTarget=true) by this set.
|
||||
*/
|
||||
UnicodeSet getSourceTargetSet(boolean getTarget) {
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
// TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
|
||||
// TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
|
||||
void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
int count = ruleVector.size();
|
||||
for (int i=0; i<count; ++i) {
|
||||
TransliterationRule r = ruleVector.get(i);
|
||||
if (getTarget) {
|
||||
r.addTargetSetTo(set);
|
||||
} else {
|
||||
r.addSourceSetTo(set);
|
||||
}
|
||||
r.addTargetSetTo(targetSet, filter);
|
||||
r.addSourceSetTo(sourceSet, filter);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -26,226 +26,200 @@ import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.UResourceBundle;
|
||||
|
||||
/**
|
||||
* <code>Transliterator</code> is an abstract class that
|
||||
* transliterates text from one format to another. The most common
|
||||
* kind of transliterator is a script, or alphabet, transliterator.
|
||||
* For example, a Russian to Latin transliterator changes Russian text
|
||||
* written in Cyrillic characters to phonetically equivalent Latin
|
||||
* characters. It does not <em>translate</em> Russian to English!
|
||||
* Transliteration, unlike translation, operates on characters, without
|
||||
* reference to the meanings of words and sentences.
|
||||
*
|
||||
* <p>Although script conversion is its most common use, a
|
||||
* transliterator can actually perform a more general class of tasks.
|
||||
* In fact, <code>Transliterator</code> defines a very general API
|
||||
* which specifies only that a segment of the input text is replaced
|
||||
* by new text. The particulars of this conversion are determined
|
||||
* entirely by subclasses of <code>Transliterator</code>.
|
||||
*
|
||||
* <p><b>Transliterators are stateless</b>
|
||||
*
|
||||
* <p><code>Transliterator</code> objects are <em>stateless</em>; they
|
||||
* retain no information between calls to
|
||||
* <code>transliterate()</code>. As a result, threads may share
|
||||
* transliterators without synchronizing them. This might seem to
|
||||
* limit the complexity of the transliteration operation. In
|
||||
* practice, subclasses perform complex transliterations by delaying
|
||||
* the replacement of text until it is known that no other
|
||||
* replacements are possible. In other words, although the
|
||||
* <code>Transliterator</code> objects are stateless, the source text
|
||||
* itself embodies all the needed information, and delayed operation
|
||||
* allows arbitrary complexity.
|
||||
*
|
||||
* <p><b>Batch transliteration</b>
|
||||
*
|
||||
* <p>The simplest way to perform transliteration is all at once, on a
|
||||
* string of existing text. This is referred to as <em>batch</em>
|
||||
* transliteration. For example, given a string <code>input</code>
|
||||
* and a transliterator <code>t</code>, the call
|
||||
*
|
||||
* <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common
|
||||
* kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator
|
||||
* changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not
|
||||
* <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference
|
||||
* to the meanings of words and sentences.
|
||||
*
|
||||
* <p>
|
||||
* Although script conversion is its most common use, a transliterator can actually perform a more general class of
|
||||
* tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the
|
||||
* input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of
|
||||
* <code>Transliterator</code>.
|
||||
*
|
||||
* <p>
|
||||
* <b>Transliterators are stateless</b>
|
||||
*
|
||||
* <p>
|
||||
* <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to
|
||||
* <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might
|
||||
* seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex
|
||||
* transliterations by delaying the replacement of text until it is known that no other replacements are possible. In
|
||||
* other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the
|
||||
* needed information, and delayed operation allows arbitrary complexity.
|
||||
*
|
||||
* <p>
|
||||
* <b>Batch transliteration</b>
|
||||
*
|
||||
* <p>
|
||||
* The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as
|
||||
* <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>,
|
||||
* the call
|
||||
*
|
||||
* <blockquote><code>String result = t.transliterate(input);
|
||||
* </code></blockquote>
|
||||
*
|
||||
* will transliterate it and return the result. Other methods allow
|
||||
* the client to specify a substring to be transliterated and to use
|
||||
* {@link Replaceable} objects instead of strings, in order to
|
||||
* preserve out-of-band information (such as text styles).
|
||||
*
|
||||
* <p><b>Keyboard transliteration</b>
|
||||
*
|
||||
* <p>Somewhat more involved is <em>keyboard</em>, or incremental
|
||||
* transliteration. This is the transliteration of text that is
|
||||
* arriving from some source (typically the user's keyboard) one
|
||||
* character at a time, or in some other piecemeal fashion.
|
||||
*
|
||||
* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
|
||||
* stores the text. As text is inserted, as much as possible is
|
||||
* transliterated on the fly. This means a GUI that displays the
|
||||
* contents of the buffer may show text being modified as each new
|
||||
* character arrives.
|
||||
*
|
||||
* <p>Consider the simple <code>RuleBasedTransliterator</code>:
|
||||
*
|
||||
*
|
||||
* will transliterate it and return the result. Other methods allow the client to specify a substring to be
|
||||
* transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band
|
||||
* information (such as text styles).
|
||||
*
|
||||
* <p>
|
||||
* <b>Keyboard transliteration</b>
|
||||
*
|
||||
* <p>
|
||||
* Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that
|
||||
* is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal
|
||||
* fashion.
|
||||
*
|
||||
* <p>
|
||||
* In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as
|
||||
* possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being
|
||||
* modified as each new character arrives.
|
||||
*
|
||||
* <p>
|
||||
* Consider the simple <code>RuleBasedTransliterator</code>:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* th>{theta}<br>
|
||||
* t>{tau}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* When the user types 't', nothing will happen, since the
|
||||
* transliterator is waiting to see if the next character is 'h'. To
|
||||
* remedy this, we introduce the notion of a cursor, marked by a '|'
|
||||
* in the output string:
|
||||
*
|
||||
*
|
||||
* When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is
|
||||
* 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string:
|
||||
*
|
||||
* <blockquote><code>
|
||||
* t>|{tau}<br>
|
||||
* {tau}h>{theta}
|
||||
* </code></blockquote>
|
||||
*
|
||||
* Now when the user types 't', tau appears, and if the next character
|
||||
* is 'h', the tau changes to a theta. This is accomplished by
|
||||
* maintaining a cursor position (independent of the insertion point,
|
||||
* and invisible in the GUI) across calls to
|
||||
* <code>transliterate()</code>. Typically, the cursor will
|
||||
* be coincident with the insertion point, but in a case like the one
|
||||
* above, it will precede the insertion point.
|
||||
*
|
||||
* <p>Keyboard transliteration methods maintain a set of three indices
|
||||
* that are updated with each call to
|
||||
* <code>transliterate()</code>, including the cursor, start,
|
||||
* and limit. These indices are changed by the method, and they are
|
||||
* passed in and out via a Position object. The <code>start</code> index
|
||||
* marks the beginning of the substring that the transliterator will
|
||||
* look at. It is advanced as text becomes committed (but it is not
|
||||
* the committed index; that's the <code>cursor</code>). The
|
||||
* <code>cursor</code> index, described above, marks the point at
|
||||
* which the transliterator last stopped, either because it reached
|
||||
* the end, or because it required more characters to disambiguate
|
||||
* between possible inputs. The <code>cursor</code> can also be
|
||||
* explicitly set by rules in a <code>RuleBasedTransliterator</code>.
|
||||
* Any characters before the <code>cursor</code> index are frozen;
|
||||
* future keyboard transliteration calls within this input sequence
|
||||
* will not change them. New text is inserted at the
|
||||
* <code>limit</code> index, which marks the end of the substring that
|
||||
* the transliterator looks at.
|
||||
*
|
||||
* <p>Because keyboard transliteration assumes that more characters
|
||||
* are to arrive, it is conservative in its operation. It only
|
||||
* transliterates when it can do so unambiguously. Otherwise it waits
|
||||
* for more characters to arrive. When the client code knows that no
|
||||
* more characters are forthcoming, perhaps because the user has
|
||||
* performed some input termination operation, then it should call
|
||||
* <code>finishTransliteration()</code> to complete any
|
||||
* pending transliterations.
|
||||
*
|
||||
* <p><b>Inverses</b>
|
||||
*
|
||||
* <p>Pairs of transliterators may be inverses of one another. For
|
||||
* example, if transliterator <b>A</b> transliterates characters by
|
||||
* incrementing their Unicode value (so "abc" -> "def"), and
|
||||
* transliterator <b>B</b> decrements character values, then <b>A</b>
|
||||
* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
|
||||
* with <b>B</b> in a compound transliterator, the result is the
|
||||
* indentity transliterator, that is, a transliterator that does not
|
||||
* change its input text.
|
||||
*
|
||||
* The <code>Transliterator</code> method <code>getInverse()</code>
|
||||
* returns a transliterator's inverse, if one exists, or
|
||||
* <code>null</code> otherwise. However, the result of
|
||||
* <code>getInverse()</code> usually will <em>not</em> be a true
|
||||
* mathematical inverse. This is because true inverse transliterators
|
||||
* are difficult to formulate. For example, consider two
|
||||
* transliterators: <b>AB</b>, which transliterates the character 'A'
|
||||
* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
|
||||
* seem that these are exact inverses, since
|
||||
*
|
||||
*
|
||||
* Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is
|
||||
* accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across
|
||||
* calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a
|
||||
* case like the one above, it will precede the insertion point.
|
||||
*
|
||||
* <p>
|
||||
* Keyboard transliteration methods maintain a set of three indices that are updated with each call to
|
||||
* <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and
|
||||
* they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring
|
||||
* that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index;
|
||||
* that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the
|
||||
* transliterator last stopped, either because it reached the end, or because it required more characters to
|
||||
* disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules in a
|
||||
* <code>RuleBasedTransliterator</code>. Any characters before the <code>cursor</code> index are frozen; future keyboard
|
||||
* transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code>
|
||||
* index, which marks the end of the substring that the transliterator looks at.
|
||||
*
|
||||
* <p>
|
||||
* Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It
|
||||
* only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the
|
||||
* client code knows that no more characters are forthcoming, perhaps because the user has performed some input
|
||||
* termination operation, then it should call <code>finishTransliteration()</code> to complete any pending
|
||||
* transliterations.
|
||||
*
|
||||
* <p>
|
||||
* <b>Inverses</b>
|
||||
*
|
||||
* <p>
|
||||
* Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates
|
||||
* characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator <b>B</b> decrements character
|
||||
* values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound
|
||||
* transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input
|
||||
* text.
|
||||
*
|
||||
* The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists,
|
||||
* or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true
|
||||
* mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider
|
||||
* two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates
|
||||
* 'B' to 'A'. It might seem that these are exact inverses, since
|
||||
*
|
||||
* <blockquote>"A" x <b>AB</b> -> "B"<br>
|
||||
* "B" x <b>BA</b> -> "A"</blockquote>
|
||||
*
|
||||
* where 'x' represents transliteration. However,
|
||||
*
|
||||
*
|
||||
* where 'x' represents transliteration. However,
|
||||
*
|
||||
* <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
|
||||
* "BBCD" x <b>BA</b> -> "AACD"</blockquote>
|
||||
*
|
||||
* so <b>AB</b> composed with <b>BA</b> is not the
|
||||
* identity. Nonetheless, <b>BA</b> may be usefully considered to be
|
||||
* <b>AB</b>'s inverse, and it is on this basis that
|
||||
* <b>AB</b><code>.getInverse()</code> could legitimately return
|
||||
*
|
||||
* so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be
|
||||
* <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return
|
||||
* <b>BA</b>.
|
||||
*
|
||||
* <p><b>IDs and display names</b>
|
||||
*
|
||||
* <p>A transliterator is designated by a short identifier string or
|
||||
* <em>ID</em>. IDs follow the format <em>source-destination</em>,
|
||||
* where <em>source</em> describes the entity being replaced, and
|
||||
* <em>destination</em> describes the entity replacing
|
||||
* <em>source</em>. The entities may be the names of scripts,
|
||||
* particular sequences of characters, or whatever else it is that the
|
||||
* transliterator converts to or from. For example, a transliterator
|
||||
* from Russian to Latin might be named "Russian-Latin". A
|
||||
* transliterator from keyboard escape sequences to Latin-1 characters
|
||||
* might be named "KeyboardEscape-Latin1". By convention, system
|
||||
* entity names are in English, with the initial letters of words
|
||||
* capitalized; user entity names may follow any format so long as
|
||||
* they do not contain dashes.
|
||||
*
|
||||
* <p>In addition to programmatic IDs, transliterator objects have
|
||||
* display names for presentation in user interfaces, returned by
|
||||
* {@link #getDisplayName}.
|
||||
*
|
||||
* <p><b>Factory methods and registration</b>
|
||||
*
|
||||
* <p>In general, client code should use the factory method
|
||||
* <code>getInstance()</code> to obtain an instance of a
|
||||
* transliterator given its ID. Valid IDs may be enumerated using
|
||||
* <code>getAvailableIDs()</code>. Since transliterators are
|
||||
* stateless, multiple calls to <code>getInstance()</code> with the
|
||||
* same ID will return the same object.
|
||||
*
|
||||
* <p>In addition to the system transliterators registered at startup,
|
||||
* user transliterators may be registered by calling
|
||||
* <code>registerInstance()</code> at run time. To register a
|
||||
* transliterator subclass without instantiating it (until it is
|
||||
* needed), users may call <code>registerClass()</code>.
|
||||
*
|
||||
* <p><b>Composed transliterators</b>
|
||||
*
|
||||
* <p>In addition to built-in system transliterators like
|
||||
* "Latin-Greek", there are also built-in <em>composed</em>
|
||||
* transliterators. These are implemented by composing two or more
|
||||
* component transliterators. For example, if we have scripts "A",
|
||||
* "B", "C", and "D", and we want to transliterate between all pairs
|
||||
* of them, then we need to write 12 transliterators: "A-B", "A-C",
|
||||
* "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to
|
||||
* convert all scripts to an intermediate script "M", then instead of
|
||||
* writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
|
||||
* "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big
|
||||
* win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
|
||||
* <em>n</em>, so as <em>n</em> gets larger the gain becomes
|
||||
* significant. With 9 scripts, it's 18 vs. 72 rule sets, a big
|
||||
* difference.) Note the use of "~" rather than "-" for the script
|
||||
* separator here; this indicates that the given transliterator is
|
||||
* intended to be composed with others, rather than be used as is.
|
||||
*
|
||||
* <p>Composed transliterators can be instantiated as usual. For
|
||||
* example, the system transliterator "Devanagari-Gujarati" is a
|
||||
* composed transliterator built internally as
|
||||
* "Devanagari~InterIndic;InterIndic~Gujarati". When this
|
||||
* transliterator is instantiated, it appears externally to be a
|
||||
* standard transliterator (e.g., getID() returns
|
||||
*
|
||||
* <p>
|
||||
* <b>Filtering</b>
|
||||
* <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The
|
||||
* filter affects just the characters that are changed -- the characters outside of the filter are still part of the
|
||||
* context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'.
|
||||
*
|
||||
* <pre>
|
||||
* String rules = "x > y; x{a} > b; ";
|
||||
* Transliterator tempTrans = Transliterator.createFromRules("temp", rules, Transliterator.FORWARD);
|
||||
* tempTrans.setFilter(new UnicodeSet("[a]"));
|
||||
* String tempResult = tempTrans.transform("xa");
|
||||
* // results in "xb"
|
||||
*</pre>
|
||||
* <p>
|
||||
* <b>IDs and display names</b>
|
||||
*
|
||||
* <p>
|
||||
* A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format
|
||||
* <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em>
|
||||
* describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of
|
||||
* characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from
|
||||
* Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1
|
||||
* characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the
|
||||
* initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes.
|
||||
*
|
||||
* <p>
|
||||
* In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces,
|
||||
* returned by {@link #getDisplayName}.
|
||||
*
|
||||
* <p>
|
||||
* <b>Factory methods and registration</b>
|
||||
*
|
||||
* <p>
|
||||
* In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a
|
||||
* transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators
|
||||
* are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object.
|
||||
*
|
||||
* <p>
|
||||
* In addition to the system transliterators registered at startup, user transliterators may be registered by calling
|
||||
* <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it
|
||||
* is needed), users may call <code>registerClass()</code>.
|
||||
*
|
||||
* <p>
|
||||
* <b>Composed transliterators</b>
|
||||
*
|
||||
* <p>
|
||||
* In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em>
|
||||
* transliterators. These are implemented by composing two or more component transliterators. For example, if we have
|
||||
* scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12
|
||||
* transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an
|
||||
* intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M",
|
||||
* "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em>
|
||||
* <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72
|
||||
* rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that
|
||||
* the given transliterator is intended to be composed with others, rather than be used as is.
|
||||
*
|
||||
* <p>
|
||||
* Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati"
|
||||
* is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this
|
||||
* transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns
|
||||
* "Devanagari-Gujarati").
|
||||
*
|
||||
* <p><b>Subclassing</b>
|
||||
*
|
||||
* <p>Subclasses must implement the abstract method
|
||||
* <code>handleTransliterate()</code>. <p>Subclasses should override
|
||||
* the <code>transliterate()</code> method taking a
|
||||
* <code>Replaceable</code> and the <code>transliterate()</code>
|
||||
* method taking a <code>String</code> and <code>StringBuffer</code>
|
||||
* if the performance of these methods can be improved over the
|
||||
* performance obtained by the default implementations in this class.
|
||||
*
|
||||
* <p>Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
*
|
||||
* <p>
|
||||
* <b>Subclassing</b>
|
||||
*
|
||||
* <p>
|
||||
* Subclasses must implement the abstract method <code>handleTransliterate()</code>.
|
||||
* <p>
|
||||
* Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the
|
||||
* <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of
|
||||
* these methods can be improved over the performance obtained by the default implementations in this class.
|
||||
*
|
||||
* <p>
|
||||
* Copyright © IBM Corporation 1999. All rights reserved.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
@ -1418,7 +1392,7 @@ public abstract class Transliterator implements StringTransform {
|
||||
t = new NullTransliterator();
|
||||
}
|
||||
else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
|
||||
t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), null);
|
||||
t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter);
|
||||
}
|
||||
else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
|
||||
// idBlock, no data -- this is an alias. The ID has
|
||||
@ -1536,6 +1510,8 @@ public abstract class Transliterator implements StringTransform {
|
||||
return result;
|
||||
}
|
||||
|
||||
static final UnicodeSet ALL_CODEPOINTS = new UnicodeSet(0,0x10FFFF).freeze();
|
||||
|
||||
/**
|
||||
* Returns the set of all characters that may be modified in the
|
||||
* input text by this Transliterator. This incorporates this
|
||||
@ -1550,20 +1526,9 @@ public abstract class Transliterator implements StringTransform {
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
public final UnicodeSet getSourceSet() {
|
||||
UnicodeSet set = handleGetSourceSet();
|
||||
if (filter != null) {
|
||||
UnicodeSet filterSet;
|
||||
// Most, but not all filters will be UnicodeSets. Optimize for
|
||||
// the high-runner case.
|
||||
try {
|
||||
filterSet = (UnicodeSet) filter;
|
||||
} catch (ClassCastException e) {
|
||||
filterSet = new UnicodeSet();
|
||||
filter.addMatchSetTo(filterSet);
|
||||
}
|
||||
set.retainAll(filterSet);
|
||||
}
|
||||
return set;
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), result, new UnicodeSet());
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1595,7 +1560,78 @@ public abstract class Transliterator implements StringTransform {
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
public UnicodeSet getTargetSet() {
|
||||
return new UnicodeSet();
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), new UnicodeSet(), result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of all characters that may be generated as
|
||||
* replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter().
|
||||
* <p>SHOULD BE OVERRIDEN BY SUBCLASSES.
|
||||
* It is probably an error for any transliterator to NOT override this, but we can't force them to
|
||||
* for backwards compatibility.
|
||||
* <p>Other methods vector through this.
|
||||
* <p>When gathering the information on source and target, the compound transliterator makes things complicated.
|
||||
* For example, suppose we have:
|
||||
* <pre>
|
||||
* Global FILTER = [ax]
|
||||
* a > b;
|
||||
* :: NULL;
|
||||
* b > c;
|
||||
* x > d;
|
||||
* </pre>
|
||||
* While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets
|
||||
* cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to
|
||||
* the global filter, intersect that transliterator's filter. Based on that we get the target.
|
||||
* The next transliterator gets as a global filter (global + last target). And so on.
|
||||
* <p>There is another complication:
|
||||
* <pre>
|
||||
* Global FILTER = [ax]
|
||||
* a > |b;
|
||||
* b > c;
|
||||
* </pre>
|
||||
* Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will
|
||||
* change the global filter as we go.
|
||||
* @param targetSet TODO
|
||||
* @see #getTargetSet
|
||||
* @internal
|
||||
*/
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter);
|
||||
// use old method, if we don't have anything better
|
||||
sourceSet.addAll(temp);
|
||||
// clumsy guess with target
|
||||
for (String s : temp) {
|
||||
String t = transliterate(s);
|
||||
if (!s.equals(t)) {
|
||||
targetSet.addAll(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the intersectionof this instance's filter intersected with an external filter.
|
||||
* The externalFilter must be frozen (it is frozen if not).
|
||||
* The result may be frozen, so don't attempt to modify.
|
||||
* @internal
|
||||
*/
|
||||
// TODO change to getMergedFilter
|
||||
public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) {
|
||||
if (filter == null) {
|
||||
return externalFilter;
|
||||
}
|
||||
UnicodeSet filterSet = new UnicodeSet(externalFilter);
|
||||
// Most, but not all filters will be UnicodeSets. Optimize for
|
||||
// the high-runner case.
|
||||
UnicodeSet temp;
|
||||
try {
|
||||
temp = (UnicodeSet) filter;
|
||||
} catch (ClassCastException e) {
|
||||
filter.addMatchSetTo(temp = new UnicodeSet());
|
||||
}
|
||||
return filterSet.retainAll(temp).freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -8,6 +8,7 @@
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
/**
|
||||
@ -248,4 +249,38 @@ class UnescapeTransliterator extends Transliterator {
|
||||
pos.limit = limit;
|
||||
pos.start = start;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
// Each form consists of a prefix, suffix,
|
||||
// * radix, minimum digit count, and maximum digit count. These
|
||||
// * values are stored as a five character header. ...
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
UnicodeSet items = new UnicodeSet();
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
for (int i = 0; spec[i] != END;) {
|
||||
// first 5 items are header
|
||||
int end = i + spec[i] + spec[i+1] + 5;
|
||||
int radix = spec[i+2];
|
||||
for (int j = 0; j < radix; ++j) {
|
||||
Utility.appendNumber(buffer, j, radix, 0);
|
||||
}
|
||||
// then add the characters
|
||||
for (int j = i + 5; j < end; ++j) {
|
||||
items.add(spec[j]);
|
||||
}
|
||||
// and go to next block
|
||||
i = end;
|
||||
}
|
||||
items.addAll(buffer.toString());
|
||||
items.retainAll(myFilter);
|
||||
|
||||
if (items.size() > 0) {
|
||||
sourceSet.addAll(items);
|
||||
targetSet.addAll(0,0x10FFFF); // assume we can produce any character
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -70,4 +70,25 @@ class UnicodeNameTransliterator extends Transliterator {
|
||||
offsets.limit = limit;
|
||||
offsets.start = cursor;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
|
||||
if (myFilter.size() > 0) {
|
||||
sourceSet.addAll(myFilter);
|
||||
targetSet.addAll('0', '9')
|
||||
.addAll('A', 'Z')
|
||||
.add('-')
|
||||
.add(' ')
|
||||
.addAll(OPEN_DELIM)
|
||||
.add(CLOSE_DELIM)
|
||||
.addAll('a', 'z') // for controls
|
||||
.add('<').add('>') // for controls
|
||||
.add('(').add(')') // for controls
|
||||
;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
@ -105,4 +106,24 @@ class UppercaseTransliterator extends Transliterator {
|
||||
}
|
||||
offsets.start = offsets.limit;
|
||||
}
|
||||
|
||||
// NOTE: normally this would be static, but because the results vary by locale....
|
||||
SourceTargetUtility sourceTargetUtility = null;
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
|
||||
*/
|
||||
@Override
|
||||
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
|
||||
synchronized (this) {
|
||||
if (sourceTargetUtility == null) {
|
||||
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
|
||||
public String transform(String source) {
|
||||
return UCharacter.toUpperCase(locale, source);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
|
||||
}
|
||||
}
|
||||
|
@ -13,13 +13,18 @@ import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.dev.test.TestUtil;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.impl.UtilityExtensions;
|
||||
import com.ibm.icu.lang.CharSequences;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.StringTransform;
|
||||
@ -28,6 +33,7 @@ import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeFilter;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.text.Normalizer2.Mode;
|
||||
import com.ibm.icu.util.CaseInsensitiveString;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
@ -480,6 +486,16 @@ public class TransliteratorTest extends TestFmwk {
|
||||
* Do some basic tests of filtering.
|
||||
*/
|
||||
public void TestFiltering() {
|
||||
|
||||
Transliterator tempTrans = Transliterator.createFromRules("temp", "x > y; x{a} > b; ", Transliterator.FORWARD);
|
||||
tempTrans.setFilter(new UnicodeSet("[a]"));
|
||||
String tempResult = tempTrans.transform("xa");
|
||||
assertEquals("context should not be filtered ", "xb", tempResult);
|
||||
|
||||
tempTrans = Transliterator.createFromRules("temp", "::[a]; x > y; x{a} > b; ", Transliterator.FORWARD);
|
||||
tempResult = tempTrans.transform("xa");
|
||||
assertEquals("context should not be filtered ", "xb", tempResult);
|
||||
|
||||
Transliterator hex = Transliterator.getInstance("Any-Hex");
|
||||
hex.setFilter(new UnicodeFilter() {
|
||||
public boolean contains(int c) {
|
||||
@ -2997,6 +3013,358 @@ public class TransliteratorTest extends TestFmwk {
|
||||
}
|
||||
}
|
||||
|
||||
public void TestSourceTargetSet2() {
|
||||
|
||||
|
||||
Normalizer2 nfkd = Normalizer2.getInstance(null, "NFKC", Mode.DECOMPOSE);
|
||||
Normalizer2 nfc = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
|
||||
Normalizer2 nfd = Normalizer2.getInstance(null, "NFC", Mode.DECOMPOSE);
|
||||
// UnicodeSet nfkdSource = new UnicodeSet();
|
||||
// UnicodeSet nfkdTarget = new UnicodeSet();
|
||||
// for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
// if (nfkd.isInert(i)) {
|
||||
// continue;
|
||||
// }
|
||||
// nfkdSource.add(i);
|
||||
// String t = nfkd.getDecomposition(i);
|
||||
// if (t != null) {
|
||||
// nfkdTarget.addAll(t);
|
||||
// } else {
|
||||
// nfkdTarget.add(i);
|
||||
// }
|
||||
// }
|
||||
// nfkdSource.freeze();
|
||||
// nfkdTarget.freeze();
|
||||
// logln("NFKD Source: " + nfkdSource.toPattern(false));
|
||||
// logln("NFKD Target: " + nfkdTarget.toPattern(false));
|
||||
|
||||
UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
|
||||
UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
|
||||
UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
|
||||
CanonicalIterator can = new CanonicalIterator("");
|
||||
|
||||
UnicodeSet disorderedMarks = new UnicodeSet();
|
||||
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
String s = nfd.getDecomposition(i);
|
||||
if (s == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
can.setSource(s);
|
||||
for (String t = can.next(); t != null; t = can.next()) {
|
||||
disorderedMarks.add(t);
|
||||
}
|
||||
|
||||
// if s has two code points, (or more), add the lead/trail information
|
||||
int first = s.codePointAt(0);
|
||||
int firstCount = Character.charCount(first);
|
||||
if (s.length() == firstCount) continue;
|
||||
String trailString = s.substring(firstCount);
|
||||
|
||||
// add all the trail characters
|
||||
if (!nonStarters.containsSome(trailString)) {
|
||||
continue;
|
||||
}
|
||||
UnicodeSet trailSet = leadToTrail.get(first);
|
||||
if (trailSet == null) {
|
||||
leadToTrail.put(first, trailSet = new UnicodeSet());
|
||||
}
|
||||
trailSet.addAll(trailString); // add remaining trails
|
||||
|
||||
// add the sources
|
||||
UnicodeSet sourcesSet = leadToSources.get(first);
|
||||
if (sourcesSet == null) {
|
||||
leadToSources.put(first, sourcesSet = new UnicodeSet());
|
||||
}
|
||||
sourcesSet.add(i);
|
||||
}
|
||||
|
||||
|
||||
for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
|
||||
String lead = x.getKey();
|
||||
UnicodeSet sources = x.getValue();
|
||||
UnicodeSet trailSet = leadToTrail.get(lead);
|
||||
for (String source : sources) {
|
||||
for (String trail : trailSet) {
|
||||
can.setSource(source + trail);
|
||||
for (String t = can.next(); t != null; t = can.next()) {
|
||||
if (t.endsWith(trail)) continue;
|
||||
disorderedMarks.add(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (String s : nonStarters) {
|
||||
disorderedMarks.add("\u0345" + s);
|
||||
disorderedMarks.add(s+"\u0323");
|
||||
String xx = nfc.normalize("Ǭ" + s);
|
||||
if (!xx.startsWith("Ǭ")) {
|
||||
logln("??");
|
||||
}
|
||||
}
|
||||
|
||||
// for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
// String s = nfkd.getDecomposition(i);
|
||||
// if (s != null) {
|
||||
// disorderedMarks.add(s);
|
||||
// disorderedMarks.add(nfc.normalize(s));
|
||||
// addDerivedStrings(nfc, disorderedMarks, s);
|
||||
// }
|
||||
// s = nfd.getDecomposition(i);
|
||||
// if (s != null) {
|
||||
// disorderedMarks.add(s);
|
||||
// }
|
||||
// if (!nfc.isInert(i)) {
|
||||
// if (i == 0x00C0) {
|
||||
// logln("À");
|
||||
// }
|
||||
// can.setSource(s+"\u0334");
|
||||
// for (String t = can.next(); t != null; t = can.next()) {
|
||||
// addDerivedStrings(nfc, disorderedMarks, t);
|
||||
// }
|
||||
// can.setSource(s+"\u0345");
|
||||
// for (String t = can.next(); t != null; t = can.next()) {
|
||||
// addDerivedStrings(nfc, disorderedMarks, t);
|
||||
// }
|
||||
// can.setSource(s+"\u0323");
|
||||
// for (String t = can.next(); t != null; t = can.next()) {
|
||||
// addDerivedStrings(nfc, disorderedMarks, t);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
logln("Test cases: " + disorderedMarks.size());
|
||||
disorderedMarks.addAll(0,0x10FFFF).freeze();
|
||||
logln("isInert \u0104 " + nfc.isInert('\u0104'));
|
||||
|
||||
Object[][] rules = {
|
||||
{":: [:sc=COMMON:] any-name;", null},
|
||||
|
||||
{":: [:Greek:] hex-any/C;", null},
|
||||
{":: [:Greek:] any-hex/C;", null},
|
||||
|
||||
{":: [[:Mn:][:Me:]] remove;", null},
|
||||
{":: [[:Mn:][:Me:]] null;", null},
|
||||
|
||||
|
||||
{":: lower;", null},
|
||||
{":: upper;", null},
|
||||
{":: title;", null},
|
||||
{":: CaseFold;", null},
|
||||
|
||||
{":: NFD;", null},
|
||||
{":: NFC;", null},
|
||||
{":: NFKD;", null},
|
||||
{":: NFKC;", null},
|
||||
|
||||
{":: [[:Mn:][:Me:]] NFKD;", null},
|
||||
{":: Latin-Greek;", null},
|
||||
{":: [:Latin:] NFKD;", null},
|
||||
{":: NFKD;", null},
|
||||
{":: NFKD;\n" +
|
||||
":: [[:Mn:][:Me:]] remove;\n" +
|
||||
":: NFC;", null},
|
||||
};
|
||||
for (Object[] rulex : rules) {
|
||||
String rule = (String) rulex[0];
|
||||
Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
|
||||
UnicodeSet actualSource = trans.getSourceSet();
|
||||
UnicodeSet actualTarget = trans.getTargetSet();
|
||||
UnicodeSet empiricalSource = new UnicodeSet();
|
||||
UnicodeSet empiricalTarget = new UnicodeSet();
|
||||
String ruleDisplay = rule.replace("\n", "\t\t");
|
||||
UnicodeSet toTest = disorderedMarks;
|
||||
// if (rulex[1] != null) {
|
||||
// toTest = new UnicodeSet(disorderedMarks);
|
||||
// toTest.addAll((UnicodeSet) rulex[1]);
|
||||
// }
|
||||
|
||||
String test = nfd.normalize("Ą");
|
||||
boolean DEBUG = true;
|
||||
int count = 0; // for debugging
|
||||
for (String s : toTest) {
|
||||
if (s.equals(test)) {
|
||||
logln(test);
|
||||
}
|
||||
String t = trans.transform(s);
|
||||
if (!s.equals(t)) {
|
||||
if (!isAtomic(s, t, trans)) {
|
||||
isAtomic(s, t, trans);
|
||||
continue;
|
||||
}
|
||||
|
||||
// only keep the part that changed; so skip the front and end.
|
||||
// int start = findSharedStartLength(s,t);
|
||||
// int end = findSharedEndLength(s,t);
|
||||
// if (start != 0 || end != 0) {
|
||||
// s = s.substring(start, s.length() - end);
|
||||
// t = t.substring(start, t.length() - end);
|
||||
// }
|
||||
if (DEBUG) {
|
||||
if (!actualSource.containsAll(s)) {
|
||||
count++;
|
||||
}
|
||||
if (!actualTarget.containsAll(t)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
addSourceTarget(s, empiricalSource, t, empiricalTarget);
|
||||
}
|
||||
}
|
||||
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
|
||||
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isAtomic(String s, String t, Transliterator trans) {
|
||||
for (int i = 1; i < s.length(); ++i) {
|
||||
if (!CharSequences.onCharacterBoundary(s, i)) {
|
||||
continue;
|
||||
}
|
||||
String q = trans.transform(s.substring(0,i));
|
||||
if (t.startsWith(q)) {
|
||||
String r = trans.transform(s.substring(i));
|
||||
if (t.length() == q.length() + r.length() && t.endsWith(r)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
// // make sure that every part is different
|
||||
// if (s.codePointCount(0, s.length()) > 1) {
|
||||
// int[] codePoints = It.codePoints(s);
|
||||
// for (int k = 0; k < codePoints.length; ++k) {
|
||||
// int pos = indexOf(t,codePoints[k]);
|
||||
// if (pos >= 0) {
|
||||
// int x;
|
||||
// }
|
||||
// }
|
||||
// if (s.contains("À")) {
|
||||
// logln("À");
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
private void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) {
|
||||
expectedSource.addAll(s);
|
||||
if (t.length() > 0) {
|
||||
expectedTarget.addAll(t);
|
||||
}
|
||||
}
|
||||
|
||||
private void addDerivedStrings(Normalizer2 nfc, UnicodeSet disorderedMarks, String s) {
|
||||
disorderedMarks.add(s);
|
||||
for (int j = 1; j < s.length(); ++j) {
|
||||
if (CharSequences.onCharacterBoundary(s, j)) {
|
||||
String shorter = s.substring(0,j);
|
||||
disorderedMarks.add(shorter);
|
||||
disorderedMarks.add(nfc.normalize(shorter) + s.substring(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestCharUtils() {
|
||||
String[][] startTests = {
|
||||
{"1", "a", "ab"},
|
||||
{"0", "a", "xb"},
|
||||
{"0", "\uD800", "\uD800\uDC01"},
|
||||
{"1", "\uD800a", "\uD800b"},
|
||||
{"0", "\uD800\uDC00", "\uD800\uDC01"},
|
||||
};
|
||||
for (String[] row : startTests) {
|
||||
int actual = findSharedStartLength(row[1], row[2]);
|
||||
assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
|
||||
Integer.parseInt(row[0]),
|
||||
actual);
|
||||
}
|
||||
String[][] endTests = {
|
||||
{"0", "\uDC00", "\uD801\uDC00"},
|
||||
{"1", "a", "ba"},
|
||||
{"0", "a", "bx"},
|
||||
{"1", "a\uDC00", "b\uDC00"},
|
||||
{"0", "\uD800\uDC00", "\uD801\uDC00"},
|
||||
};
|
||||
for (String[] row : endTests) {
|
||||
int actual = findSharedEndLength(row[1], row[2]);
|
||||
assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
|
||||
Integer.parseInt(row[0]),
|
||||
actual);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param s
|
||||
* @param t
|
||||
* @return
|
||||
*/
|
||||
// TODO make generally available
|
||||
private static int findSharedStartLength(CharSequence s, CharSequence t) {
|
||||
int min = Math.min(s.length(), t.length());
|
||||
int i;
|
||||
char sch, tch;
|
||||
for (i = 0; i < min; ++i) {
|
||||
sch = s.charAt(i);
|
||||
tch = t.charAt(i);
|
||||
if (sch != tch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return CharSequences.onCharacterBoundary(s,i) && CharSequences.onCharacterBoundary(t,i) ? i : i - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param s
|
||||
* @param t
|
||||
* @return
|
||||
*/
|
||||
// TODO make generally available
|
||||
private static int findSharedEndLength(CharSequence s, CharSequence t) {
|
||||
int slength = s.length();
|
||||
int tlength = t.length();
|
||||
int min = Math.min(slength, tlength);
|
||||
int i;
|
||||
char sch, tch;
|
||||
// TODO can make the calculations slightly faster... Not sure if it is worth the complication, tho'
|
||||
for (i = 0; i < min; ++i) {
|
||||
sch = s.charAt(slength - i - 1);
|
||||
tch = t.charAt(tlength - i - 1);
|
||||
if (sch != tch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return CharSequences.onCharacterBoundary(s,slength - i) && CharSequences.onCharacterBoundary(t,tlength - i) ? i : i - 1;
|
||||
}
|
||||
|
||||
enum SetAssert {EQUALS, MISSING_OK, EXTRA_OK}
|
||||
|
||||
void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) {
|
||||
boolean haveError = false;
|
||||
if (!actual.containsAll(empirical)) {
|
||||
UnicodeSet missing = new UnicodeSet(empirical).removeAll(actual);
|
||||
errln(message + " \tgetXSet < empirical (" + missing.size() + "): " + toPattern(missing));
|
||||
haveError = true;
|
||||
}
|
||||
if (!empirical.containsAll(actual)) {
|
||||
UnicodeSet extra = new UnicodeSet(actual).removeAll(empirical);
|
||||
logln("WARNING: " + message + " \tgetXSet > empirical (" + extra.size() + "): " + toPattern(extra));
|
||||
haveError = true;
|
||||
}
|
||||
if (!haveError) {
|
||||
logln("OK " + message + ' ' + toPattern(empirical));
|
||||
}
|
||||
}
|
||||
|
||||
private String toPattern(UnicodeSet missing) {
|
||||
String result = missing.toPattern(false);
|
||||
if (result.length() < 200) {
|
||||
return result;
|
||||
}
|
||||
return result.substring(0, CharSequences.onCharacterBoundary(result, 200) ? 200 : 199) + "…";
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test handling of rule whitespace, for both RBT and UnicodeSet.
|
||||
*/
|
||||
@ -3741,7 +4109,7 @@ the ::BEGIN/::END stuff)
|
||||
Transliterator.createFromRules("gif", "\\", Transliterator.FORWARD);
|
||||
} catch(Exception e){
|
||||
errln("TransliteratorParser.nextLine() was not suppose to return an " +
|
||||
"exception for a rule of '\\'");
|
||||
"exception for a rule of '\\'");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user