ICU-8227 Whew, there were lots of problems in the way the old code was done. Now working much better.

X-SVN-Rev: 29207
This commit is contained in:
Mark Davis 2010-12-14 07:51:00 +00:00
parent e0872406eb
commit 9ffcb85ba1
21 changed files with 1121 additions and 278 deletions

1
.gitattributes vendored
View File

@ -262,6 +262,7 @@ icu4j/main/classes/translit/.externalToolBuilders/copy-data-translit.launch -tex
icu4j/main/classes/translit/.settings/org.eclipse.core.resources.prefs -text
icu4j/main/classes/translit/.settings/org.eclipse.jdt.core.prefs -text
icu4j/main/classes/translit/.settings/org.eclipse.jdt.ui.prefs -text
icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java -text
icu4j/main/classes/translit/translit-build.launch -text
icu4j/main/shared/.project -text
icu4j/main/shared/.settings/org.eclipse.core.resources.prefs -text

View File

@ -404,5 +404,18 @@ class AnyTransliterator extends Transliterator {
}
return new AnyTransliterator(getID(), filter, target, targetScript, widthFix, cache);
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
// Assume that it can modify any character to any other character
sourceSet.addAll(myFilter);
if (myFilter.size() != 0) {
targetSet.addAll(0, 0x10FFFF);
}
}
}

View File

@ -387,5 +387,17 @@ final class BreakTransliterator extends Transliterator {
}
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
// Doesn't actually modify the source characters, so leave them alone.
// add the characters inserted
if (myFilter.size() != 0) {
targetSet.addAll(insertion);
}
}
}

View File

@ -7,6 +7,7 @@
package com.ibm.icu.text;
import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.lang.UCharacter;
/**
* A transliterator that performs locale-sensitive toLower()
@ -102,4 +103,23 @@ class CaseFoldTransliterator extends Transliterator{
}
offsets.start = offsets.limit;
}
static SourceTargetUtility sourceTargetUtility = null;
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
synchronized (UppercaseTransliterator.class) {
if (sourceTargetUtility == null) {
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
public String transform(String source) {
return UCharacter.foldCase(source, true);
}
});
}
}
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
}
}

View File

@ -305,26 +305,20 @@ class CompoundTransliterator extends Transliterator {
}
/**
* Return the set of all characters that may be modified by this
* Transliterator, ignoring the effect of our filter.
* @internal
*/
protected UnicodeSet handleGetSourceSet() {
UnicodeSet set = new UnicodeSet();
@Override
public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet myFilter = new UnicodeSet(getFilterAsUnicodeSet(filter));
UnicodeSet tempTargetSet = new UnicodeSet();
for (int i=0; i<trans.length; ++i) {
set.addAll(trans[i].getSourceSet());
// Take the example of Hiragana-Latin. This is really
// Hiragana-Katakana; Katakana-Latin. The source set of
// these two is roughly [:Hiragana:] and [:Katakana:].
// But the source set for the entire transliterator is
// actually [:Hiragana:] ONLY -- that is, the first
// non-empty source set.
// This is a heuristic, and not 100% reliable.
if (!set.isEmpty()) {
break;
}
// each time we produce targets, those can be used by subsequent items, despite the filter.
// so we get just those items, and add them to the filter each time.
tempTargetSet.clear();
trans[i].addSourceTargetSet(myFilter, sourceSet, tempTargetSet);
targetSet.addAll(tempTargetSet);
myFilter.addAll(tempTargetSet);
}
return set;
}
/**

View File

@ -197,4 +197,23 @@ class EscapeTransliterator extends Transliterator {
pos.limit = limit;
pos.start = start;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
sourceSet.addAll(getFilterAsUnicodeSet(inputFilter));
for (EscapeTransliterator it = this; it != null ; it = it.supplementalHandler) {
if (inputFilter.size() != 0) {
targetSet.addAll(it.prefix);
targetSet.addAll(it.suffix);
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < it.radix; ++i) {
Utility.appendNumber(buffer, i, it.radix, it.minDigits);
}
targetSet.addAll(buffer.toString()); // TODO drop once String is changed to CharSequence in UnicodeSet
}
}
}
}

View File

@ -7,6 +7,7 @@
package com.ibm.icu.text;
import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.ULocale;
/**
@ -109,4 +110,24 @@ class LowercaseTransliterator extends Transliterator{
}
offsets.start = offsets.limit;
}
// NOTE: normally this would be static, but because the results vary by locale....
SourceTargetUtility sourceTargetUtility = null;
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
synchronized (this) {
if (sourceTargetUtility == null) {
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
public String transform(String source) {
return UCharacter.toLowerCase(locale, source);
}
});
}
}
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
}
}

View File

@ -165,4 +165,31 @@ class NameUnicodeTransliterator extends Transliterator {
// open delimiter candidate.
offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
if (!myFilter.containsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.contains(CLOSE_DELIM)) {
return; // we have to contain both prefix and suffix
}
UnicodeSet items = new UnicodeSet()
.addAll('0', '9')
.addAll('A', 'F')
.addAll('a', 'z') // for controls
.add('<').add('>') // for controls
.add('(').add(')') // for controls
.add('-')
.add(' ')
.addAll(UnicodeNameTransliterator.OPEN_DELIM)
.add(CLOSE_DELIM);
items.retainAll(myFilter);
if (items.size() > 0) {
sourceSet.addAll(items);
// could produce any character
targetSet.addAll(0, 0x10FFFF);
}
}
}

View File

@ -1,14 +1,17 @@
/*
**********************************************************************
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 06/08/01 aliu Creation.
**********************************************************************
*/
**********************************************************************
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 06/08/01 aliu Creation.
**********************************************************************
*/
package com.ibm.icu.text;
import java.util.HashMap;
import java.util.Map;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
@ -76,7 +79,7 @@ final class NormalizationTransliterator extends Transliterator {
* Implements {@link Transliterator#handleTransliterate}.
*/
protected void handleTransliterate(Replaceable text,
Position offsets, boolean isIncremental) {
Position offsets, boolean isIncremental) {
// start and limit of the input range
int start = offsets.start;
int limit = offsets.limit;
@ -129,4 +132,34 @@ final class NormalizationTransliterator extends Transliterator {
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
}
static final Map<Normalizer2, SourceTargetUtility> SOURCE_CACHE = new HashMap<Normalizer2, SourceTargetUtility>();
// TODO Get rid of this if Normalizer2 becomes a Transform
static class NormalizingTransform implements Transform<String,String> {
final Normalizer2 norm2;
public NormalizingTransform(Normalizer2 norm2) {
this.norm2 = norm2;
}
public String transform(String source) {
return norm2.normalize(source);
}
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
SourceTargetUtility cache;
synchronized (SOURCE_CACHE) {
//String id = getID();
cache = SOURCE_CACHE.get(norm2);
if (cache == null) {
cache = new SourceTargetUtility(new NormalizingTransform(norm2), norm2);
SOURCE_CACHE.put(norm2, cache);
}
}
cache.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
}
}

View File

@ -30,4 +30,12 @@ class NullTransliterator extends Transliterator {
Position offsets, boolean incremental) {
offsets.start = offsets.limit;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
// do nothing
}
}

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2004, International Business Machines Corporation and *
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -49,4 +49,15 @@ class RemoveTransliterator extends Transliterator {
index.contextLimit -= len;
index.limit -= len;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
// intersect myFilter with the input filter
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
sourceSet.addAll(myFilter);
// do nothing with the target
}
}

View File

@ -448,24 +448,32 @@ public class RuleBasedTransliterator extends Transliterator {
return data.ruleSet.toRules(escapeUnprintable);
}
// /**
// * Return the set of all characters that may be modified by this
// * Transliterator, ignoring the effect of our filter.
// * @internal
// * @deprecated This API is ICU internal only.
// */
// protected UnicodeSet handleGetSourceSet() {
// return data.ruleSet.getSourceTargetSet(false, unicodeFilter);
// }
//
// /**
// * Returns the set of all characters that may be generated as
// * replacement text by this transliterator.
// * @internal
// * @deprecated This API is ICU internal only.
// */
// public UnicodeSet getTargetSet() {
// return data.ruleSet.getSourceTargetSet(true, unicodeFilter);
// }
/**
* Return the set of all characters that may be modified by this
* Transliterator, ignoring the effect of our filter.
* @internal
* @deprecated This API is ICU internal only.
*/
protected UnicodeSet handleGetSourceSet() {
return data.ruleSet.getSourceTargetSet(false);
}
/**
* Returns the set of all characters that may be generated as
* replacement text by this transliterator.
* @internal
* @deprecated This API is ICU internal only.
*/
public UnicodeSet getTargetSet() {
return data.ruleSet.getSourceTargetSet(true);
@Override
public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
data.ruleSet.addSourceTargetSet(filter, sourceSet, targetSet);
}
/**

View File

@ -0,0 +1,133 @@
/*
*******************************************************************************
* Copyright (C) 2010, Google, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.util.HashSet;
import java.util.Set;
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.text.Normalizer2.Mode;
/**
* Simple internal utility class for helping with getSource/TargetSet
*/
class SourceTargetUtility {
final Transform<String, String> transform;
final UnicodeSet sourceCache;
final Set<String> sourceStrings;
static final UnicodeSet NON_STARTERS = new UnicodeSet("[:^ccc=0:]").freeze();
static Normalizer2 NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
//static final UnicodeSet TRAILING_COMBINING = new UnicodeSet();
public SourceTargetUtility(Transform<String, String> transform) {
this(transform, null);
}
public SourceTargetUtility(Transform<String, String> transform, Normalizer2 normalizer) {
this.transform = transform;
if (normalizer != null) {
// synchronized (SourceTargetUtility.class) {
// if (NFC == null) {
// NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
// for (int i = 0; i <= 0x10FFFF; ++i) {
// String d = NFC.getDecomposition(i);
// if (d == null) {
// continue;
// }
// String s = NFC.normalize(d);
// if (!CharSequences.equals(i, s)) {
// continue;
// }
// // composes
// boolean first = false;
// for (int trailing : CharSequences.codePoints(d)) {
// if (first) {
// first = false;
// } else {
// TRAILING_COMBINING.add(trailing);
// }
// }
// }
// }
// }
sourceCache = new UnicodeSet("[:^ccc=0:]");
} else {
sourceCache = new UnicodeSet();
}
sourceStrings = new HashSet<String>();
for (int i = 0; i <= 0x10FFFF; ++i) {
String s = transform.transform(UTF16.valueOf(i));
boolean added = false;
if (!CharSequences.equals(i, s)) {
sourceCache.add(i);
added = true;
}
if (normalizer == null) {
continue;
}
String d = NFC.getDecomposition(i);
if (d == null) {
continue;
}
s = transform.transform(d);
if (!d.equals(s)) {
sourceStrings.add(d);
}
if (added) {
continue;
}
if (!normalizer.isInert(i)) {
sourceCache.add(i);
continue;
}
// see if any of the non-starters change s; if so, add i
// for (String ns : TRAILING_COMBINING) {
// String s2 = transform.transform(s + ns);
// if (!s2.startsWith(s)) {
// sourceCache.add(i);
// break;
// }
// }
// int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2;
// if (endOfFirst >= d.length()) {
// continue;
// }
// // now add all initial substrings
// for (int j = 1; j < d.length(); ++j) {
// if (!CharSequences.onCharacterBoundary(d, j)) {
// continue;
// }
// String dd = d.substring(0,j);
// s = transform.transform(dd);
// if (!dd.equals(s)) {
// sourceStrings.add(dd);
// }
// }
}
sourceCache.freeze();
}
public void addSourceTargetSet(Transliterator transliterator, UnicodeSet inputFilter, UnicodeSet sourceSet,
UnicodeSet targetSet) {
UnicodeSet myFilter = transliterator.getFilterAsUnicodeSet(inputFilter);
UnicodeSet affectedCharacters = new UnicodeSet(sourceCache).retainAll(myFilter);
sourceSet.addAll(affectedCharacters);
for (String s : affectedCharacters) {
targetSet.addAll(transform.transform(s));
}
for (String s : sourceStrings) {
if (myFilter.containsAll(s)) {
String t = transform.transform(s);
if (!s.equals(t)) {
targetSet.addAll(t);
sourceSet.addAll(s);
}
}
}
}
}

View File

@ -6,6 +6,7 @@
package com.ibm.icu.text;
import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.ULocale;
/**
@ -147,4 +148,24 @@ class TitlecaseTransliterator extends Transliterator {
}
offsets.start = offsets.limit;
}
// NOTE: normally this would be static, but because the results vary by locale....
SourceTargetUtility sourceTargetUtility = null;
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
synchronized (this) {
if (sourceTargetUtility == null) {
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
public String transform(String source) {
return UCharacter.toTitleCase(locale, source, null);
}
});
}
}
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
}
}

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2007, International Business Machines Corporation and *
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -548,8 +548,11 @@ class TransliterationRule {
* Union the set of all characters that may be modified by this rule
* into the given set.
*/
void addSourceSetTo(UnicodeSet toUnionTo) {
void addSourceSetTo(UnicodeSet toUnionTo, UnicodeSet filter) {
int limit = anteContextLength + keyLength;
if (filter != null && !matches(filter)) {
return;
}
for (int i=anteContextLength; i<limit; ) {
int ch = UTF16.charAt(pattern, i);
i += UTF16.getCharCount(ch);
@ -562,11 +565,55 @@ class TransliterationRule {
}
}
/**
* Sees if the source of the rule can match the filter. There is a known issue with filters containing multiple characters.
* @param filter must not be null (check in caller)
* @param pattern2
* @param anteContextLength2
* @param limit
* @return
*/
// Problem: the rule is [{ab}]c > x
// The filter is [a{bc}].
// If the input is abc, then the rule will work.
// However, following code applying the filter won't catch that case.
private boolean matches(UnicodeSet filter) {
int limit = anteContextLength + keyLength;
// We need to walk through the pattern.
// Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo
for (int i=anteContextLength; i<limit; ) {
int ch = UTF16.charAt(pattern, i);
i += UTF16.getCharCount(ch);
UnicodeMatcher matcher = data.lookupMatcher(ch);
if (matcher == null) {
if (!filter.contains(ch)) {
return false;
}
} else {
try {
if (!filter.containsSome((UnicodeSet) matcher)) {
return false;
}
} catch (ClassCastException e) {
UnicodeSet temp = new UnicodeSet();
matcher.addMatchSetTo(temp);
if (!filter.containsSome(temp)) {
return false;
}
}
}
}
return true;
}
/**
* Union the set of all characters that may be emitted by this rule
* into the given set.
*/
void addTargetSetTo(UnicodeSet toUnionTo) {
void addTargetSetTo(UnicodeSet toUnionTo, UnicodeSet filter) {
if (filter != null && !matches(filter)) {
return;
}
output.addReplacementSetTo(toUnionTo);
}
}

View File

@ -238,21 +238,15 @@ class TransliterationRuleSet {
return ruleSource.toString();
}
/**
* Return the set of all characters that may be modified (getTarget=false)
* or emitted (getTarget=true) by this set.
*/
UnicodeSet getSourceTargetSet(boolean getTarget) {
UnicodeSet set = new UnicodeSet();
// TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
// TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
int count = ruleVector.size();
for (int i=0; i<count; ++i) {
TransliterationRule r = ruleVector.get(i);
if (getTarget) {
r.addTargetSetTo(set);
} else {
r.addSourceSetTo(set);
}
r.addTargetSetTo(targetSet, filter);
r.addSourceSetTo(sourceSet, filter);
}
return set;
}
}

View File

@ -26,226 +26,200 @@ import com.ibm.icu.util.ULocale;
import com.ibm.icu.util.UResourceBundle;
/**
* <code>Transliterator</code> is an abstract class that
* transliterates text from one format to another. The most common
* kind of transliterator is a script, or alphabet, transliterator.
* For example, a Russian to Latin transliterator changes Russian text
* written in Cyrillic characters to phonetically equivalent Latin
* characters. It does not <em>translate</em> Russian to English!
* Transliteration, unlike translation, operates on characters, without
* reference to the meanings of words and sentences.
*
* <p>Although script conversion is its most common use, a
* transliterator can actually perform a more general class of tasks.
* In fact, <code>Transliterator</code> defines a very general API
* which specifies only that a segment of the input text is replaced
* by new text. The particulars of this conversion are determined
* entirely by subclasses of <code>Transliterator</code>.
*
* <p><b>Transliterators are stateless</b>
*
* <p><code>Transliterator</code> objects are <em>stateless</em>; they
* retain no information between calls to
* <code>transliterate()</code>. As a result, threads may share
* transliterators without synchronizing them. This might seem to
* limit the complexity of the transliteration operation. In
* practice, subclasses perform complex transliterations by delaying
* the replacement of text until it is known that no other
* replacements are possible. In other words, although the
* <code>Transliterator</code> objects are stateless, the source text
* itself embodies all the needed information, and delayed operation
* allows arbitrary complexity.
*
* <p><b>Batch transliteration</b>
*
* <p>The simplest way to perform transliteration is all at once, on a
* string of existing text. This is referred to as <em>batch</em>
* transliteration. For example, given a string <code>input</code>
* and a transliterator <code>t</code>, the call
*
* <code>Transliterator</code> is an abstract class that transliterates text from one format to another. The most common
* kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator
* changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not
* <em>translate</em> Russian to English! Transliteration, unlike translation, operates on characters, without reference
* to the meanings of words and sentences.
*
* <p>
* Although script conversion is its most common use, a transliterator can actually perform a more general class of
* tasks. In fact, <code>Transliterator</code> defines a very general API which specifies only that a segment of the
* input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of
* <code>Transliterator</code>.
*
* <p>
* <b>Transliterators are stateless</b>
*
* <p>
* <code>Transliterator</code> objects are <em>stateless</em>; they retain no information between calls to
* <code>transliterate()</code>. As a result, threads may share transliterators without synchronizing them. This might
* seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex
* transliterations by delaying the replacement of text until it is known that no other replacements are possible. In
* other words, although the <code>Transliterator</code> objects are stateless, the source text itself embodies all the
* needed information, and delayed operation allows arbitrary complexity.
*
* <p>
* <b>Batch transliteration</b>
*
* <p>
* The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as
* <em>batch</em> transliteration. For example, given a string <code>input</code> and a transliterator <code>t</code>,
* the call
*
* <blockquote><code>String result = t.transliterate(input);
* </code></blockquote>
*
* will transliterate it and return the result. Other methods allow
* the client to specify a substring to be transliterated and to use
* {@link Replaceable} objects instead of strings, in order to
* preserve out-of-band information (such as text styles).
*
* <p><b>Keyboard transliteration</b>
*
* <p>Somewhat more involved is <em>keyboard</em>, or incremental
* transliteration. This is the transliteration of text that is
* arriving from some source (typically the user's keyboard) one
* character at a time, or in some other piecemeal fashion.
*
* <p>In keyboard transliteration, a <code>Replaceable</code> buffer
* stores the text. As text is inserted, as much as possible is
* transliterated on the fly. This means a GUI that displays the
* contents of the buffer may show text being modified as each new
* character arrives.
*
* <p>Consider the simple <code>RuleBasedTransliterator</code>:
*
*
* will transliterate it and return the result. Other methods allow the client to specify a substring to be
* transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band
* information (such as text styles).
*
* <p>
* <b>Keyboard transliteration</b>
*
* <p>
* Somewhat more involved is <em>keyboard</em>, or incremental transliteration. This is the transliteration of text that
* is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal
* fashion.
*
* <p>
* In keyboard transliteration, a <code>Replaceable</code> buffer stores the text. As text is inserted, as much as
* possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being
* modified as each new character arrives.
*
* <p>
* Consider the simple <code>RuleBasedTransliterator</code>:
*
* <blockquote><code>
* th&gt;{theta}<br>
* t&gt;{tau}
* </code></blockquote>
*
* When the user types 't', nothing will happen, since the
* transliterator is waiting to see if the next character is 'h'. To
* remedy this, we introduce the notion of a cursor, marked by a '|'
* in the output string:
*
*
* When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is
* 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string:
*
* <blockquote><code>
* t&gt;|{tau}<br>
* {tau}h&gt;{theta}
* </code></blockquote>
*
* Now when the user types 't', tau appears, and if the next character
* is 'h', the tau changes to a theta. This is accomplished by
* maintaining a cursor position (independent of the insertion point,
* and invisible in the GUI) across calls to
* <code>transliterate()</code>. Typically, the cursor will
* be coincident with the insertion point, but in a case like the one
* above, it will precede the insertion point.
*
* <p>Keyboard transliteration methods maintain a set of three indices
* that are updated with each call to
* <code>transliterate()</code>, including the cursor, start,
* and limit. These indices are changed by the method, and they are
* passed in and out via a Position object. The <code>start</code> index
* marks the beginning of the substring that the transliterator will
* look at. It is advanced as text becomes committed (but it is not
* the committed index; that's the <code>cursor</code>). The
* <code>cursor</code> index, described above, marks the point at
* which the transliterator last stopped, either because it reached
* the end, or because it required more characters to disambiguate
* between possible inputs. The <code>cursor</code> can also be
* explicitly set by rules in a <code>RuleBasedTransliterator</code>.
* Any characters before the <code>cursor</code> index are frozen;
* future keyboard transliteration calls within this input sequence
* will not change them. New text is inserted at the
* <code>limit</code> index, which marks the end of the substring that
* the transliterator looks at.
*
* <p>Because keyboard transliteration assumes that more characters
* are to arrive, it is conservative in its operation. It only
* transliterates when it can do so unambiguously. Otherwise it waits
* for more characters to arrive. When the client code knows that no
* more characters are forthcoming, perhaps because the user has
* performed some input termination operation, then it should call
* <code>finishTransliteration()</code> to complete any
* pending transliterations.
*
* <p><b>Inverses</b>
*
* <p>Pairs of transliterators may be inverses of one another. For
* example, if transliterator <b>A</b> transliterates characters by
* incrementing their Unicode value (so "abc" -> "def"), and
* transliterator <b>B</b> decrements character values, then <b>A</b>
* is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
* with <b>B</b> in a compound transliterator, the result is the
* indentity transliterator, that is, a transliterator that does not
* change its input text.
*
* The <code>Transliterator</code> method <code>getInverse()</code>
* returns a transliterator's inverse, if one exists, or
* <code>null</code> otherwise. However, the result of
* <code>getInverse()</code> usually will <em>not</em> be a true
* mathematical inverse. This is because true inverse transliterators
* are difficult to formulate. For example, consider two
* transliterators: <b>AB</b>, which transliterates the character 'A'
* to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
* seem that these are exact inverses, since
*
*
* Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is
* accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across
* calls to <code>transliterate()</code>. Typically, the cursor will be coincident with the insertion point, but in a
* case like the one above, it will precede the insertion point.
*
* <p>
* Keyboard transliteration methods maintain a set of three indices that are updated with each call to
* <code>transliterate()</code>, including the cursor, start, and limit. These indices are changed by the method, and
* they are passed in and out via a Position object. The <code>start</code> index marks the beginning of the substring
* that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index;
* that's the <code>cursor</code>). The <code>cursor</code> index, described above, marks the point at which the
* transliterator last stopped, either because it reached the end, or because it required more characters to
* disambiguate between possible inputs. The <code>cursor</code> can also be explicitly set by rules in a
* <code>RuleBasedTransliterator</code>. Any characters before the <code>cursor</code> index are frozen; future keyboard
* transliteration calls within this input sequence will not change them. New text is inserted at the <code>limit</code>
* index, which marks the end of the substring that the transliterator looks at.
*
* <p>
* Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It
* only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the
* client code knows that no more characters are forthcoming, perhaps because the user has performed some input
* termination operation, then it should call <code>finishTransliteration()</code> to complete any pending
* transliterations.
*
* <p>
* <b>Inverses</b>
*
* <p>
* Pairs of transliterators may be inverses of one another. For example, if transliterator <b>A</b> transliterates
* characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator <b>B</b> decrements character
* values, then <b>A</b> is an inverse of <b>B</b> and vice versa. If we compose <b>A</b> with <b>B</b> in a compound
* transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input
* text.
*
* The <code>Transliterator</code> method <code>getInverse()</code> returns a transliterator's inverse, if one exists,
* or <code>null</code> otherwise. However, the result of <code>getInverse()</code> usually will <em>not</em> be a true
* mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider
* two transliterators: <b>AB</b>, which transliterates the character 'A' to 'B', and <b>BA</b>, which transliterates
* 'B' to 'A'. It might seem that these are exact inverses, since
*
* <blockquote>"A" x <b>AB</b> -> "B"<br>
* "B" x <b>BA</b> -> "A"</blockquote>
*
* where 'x' represents transliteration. However,
*
*
* where 'x' represents transliteration. However,
*
* <blockquote>"ABCD" x <b>AB</b> -> "BBCD"<br>
* "BBCD" x <b>BA</b> -> "AACD"</blockquote>
*
* so <b>AB</b> composed with <b>BA</b> is not the
* identity. Nonetheless, <b>BA</b> may be usefully considered to be
* <b>AB</b>'s inverse, and it is on this basis that
* <b>AB</b><code>.getInverse()</code> could legitimately return
*
* so <b>AB</b> composed with <b>BA</b> is not the identity. Nonetheless, <b>BA</b> may be usefully considered to be
* <b>AB</b>'s inverse, and it is on this basis that <b>AB</b><code>.getInverse()</code> could legitimately return
* <b>BA</b>.
*
* <p><b>IDs and display names</b>
*
* <p>A transliterator is designated by a short identifier string or
* <em>ID</em>. IDs follow the format <em>source-destination</em>,
* where <em>source</em> describes the entity being replaced, and
* <em>destination</em> describes the entity replacing
* <em>source</em>. The entities may be the names of scripts,
* particular sequences of characters, or whatever else it is that the
* transliterator converts to or from. For example, a transliterator
* from Russian to Latin might be named "Russian-Latin". A
* transliterator from keyboard escape sequences to Latin-1 characters
* might be named "KeyboardEscape-Latin1". By convention, system
* entity names are in English, with the initial letters of words
* capitalized; user entity names may follow any format so long as
* they do not contain dashes.
*
* <p>In addition to programmatic IDs, transliterator objects have
* display names for presentation in user interfaces, returned by
* {@link #getDisplayName}.
*
* <p><b>Factory methods and registration</b>
*
* <p>In general, client code should use the factory method
* <code>getInstance()</code> to obtain an instance of a
* transliterator given its ID. Valid IDs may be enumerated using
* <code>getAvailableIDs()</code>. Since transliterators are
* stateless, multiple calls to <code>getInstance()</code> with the
* same ID will return the same object.
*
* <p>In addition to the system transliterators registered at startup,
* user transliterators may be registered by calling
* <code>registerInstance()</code> at run time. To register a
* transliterator subclass without instantiating it (until it is
* needed), users may call <code>registerClass()</code>.
*
* <p><b>Composed transliterators</b>
*
* <p>In addition to built-in system transliterators like
* "Latin-Greek", there are also built-in <em>composed</em>
* transliterators. These are implemented by composing two or more
* component transliterators. For example, if we have scripts "A",
* "B", "C", and "D", and we want to transliterate between all pairs
* of them, then we need to write 12 transliterators: "A-B", "A-C",
* "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to
* convert all scripts to an intermediate script "M", then instead of
* writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M",
* "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big
* win, but it's really 2<em>n</em> vs. <em>n</em><sup>2</sup> -
* <em>n</em>, so as <em>n</em> gets larger the gain becomes
* significant. With 9 scripts, it's 18 vs. 72 rule sets, a big
* difference.) Note the use of "~" rather than "-" for the script
* separator here; this indicates that the given transliterator is
* intended to be composed with others, rather than be used as is.
*
* <p>Composed transliterators can be instantiated as usual. For
* example, the system transliterator "Devanagari-Gujarati" is a
* composed transliterator built internally as
* "Devanagari~InterIndic;InterIndic~Gujarati". When this
* transliterator is instantiated, it appears externally to be a
* standard transliterator (e.g., getID() returns
*
* <p>
* <b>Filtering</b>
* <p>Each transliterator has a filter, which restricts changes to those characters selected by the filter. The
* filter affects just the characters that are changed -- the characters outside of the filter are still part of the
* context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'.
*
* <pre>
* String rules = &quot;x &gt; y; x{a} &gt; b; &quot;;
* Transliterator tempTrans = Transliterator.createFromRules(&quot;temp&quot;, rules, Transliterator.FORWARD);
* tempTrans.setFilter(new UnicodeSet(&quot;[a]&quot;));
* String tempResult = tempTrans.transform(&quot;xa&quot;);
* // results in &quot;xb&quot;
*</pre>
* <p>
* <b>IDs and display names</b>
*
* <p>
* A transliterator is designated by a short identifier string or <em>ID</em>. IDs follow the format
* <em>source-destination</em>, where <em>source</em> describes the entity being replaced, and <em>destination</em>
* describes the entity replacing <em>source</em>. The entities may be the names of scripts, particular sequences of
* characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from
* Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1
* characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the
* initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes.
*
* <p>
* In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces,
* returned by {@link #getDisplayName}.
*
* <p>
* <b>Factory methods and registration</b>
*
* <p>
* In general, client code should use the factory method <code>getInstance()</code> to obtain an instance of a
* transliterator given its ID. Valid IDs may be enumerated using <code>getAvailableIDs()</code>. Since transliterators
* are stateless, multiple calls to <code>getInstance()</code> with the same ID will return the same object.
*
* <p>
* In addition to the system transliterators registered at startup, user transliterators may be registered by calling
* <code>registerInstance()</code> at run time. To register a transliterator subclass without instantiating it (until it
* is needed), users may call <code>registerClass()</code>.
*
* <p>
* <b>Composed transliterators</b>
*
* <p>
* In addition to built-in system transliterators like "Latin-Greek", there are also built-in <em>composed</em>
* transliterators. These are implemented by composing two or more component transliterators. For example, if we have
* scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12
* transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an
* intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M",
* "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2<em>n</em> vs. <em>n</em>
* <sup>2</sup> - <em>n</em>, so as <em>n</em> gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72
* rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that
* the given transliterator is intended to be composed with others, rather than be used as is.
*
* <p>
* Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati"
* is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this
* transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns
* "Devanagari-Gujarati").
*
* <p><b>Subclassing</b>
*
* <p>Subclasses must implement the abstract method
* <code>handleTransliterate()</code>. <p>Subclasses should override
* the <code>transliterate()</code> method taking a
* <code>Replaceable</code> and the <code>transliterate()</code>
* method taking a <code>String</code> and <code>StringBuffer</code>
* if the performance of these methods can be improved over the
* performance obtained by the default implementations in this class.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
*
* <p>
* <b>Subclassing</b>
*
* <p>
* Subclasses must implement the abstract method <code>handleTransliterate()</code>.
* <p>
* Subclasses should override the <code>transliterate()</code> method taking a <code>Replaceable</code> and the
* <code>transliterate()</code> method taking a <code>String</code> and <code>StringBuffer</code> if the performance of
* these methods can be improved over the performance obtained by the default implementations in this class.
*
* <p>
* Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @stable ICU 2.0
*/
@ -1418,7 +1392,7 @@ public abstract class Transliterator implements StringTransform {
t = new NullTransliterator();
}
else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), null);
t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter);
}
else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
// idBlock, no data -- this is an alias. The ID has
@ -1536,6 +1510,8 @@ public abstract class Transliterator implements StringTransform {
return result;
}
static final UnicodeSet ALL_CODEPOINTS = new UnicodeSet(0,0x10FFFF).freeze();
/**
* Returns the set of all characters that may be modified in the
* input text by this Transliterator. This incorporates this
@ -1550,20 +1526,9 @@ public abstract class Transliterator implements StringTransform {
* @stable ICU 2.2
*/
public final UnicodeSet getSourceSet() {
UnicodeSet set = handleGetSourceSet();
if (filter != null) {
UnicodeSet filterSet;
// Most, but not all filters will be UnicodeSets. Optimize for
// the high-runner case.
try {
filterSet = (UnicodeSet) filter;
} catch (ClassCastException e) {
filterSet = new UnicodeSet();
filter.addMatchSetTo(filterSet);
}
set.retainAll(filterSet);
}
return set;
UnicodeSet result = new UnicodeSet();
addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), result, new UnicodeSet());
return result;
}
/**
@ -1595,7 +1560,78 @@ public abstract class Transliterator implements StringTransform {
* @stable ICU 2.2
*/
public UnicodeSet getTargetSet() {
return new UnicodeSet();
UnicodeSet result = new UnicodeSet();
addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), new UnicodeSet(), result);
return result;
}
/**
* Returns the set of all characters that may be generated as
* replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter().
* <p>SHOULD BE OVERRIDEN BY SUBCLASSES.
* It is probably an error for any transliterator to NOT override this, but we can't force them to
* for backwards compatibility.
* <p>Other methods vector through this.
* <p>When gathering the information on source and target, the compound transliterator makes things complicated.
* For example, suppose we have:
* <pre>
* Global FILTER = [ax]
* a > b;
* :: NULL;
* b > c;
* x > d;
* </pre>
* While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets
* cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to
* the global filter, intersect that transliterator's filter. Based on that we get the target.
* The next transliterator gets as a global filter (global + last target). And so on.
* <p>There is another complication:
* <pre>
* Global FILTER = [ax]
* a > |b;
* b > c;
* </pre>
* Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will
* change the global filter as we go.
* @param targetSet TODO
* @see #getTargetSet
* @internal
*/
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter);
// use old method, if we don't have anything better
sourceSet.addAll(temp);
// clumsy guess with target
for (String s : temp) {
String t = transliterate(s);
if (!s.equals(t)) {
targetSet.addAll(t);
}
}
}
/**
* Returns the intersectionof this instance's filter intersected with an external filter.
* The externalFilter must be frozen (it is frozen if not).
* The result may be frozen, so don't attempt to modify.
* @internal
*/
// TODO change to getMergedFilter
public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) {
if (filter == null) {
return externalFilter;
}
UnicodeSet filterSet = new UnicodeSet(externalFilter);
// Most, but not all filters will be UnicodeSets. Optimize for
// the high-runner case.
UnicodeSet temp;
try {
temp = (UnicodeSet) filter;
} catch (ClassCastException e) {
filter.addMatchSetTo(temp = new UnicodeSet());
}
return filterSet.retainAll(temp).freeze();
}
/**

View File

@ -8,6 +8,7 @@
**********************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
/**
@ -248,4 +249,38 @@ class UnescapeTransliterator extends Transliterator {
pos.limit = limit;
pos.start = start;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
// Each form consists of a prefix, suffix,
// * radix, minimum digit count, and maximum digit count. These
// * values are stored as a five character header. ...
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
UnicodeSet items = new UnicodeSet();
StringBuilder buffer = new StringBuilder();
for (int i = 0; spec[i] != END;) {
// first 5 items are header
int end = i + spec[i] + spec[i+1] + 5;
int radix = spec[i+2];
for (int j = 0; j < radix; ++j) {
Utility.appendNumber(buffer, j, radix, 0);
}
// then add the characters
for (int j = i + 5; j < end; ++j) {
items.add(spec[j]);
}
// and go to next block
i = end;
}
items.addAll(buffer.toString());
items.retainAll(myFilter);
if (items.size() > 0) {
sourceSet.addAll(items);
targetSet.addAll(0,0x10FFFF); // assume we can produce any character
}
}
}

View File

@ -70,4 +70,25 @@ class UnicodeNameTransliterator extends Transliterator {
offsets.limit = limit;
offsets.start = cursor;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
if (myFilter.size() > 0) {
sourceSet.addAll(myFilter);
targetSet.addAll('0', '9')
.addAll('A', 'Z')
.add('-')
.add(' ')
.addAll(OPEN_DELIM)
.add(CLOSE_DELIM)
.addAll('a', 'z') // for controls
.add('<').add('>') // for controls
.add('(').add(')') // for controls
;
}
}
}

View File

@ -7,6 +7,7 @@
package com.ibm.icu.text;
import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.ULocale;
/**
@ -105,4 +106,24 @@ class UppercaseTransliterator extends Transliterator {
}
offsets.start = offsets.limit;
}
// NOTE: normally this would be static, but because the results vary by locale....
SourceTargetUtility sourceTargetUtility = null;
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
synchronized (this) {
if (sourceTargetUtility == null) {
sourceTargetUtility = new SourceTargetUtility(new Transform<String,String>() {
public String transform(String source) {
return UCharacter.toUpperCase(locale, source);
}
});
}
}
sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
}
}

View File

@ -13,13 +13,18 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map.Entry;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.impl.UtilityExtensions;
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.StringTransform;
@ -28,6 +33,7 @@ import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeFilter;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.text.Normalizer2.Mode;
import com.ibm.icu.util.CaseInsensitiveString;
import com.ibm.icu.util.ULocale;
@ -480,6 +486,16 @@ public class TransliteratorTest extends TestFmwk {
* Do some basic tests of filtering.
*/
public void TestFiltering() {
Transliterator tempTrans = Transliterator.createFromRules("temp", "x > y; x{a} > b; ", Transliterator.FORWARD);
tempTrans.setFilter(new UnicodeSet("[a]"));
String tempResult = tempTrans.transform("xa");
assertEquals("context should not be filtered ", "xb", tempResult);
tempTrans = Transliterator.createFromRules("temp", "::[a]; x > y; x{a} > b; ", Transliterator.FORWARD);
tempResult = tempTrans.transform("xa");
assertEquals("context should not be filtered ", "xb", tempResult);
Transliterator hex = Transliterator.getInstance("Any-Hex");
hex.setFilter(new UnicodeFilter() {
public boolean contains(int c) {
@ -2997,6 +3013,358 @@ public class TransliteratorTest extends TestFmwk {
}
}
public void TestSourceTargetSet2() {
Normalizer2 nfkd = Normalizer2.getInstance(null, "NFKC", Mode.DECOMPOSE);
Normalizer2 nfc = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE);
Normalizer2 nfd = Normalizer2.getInstance(null, "NFC", Mode.DECOMPOSE);
// UnicodeSet nfkdSource = new UnicodeSet();
// UnicodeSet nfkdTarget = new UnicodeSet();
// for (int i = 0; i <= 0x10FFFF; ++i) {
// if (nfkd.isInert(i)) {
// continue;
// }
// nfkdSource.add(i);
// String t = nfkd.getDecomposition(i);
// if (t != null) {
// nfkdTarget.addAll(t);
// } else {
// nfkdTarget.add(i);
// }
// }
// nfkdSource.freeze();
// nfkdTarget.freeze();
// logln("NFKD Source: " + nfkdSource.toPattern(false));
// logln("NFKD Target: " + nfkdTarget.toPattern(false));
UnicodeMap<UnicodeSet> leadToTrail = new UnicodeMap();
UnicodeMap<UnicodeSet> leadToSources = new UnicodeMap();
UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze();
CanonicalIterator can = new CanonicalIterator("");
UnicodeSet disorderedMarks = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
String s = nfd.getDecomposition(i);
if (s == null) {
continue;
}
can.setSource(s);
for (String t = can.next(); t != null; t = can.next()) {
disorderedMarks.add(t);
}
// if s has two code points, (or more), add the lead/trail information
int first = s.codePointAt(0);
int firstCount = Character.charCount(first);
if (s.length() == firstCount) continue;
String trailString = s.substring(firstCount);
// add all the trail characters
if (!nonStarters.containsSome(trailString)) {
continue;
}
UnicodeSet trailSet = leadToTrail.get(first);
if (trailSet == null) {
leadToTrail.put(first, trailSet = new UnicodeSet());
}
trailSet.addAll(trailString); // add remaining trails
// add the sources
UnicodeSet sourcesSet = leadToSources.get(first);
if (sourcesSet == null) {
leadToSources.put(first, sourcesSet = new UnicodeSet());
}
sourcesSet.add(i);
}
for (Entry<String, UnicodeSet> x : leadToSources.entrySet()) {
String lead = x.getKey();
UnicodeSet sources = x.getValue();
UnicodeSet trailSet = leadToTrail.get(lead);
for (String source : sources) {
for (String trail : trailSet) {
can.setSource(source + trail);
for (String t = can.next(); t != null; t = can.next()) {
if (t.endsWith(trail)) continue;
disorderedMarks.add(t);
}
}
}
}
for (String s : nonStarters) {
disorderedMarks.add("\u0345" + s);
disorderedMarks.add(s+"\u0323");
String xx = nfc.normalize("Ǭ" + s);
if (!xx.startsWith("Ǭ")) {
logln("??");
}
}
// for (int i = 0; i <= 0x10FFFF; ++i) {
// String s = nfkd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// disorderedMarks.add(nfc.normalize(s));
// addDerivedStrings(nfc, disorderedMarks, s);
// }
// s = nfd.getDecomposition(i);
// if (s != null) {
// disorderedMarks.add(s);
// }
// if (!nfc.isInert(i)) {
// if (i == 0x00C0) {
// logln("À");
// }
// can.setSource(s+"\u0334");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0345");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// can.setSource(s+"\u0323");
// for (String t = can.next(); t != null; t = can.next()) {
// addDerivedStrings(nfc, disorderedMarks, t);
// }
// }
// }
logln("Test cases: " + disorderedMarks.size());
disorderedMarks.addAll(0,0x10FFFF).freeze();
logln("isInert \u0104 " + nfc.isInert('\u0104'));
Object[][] rules = {
{":: [:sc=COMMON:] any-name;", null},
{":: [:Greek:] hex-any/C;", null},
{":: [:Greek:] any-hex/C;", null},
{":: [[:Mn:][:Me:]] remove;", null},
{":: [[:Mn:][:Me:]] null;", null},
{":: lower;", null},
{":: upper;", null},
{":: title;", null},
{":: CaseFold;", null},
{":: NFD;", null},
{":: NFC;", null},
{":: NFKD;", null},
{":: NFKC;", null},
{":: [[:Mn:][:Me:]] NFKD;", null},
{":: Latin-Greek;", null},
{":: [:Latin:] NFKD;", null},
{":: NFKD;", null},
{":: NFKD;\n" +
":: [[:Mn:][:Me:]] remove;\n" +
":: NFC;", null},
};
for (Object[] rulex : rules) {
String rule = (String) rulex[0];
Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD);
UnicodeSet actualSource = trans.getSourceSet();
UnicodeSet actualTarget = trans.getTargetSet();
UnicodeSet empiricalSource = new UnicodeSet();
UnicodeSet empiricalTarget = new UnicodeSet();
String ruleDisplay = rule.replace("\n", "\t\t");
UnicodeSet toTest = disorderedMarks;
// if (rulex[1] != null) {
// toTest = new UnicodeSet(disorderedMarks);
// toTest.addAll((UnicodeSet) rulex[1]);
// }
String test = nfd.normalize("Ą");
boolean DEBUG = true;
int count = 0; // for debugging
for (String s : toTest) {
if (s.equals(test)) {
logln(test);
}
String t = trans.transform(s);
if (!s.equals(t)) {
if (!isAtomic(s, t, trans)) {
isAtomic(s, t, trans);
continue;
}
// only keep the part that changed; so skip the front and end.
// int start = findSharedStartLength(s,t);
// int end = findSharedEndLength(s,t);
// if (start != 0 || end != 0) {
// s = s.substring(start, s.length() - end);
// t = t.substring(start, t.length() - end);
// }
if (DEBUG) {
if (!actualSource.containsAll(s)) {
count++;
}
if (!actualTarget.containsAll(t)) {
count++;
}
}
addSourceTarget(s, empiricalSource, t, empiricalTarget);
}
}
assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK);
assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK);
}
}
private boolean isAtomic(String s, String t, Transliterator trans) {
for (int i = 1; i < s.length(); ++i) {
if (!CharSequences.onCharacterBoundary(s, i)) {
continue;
}
String q = trans.transform(s.substring(0,i));
if (t.startsWith(q)) {
String r = trans.transform(s.substring(i));
if (t.length() == q.length() + r.length() && t.endsWith(r)) {
return false;
}
}
}
return true;
// // make sure that every part is different
// if (s.codePointCount(0, s.length()) > 1) {
// int[] codePoints = It.codePoints(s);
// for (int k = 0; k < codePoints.length; ++k) {
// int pos = indexOf(t,codePoints[k]);
// if (pos >= 0) {
// int x;
// }
// }
// if (s.contains("À")) {
// logln("À");
// }
// }
}
private void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) {
expectedSource.addAll(s);
if (t.length() > 0) {
expectedTarget.addAll(t);
}
}
private void addDerivedStrings(Normalizer2 nfc, UnicodeSet disorderedMarks, String s) {
disorderedMarks.add(s);
for (int j = 1; j < s.length(); ++j) {
if (CharSequences.onCharacterBoundary(s, j)) {
String shorter = s.substring(0,j);
disorderedMarks.add(shorter);
disorderedMarks.add(nfc.normalize(shorter) + s.substring(j));
}
}
}
public void TestCharUtils() {
String[][] startTests = {
{"1", "a", "ab"},
{"0", "a", "xb"},
{"0", "\uD800", "\uD800\uDC01"},
{"1", "\uD800a", "\uD800b"},
{"0", "\uD800\uDC00", "\uD800\uDC01"},
};
for (String[] row : startTests) {
int actual = findSharedStartLength(row[1], row[2]);
assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")",
Integer.parseInt(row[0]),
actual);
}
String[][] endTests = {
{"0", "\uDC00", "\uD801\uDC00"},
{"1", "a", "ba"},
{"0", "a", "bx"},
{"1", "a\uDC00", "b\uDC00"},
{"0", "\uD800\uDC00", "\uD801\uDC00"},
};
for (String[] row : endTests) {
int actual = findSharedEndLength(row[1], row[2]);
assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")",
Integer.parseInt(row[0]),
actual);
}
}
/**
* @param s
* @param t
* @return
*/
// TODO make generally available
private static int findSharedStartLength(CharSequence s, CharSequence t) {
int min = Math.min(s.length(), t.length());
int i;
char sch, tch;
for (i = 0; i < min; ++i) {
sch = s.charAt(i);
tch = t.charAt(i);
if (sch != tch) {
break;
}
}
return CharSequences.onCharacterBoundary(s,i) && CharSequences.onCharacterBoundary(t,i) ? i : i - 1;
}
/**
* @param s
* @param t
* @return
*/
// TODO make generally available
private static int findSharedEndLength(CharSequence s, CharSequence t) {
int slength = s.length();
int tlength = t.length();
int min = Math.min(slength, tlength);
int i;
char sch, tch;
// TODO can make the calculations slightly faster... Not sure if it is worth the complication, tho'
for (i = 0; i < min; ++i) {
sch = s.charAt(slength - i - 1);
tch = t.charAt(tlength - i - 1);
if (sch != tch) {
break;
}
}
return CharSequences.onCharacterBoundary(s,slength - i) && CharSequences.onCharacterBoundary(t,tlength - i) ? i : i - 1;
}
enum SetAssert {EQUALS, MISSING_OK, EXTRA_OK}
void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) {
boolean haveError = false;
if (!actual.containsAll(empirical)) {
UnicodeSet missing = new UnicodeSet(empirical).removeAll(actual);
errln(message + " \tgetXSet < empirical (" + missing.size() + "): " + toPattern(missing));
haveError = true;
}
if (!empirical.containsAll(actual)) {
UnicodeSet extra = new UnicodeSet(actual).removeAll(empirical);
logln("WARNING: " + message + " \tgetXSet > empirical (" + extra.size() + "): " + toPattern(extra));
haveError = true;
}
if (!haveError) {
logln("OK " + message + ' ' + toPattern(empirical));
}
}
private String toPattern(UnicodeSet missing) {
String result = missing.toPattern(false);
if (result.length() < 200) {
return result;
}
return result.substring(0, CharSequences.onCharacterBoundary(result, 200) ? 200 : 199) + "";
}
/**
* Test handling of rule whitespace, for both RBT and UnicodeSet.
*/
@ -3741,7 +4109,7 @@ the ::BEGIN/::END stuff)
Transliterator.createFromRules("gif", "\\", Transliterator.FORWARD);
} catch(Exception e){
errln("TransliteratorParser.nextLine() was not suppose to return an " +
"exception for a rule of '\\'");
"exception for a rule of '\\'");
}
}
}