From 9ffcb85ba127a118317c1a4ba7e331cf4cf00c34 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Tue, 14 Dec 2010 07:51:00 +0000 Subject: [PATCH] ICU-8227 Whew, there were lots of problems in the way the old code was done. Now working much better. X-SVN-Rev: 29207 --- .gitattributes | 1 + .../com/ibm/icu/text/AnyTransliterator.java | 13 + .../com/ibm/icu/text/BreakTransliterator.java | 12 + .../ibm/icu/text/CaseFoldTransliterator.java | 20 + .../ibm/icu/text/CompoundTransliterator.java | 28 +- .../ibm/icu/text/EscapeTransliterator.java | 19 + .../ibm/icu/text/LowercaseTransliterator.java | 21 + .../icu/text/NameUnicodeTransliterator.java | 27 + .../icu/text/NormalizationTransliterator.java | 51 +- .../com/ibm/icu/text/NullTransliterator.java | 8 + .../ibm/icu/text/RemoveTransliterator.java | 13 +- .../ibm/icu/text/RuleBasedTransliterator.java | 38 +- .../com/ibm/icu/text/SourceTargetUtility.java | 133 +++++ .../ibm/icu/text/TitlecaseTransliterator.java | 21 + .../com/ibm/icu/text/TransliterationRule.java | 53 +- .../ibm/icu/text/TransliterationRuleSet.java | 18 +- .../src/com/ibm/icu/text/Transliterator.java | 476 ++++++++++-------- .../ibm/icu/text/UnescapeTransliterator.java | 35 ++ .../icu/text/UnicodeNameTransliterator.java | 21 + .../ibm/icu/text/UppercaseTransliterator.java | 21 + .../dev/test/translit/TransliteratorTest.java | 370 +++++++++++++- 21 files changed, 1121 insertions(+), 278 deletions(-) create mode 100644 icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java diff --git a/.gitattributes b/.gitattributes index a161e067b3..1f16107fba 100644 --- a/.gitattributes +++ b/.gitattributes @@ -262,6 +262,7 @@ icu4j/main/classes/translit/.externalToolBuilders/copy-data-translit.launch -tex icu4j/main/classes/translit/.settings/org.eclipse.core.resources.prefs -text icu4j/main/classes/translit/.settings/org.eclipse.jdt.core.prefs -text icu4j/main/classes/translit/.settings/org.eclipse.jdt.ui.prefs -text +icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java -text icu4j/main/classes/translit/translit-build.launch -text icu4j/main/shared/.project -text icu4j/main/shared/.settings/org.eclipse.core.resources.prefs -text diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java index 295f599712..7f8809c397 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/AnyTransliterator.java @@ -404,5 +404,18 @@ class AnyTransliterator extends Transliterator { } return new AnyTransliterator(getID(), filter, target, targetScript, widthFix, cache); } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + // Assume that it can modify any character to any other character + sourceSet.addAll(myFilter); + if (myFilter.size() != 0) { + targetSet.addAll(0, 0x10FFFF); + } + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java index c27156e77f..49db4aa5e9 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/BreakTransliterator.java @@ -387,5 +387,17 @@ final class BreakTransliterator extends Transliterator { } } + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + // Doesn't actually modify the source characters, so leave them alone. + // add the characters inserted + if (myFilter.size() != 0) { + targetSet.addAll(insertion); + } + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java index 9d4f443b31..6955030b56 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/CaseFoldTransliterator.java @@ -7,6 +7,7 @@ package com.ibm.icu.text; import com.ibm.icu.impl.UCaseProps; +import com.ibm.icu.lang.UCharacter; /** * A transliterator that performs locale-sensitive toLower() @@ -102,4 +103,23 @@ class CaseFoldTransliterator extends Transliterator{ } offsets.start = offsets.limit; } + + static SourceTargetUtility sourceTargetUtility = null; + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + synchronized (UppercaseTransliterator.class) { + if (sourceTargetUtility == null) { + sourceTargetUtility = new SourceTargetUtility(new Transform() { + public String transform(String source) { + return UCharacter.foldCase(source, true); + } + }); + } + } + sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet); + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java index 0192787549..a83f48dd9f 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/CompoundTransliterator.java @@ -305,26 +305,20 @@ class CompoundTransliterator extends Transliterator { } /** - * Return the set of all characters that may be modified by this - * Transliterator, ignoring the effect of our filter. + * @internal */ - protected UnicodeSet handleGetSourceSet() { - UnicodeSet set = new UnicodeSet(); + @Override + public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { + UnicodeSet myFilter = new UnicodeSet(getFilterAsUnicodeSet(filter)); + UnicodeSet tempTargetSet = new UnicodeSet(); for (int i=0; i() { + public String transform(String source) { + return UCharacter.toLowerCase(locale, source); + } + }); + } + } + sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet); + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java index 7e6a44aa92..f8c28506f5 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/NameUnicodeTransliterator.java @@ -165,4 +165,31 @@ class NameUnicodeTransliterator extends Transliterator { // open delimiter candidate. offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor; } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + if (!myFilter.containsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.contains(CLOSE_DELIM)) { + return; // we have to contain both prefix and suffix + } + UnicodeSet items = new UnicodeSet() + .addAll('0', '9') + .addAll('A', 'F') + .addAll('a', 'z') // for controls + .add('<').add('>') // for controls + .add('(').add(')') // for controls + .add('-') + .add(' ') + .addAll(UnicodeNameTransliterator.OPEN_DELIM) + .add(CLOSE_DELIM); + items.retainAll(myFilter); + if (items.size() > 0) { + sourceSet.addAll(items); + // could produce any character + targetSet.addAll(0, 0x10FFFF); + } + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java index e5d7613b47..02ae4c2dd3 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/NormalizationTransliterator.java @@ -1,14 +1,17 @@ /* -********************************************************************** -* Copyright (C) 2001-2010, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* Date Name Description -* 06/08/01 aliu Creation. -********************************************************************** -*/ + ********************************************************************** + * Copyright (C) 2001-2010, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * Date Name Description + * 06/08/01 aliu Creation. + ********************************************************************** + */ package com.ibm.icu.text; +import java.util.HashMap; +import java.util.Map; + import com.ibm.icu.impl.Norm2AllModes; import com.ibm.icu.impl.Normalizer2Impl; @@ -76,7 +79,7 @@ final class NormalizationTransliterator extends Transliterator { * Implements {@link Transliterator#handleTransliterate}. */ protected void handleTransliterate(Replaceable text, - Position offsets, boolean isIncremental) { + Position offsets, boolean isIncremental) { // start and limit of the input range int start = offsets.start; int limit = offsets.limit; @@ -129,4 +132,34 @@ final class NormalizationTransliterator extends Transliterator { offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; } + + static final Map SOURCE_CACHE = new HashMap(); + + // TODO Get rid of this if Normalizer2 becomes a Transform + static class NormalizingTransform implements Transform { + final Normalizer2 norm2; + public NormalizingTransform(Normalizer2 norm2) { + this.norm2 = norm2; + } + public String transform(String source) { + return norm2.normalize(source); + } + } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + SourceTargetUtility cache; + synchronized (SOURCE_CACHE) { + //String id = getID(); + cache = SOURCE_CACHE.get(norm2); + if (cache == null) { + cache = new SourceTargetUtility(new NormalizingTransform(norm2), norm2); + SOURCE_CACHE.put(norm2, cache); + } + } + cache.addSourceTargetSet(this, inputFilter, sourceSet, targetSet); + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java index 906017146c..8ae4bfc826 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/NullTransliterator.java @@ -30,4 +30,12 @@ class NullTransliterator extends Transliterator { Position offsets, boolean incremental) { offsets.start = offsets.limit; } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + // do nothing + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java index 3d020021f0..2e4afa21c5 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/RemoveTransliterator.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 1996-2004, International Business Machines Corporation and * + * Copyright (C) 1996-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -49,4 +49,15 @@ class RemoveTransliterator extends Transliterator { index.contextLimit -= len; index.limit -= len; } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(boolean, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + // intersect myFilter with the input filter + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + sourceSet.addAll(myFilter); + // do nothing with the target + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java index 08c62b152a..3edb796259 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/RuleBasedTransliterator.java @@ -448,24 +448,32 @@ public class RuleBasedTransliterator extends Transliterator { return data.ruleSet.toRules(escapeUnprintable); } +// /** +// * Return the set of all characters that may be modified by this +// * Transliterator, ignoring the effect of our filter. +// * @internal +// * @deprecated This API is ICU internal only. +// */ +// protected UnicodeSet handleGetSourceSet() { +// return data.ruleSet.getSourceTargetSet(false, unicodeFilter); +// } +// +// /** +// * Returns the set of all characters that may be generated as +// * replacement text by this transliterator. +// * @internal +// * @deprecated This API is ICU internal only. +// */ +// public UnicodeSet getTargetSet() { +// return data.ruleSet.getSourceTargetSet(true, unicodeFilter); +// } + /** - * Return the set of all characters that may be modified by this - * Transliterator, ignoring the effect of our filter. * @internal - * @deprecated This API is ICU internal only. */ - protected UnicodeSet handleGetSourceSet() { - return data.ruleSet.getSourceTargetSet(false); - } - - /** - * Returns the set of all characters that may be generated as - * replacement text by this transliterator. - * @internal - * @deprecated This API is ICU internal only. - */ - public UnicodeSet getTargetSet() { - return data.ruleSet.getSourceTargetSet(true); + @Override + public void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { + data.ruleSet.addSourceTargetSet(filter, sourceSet, targetSet); } /** diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java new file mode 100644 index 0000000000..57890353c1 --- /dev/null +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/SourceTargetUtility.java @@ -0,0 +1,133 @@ +/* + ******************************************************************************* + * Copyright (C) 2010, Google, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +package com.ibm.icu.text; + +import java.util.HashSet; +import java.util.Set; + +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.Normalizer2.Mode; + +/** + * Simple internal utility class for helping with getSource/TargetSet + */ +class SourceTargetUtility { + final Transform transform; + final UnicodeSet sourceCache; + final Set sourceStrings; + static final UnicodeSet NON_STARTERS = new UnicodeSet("[:^ccc=0:]").freeze(); + static Normalizer2 NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE); + //static final UnicodeSet TRAILING_COMBINING = new UnicodeSet(); + + public SourceTargetUtility(Transform transform) { + this(transform, null); + } + + public SourceTargetUtility(Transform transform, Normalizer2 normalizer) { + this.transform = transform; + if (normalizer != null) { +// synchronized (SourceTargetUtility.class) { +// if (NFC == null) { +// NFC = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE); +// for (int i = 0; i <= 0x10FFFF; ++i) { +// String d = NFC.getDecomposition(i); +// if (d == null) { +// continue; +// } +// String s = NFC.normalize(d); +// if (!CharSequences.equals(i, s)) { +// continue; +// } +// // composes +// boolean first = false; +// for (int trailing : CharSequences.codePoints(d)) { +// if (first) { +// first = false; +// } else { +// TRAILING_COMBINING.add(trailing); +// } +// } +// } +// } +// } + sourceCache = new UnicodeSet("[:^ccc=0:]"); + } else { + sourceCache = new UnicodeSet(); + } + sourceStrings = new HashSet(); + for (int i = 0; i <= 0x10FFFF; ++i) { + String s = transform.transform(UTF16.valueOf(i)); + boolean added = false; + if (!CharSequences.equals(i, s)) { + sourceCache.add(i); + added = true; + } + if (normalizer == null) { + continue; + } + String d = NFC.getDecomposition(i); + if (d == null) { + continue; + } + s = transform.transform(d); + if (!d.equals(s)) { + sourceStrings.add(d); + } + if (added) { + continue; + } + if (!normalizer.isInert(i)) { + sourceCache.add(i); + continue; + } + // see if any of the non-starters change s; if so, add i +// for (String ns : TRAILING_COMBINING) { +// String s2 = transform.transform(s + ns); +// if (!s2.startsWith(s)) { +// sourceCache.add(i); +// break; +// } +// } + + // int endOfFirst = CharSequences.onCharacterBoundary(d, 1) ? 1 : 2; + // if (endOfFirst >= d.length()) { + // continue; + // } + // // now add all initial substrings + // for (int j = 1; j < d.length(); ++j) { + // if (!CharSequences.onCharacterBoundary(d, j)) { + // continue; + // } + // String dd = d.substring(0,j); + // s = transform.transform(dd); + // if (!dd.equals(s)) { + // sourceStrings.add(dd); + // } + // } + } + sourceCache.freeze(); + } + + public void addSourceTargetSet(Transliterator transliterator, UnicodeSet inputFilter, UnicodeSet sourceSet, + UnicodeSet targetSet) { + UnicodeSet myFilter = transliterator.getFilterAsUnicodeSet(inputFilter); + UnicodeSet affectedCharacters = new UnicodeSet(sourceCache).retainAll(myFilter); + sourceSet.addAll(affectedCharacters); + for (String s : affectedCharacters) { + targetSet.addAll(transform.transform(s)); + } + for (String s : sourceStrings) { + if (myFilter.containsAll(s)) { + String t = transform.transform(s); + if (!s.equals(t)) { + targetSet.addAll(t); + sourceSet.addAll(s); + } + } + } + } +} \ No newline at end of file diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java index 577592702e..fe2fc98af9 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TitlecaseTransliterator.java @@ -6,6 +6,7 @@ package com.ibm.icu.text; import com.ibm.icu.impl.UCaseProps; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.util.ULocale; /** @@ -147,4 +148,24 @@ class TitlecaseTransliterator extends Transliterator { } offsets.start = offsets.limit; } + + // NOTE: normally this would be static, but because the results vary by locale.... + SourceTargetUtility sourceTargetUtility = null; + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + synchronized (this) { + if (sourceTargetUtility == null) { + sourceTargetUtility = new SourceTargetUtility(new Transform() { + public String transform(String source) { + return UCharacter.toTitleCase(locale, source, null); + } + }); + } + } + sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet); + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java index 907e1ac6d9..494827f6ff 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/TransliterationRule.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 1996-2007, International Business Machines Corporation and * + * Copyright (C) 1996-2010, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -548,8 +548,11 @@ class TransliterationRule { * Union the set of all characters that may be modified by this rule * into the given set. */ - void addSourceSetTo(UnicodeSet toUnionTo) { + void addSourceSetTo(UnicodeSet toUnionTo, UnicodeSet filter) { int limit = anteContextLength + keyLength; + if (filter != null && !matches(filter)) { + return; + } for (int i=anteContextLength; i x + // The filter is [a{bc}]. + // If the input is abc, then the rule will work. + // However, following code applying the filter won't catch that case. + private boolean matches(UnicodeSet filter) { + int limit = anteContextLength + keyLength; + // We need to walk through the pattern. + // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo + for (int i=anteContextLength; i |b ; b > c ; + // TODO Merge into r.addSourceTargetSet, to avoid duplicate testing + void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) { int count = ruleVector.size(); for (int i=0; iTransliterator is an abstract class that - * transliterates text from one format to another. The most common - * kind of transliterator is a script, or alphabet, transliterator. - * For example, a Russian to Latin transliterator changes Russian text - * written in Cyrillic characters to phonetically equivalent Latin - * characters. It does not translate Russian to English! - * Transliteration, unlike translation, operates on characters, without - * reference to the meanings of words and sentences. - * - *

Although script conversion is its most common use, a - * transliterator can actually perform a more general class of tasks. - * In fact, Transliterator defines a very general API - * which specifies only that a segment of the input text is replaced - * by new text. The particulars of this conversion are determined - * entirely by subclasses of Transliterator. - * - *

Transliterators are stateless - * - *

Transliterator objects are stateless; they - * retain no information between calls to - * transliterate(). As a result, threads may share - * transliterators without synchronizing them. This might seem to - * limit the complexity of the transliteration operation. In - * practice, subclasses perform complex transliterations by delaying - * the replacement of text until it is known that no other - * replacements are possible. In other words, although the - * Transliterator objects are stateless, the source text - * itself embodies all the needed information, and delayed operation - * allows arbitrary complexity. - * - *

Batch transliteration - * - *

The simplest way to perform transliteration is all at once, on a - * string of existing text. This is referred to as batch - * transliteration. For example, given a string input - * and a transliterator t, the call - * + * Transliterator is an abstract class that transliterates text from one format to another. The most common + * kind of transliterator is a script, or alphabet, transliterator. For example, a Russian to Latin transliterator + * changes Russian text written in Cyrillic characters to phonetically equivalent Latin characters. It does not + * translate Russian to English! Transliteration, unlike translation, operates on characters, without reference + * to the meanings of words and sentences. + * + *

+ * Although script conversion is its most common use, a transliterator can actually perform a more general class of + * tasks. In fact, Transliterator defines a very general API which specifies only that a segment of the + * input text is replaced by new text. The particulars of this conversion are determined entirely by subclasses of + * Transliterator. + * + *

+ * Transliterators are stateless + * + *

+ * Transliterator objects are stateless; they retain no information between calls to + * transliterate(). As a result, threads may share transliterators without synchronizing them. This might + * seem to limit the complexity of the transliteration operation. In practice, subclasses perform complex + * transliterations by delaying the replacement of text until it is known that no other replacements are possible. In + * other words, although the Transliterator objects are stateless, the source text itself embodies all the + * needed information, and delayed operation allows arbitrary complexity. + * + *

+ * Batch transliteration + * + *

+ * The simplest way to perform transliteration is all at once, on a string of existing text. This is referred to as + * batch transliteration. For example, given a string input and a transliterator t, + * the call + * *

String result = t.transliterate(input); *
- * - * will transliterate it and return the result. Other methods allow - * the client to specify a substring to be transliterated and to use - * {@link Replaceable} objects instead of strings, in order to - * preserve out-of-band information (such as text styles). - * - *

Keyboard transliteration - * - *

Somewhat more involved is keyboard, or incremental - * transliteration. This is the transliteration of text that is - * arriving from some source (typically the user's keyboard) one - * character at a time, or in some other piecemeal fashion. - * - *

In keyboard transliteration, a Replaceable buffer - * stores the text. As text is inserted, as much as possible is - * transliterated on the fly. This means a GUI that displays the - * contents of the buffer may show text being modified as each new - * character arrives. - * - *

Consider the simple RuleBasedTransliterator: - * + * + * will transliterate it and return the result. Other methods allow the client to specify a substring to be + * transliterated and to use {@link Replaceable} objects instead of strings, in order to preserve out-of-band + * information (such as text styles). + * + *

+ * Keyboard transliteration + * + *

+ * Somewhat more involved is keyboard, or incremental transliteration. This is the transliteration of text that + * is arriving from some source (typically the user's keyboard) one character at a time, or in some other piecemeal + * fashion. + * + *

+ * In keyboard transliteration, a Replaceable buffer stores the text. As text is inserted, as much as + * possible is transliterated on the fly. This means a GUI that displays the contents of the buffer may show text being + * modified as each new character arrives. + * + *

+ * Consider the simple RuleBasedTransliterator: + * *

* th>{theta}
* t>{tau} *
- * - * When the user types 't', nothing will happen, since the - * transliterator is waiting to see if the next character is 'h'. To - * remedy this, we introduce the notion of a cursor, marked by a '|' - * in the output string: - * + * + * When the user types 't', nothing will happen, since the transliterator is waiting to see if the next character is + * 'h'. To remedy this, we introduce the notion of a cursor, marked by a '|' in the output string: + * *
* t>|{tau}
* {tau}h>{theta} *
- * - * Now when the user types 't', tau appears, and if the next character - * is 'h', the tau changes to a theta. This is accomplished by - * maintaining a cursor position (independent of the insertion point, - * and invisible in the GUI) across calls to - * transliterate(). Typically, the cursor will - * be coincident with the insertion point, but in a case like the one - * above, it will precede the insertion point. - * - *

Keyboard transliteration methods maintain a set of three indices - * that are updated with each call to - * transliterate(), including the cursor, start, - * and limit. These indices are changed by the method, and they are - * passed in and out via a Position object. The start index - * marks the beginning of the substring that the transliterator will - * look at. It is advanced as text becomes committed (but it is not - * the committed index; that's the cursor). The - * cursor index, described above, marks the point at - * which the transliterator last stopped, either because it reached - * the end, or because it required more characters to disambiguate - * between possible inputs. The cursor can also be - * explicitly set by rules in a RuleBasedTransliterator. - * Any characters before the cursor index are frozen; - * future keyboard transliteration calls within this input sequence - * will not change them. New text is inserted at the - * limit index, which marks the end of the substring that - * the transliterator looks at. - * - *

Because keyboard transliteration assumes that more characters - * are to arrive, it is conservative in its operation. It only - * transliterates when it can do so unambiguously. Otherwise it waits - * for more characters to arrive. When the client code knows that no - * more characters are forthcoming, perhaps because the user has - * performed some input termination operation, then it should call - * finishTransliteration() to complete any - * pending transliterations. - * - *

Inverses - * - *

Pairs of transliterators may be inverses of one another. For - * example, if transliterator A transliterates characters by - * incrementing their Unicode value (so "abc" -> "def"), and - * transliterator B decrements character values, then A - * is an inverse of B and vice versa. If we compose A - * with B in a compound transliterator, the result is the - * indentity transliterator, that is, a transliterator that does not - * change its input text. - * - * The Transliterator method getInverse() - * returns a transliterator's inverse, if one exists, or - * null otherwise. However, the result of - * getInverse() usually will not be a true - * mathematical inverse. This is because true inverse transliterators - * are difficult to formulate. For example, consider two - * transliterators: AB, which transliterates the character 'A' - * to 'B', and BA, which transliterates 'B' to 'A'. It might - * seem that these are exact inverses, since - * + * + * Now when the user types 't', tau appears, and if the next character is 'h', the tau changes to a theta. This is + * accomplished by maintaining a cursor position (independent of the insertion point, and invisible in the GUI) across + * calls to transliterate(). Typically, the cursor will be coincident with the insertion point, but in a + * case like the one above, it will precede the insertion point. + * + *

+ * Keyboard transliteration methods maintain a set of three indices that are updated with each call to + * transliterate(), including the cursor, start, and limit. These indices are changed by the method, and + * they are passed in and out via a Position object. The start index marks the beginning of the substring + * that the transliterator will look at. It is advanced as text becomes committed (but it is not the committed index; + * that's the cursor). The cursor index, described above, marks the point at which the + * transliterator last stopped, either because it reached the end, or because it required more characters to + * disambiguate between possible inputs. The cursor can also be explicitly set by rules in a + * RuleBasedTransliterator. Any characters before the cursor index are frozen; future keyboard + * transliteration calls within this input sequence will not change them. New text is inserted at the limit + * index, which marks the end of the substring that the transliterator looks at. + * + *

+ * Because keyboard transliteration assumes that more characters are to arrive, it is conservative in its operation. It + * only transliterates when it can do so unambiguously. Otherwise it waits for more characters to arrive. When the + * client code knows that no more characters are forthcoming, perhaps because the user has performed some input + * termination operation, then it should call finishTransliteration() to complete any pending + * transliterations. + * + *

+ * Inverses + * + *

+ * Pairs of transliterators may be inverses of one another. For example, if transliterator A transliterates + * characters by incrementing their Unicode value (so "abc" -> "def"), and transliterator B decrements character + * values, then A is an inverse of B and vice versa. If we compose A with B in a compound + * transliterator, the result is the indentity transliterator, that is, a transliterator that does not change its input + * text. + * + * The Transliterator method getInverse() returns a transliterator's inverse, if one exists, + * or null otherwise. However, the result of getInverse() usually will not be a true + * mathematical inverse. This is because true inverse transliterators are difficult to formulate. For example, consider + * two transliterators: AB, which transliterates the character 'A' to 'B', and BA, which transliterates + * 'B' to 'A'. It might seem that these are exact inverses, since + * *

"A" x AB -> "B"
* "B" x BA -> "A"
- * - * where 'x' represents transliteration. However, - * + * + * where 'x' represents transliteration. However, + * *
"ABCD" x AB -> "BBCD"
* "BBCD" x BA -> "AACD"
- * - * so AB composed with BA is not the - * identity. Nonetheless, BA may be usefully considered to be - * AB's inverse, and it is on this basis that - * AB.getInverse() could legitimately return + * + * so AB composed with BA is not the identity. Nonetheless, BA may be usefully considered to be + * AB's inverse, and it is on this basis that AB.getInverse() could legitimately return * BA. - * - *

IDs and display names - * - *

A transliterator is designated by a short identifier string or - * ID. IDs follow the format source-destination, - * where source describes the entity being replaced, and - * destination describes the entity replacing - * source. The entities may be the names of scripts, - * particular sequences of characters, or whatever else it is that the - * transliterator converts to or from. For example, a transliterator - * from Russian to Latin might be named "Russian-Latin". A - * transliterator from keyboard escape sequences to Latin-1 characters - * might be named "KeyboardEscape-Latin1". By convention, system - * entity names are in English, with the initial letters of words - * capitalized; user entity names may follow any format so long as - * they do not contain dashes. - * - *

In addition to programmatic IDs, transliterator objects have - * display names for presentation in user interfaces, returned by - * {@link #getDisplayName}. - * - *

Factory methods and registration - * - *

In general, client code should use the factory method - * getInstance() to obtain an instance of a - * transliterator given its ID. Valid IDs may be enumerated using - * getAvailableIDs(). Since transliterators are - * stateless, multiple calls to getInstance() with the - * same ID will return the same object. - * - *

In addition to the system transliterators registered at startup, - * user transliterators may be registered by calling - * registerInstance() at run time. To register a - * transliterator subclass without instantiating it (until it is - * needed), users may call registerClass(). - * - *

Composed transliterators - * - *

In addition to built-in system transliterators like - * "Latin-Greek", there are also built-in composed - * transliterators. These are implemented by composing two or more - * component transliterators. For example, if we have scripts "A", - * "B", "C", and "D", and we want to transliterate between all pairs - * of them, then we need to write 12 transliterators: "A-B", "A-C", - * "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to - * convert all scripts to an intermediate script "M", then instead of - * writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", - * "D~M", "M~A", "M~B", "M~C", "M~D". (This might not seem like a big - * win, but it's really 2n vs. n2 - - * n, so as n gets larger the gain becomes - * significant. With 9 scripts, it's 18 vs. 72 rule sets, a big - * difference.) Note the use of "~" rather than "-" for the script - * separator here; this indicates that the given transliterator is - * intended to be composed with others, rather than be used as is. - * - *

Composed transliterators can be instantiated as usual. For - * example, the system transliterator "Devanagari-Gujarati" is a - * composed transliterator built internally as - * "Devanagari~InterIndic;InterIndic~Gujarati". When this - * transliterator is instantiated, it appears externally to be a - * standard transliterator (e.g., getID() returns + * + *

+ * Filtering + *

Each transliterator has a filter, which restricts changes to those characters selected by the filter. The + * filter affects just the characters that are changed -- the characters outside of the filter are still part of the + * context for the filter. For example, in the following even though 'x' is filtered out, and doesn't convert to y, it does affect the conversion of 'a'. + * + *

+ * String rules = "x > y; x{a} > b; ";
+ * Transliterator tempTrans = Transliterator.createFromRules("temp", rules, Transliterator.FORWARD);
+ * tempTrans.setFilter(new UnicodeSet("[a]"));
+ * String tempResult = tempTrans.transform("xa");
+ * // results in "xb"
+ *
+ *

+ * IDs and display names + * + *

+ * A transliterator is designated by a short identifier string or ID. IDs follow the format + * source-destination, where source describes the entity being replaced, and destination + * describes the entity replacing source. The entities may be the names of scripts, particular sequences of + * characters, or whatever else it is that the transliterator converts to or from. For example, a transliterator from + * Russian to Latin might be named "Russian-Latin". A transliterator from keyboard escape sequences to Latin-1 + * characters might be named "KeyboardEscape-Latin1". By convention, system entity names are in English, with the + * initial letters of words capitalized; user entity names may follow any format so long as they do not contain dashes. + * + *

+ * In addition to programmatic IDs, transliterator objects have display names for presentation in user interfaces, + * returned by {@link #getDisplayName}. + * + *

+ * Factory methods and registration + * + *

+ * In general, client code should use the factory method getInstance() to obtain an instance of a + * transliterator given its ID. Valid IDs may be enumerated using getAvailableIDs(). Since transliterators + * are stateless, multiple calls to getInstance() with the same ID will return the same object. + * + *

+ * In addition to the system transliterators registered at startup, user transliterators may be registered by calling + * registerInstance() at run time. To register a transliterator subclass without instantiating it (until it + * is needed), users may call registerClass(). + * + *

+ * Composed transliterators + * + *

+ * In addition to built-in system transliterators like "Latin-Greek", there are also built-in composed + * transliterators. These are implemented by composing two or more component transliterators. For example, if we have + * scripts "A", "B", "C", and "D", and we want to transliterate between all pairs of them, then we need to write 12 + * transliterators: "A-B", "A-C", "A-D", "B-A",..., "D-A", "D-B", "D-C". If it is possible to convert all scripts to an + * intermediate script "M", then instead of writing 12 rule sets, we only need to write 8: "A~M", "B~M", "C~M", "D~M", + * "M~A", "M~B", "M~C", "M~D". (This might not seem like a big win, but it's really 2n vs. n + * 2 - n, so as n gets larger the gain becomes significant. With 9 scripts, it's 18 vs. 72 + * rule sets, a big difference.) Note the use of "~" rather than "-" for the script separator here; this indicates that + * the given transliterator is intended to be composed with others, rather than be used as is. + * + *

+ * Composed transliterators can be instantiated as usual. For example, the system transliterator "Devanagari-Gujarati" + * is a composed transliterator built internally as "Devanagari~InterIndic;InterIndic~Gujarati". When this + * transliterator is instantiated, it appears externally to be a standard transliterator (e.g., getID() returns * "Devanagari-Gujarati"). - * - *

Subclassing - * - *

Subclasses must implement the abstract method - * handleTransliterate().

Subclasses should override - * the transliterate() method taking a - * Replaceable and the transliterate() - * method taking a String and StringBuffer - * if the performance of these methods can be improved over the - * performance obtained by the default implementations in this class. - * - *

Copyright © IBM Corporation 1999. All rights reserved. - * + * + *

+ * Subclassing + * + *

+ * Subclasses must implement the abstract method handleTransliterate(). + *

+ * Subclasses should override the transliterate() method taking a Replaceable and the + * transliterate() method taking a String and StringBuffer if the performance of + * these methods can be improved over the performance obtained by the default implementations in this class. + * + *

+ * Copyright © IBM Corporation 1999. All rights reserved. + * * @author Alan Liu * @stable ICU 2.0 */ @@ -1418,7 +1392,7 @@ public abstract class Transliterator implements StringTransform { t = new NullTransliterator(); } else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) { - t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), null); + t = new RuleBasedTransliterator(ID, parser.dataVector.get(0), parser.compoundFilter); } else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) { // idBlock, no data -- this is an alias. The ID has @@ -1536,6 +1510,8 @@ public abstract class Transliterator implements StringTransform { return result; } + static final UnicodeSet ALL_CODEPOINTS = new UnicodeSet(0,0x10FFFF).freeze(); + /** * Returns the set of all characters that may be modified in the * input text by this Transliterator. This incorporates this @@ -1550,20 +1526,9 @@ public abstract class Transliterator implements StringTransform { * @stable ICU 2.2 */ public final UnicodeSet getSourceSet() { - UnicodeSet set = handleGetSourceSet(); - if (filter != null) { - UnicodeSet filterSet; - // Most, but not all filters will be UnicodeSets. Optimize for - // the high-runner case. - try { - filterSet = (UnicodeSet) filter; - } catch (ClassCastException e) { - filterSet = new UnicodeSet(); - filter.addMatchSetTo(filterSet); - } - set.retainAll(filterSet); - } - return set; + UnicodeSet result = new UnicodeSet(); + addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), result, new UnicodeSet()); + return result; } /** @@ -1595,7 +1560,78 @@ public abstract class Transliterator implements StringTransform { * @stable ICU 2.2 */ public UnicodeSet getTargetSet() { - return new UnicodeSet(); + UnicodeSet result = new UnicodeSet(); + addSourceTargetSet(getFilterAsUnicodeSet(ALL_CODEPOINTS), new UnicodeSet(), result); + return result; + } + + /** + * Returns the set of all characters that may be generated as + * replacement text by this transliterator, filtered by BOTH the input filter, and the current getFilter(). + *

SHOULD BE OVERRIDEN BY SUBCLASSES. + * It is probably an error for any transliterator to NOT override this, but we can't force them to + * for backwards compatibility. + *

Other methods vector through this. + *

When gathering the information on source and target, the compound transliterator makes things complicated. + * For example, suppose we have: + *

+     * Global FILTER = [ax]
+     * a > b;
+     * :: NULL;
+     * b > c;
+     * x > d;
+     * 
+ * While the filter just allows a and x, b is an intermediate result, which could produce c. So the source and target sets + * cannot be gathered independently. What we have to do is filter the sources for the first transliterator according to + * the global filter, intersect that transliterator's filter. Based on that we get the target. + * The next transliterator gets as a global filter (global + last target). And so on. + *

There is another complication: + *

+     * Global FILTER = [ax]
+     * a > |b;
+     * b > c;
+     * 
+ * Even though b would be filtered from the input, whenever we have a backup, it could be part of the input. So ideally we will + * change the global filter as we go. + * @param targetSet TODO + * @see #getTargetSet + * @internal + */ + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + UnicodeSet temp = new UnicodeSet(handleGetSourceSet()).retainAll(myFilter); + // use old method, if we don't have anything better + sourceSet.addAll(temp); + // clumsy guess with target + for (String s : temp) { + String t = transliterate(s); + if (!s.equals(t)) { + targetSet.addAll(t); + } + } + } + + /** + * Returns the intersectionof this instance's filter intersected with an external filter. + * The externalFilter must be frozen (it is frozen if not). + * The result may be frozen, so don't attempt to modify. + * @internal + */ + // TODO change to getMergedFilter + public UnicodeSet getFilterAsUnicodeSet(UnicodeSet externalFilter) { + if (filter == null) { + return externalFilter; + } + UnicodeSet filterSet = new UnicodeSet(externalFilter); + // Most, but not all filters will be UnicodeSets. Optimize for + // the high-runner case. + UnicodeSet temp; + try { + temp = (UnicodeSet) filter; + } catch (ClassCastException e) { + filter.addMatchSetTo(temp = new UnicodeSet()); + } + return filterSet.retainAll(temp).freeze(); } /** diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java index a9f3840b9b..dc573cb612 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnescapeTransliterator.java @@ -8,6 +8,7 @@ ********************************************************************** */ package com.ibm.icu.text; +import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; /** @@ -248,4 +249,38 @@ class UnescapeTransliterator extends Transliterator { pos.limit = limit; pos.start = start; } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + // Each form consists of a prefix, suffix, + // * radix, minimum digit count, and maximum digit count. These + // * values are stored as a five character header. ... + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + UnicodeSet items = new UnicodeSet(); + StringBuilder buffer = new StringBuilder(); + for (int i = 0; spec[i] != END;) { + // first 5 items are header + int end = i + spec[i] + spec[i+1] + 5; + int radix = spec[i+2]; + for (int j = 0; j < radix; ++j) { + Utility.appendNumber(buffer, j, radix, 0); + } + // then add the characters + for (int j = i + 5; j < end; ++j) { + items.add(spec[j]); + } + // and go to next block + i = end; + } + items.addAll(buffer.toString()); + items.retainAll(myFilter); + + if (items.size() > 0) { + sourceSet.addAll(items); + targetSet.addAll(0,0x10FFFF); // assume we can produce any character + } + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java index 12b832fd2f..f5bb5ad89d 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UnicodeNameTransliterator.java @@ -70,4 +70,25 @@ class UnicodeNameTransliterator extends Transliterator { offsets.limit = limit; offsets.start = cursor; } + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter); + if (myFilter.size() > 0) { + sourceSet.addAll(myFilter); + targetSet.addAll('0', '9') + .addAll('A', 'Z') + .add('-') + .add(' ') + .addAll(OPEN_DELIM) + .add(CLOSE_DELIM) + .addAll('a', 'z') // for controls + .add('<').add('>') // for controls + .add('(').add(')') // for controls + ; + } + } } diff --git a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java index 6767d703ce..d8b54d5672 100644 --- a/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java +++ b/icu4j/main/classes/translit/src/com/ibm/icu/text/UppercaseTransliterator.java @@ -7,6 +7,7 @@ package com.ibm.icu.text; import com.ibm.icu.impl.UCaseProps; +import com.ibm.icu.lang.UCharacter; import com.ibm.icu.util.ULocale; /** @@ -105,4 +106,24 @@ class UppercaseTransliterator extends Transliterator { } offsets.start = offsets.limit; } + + // NOTE: normally this would be static, but because the results vary by locale.... + SourceTargetUtility sourceTargetUtility = null; + + /* (non-Javadoc) + * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet) + */ + @Override + public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) { + synchronized (this) { + if (sourceTargetUtility == null) { + sourceTargetUtility = new SourceTargetUtility(new Transform() { + public String transform(String source) { + return UCharacter.toUpperCase(locale, source); + } + }); + } + } + sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet); + } } diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java index 87ea37c616..8095696f0a 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java @@ -13,13 +13,18 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; +import java.util.Map.Entry; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; +import com.ibm.icu.dev.test.util.UnicodeMap; import com.ibm.icu.impl.Utility; import com.ibm.icu.impl.UtilityExtensions; +import com.ibm.icu.lang.CharSequences; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Replaceable; import com.ibm.icu.text.ReplaceableString; import com.ibm.icu.text.StringTransform; @@ -28,6 +33,7 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeFilter; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.text.Normalizer2.Mode; import com.ibm.icu.util.CaseInsensitiveString; import com.ibm.icu.util.ULocale; @@ -480,6 +486,16 @@ public class TransliteratorTest extends TestFmwk { * Do some basic tests of filtering. */ public void TestFiltering() { + + Transliterator tempTrans = Transliterator.createFromRules("temp", "x > y; x{a} > b; ", Transliterator.FORWARD); + tempTrans.setFilter(new UnicodeSet("[a]")); + String tempResult = tempTrans.transform("xa"); + assertEquals("context should not be filtered ", "xb", tempResult); + + tempTrans = Transliterator.createFromRules("temp", "::[a]; x > y; x{a} > b; ", Transliterator.FORWARD); + tempResult = tempTrans.transform("xa"); + assertEquals("context should not be filtered ", "xb", tempResult); + Transliterator hex = Transliterator.getInstance("Any-Hex"); hex.setFilter(new UnicodeFilter() { public boolean contains(int c) { @@ -2997,6 +3013,358 @@ public class TransliteratorTest extends TestFmwk { } } + public void TestSourceTargetSet2() { + + + Normalizer2 nfkd = Normalizer2.getInstance(null, "NFKC", Mode.DECOMPOSE); + Normalizer2 nfc = Normalizer2.getInstance(null, "NFC", Mode.COMPOSE); + Normalizer2 nfd = Normalizer2.getInstance(null, "NFC", Mode.DECOMPOSE); + // UnicodeSet nfkdSource = new UnicodeSet(); + // UnicodeSet nfkdTarget = new UnicodeSet(); + // for (int i = 0; i <= 0x10FFFF; ++i) { + // if (nfkd.isInert(i)) { + // continue; + // } + // nfkdSource.add(i); + // String t = nfkd.getDecomposition(i); + // if (t != null) { + // nfkdTarget.addAll(t); + // } else { + // nfkdTarget.add(i); + // } + // } + // nfkdSource.freeze(); + // nfkdTarget.freeze(); + // logln("NFKD Source: " + nfkdSource.toPattern(false)); + // logln("NFKD Target: " + nfkdTarget.toPattern(false)); + + UnicodeMap leadToTrail = new UnicodeMap(); + UnicodeMap leadToSources = new UnicodeMap(); + UnicodeSet nonStarters = new UnicodeSet("[:^ccc=0:]").freeze(); + CanonicalIterator can = new CanonicalIterator(""); + + UnicodeSet disorderedMarks = new UnicodeSet(); + + for (int i = 0; i <= 0x10FFFF; ++i) { + String s = nfd.getDecomposition(i); + if (s == null) { + continue; + } + + can.setSource(s); + for (String t = can.next(); t != null; t = can.next()) { + disorderedMarks.add(t); + } + + // if s has two code points, (or more), add the lead/trail information + int first = s.codePointAt(0); + int firstCount = Character.charCount(first); + if (s.length() == firstCount) continue; + String trailString = s.substring(firstCount); + + // add all the trail characters + if (!nonStarters.containsSome(trailString)) { + continue; + } + UnicodeSet trailSet = leadToTrail.get(first); + if (trailSet == null) { + leadToTrail.put(first, trailSet = new UnicodeSet()); + } + trailSet.addAll(trailString); // add remaining trails + + // add the sources + UnicodeSet sourcesSet = leadToSources.get(first); + if (sourcesSet == null) { + leadToSources.put(first, sourcesSet = new UnicodeSet()); + } + sourcesSet.add(i); + } + + + for (Entry x : leadToSources.entrySet()) { + String lead = x.getKey(); + UnicodeSet sources = x.getValue(); + UnicodeSet trailSet = leadToTrail.get(lead); + for (String source : sources) { + for (String trail : trailSet) { + can.setSource(source + trail); + for (String t = can.next(); t != null; t = can.next()) { + if (t.endsWith(trail)) continue; + disorderedMarks.add(t); + } + } + } + } + + + for (String s : nonStarters) { + disorderedMarks.add("\u0345" + s); + disorderedMarks.add(s+"\u0323"); + String xx = nfc.normalize("Ǭ" + s); + if (!xx.startsWith("Ǭ")) { + logln("??"); + } + } + +// for (int i = 0; i <= 0x10FFFF; ++i) { +// String s = nfkd.getDecomposition(i); +// if (s != null) { +// disorderedMarks.add(s); +// disorderedMarks.add(nfc.normalize(s)); +// addDerivedStrings(nfc, disorderedMarks, s); +// } +// s = nfd.getDecomposition(i); +// if (s != null) { +// disorderedMarks.add(s); +// } +// if (!nfc.isInert(i)) { +// if (i == 0x00C0) { +// logln("À"); +// } +// can.setSource(s+"\u0334"); +// for (String t = can.next(); t != null; t = can.next()) { +// addDerivedStrings(nfc, disorderedMarks, t); +// } +// can.setSource(s+"\u0345"); +// for (String t = can.next(); t != null; t = can.next()) { +// addDerivedStrings(nfc, disorderedMarks, t); +// } +// can.setSource(s+"\u0323"); +// for (String t = can.next(); t != null; t = can.next()) { +// addDerivedStrings(nfc, disorderedMarks, t); +// } +// } +// } + logln("Test cases: " + disorderedMarks.size()); + disorderedMarks.addAll(0,0x10FFFF).freeze(); + logln("isInert \u0104 " + nfc.isInert('\u0104')); + + Object[][] rules = { + {":: [:sc=COMMON:] any-name;", null}, + + {":: [:Greek:] hex-any/C;", null}, + {":: [:Greek:] any-hex/C;", null}, + + {":: [[:Mn:][:Me:]] remove;", null}, + {":: [[:Mn:][:Me:]] null;", null}, + + + {":: lower;", null}, + {":: upper;", null}, + {":: title;", null}, + {":: CaseFold;", null}, + + {":: NFD;", null}, + {":: NFC;", null}, + {":: NFKD;", null}, + {":: NFKC;", null}, + + {":: [[:Mn:][:Me:]] NFKD;", null}, + {":: Latin-Greek;", null}, + {":: [:Latin:] NFKD;", null}, + {":: NFKD;", null}, + {":: NFKD;\n" + + ":: [[:Mn:][:Me:]] remove;\n" + + ":: NFC;", null}, + }; + for (Object[] rulex : rules) { + String rule = (String) rulex[0]; + Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD); + UnicodeSet actualSource = trans.getSourceSet(); + UnicodeSet actualTarget = trans.getTargetSet(); + UnicodeSet empiricalSource = new UnicodeSet(); + UnicodeSet empiricalTarget = new UnicodeSet(); + String ruleDisplay = rule.replace("\n", "\t\t"); + UnicodeSet toTest = disorderedMarks; +// if (rulex[1] != null) { +// toTest = new UnicodeSet(disorderedMarks); +// toTest.addAll((UnicodeSet) rulex[1]); +// } + + String test = nfd.normalize("Ą"); + boolean DEBUG = true; + int count = 0; // for debugging + for (String s : toTest) { + if (s.equals(test)) { + logln(test); + } + String t = trans.transform(s); + if (!s.equals(t)) { + if (!isAtomic(s, t, trans)) { + isAtomic(s, t, trans); + continue; + } + + // only keep the part that changed; so skip the front and end. + // int start = findSharedStartLength(s,t); + // int end = findSharedEndLength(s,t); + // if (start != 0 || end != 0) { + // s = s.substring(start, s.length() - end); + // t = t.substring(start, t.length() - end); + // } + if (DEBUG) { + if (!actualSource.containsAll(s)) { + count++; + } + if (!actualTarget.containsAll(t)) { + count++; + } + } + addSourceTarget(s, empiricalSource, t, empiricalTarget); + } + } + assertEquals("getSource(" + ruleDisplay + ")", empiricalSource, actualSource, SetAssert.MISSING_OK); + assertEquals("getTarget(" + ruleDisplay + ")", empiricalTarget, actualTarget, SetAssert.MISSING_OK); + } + } + + private boolean isAtomic(String s, String t, Transliterator trans) { + for (int i = 1; i < s.length(); ++i) { + if (!CharSequences.onCharacterBoundary(s, i)) { + continue; + } + String q = trans.transform(s.substring(0,i)); + if (t.startsWith(q)) { + String r = trans.transform(s.substring(i)); + if (t.length() == q.length() + r.length() && t.endsWith(r)) { + return false; + } + } + } + return true; + // // make sure that every part is different + // if (s.codePointCount(0, s.length()) > 1) { + // int[] codePoints = It.codePoints(s); + // for (int k = 0; k < codePoints.length; ++k) { + // int pos = indexOf(t,codePoints[k]); + // if (pos >= 0) { + // int x; + // } + // } + // if (s.contains("À")) { + // logln("À"); + // } + // } + } + + private void addSourceTarget(String s, UnicodeSet expectedSource, String t, UnicodeSet expectedTarget) { + expectedSource.addAll(s); + if (t.length() > 0) { + expectedTarget.addAll(t); + } + } + + private void addDerivedStrings(Normalizer2 nfc, UnicodeSet disorderedMarks, String s) { + disorderedMarks.add(s); + for (int j = 1; j < s.length(); ++j) { + if (CharSequences.onCharacterBoundary(s, j)) { + String shorter = s.substring(0,j); + disorderedMarks.add(shorter); + disorderedMarks.add(nfc.normalize(shorter) + s.substring(j)); + } + } + } + + public void TestCharUtils() { + String[][] startTests = { + {"1", "a", "ab"}, + {"0", "a", "xb"}, + {"0", "\uD800", "\uD800\uDC01"}, + {"1", "\uD800a", "\uD800b"}, + {"0", "\uD800\uDC00", "\uD800\uDC01"}, + }; + for (String[] row : startTests) { + int actual = findSharedStartLength(row[1], row[2]); + assertEquals("findSharedStartLength(" + row[1] + "," + row[2] + ")", + Integer.parseInt(row[0]), + actual); + } + String[][] endTests = { + {"0", "\uDC00", "\uD801\uDC00"}, + {"1", "a", "ba"}, + {"0", "a", "bx"}, + {"1", "a\uDC00", "b\uDC00"}, + {"0", "\uD800\uDC00", "\uD801\uDC00"}, + }; + for (String[] row : endTests) { + int actual = findSharedEndLength(row[1], row[2]); + assertEquals("findSharedEndLength(" + row[1] + "," + row[2] + ")", + Integer.parseInt(row[0]), + actual); + } + } + + /** + * @param s + * @param t + * @return + */ + // TODO make generally available + private static int findSharedStartLength(CharSequence s, CharSequence t) { + int min = Math.min(s.length(), t.length()); + int i; + char sch, tch; + for (i = 0; i < min; ++i) { + sch = s.charAt(i); + tch = t.charAt(i); + if (sch != tch) { + break; + } + } + return CharSequences.onCharacterBoundary(s,i) && CharSequences.onCharacterBoundary(t,i) ? i : i - 1; + } + + /** + * @param s + * @param t + * @return + */ + // TODO make generally available + private static int findSharedEndLength(CharSequence s, CharSequence t) { + int slength = s.length(); + int tlength = t.length(); + int min = Math.min(slength, tlength); + int i; + char sch, tch; + // TODO can make the calculations slightly faster... Not sure if it is worth the complication, tho' + for (i = 0; i < min; ++i) { + sch = s.charAt(slength - i - 1); + tch = t.charAt(tlength - i - 1); + if (sch != tch) { + break; + } + } + return CharSequences.onCharacterBoundary(s,slength - i) && CharSequences.onCharacterBoundary(t,tlength - i) ? i : i - 1; + } + + enum SetAssert {EQUALS, MISSING_OK, EXTRA_OK} + + void assertEquals(String message, UnicodeSet empirical, UnicodeSet actual, SetAssert setAssert) { + boolean haveError = false; + if (!actual.containsAll(empirical)) { + UnicodeSet missing = new UnicodeSet(empirical).removeAll(actual); + errln(message + " \tgetXSet < empirical (" + missing.size() + "): " + toPattern(missing)); + haveError = true; + } + if (!empirical.containsAll(actual)) { + UnicodeSet extra = new UnicodeSet(actual).removeAll(empirical); + logln("WARNING: " + message + " \tgetXSet > empirical (" + extra.size() + "): " + toPattern(extra)); + haveError = true; + } + if (!haveError) { + logln("OK " + message + ' ' + toPattern(empirical)); + } + } + + private String toPattern(UnicodeSet missing) { + String result = missing.toPattern(false); + if (result.length() < 200) { + return result; + } + return result.substring(0, CharSequences.onCharacterBoundary(result, 200) ? 200 : 199) + "…"; + } + + /** * Test handling of rule whitespace, for both RBT and UnicodeSet. */ @@ -3741,7 +4109,7 @@ the ::BEGIN/::END stuff) Transliterator.createFromRules("gif", "\\", Transliterator.FORWARD); } catch(Exception e){ errln("TransliteratorParser.nextLine() was not suppose to return an " + - "exception for a rule of '\\'"); + "exception for a rule of '\\'"); } } }