ICU-10705 More substantive changes were needed. The code didn't get the CLDR data, and there were some problems with the algorithm. Added many more tests, and added a hack to get around the fact that the generated CLDR data is reordered (it needs to maintain the file order!)
X-SVN-Rev: 35193
This commit is contained in:
parent
f7100c3d6e
commit
2ccc9fb2bd
@ -1,6 +1,6 @@
|
||||
/*
|
||||
****************************************************************************************
|
||||
* Copyright (C) 2009-2013, Google, Inc.; International Business Machines Corporation *
|
||||
* Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
****************************************************************************************
|
||||
*/
|
||||
@ -11,9 +11,11 @@ import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.impl.Row;
|
||||
import com.ibm.icu.impl.Row.R2;
|
||||
import com.ibm.icu.impl.Row.R3;
|
||||
@ -43,7 +45,10 @@ import com.ibm.icu.impl.Row.R3;
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public class LocaleMatcher {
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
private static boolean DEBUG = false;
|
||||
|
||||
private static final ULocale UNKNOWN_LOCALE = new ULocale("und");
|
||||
|
||||
/**
|
||||
* Threshold for falling back to the default (first) language. May make this
|
||||
@ -56,6 +61,11 @@ public class LocaleMatcher {
|
||||
*/
|
||||
private final ULocale defaultLanguage;
|
||||
|
||||
/**
|
||||
* The default language, in case the threshold is not met.
|
||||
*/
|
||||
private final double threshold;
|
||||
|
||||
/**
|
||||
* Create a new language matcher. The highest-weighted language is the
|
||||
* default. That means that if no other language is matches closer than a given
|
||||
@ -89,12 +99,24 @@ public class LocaleMatcher {
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData) {
|
||||
this(languagePriorityList, matcherData, DEFAULT_THRESHOLD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal testing function; may expose API later.
|
||||
* @param languagePriorityList LocalePriorityList to match
|
||||
* @param matcherData Internal matching data
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public LocaleMatcher(LocalePriorityList languagePriorityList, LanguageMatcherData matcherData, double threshold) {
|
||||
this.matcherData = matcherData;
|
||||
for (final ULocale language : languagePriorityList) {
|
||||
add(language, languagePriorityList.getWeight(language));
|
||||
}
|
||||
Iterator<ULocale> it = languagePriorityList.iterator();
|
||||
defaultLanguage = it.hasNext() ? it.next() : null;
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
|
||||
@ -136,7 +158,7 @@ public class LocaleMatcher {
|
||||
lang2 == null ? lang : lang2,
|
||||
script2 == null ? script : script2,
|
||||
region2 == null ? region : region2
|
||||
);
|
||||
);
|
||||
}
|
||||
return ulocale;
|
||||
}
|
||||
@ -159,7 +181,7 @@ public class LocaleMatcher {
|
||||
bestTableMatch = matchRow.get0();
|
||||
}
|
||||
}
|
||||
if (bestWeight < DEFAULT_THRESHOLD) {
|
||||
if (bestWeight < threshold) {
|
||||
bestTableMatch = defaultLanguage;
|
||||
}
|
||||
return bestTableMatch;
|
||||
@ -187,6 +209,14 @@ public class LocaleMatcher {
|
||||
return getBestMatchInternal(ulocale).get0();
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public ULocale getBestMatch(ULocale... ulocales) {
|
||||
return getBestMatch(LocalePriorityList.add(ulocales).build());
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @stable ICU 4.4
|
||||
@ -194,7 +224,7 @@ public class LocaleMatcher {
|
||||
@Override
|
||||
public String toString() {
|
||||
return "{" + defaultLanguage + ", "
|
||||
+ maximizedLanguageToWeight + "}";
|
||||
+ maximizedLanguageToWeight + "}";
|
||||
}
|
||||
// ================= Privates =====================
|
||||
|
||||
@ -217,7 +247,7 @@ public class LocaleMatcher {
|
||||
R2<ULocale, Double> row = maximizedLanguageToWeight.get(tableKey);
|
||||
final double match = match(languageCode, maximized, tableKey, row.get0());
|
||||
if (DEBUG) {
|
||||
System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match);
|
||||
System.out.println("\t" + tableKey + ";\t" + row.toString() + ";\t" + match + "\n");
|
||||
}
|
||||
final double weight = match * row.get1();
|
||||
if (weight > bestWeight) {
|
||||
@ -225,7 +255,7 @@ public class LocaleMatcher {
|
||||
bestTableMatch = tableKey;
|
||||
}
|
||||
}
|
||||
if (bestWeight < DEFAULT_THRESHOLD) {
|
||||
if (bestWeight < threshold) {
|
||||
bestTableMatch = defaultLanguage;
|
||||
}
|
||||
return Row.R2.of(bestTableMatch, bestWeight);
|
||||
@ -252,6 +282,16 @@ public class LocaleMatcher {
|
||||
*/
|
||||
// TODO(markdavis): update the above when CLDR 1.6 is final.
|
||||
private ULocale addLikelySubtags(ULocale languageCode) {
|
||||
// max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined
|
||||
// language would normally match English. But that would produce the counterintuitive results
|
||||
// that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and
|
||||
// getBestMatch("en", LocaleMatcher("it,und")) would be "und".
|
||||
//
|
||||
// To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults)
|
||||
// so that max("und")="und". That produces the following, more desirable results:
|
||||
if (languageCode.equals(UNKNOWN_LOCALE)) {
|
||||
return UNKNOWN_LOCALE;
|
||||
}
|
||||
final ULocale result = ULocale.addLikelySubtags(languageCode);
|
||||
// should have method on getLikelySubtags for this
|
||||
if (result == null || result.equals(languageCode)) {
|
||||
@ -275,9 +315,9 @@ public class LocaleMatcher {
|
||||
private String region;
|
||||
private Level level;
|
||||
static Pattern pattern = Pattern.compile(
|
||||
"([a-zA-Z]{1,8}|\\*)" +
|
||||
"(?:-([a-zA-Z]{4}|\\*))?" +
|
||||
"(?:-([a-zA-Z]{2}|[0-9]{3}|\\*))?");
|
||||
"([a-z]{1,8}|\\*)"
|
||||
+ "(?:[_-]([A-Z][a-z]{3}|\\*))?"
|
||||
+ "(?:[_-]([A-Z]{2}|[0-9]{3}|\\*))?");
|
||||
|
||||
public LocalePatternMatcher(String toMatch) {
|
||||
Matcher matcher = pattern.matcher(toMatch);
|
||||
@ -341,16 +381,32 @@ public class LocaleMatcher {
|
||||
}
|
||||
}
|
||||
|
||||
enum Level {language, script, region}
|
||||
enum Level {
|
||||
language(0.99),
|
||||
script(0.2),
|
||||
region(0.04);
|
||||
|
||||
final double worst;
|
||||
|
||||
Level(double d) {
|
||||
worst = d;
|
||||
}
|
||||
}
|
||||
|
||||
private static class ScoreData implements Freezable<ScoreData> {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final double maxUnequal_changeD_sameS = 0.5;
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final double maxUnequal_changeEqual = 0.75;
|
||||
LinkedHashSet<Row.R3<LocalePatternMatcher,LocalePatternMatcher,Double>> scores = new LinkedHashSet<R3<LocalePatternMatcher, LocalePatternMatcher, Double>>();
|
||||
final double worst;
|
||||
final Level level;
|
||||
|
||||
public ScoreData(Level level) {
|
||||
this.level = level;
|
||||
this.worst = (1-(level == Level.language ? 90 : level == Level.script ? 20 : 4))/100.0;
|
||||
}
|
||||
|
||||
void addDataToScores(String desired, String supported, R3<LocalePatternMatcher,LocalePatternMatcher,Double> data) {
|
||||
@ -385,10 +441,13 @@ public class LocaleMatcher {
|
||||
* else
|
||||
* rd = 0.25*StdRDiff // lines 2,5
|
||||
*/
|
||||
|
||||
// example: input en-GB, supported en en-GB
|
||||
// we want to have a closer match with
|
||||
|
||||
boolean desiredChange = desiredRaw.equals(desiredMax);
|
||||
boolean supportedChange = supportedRaw.equals(supportedMax);
|
||||
double distance;
|
||||
double distance = 0;
|
||||
if (!desiredMax.equals(supportedMax)) {
|
||||
// Map<String, Set<R3<LocalePatternMatcher,LocalePatternMatcher,Double>>> lang_result = scores.get(desiredMax);
|
||||
// if (lang_result == null) {
|
||||
@ -401,42 +460,63 @@ public class LocaleMatcher {
|
||||
// } else {
|
||||
distance = getRawScore(dMax, sMax);
|
||||
// }
|
||||
if (desiredChange == supportedChange) {
|
||||
distance *= 0.75;
|
||||
} else if (desiredChange) {
|
||||
distance *= 0.5;
|
||||
}
|
||||
} else if (desiredChange == supportedChange) { // maxes are equal, changes are equal
|
||||
distance = 0;
|
||||
// if (desiredChange == supportedChange) {
|
||||
// distance *= maxUnequal_changeEqual;
|
||||
// if (DEBUG) {
|
||||
// System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD=changeS)\t" + distance);
|
||||
// }
|
||||
// } else if (desiredChange) {
|
||||
// distance *= maxUnequal_changeD_sameS;
|
||||
// if (DEBUG) {
|
||||
// System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, changeD, !changeS)\t" + distance);
|
||||
// }
|
||||
// } else {
|
||||
// if (DEBUG) {
|
||||
// System.out.println("\t\t\t" + level + " Distance (maxD≠maxS, !changeD, changeS)\t" + distance);
|
||||
// }
|
||||
// }
|
||||
} else if (!desiredRaw.equals(supportedRaw)) { // maxes are equal, changes are equal
|
||||
distance += 0.001;
|
||||
// if (DEBUG) {
|
||||
// System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD=changeS)\t" + distance);
|
||||
// }
|
||||
} else { // maxes are equal, changes are different
|
||||
distance = 0.25*worst;
|
||||
// distance = 0.25*level.worst;
|
||||
// if (DEBUG) {
|
||||
// System.out.println("\t\t\t" + level + " Distance (maxD=maxS, changeD≠changeS)\t" + distance);
|
||||
// }
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
private double getRawScore(ULocale desiredLocale, ULocale supportedLocale) {
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tRaw Score:\t" + desiredLocale + ";\t" + supportedLocale);
|
||||
System.out.println("\t\t\t" + level + " Raw Score:\t" + desiredLocale + ";\t" + supportedLocale);
|
||||
}
|
||||
for (R3<LocalePatternMatcher,LocalePatternMatcher,Double> datum : scores) { // : result
|
||||
if (datum.get0().matches(desiredLocale)
|
||||
&& datum.get1().matches(supportedLocale)) {
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tFOUND\t" + datum);
|
||||
System.out.println("\t\t\t\tFOUND\t" + datum);
|
||||
}
|
||||
return datum.get2();
|
||||
}
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tNOTFOUND\t" + worst);
|
||||
System.out.println("\t\t\t\tNOTFOUND\t" + level.worst);
|
||||
}
|
||||
return worst;
|
||||
return level.worst;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return level + ", " + scores;
|
||||
StringBuilder result = new StringBuilder().append(level);
|
||||
for (R3<LocalePatternMatcher, LocalePatternMatcher, Double> score : scores) {
|
||||
result.append("\n\t\t").append(score);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public ScoreData cloneAsThawed() {
|
||||
try {
|
||||
@ -478,6 +558,14 @@ public class LocaleMatcher {
|
||||
public LanguageMatcherData() {
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public String toString() {
|
||||
return languageScores + "\n\t" + scriptScores + "\n\t" + regionScores;
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
@ -489,13 +577,16 @@ public class LocaleMatcher {
|
||||
diff += regionScores.getScore(a, aMax, a.getCountry(), aMax.getCountry(), b, bMax, b.getCountry(), bMax.getCountry());
|
||||
|
||||
if (!a.getVariant().equals(b.getVariant())) {
|
||||
diff += 1;
|
||||
diff += 0.01;
|
||||
}
|
||||
if (diff < 0.0d) {
|
||||
diff = 0.0d;
|
||||
} else if (diff > 1.0d) {
|
||||
diff = 1.0d;
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println("\t\t\tTotal Distance\t" + diff);
|
||||
}
|
||||
return 1.0 - diff;
|
||||
}
|
||||
|
||||
@ -551,7 +642,7 @@ public class LocaleMatcher {
|
||||
LocalePatternMatcher supportedMatcher = new LocalePatternMatcher(supported);
|
||||
Level supportedLen = supportedMatcher.getLevel();
|
||||
if (desiredLen != supportedLen) {
|
||||
throw new IllegalArgumentException();
|
||||
throw new IllegalArgumentException("Lengths unequal: " + desired + ", " + supported);
|
||||
}
|
||||
R3<LocalePatternMatcher,LocalePatternMatcher,Double> data = Row.of(desiredMatcher, supportedMatcher, score);
|
||||
R3<LocalePatternMatcher,LocalePatternMatcher,Double> data2 = oneway ? null : Row.of(supportedMatcher, desiredMatcher, score);
|
||||
@ -626,39 +717,150 @@ public class LocaleMatcher {
|
||||
|
||||
LanguageMatcherData matcherData;
|
||||
|
||||
private static LanguageMatcherData defaultWritten = new LanguageMatcherData()
|
||||
// TODO get data from CLDR
|
||||
.addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
|
||||
.addDistance("nn", "nb", 96)
|
||||
.addDistance("nn", "no", 96)
|
||||
.addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
|
||||
.addDistance("da", "nb", 90)
|
||||
.addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
|
||||
.addDistance("sh", "br", 96)
|
||||
.addDistance("sr", "br", 96)
|
||||
.addDistance("sh", "hr", 96)
|
||||
.addDistance("sr", "hr", 96)
|
||||
.addDistance("sh", "sr", 96)
|
||||
.addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
|
||||
.addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
|
||||
.addDistance("*-Hant", "*-Hans", 75, true)
|
||||
.addDistance("en-*-US", "en-*-CA", 98, "US is different than others, and Canadian is inbetween.")
|
||||
.addDistance("en-*-US", "en-*-*", 97)
|
||||
.addDistance("en-*-CA", "en-*-*", 98)
|
||||
.addDistance("en-*-*", "en-*-*", 99)
|
||||
.addDistance("es-*-ES", "es-*-ES", 100, "Latin American Spanishes are closer to each other. Approximate by having es-ES be further from everything else.")
|
||||
.addDistance("es-*-ES", "es-*-*", 93)
|
||||
.addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
|
||||
.addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
|
||||
.addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
|
||||
.freeze();
|
||||
private static final LanguageMatcherData defaultWritten;
|
||||
// = new LanguageMatcherData()
|
||||
// // TODO get data from CLDR
|
||||
// .addDistance("no", "nb", 100, "The language no is normally taken as nb in content; we might alias this for lookup.")
|
||||
// .addDistance("nn", "nb", 96)
|
||||
// .addDistance("nn", "no", 96)
|
||||
// .addDistance("da", "no", 90, "Danish and norwegian are reasonably close.")
|
||||
// .addDistance("da", "nb", 90)
|
||||
// .addDistance("hr", "br", 96, "Serbo-croatian variants are all very close.")
|
||||
// .addDistance("sh", "br", 96)
|
||||
// .addDistance("sr", "br", 96)
|
||||
// .addDistance("sh", "hr", 96)
|
||||
// .addDistance("sr", "hr", 96)
|
||||
// .addDistance("sh", "sr", 96)
|
||||
// .addDistance("sr-Latn", "sr-Cyrl", 90, "Most serbs can read either script.")
|
||||
// .addDistance("*-Hans", "*-Hant", 85, true, "Readers of simplified can read traditional much better than reverse.")
|
||||
// .addDistance("*-Hant", "*-Hans", 75, true)
|
||||
// .addDistance("en-*-US", "en-*-*", 97, "Non-US English variants are closer to each other (written). Make en-US be further from everything else.")
|
||||
// .addDistance("en-*-*", "en-*-*", 99)
|
||||
// .addDistance("es-*-ES", "es-*-*", 97, "Latin American Spanishes are closer to each other. Make es-ES be further from everything else.")
|
||||
// .addDistance("es-*-419", "es-*-*", 99, "Have es-MX, es-AR, etc be closer to es-419 than to each other")
|
||||
// .addDistance("es-*-*", "es-*-*", 97)
|
||||
// .addDistance("*", "*", 1, "[Default value -- must be at end!] Normally there is no comprehension of different languages.")
|
||||
// .addDistance("*-*", "*-*", 20, "[Default value -- must be at end!] Normally there is little comprehension of different scripts.")
|
||||
// .addDistance("*-*-*", "*-*-*", 96, "[Default value -- must be at end!] Normally there are small differences across regions.")
|
||||
// .freeze();
|
||||
|
||||
private static HashMap<String,String> canonicalMap = new HashMap<String, String>();
|
||||
|
||||
static class DataHack implements Comparable<DataHack>{
|
||||
final String source;
|
||||
final String target;
|
||||
int percent;
|
||||
public DataHack(String source, String target, int percent) {
|
||||
this.source = source;
|
||||
this.target = target.equals("de_CH") ? "de" : target; // hack to fix bad data
|
||||
this.percent = percent;
|
||||
}
|
||||
static final Pattern STAR_KEEP = Pattern.compile("([^_]+)(?:_[^_]+(?:_[^_]+)?)?");
|
||||
public int compareTo(DataHack other) {
|
||||
// this is just a one-time hack so we don't need to optimize
|
||||
int diff = getUnderbars(source) - getUnderbars(other.source);
|
||||
if (0 != diff) {
|
||||
return diff;
|
||||
}
|
||||
String thisSource = source.replace('*', 'þ'); // just something after Z
|
||||
String otherSource = other.source.replace('*', 'þ'); // just something after Z
|
||||
diff = thisSource.compareTo(otherSource);
|
||||
if (0 != diff) {
|
||||
return diff;
|
||||
}
|
||||
String thisTarget = target.replace('*', 'þ'); // just something after Z
|
||||
String otherTarget = other.target.replace('*', 'þ'); // just something after Z
|
||||
diff = thisTarget.compareTo(otherTarget);
|
||||
|
||||
// Matcher matcher = STAR_KEEP.matcher(source);
|
||||
// matcher.matches();
|
||||
// String first = matcher.group(0);
|
||||
// String second = matcher.group(1);
|
||||
// String third = matcher.group(2);
|
||||
// Matcher matcherB = STAR_KEEP.matcher(source);
|
||||
// String firstB = matcher.group(0);
|
||||
// String secondB = matcher.group(1);
|
||||
// String thirdB = matcher.group(2);
|
||||
//
|
||||
// int diff = onlyStars.length() - onlyStarsOther.length();
|
||||
|
||||
if (0 != diff) {
|
||||
return diff;
|
||||
}
|
||||
diff = source.compareTo(other.source);
|
||||
if (0 != diff) {
|
||||
return diff;
|
||||
}
|
||||
return target.compareTo(other.target);
|
||||
}
|
||||
/**
|
||||
* @param source2
|
||||
*/
|
||||
private int getUnderbars(String source2) {
|
||||
int pos = source2.indexOf('_');
|
||||
if (pos < 0) {
|
||||
return 0;
|
||||
}
|
||||
pos = source2.indexOf('_',pos+1);
|
||||
return pos < 0 ? 1 : 2;
|
||||
}
|
||||
public String toString() {
|
||||
return source + ", " + target + " => " + percent;
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
// TODO get data from CLDR
|
||||
canonicalMap.put("iw", "he");
|
||||
canonicalMap.put("mo", "ro");
|
||||
canonicalMap.put("tl", "fil");
|
||||
|
||||
ICUResourceBundle suppData = getICUSupplementalData();
|
||||
ICUResourceBundle languageMatching = suppData.findTopLevel("languageMatching");
|
||||
ICUResourceBundle written = (ICUResourceBundle) languageMatching.get("written");
|
||||
defaultWritten = new LanguageMatcherData();
|
||||
// HACK
|
||||
// The data coming from ICU may be old, and badly ordered.
|
||||
TreeSet<DataHack> hack = new TreeSet<DataHack>();
|
||||
defaultWritten.addDistance("en_*_US", "en_*_*", 97);
|
||||
defaultWritten.addDistance("en_*_GB", "en_*_*", 98);
|
||||
defaultWritten.addDistance("es_*_ES", "es_*_*", 97);
|
||||
defaultWritten.addDistance("es_*_419", "es_*_*", 99);
|
||||
defaultWritten.addDistance("es_*_*", "es_*_*", 98);
|
||||
|
||||
for(UResourceBundleIterator iter = written.getIterator(); iter.hasNext();) {
|
||||
ICUResourceBundle item = (ICUResourceBundle) iter.next();
|
||||
/*
|
||||
"*_*_*",
|
||||
"*_*_*",
|
||||
"96",
|
||||
*/
|
||||
hack.add(new DataHack(item.getString(0), item.getString(1), Integer.parseInt(item.getString(2))));
|
||||
}
|
||||
for (DataHack dataHack : hack) {
|
||||
defaultWritten.addDistance(dataHack.source, dataHack.target, dataHack.percent);
|
||||
}
|
||||
defaultWritten.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public static ICUResourceBundle getICUSupplementalData() {
|
||||
ICUResourceBundle suppData = (ICUResourceBundle) UResourceBundle.getBundleInstance(
|
||||
ICUResourceBundle.ICU_BASE_NAME,
|
||||
"supplementalData",
|
||||
ICUResourceBundle.ICU_DATA_CLASS_LOADER);
|
||||
return suppData;
|
||||
}
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
public static double match(ULocale a, ULocale b) {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("");
|
||||
return matcher.match(a, matcher.addLikelySubtags(a), b, matcher.addLikelySubtags(b));
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010-2011, Google, Inc.; International Business Machines *
|
||||
* Copyright (C) 2010-2014, Google, Inc.; International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -81,7 +81,7 @@ public class LocalePriorityList implements Iterable<ULocale> {
|
||||
* @return internal builder, for chaining
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
public static Builder add(ULocale languageCode) {
|
||||
public static Builder add(ULocale... languageCode) {
|
||||
return new Builder().add(languageCode);
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,15 @@
|
||||
/*
|
||||
******************************************************************************************
|
||||
* Copyright (C) 2009-2010, Google, Inc.; International Business Machines Corporation and *
|
||||
* Copyright (C) 2009-2014, Google, Inc.; International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
******************************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.test.util;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.util.LocaleMatcher;
|
||||
import com.ibm.icu.util.LocaleMatcher.LanguageMatcherData;
|
||||
@ -24,6 +27,41 @@ public class LocaleMatcherTest extends TestFmwk {
|
||||
new LocaleMatcherTest().run(args);
|
||||
}
|
||||
|
||||
public void testenGB() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("fr, en, en_GB, es_MX, es_419, es");
|
||||
assertEquals("en_GB", matcher.getBestMatch("en_NZ").toString());
|
||||
assertEquals("es", matcher.getBestMatch("es_ES").toString());
|
||||
assertEquals("es_419", matcher.getBestMatch("es_AR").toString());
|
||||
assertEquals("es_MX", matcher.getBestMatch("es_MX").toString());
|
||||
}
|
||||
|
||||
public void testFallbacks() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("en, hi");
|
||||
if (!logKnownIssue("10705", "Need new data from CLDR for languageMatching")) {
|
||||
assertEquals("hi", matcher.getBestMatch("sa").toString());
|
||||
}
|
||||
}
|
||||
|
||||
public void testOverrideData() {
|
||||
double threshold = 0.05;
|
||||
LanguageMatcherData localeMatcherData = new LanguageMatcherData()
|
||||
.addDistance("br", "fr", 10, true)
|
||||
.addDistance("es", "cy", 10, true)
|
||||
;
|
||||
logln(localeMatcherData.toString());
|
||||
|
||||
final LocaleMatcher matcher = new LocaleMatcher(
|
||||
LocalePriorityList
|
||||
.add(ULocale.ENGLISH)
|
||||
.add(ULocale.FRENCH)
|
||||
.add(ULocale.UK)
|
||||
.build(), localeMatcherData , threshold);
|
||||
logln(matcher.toString());
|
||||
|
||||
assertEquals(ULocale.FRENCH, matcher.getBestMatch(new ULocale("br")));
|
||||
assertEquals(ULocale.ENGLISH, matcher.getBestMatch(new ULocale("es"))); // one way
|
||||
}
|
||||
|
||||
public void testBasics() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher(LocalePriorityList.add(ULocale.FRENCH).add(ULocale.UK)
|
||||
.add(ULocale.ENGLISH).build());
|
||||
@ -84,5 +122,224 @@ public class LocaleMatcherTest extends TestFmwk {
|
||||
private void assertEquals(Object expected, Object string) {
|
||||
assertEquals("", expected, string);
|
||||
}
|
||||
private void assertNull(Object bestMatch) {
|
||||
assertNull("", bestMatch);
|
||||
}
|
||||
|
||||
public void testEmpty() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("");
|
||||
assertNull(matcher.getBestMatch(ULocale.FRENCH));
|
||||
}
|
||||
|
||||
static final ULocale ENGLISH_CANADA = new ULocale("en_CA");
|
||||
|
||||
public void testMatch_exact() {
|
||||
assertEquals(1.0,
|
||||
LocaleMatcher.match(ENGLISH_CANADA, ENGLISH_CANADA));
|
||||
}
|
||||
|
||||
public void testMatch_none() {
|
||||
double match = LocaleMatcher.match(
|
||||
new ULocale("ar_MK"),
|
||||
ENGLISH_CANADA);
|
||||
assertTrue("Actual < 0: " + match, 0 <= match);
|
||||
assertTrue("Actual > 0.15 (~ language + script distance): " + match, 0.2 > match);
|
||||
}
|
||||
|
||||
public void testMatch_matchOnMazimized() {
|
||||
ULocale undTw = new ULocale("und_TW");
|
||||
ULocale zhHant = new ULocale("zh_Hant");
|
||||
double matchZh = LocaleMatcher.match(undTw, new ULocale("zh"));
|
||||
double matchZhHant = LocaleMatcher.match(undTw, zhHant);
|
||||
assertTrue("und_TW should be closer to zh_Hant (" + matchZhHant +
|
||||
") than to zh (" + matchZh + ")",
|
||||
matchZh < matchZhHant);
|
||||
double matchEnHantTw = LocaleMatcher.match(new ULocale("en_Hant_TW"),
|
||||
zhHant);
|
||||
assertTrue("zh_Hant should be closer to und_TW (" + matchZhHant +
|
||||
") than to en_Hant_TW (" + matchEnHantTw + ")",
|
||||
matchEnHantTw < matchZhHant);
|
||||
assertTrue("zh should be closer to und_TW (" + matchZh +
|
||||
") than to en_Hant_TW (" + matchEnHantTw + ")",
|
||||
matchEnHantTw < matchZh);
|
||||
}
|
||||
|
||||
public void testMatchGrandfatheredCode() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("fr, i_klingon, en_Latn_US");
|
||||
assertEquals("en_Latn_US", matcher.getBestMatch("en_GB_oed").toString());
|
||||
//assertEquals("tlh", matcher.getBestMatch("i_klingon").toString());
|
||||
}
|
||||
|
||||
public void testGetBestMatchForList_exactMatch() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("fr, en_GB, ja, es_ES, es_MX");
|
||||
assertEquals("ja", matcher.getBestMatch("ja, de").toString());
|
||||
}
|
||||
|
||||
public void testGetBestMatchForList_simpleVariantMatch() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("fr, en_GB, ja, es_ES, es_MX");
|
||||
// Intentionally avoiding a perfect_match or two candidates for variant matches.
|
||||
assertEquals("en_GB", matcher.getBestMatch("de, en_US").toString());
|
||||
// Fall back.
|
||||
assertEquals("fr", matcher.getBestMatch("de, zh").toString());
|
||||
}
|
||||
|
||||
public void testGetBestMatchForList_matchOnMaximized() {
|
||||
final LocaleMatcher matcher = new LocaleMatcher("en, ja");
|
||||
//final LocaleMatcher matcher = new LocaleMatcher("fr, en, ja, es_ES, es_MX");
|
||||
// Check that if the preference is maximized already, it works as well.
|
||||
assertEquals("Match for ja_Jpan_JP (maximized already)",
|
||||
"ja", matcher.getBestMatch("ja_Jpan_JP, en-AU").toString());
|
||||
if (true) return;
|
||||
// ja_JP matches ja on likely subtags, and it's listed first, thus it wins over
|
||||
// thus it wins over the second preference en_GB.
|
||||
assertEquals("Match for ja_JP, with likely region subtag",
|
||||
"ja", matcher.getBestMatch("ja_JP, en_US").toString());
|
||||
// Check that if the preference is maximized already, it works as well.
|
||||
assertEquals("Match for ja_Jpan_JP (maximized already)",
|
||||
"ja", matcher.getBestMatch("ja_Jpan_JP, en_US").toString());
|
||||
}
|
||||
|
||||
public void testGetBestMatchForList_noMatchOnMaximized() {
|
||||
// Regression test for http://b/5714572 .
|
||||
final LocaleMatcher matcher = new LocaleMatcher("en, de, fr, ja");
|
||||
// de maximizes to de_DE. Pick the exact match for the secondary language instead.
|
||||
assertEquals("fr", matcher.getBestMatch("de_CH, fr").toString());
|
||||
}
|
||||
|
||||
public void testBestMatchForTraditionalChinese() {
|
||||
// Scenario: An application that only supports Simplified Chinese (and some other languages),
|
||||
// but does not support Traditional Chinese. zh_Hans_CN could be replaced with zh_CN, zh, or
|
||||
// zh_Hans, it wouldn't make much of a difference.
|
||||
final LocaleMatcher matcher = new LocaleMatcher("fr, zh_Hans_CN, en_US");
|
||||
|
||||
// The script distance (simplified vs. traditional Han) is considered small enough
|
||||
// to be an acceptable match. The regional difference is considered almost insignificant.
|
||||
assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_TW").toString());
|
||||
assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_Hant").toString());
|
||||
|
||||
// For geo_political reasons, you might want to avoid a zh_Hant -> zh_Hans match.
|
||||
// In this case, if zh_TW, zh_HK or a tag starting with zh_Hant is requested, you can
|
||||
// change your call to getBestMatch to include a 2nd language preference.
|
||||
// "en" is a better match since its distance to "en_US" is closer than the distance
|
||||
// from "zh_TW" to "zh_CN" (script distance).
|
||||
assertEquals("en_US", matcher.getBestMatch("zh_TW, en").toString());
|
||||
assertEquals("en_US", matcher.getBestMatch("zh_Hant_CN, en").toString());
|
||||
assertEquals("zh_Hans_CN", matcher.getBestMatch("zh_Hans, en").toString());
|
||||
}
|
||||
|
||||
public void testUndefined() {
|
||||
// When the undefined language doesn't match anything in the list, getBestMatch returns
|
||||
// the default, as usual.
|
||||
LocaleMatcher matcher = new LocaleMatcher("it,fr");
|
||||
assertEquals("it", matcher.getBestMatch("und").toString());
|
||||
|
||||
// When it *does* occur in the list, BestMatch returns it, as expected.
|
||||
matcher = new LocaleMatcher("it,und");
|
||||
assertEquals("und", matcher.getBestMatch("und").toString());
|
||||
|
||||
// The unusual part:
|
||||
// max("und") = "en_Latn_US", and since matching is based on maximized tags, the undefined
|
||||
// language would normally match English. But that would produce the counterintuitive results
|
||||
// that getBestMatch("und", LocaleMatcher("it,en")) would be "en", and
|
||||
// getBestMatch("en", LocaleMatcher("it,und")) would be "und".
|
||||
//
|
||||
// To avoid that, we change the matcher's definitions of max (AddLikelySubtagsWithDefaults)
|
||||
// so that max("und")="und". That produces the following, more desirable results:
|
||||
matcher = new LocaleMatcher("it,en");
|
||||
assertEquals("it", matcher.getBestMatch("und").toString());
|
||||
matcher = new LocaleMatcher("it,und");
|
||||
assertEquals("it", matcher.getBestMatch("en").toString());
|
||||
}
|
||||
|
||||
// public void testGetBestMatch_emptyList() {
|
||||
// final LocaleMatcher matcher = new LocaleMatcher(
|
||||
// new LocalePriorityList(new HashMap()));
|
||||
// assertNull(matcher.getBestMatch(ULocale.ENGLISH));
|
||||
// }
|
||||
|
||||
public void testGetBestMatch_googlePseudoLocales() {
|
||||
// Google pseudo locales are primarily based on variant subtags.
|
||||
// See http://sites/intl_eng/pseudo_locales.
|
||||
// (See below for the region code based fall back options.)
|
||||
final LocaleMatcher matcher = new LocaleMatcher(
|
||||
"fr, pt");
|
||||
assertEquals("fr", matcher.getBestMatch("de").toString());
|
||||
assertEquals("fr", matcher.getBestMatch("en_US").toString());
|
||||
assertEquals("fr", matcher.getBestMatch("en").toString());
|
||||
assertEquals("pt", matcher.getBestMatch("pt_BR").toString());
|
||||
}
|
||||
|
||||
public void testGetBestMatch_regionDistance() {
|
||||
LocaleMatcher matcher = new LocaleMatcher("es_AR, es");
|
||||
assertEquals("es_AR", matcher.getBestMatch("es_MX").toString());
|
||||
|
||||
matcher = new LocaleMatcher("fr, en, en_CA");
|
||||
assertEquals("en_CA", matcher.getBestMatch("en_GB").toString());
|
||||
|
||||
matcher = new LocaleMatcher("de_AT, de_DE, de_CH");
|
||||
assertEquals("de_DE", matcher.getBestMatch("de").toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* If all the base languages are the same, then each sublocale matches itself most closely
|
||||
*/
|
||||
public void testExactMatches() {
|
||||
String lastBase = "";
|
||||
TreeSet<ULocale> sorted = new TreeSet();
|
||||
for (ULocale loc : ULocale.getAvailableLocales()) {
|
||||
String language = loc.getLanguage();
|
||||
if (!lastBase.equals(language)) {
|
||||
check(sorted);
|
||||
sorted.clear();
|
||||
lastBase = language;
|
||||
}
|
||||
sorted.add(loc);
|
||||
}
|
||||
check(sorted);
|
||||
}
|
||||
|
||||
private void check(Set<ULocale> sorted) {
|
||||
if (sorted.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
check2(sorted);
|
||||
ULocale first = sorted.iterator().next();
|
||||
ULocale max = ULocale.addLikelySubtags(first);
|
||||
sorted.add(max);
|
||||
check2(sorted);
|
||||
}
|
||||
/**
|
||||
* @param sorted
|
||||
*/
|
||||
private void check2(Set<ULocale> sorted) {
|
||||
// TODO Auto-generated method stub
|
||||
logln("Checking: " + sorted);
|
||||
LocaleMatcher matcher = new LocaleMatcher(
|
||||
LocalePriorityList.add(
|
||||
sorted.toArray(new ULocale[sorted.size()]))
|
||||
.build());
|
||||
for (ULocale loc : sorted) {
|
||||
String stringLoc = loc.toString();
|
||||
assertEquals(stringLoc, matcher.getBestMatch(stringLoc).toString());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// public void testComputeDistance_monkeyTest() {
|
||||
// RegionCode[] codes = RegionCode.values();
|
||||
// Random random = new Random();
|
||||
// for (int i = 0; i < 1000; ++i) {
|
||||
// RegionCode x = codes[random.nextInt(codes.length)];
|
||||
// RegionCode y = codes[random.nextInt(codes.length)];
|
||||
// double d = LocaleMatcher.getRegionDistance(x, y, null, null);
|
||||
// if (x == RegionCode.ZZ || y == RegionCode.ZZ) {
|
||||
// assertEquals(LocaleMatcher.REGION_DISTANCE, d);
|
||||
// } else if (x == y) {
|
||||
// assertEquals(0.0, d);
|
||||
// } else {
|
||||
// assertTrue(d > 0);
|
||||
// assertTrue(d <= LocaleMatcher.REGION_DISTANCE);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user