ICU-20092 Integrates new languagematcher enhancements into ICU4J.
This commit is contained in:
parent
03c6e86c6c
commit
c854dd0d54
@ -136,7 +136,9 @@ public class XLikelySubtags {
|
||||
// //new UnicodeRegex().compileBnf(pat)
|
||||
// );
|
||||
//
|
||||
// TODO: fix this to check for format. Not required, since this is only called internally, but safer for the future.
|
||||
// NOTE: Should we fix this to check for format?
|
||||
// ANSWER: Not required, since this is only called internally. Moreover, we deliberately
|
||||
// use invalid language tags ("x1", "x2", etc.) to represent pseudo-locales. See below.
|
||||
static LSR from(String languageIdentifier) {
|
||||
String[] parts = languageIdentifier.split("[-_]");
|
||||
if (parts.length < 1 || parts.length > 3) {
|
||||
@ -147,19 +149,64 @@ public class XLikelySubtags {
|
||||
String p3 = parts.length < 3 ? "" : parts[2];
|
||||
return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3);
|
||||
|
||||
// Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
|
||||
// if (!matcher.matches()) {
|
||||
// return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
|
||||
// }
|
||||
// System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
|
||||
// throw new ICUException("invalid language id");
|
||||
// Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
|
||||
// if (!matcher.matches()) {
|
||||
// return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
|
||||
// }
|
||||
// System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
|
||||
// throw new ICUException("invalid language id");
|
||||
}
|
||||
|
||||
private static final HashMap<ULocale, LSR> pseudoReplacements = new HashMap<ULocale, LSR>(11);
|
||||
|
||||
// Note code in XLocaledistance.java handle pseudo-regions XA, XB, and XC, making them
|
||||
// very distant from any other locale. Similarly, it establishes that any of the
|
||||
// invalid locales below ("x1", "x2", ..., "x7", and "x8-en") are very distant
|
||||
// from any other locale.
|
||||
static {
|
||||
String[][] source = {
|
||||
{"x-bork", "x1", "", ""},
|
||||
{"x-elmer", "x2", "", ""},
|
||||
{"x-hacker", "x3", "", ""},
|
||||
{"x-piglatin", "x4", "", ""},
|
||||
{"x-pirate", "x5", "", ""},
|
||||
{"en-XA", "x6", "", ""},
|
||||
{"en-PSACCENT", "x6", "", ""}, // Note: same as for ex-XA
|
||||
{"ar-XB", "x7", "", ""},
|
||||
{"ar-PSBIDI", "x7", "", ""}, // Note: same as for ar-XB
|
||||
{"en-XC", "x8", "en", ""}, // Note: language is stored in LSR.script field
|
||||
{"en-PSCRACK", "x8", "en", ""}, // Note: same as for en-XC
|
||||
};
|
||||
for (int i = 0; i < source.length; ++i) {
|
||||
pseudoReplacements.put(new ULocale(source[i][0]),
|
||||
new LSR(source[i][1], source[i][2], source[i][3]));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static LSR from(ULocale locale) {
|
||||
LSR replacement = pseudoReplacements.get(locale);
|
||||
if (replacement != null) {
|
||||
return replacement;
|
||||
}
|
||||
// Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
|
||||
if ("PSCRACK".equals(locale.getVariant())) {
|
||||
return new LSR(
|
||||
"x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
|
||||
}
|
||||
return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry());
|
||||
}
|
||||
|
||||
public static LSR fromMaximalized(ULocale locale) {
|
||||
LSR replacement = pseudoReplacements.get(locale);
|
||||
if (replacement != null) {
|
||||
return replacement;
|
||||
}
|
||||
// Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
|
||||
if ("PSCRACK".equals(locale.getVariant())) {
|
||||
return new LSR(
|
||||
"x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
|
||||
}
|
||||
return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry());
|
||||
}
|
||||
|
||||
|
@ -46,6 +46,10 @@ public class XLocaleDistance {
|
||||
|
||||
public static final int ABOVE_THRESHOLD = 100;
|
||||
|
||||
// Activates debugging output to stderr with details of GetBestMatch.
|
||||
// Be sure to set this to false before checking this in for production!
|
||||
private static final boolean TRACE_DISTANCE = false;
|
||||
|
||||
@Deprecated
|
||||
public static final String ANY = "<EFBFBD>"; // matches any character. Uses value above any subtag.
|
||||
|
||||
@ -441,6 +445,10 @@ public class XLocaleDistance {
|
||||
|
||||
@Override
|
||||
public int getDistance(String desired, String supported, Output<DistanceTable> distanceTable, boolean starEquals) {
|
||||
if (TRACE_DISTANCE) {
|
||||
System.err.printf(" Entering getDistance: desired=%s supported=%s starEquals=%s\n",
|
||||
desired, supported, Boolean.toString(starEquals));
|
||||
}
|
||||
boolean star = false;
|
||||
Map<String, DistanceNode> sub2 = subtables.get(desired);
|
||||
if (sub2 == null) {
|
||||
@ -462,7 +470,11 @@ public class XLocaleDistance {
|
||||
if (distanceTable != null) {
|
||||
distanceTable.value = ((StringDistanceNode) value).distanceTable;
|
||||
}
|
||||
return starEquals && star && desired.equals(supported) ? 0 : value.distance;
|
||||
int result = starEquals && star && desired.equals(supported) ? 0 : value.distance;
|
||||
if (TRACE_DISTANCE) {
|
||||
System.err.printf(" Returning from getDistance: %d\n", result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public void copy(StringDistanceTable other) {
|
||||
@ -619,6 +631,7 @@ public class XLocaleDistance {
|
||||
buffer.append('\t').append('#').append(id).append('\n');
|
||||
} else {
|
||||
((StringDistanceTable)distanceTable).toString(abbreviate, indent+"\t\t\t", intern, buffer);
|
||||
buffer.append('\n');
|
||||
}
|
||||
} else {
|
||||
buffer.append('\n');
|
||||
@ -726,17 +739,31 @@ public class XLocaleDistance {
|
||||
* ULocales must be in canonical, addLikelySubtags format. Returns distance
|
||||
*/
|
||||
public int distanceRaw(LSR desired, LSR supported, int threshold, DistanceOption distanceOption) {
|
||||
return distanceRaw(desired.language, supported.language,
|
||||
if (TRACE_DISTANCE) {
|
||||
System.err.printf(" Entering distanceRaw: desired=%s supported=%s "
|
||||
+ "threshold=%d preferred=%s\n",
|
||||
desired, supported, threshold,
|
||||
distanceOption.name());
|
||||
}
|
||||
int result = distanceRaw(desired.language, supported.language,
|
||||
desired.script, supported.script,
|
||||
desired.region, supported.region,
|
||||
threshold, distanceOption);
|
||||
if (TRACE_DISTANCE) {
|
||||
System.err.printf(" Returning from distanceRaw: %d\n", result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public enum DistanceOption {NORMAL, SCRIPT_FIRST}
|
||||
public enum DistanceOption {REGION_FIRST, SCRIPT_FIRST}
|
||||
// NOTE: Replaced "NORMAL" with "REGION_FIRST". By default, scripts have greater weight
|
||||
// than regions, so they might be considered the "normal" case.
|
||||
|
||||
/**
|
||||
* Returns distance, from 0 to ABOVE_THRESHOLD.
|
||||
* ULocales must be in canonical, addLikelySubtags format. Returns distance
|
||||
* ULocales must be in canonical, addLikelySubtags format.
|
||||
* (Exception: internal calls may pass any strings. They do this for pseudo-locales.)
|
||||
* Returns distance.
|
||||
*/
|
||||
public int distanceRaw(
|
||||
String desiredLang, String supportedLang,
|
||||
@ -942,6 +969,28 @@ public class XLocaleDistance {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pseudo regions should match no other regions.
|
||||
// {"*-*-XA", "*-*-*", "0"},
|
||||
// {"*-*-XB", "*-*-*", "0"},
|
||||
// {"*-*-XC", "*-*-*", "0"},
|
||||
// {"x1-*-*", "*-*-*", "0"},
|
||||
// {"x2-*-*", "*-*-*", "0"},
|
||||
// ...
|
||||
// {"x8-*-*", "*-*-*", "0"},
|
||||
List<String> supported = Arrays.asList("*", "*", "*");
|
||||
for (String x : Arrays.asList("XA", "XB", "XC")) {
|
||||
List<String> desired = Arrays.asList("*", "*", x);
|
||||
add(defaultDistanceTable, desired, supported, 100);
|
||||
add(defaultDistanceTable, supported, desired, 100);
|
||||
}
|
||||
// See XLikelySubtags.java for the mapping of pseudo-locales to x1 ... x8.
|
||||
for (int i = 1; i <= 8; ++i) {
|
||||
List<String> desired = Arrays.asList("x" + String.valueOf(i), "*", "*");
|
||||
add(defaultDistanceTable, desired, supported, 100);
|
||||
add(defaultDistanceTable, supported, desired, 100);
|
||||
}
|
||||
|
||||
if (PRINT_OVERRIDES) {
|
||||
System.out.println("\t\t</languageMatches>");
|
||||
}
|
||||
|
@ -27,6 +27,9 @@ public class XLocaleMatcher {
|
||||
private static final LSR UND = new LSR("und","","");
|
||||
private static final ULocale UND_LOCALE = new ULocale("und");
|
||||
|
||||
// Activates debugging output to stderr with details of GetBestMatch.
|
||||
private static final boolean TRACE_MATCHER = false;
|
||||
|
||||
// normally the default values, but can be set via constructor
|
||||
|
||||
private final XLocaleDistance localeDistance;
|
||||
@ -60,7 +63,9 @@ public class XLocaleMatcher {
|
||||
return this;
|
||||
}
|
||||
public Builder setSupportedLocales(Set<ULocale> languagePriorityList) {
|
||||
this.supportedLanguagesList = languagePriorityList;
|
||||
Set<ULocale> temp = new LinkedHashSet<ULocale>(); // maintain order
|
||||
temp.addAll(languagePriorityList);
|
||||
this.supportedLanguagesList = temp;
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -114,6 +119,22 @@ public class XLocaleMatcher {
|
||||
public XLocaleMatcher build() {
|
||||
return new XLocaleMatcher(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder().append("{XLocaleMatcher.Builder");
|
||||
if (!supportedLanguagesList.isEmpty()) {
|
||||
s.append(" supported={").append(supportedLanguagesList.toString()).append("}");
|
||||
}
|
||||
if (defaultLanguage != null) {
|
||||
s.append(" default=").append(defaultLanguage.toString());
|
||||
}
|
||||
if (thresholdDistance >= 0) {
|
||||
s.append(String.format(" thresholdDistance=%d", thresholdDistance));
|
||||
}
|
||||
s.append(" preference=").append(distanceOption.name());
|
||||
return s.append("}").toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -176,7 +197,8 @@ public class XLocaleMatcher {
|
||||
private Multimap<LSR,ULocale> extractLsrMap(Set<ULocale> languagePriorityList, Set<LSR> priorities) {
|
||||
Multimap<LSR, ULocale> builder = LinkedHashMultimap.create();
|
||||
for (ULocale item : languagePriorityList) {
|
||||
final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item);
|
||||
final LSR max = item.equals(UND_LOCALE) ? UND :
|
||||
LSR.fromMaximalized(item);
|
||||
builder.put(max, item);
|
||||
}
|
||||
if (builder.size() > 1 && priorities != null) {
|
||||
@ -255,46 +277,65 @@ public class XLocaleMatcher {
|
||||
ULocale bestDesiredLocale = null;
|
||||
Collection<ULocale> bestSupportedLocales = null;
|
||||
int delta = 0;
|
||||
mainLoop:
|
||||
for (final Entry<LSR, ULocale> desiredLsrAndLocale : desiredLSRs.entries()) {
|
||||
// quick check for exact match
|
||||
ULocale desiredLocale = desiredLsrAndLocale.getValue();
|
||||
LSR desiredLSR = desiredLsrAndLocale.getKey();
|
||||
if (delta < bestDistance) {
|
||||
if (exactSupportedLocales.contains(desiredLocale)) {
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = desiredLocale;
|
||||
}
|
||||
return desiredLocale;
|
||||
}
|
||||
// quick check for maximized locale
|
||||
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
|
||||
if (found != null) {
|
||||
// if we find one in the set, return first (lowest). We already know the exact one isn't there.
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = desiredLocale;
|
||||
}
|
||||
return found.iterator().next();
|
||||
}
|
||||
mainLoop:
|
||||
for (final Entry<LSR, Set<ULocale>> desiredLsrAndLocales : desiredLSRs.asMap().entrySet()) {
|
||||
LSR desiredLSR = desiredLsrAndLocales.getKey();
|
||||
for (ULocale desiredLocale : desiredLsrAndLocales.getValue()) {
|
||||
// quick check for exact match
|
||||
if (delta < bestDistance) {
|
||||
if (exactSupportedLocales.contains(desiredLocale)) {
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = desiredLocale;
|
||||
}
|
||||
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
|
||||
int distance = delta + localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(),
|
||||
thresholdDistance, distanceOption);
|
||||
if (distance < bestDistance) {
|
||||
bestDistance = distance;
|
||||
bestDesiredLocale = desiredLocale;
|
||||
bestSupportedLocales = supportedLsrAndLocale.getValue();
|
||||
if (distance == 0) {
|
||||
break mainLoop;
|
||||
}
|
||||
}
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf(
|
||||
"Returning %s, which is an exact match for a supported language\n",
|
||||
desiredLocale);
|
||||
}
|
||||
return desiredLocale;
|
||||
}
|
||||
// quick check for maximized locale
|
||||
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
|
||||
if (found != null) {
|
||||
// if we find one in the set, return first (lowest). We already know the exact one isn't
|
||||
// there.
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = desiredLocale;
|
||||
}
|
||||
delta += demotionPerAdditionalDesiredLocale;
|
||||
ULocale result = found.iterator().next();
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf("Returning %s\n", result.toString());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
|
||||
int distance =
|
||||
delta
|
||||
+ localeDistance.distanceRaw(
|
||||
desiredLSR,
|
||||
supportedLsrAndLocale.getKey(),
|
||||
thresholdDistance,
|
||||
distanceOption);
|
||||
if (distance < bestDistance) {
|
||||
bestDistance = distance;
|
||||
bestDesiredLocale = desiredLocale;
|
||||
bestSupportedLocales = supportedLsrAndLocale.getValue();
|
||||
if (distance == 0) {
|
||||
break mainLoop;
|
||||
}
|
||||
}
|
||||
}
|
||||
delta += demotionPerAdditionalDesiredLocale;
|
||||
}
|
||||
}
|
||||
if (bestDistance >= thresholdDistance) {
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = null;
|
||||
}
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf("Returning default %s\n", defaultLanguage.toString());
|
||||
}
|
||||
return defaultLanguage;
|
||||
}
|
||||
if (outputBestDesired != null) {
|
||||
@ -302,10 +343,18 @@ public class XLocaleMatcher {
|
||||
}
|
||||
// pick exact match if there is one
|
||||
if (bestSupportedLocales.contains(bestDesiredLocale)) {
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf(
|
||||
"Returning %s which matches a supported language\n", bestDesiredLocale.toString());
|
||||
}
|
||||
return bestDesiredLocale;
|
||||
}
|
||||
// otherwise return first supported, combining variants and extensions from bestDesired
|
||||
return bestSupportedLocales.iterator().next();
|
||||
ULocale result = bestSupportedLocales.iterator().next();
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf("Returning first supported language %s\n", result.toString());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -327,17 +376,24 @@ public class XLocaleMatcher {
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = desiredLocale;
|
||||
}
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf("Exact match with a supported locale.\n");
|
||||
}
|
||||
return desiredLocale;
|
||||
}
|
||||
// quick check for maximized locale
|
||||
if (distanceOption == DistanceOption.NORMAL) {
|
||||
if (distanceOption == DistanceOption.REGION_FIRST) {
|
||||
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
|
||||
if (found != null) {
|
||||
// if we find one in the set, return first (lowest). We already know the exact one isn't there.
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = desiredLocale;
|
||||
}
|
||||
return found.iterator().next();
|
||||
ULocale result = found.iterator().next();
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf("Matches a maximized supported locale: %s\n", result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
|
||||
@ -356,6 +412,11 @@ public class XLocaleMatcher {
|
||||
if (outputBestDesired != null) {
|
||||
outputBestDesired.value = null;
|
||||
}
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf(
|
||||
"Returning default %s because everything exceeded the threshold of %d.\n",
|
||||
defaultLanguage, thresholdDistance);
|
||||
}
|
||||
return defaultLanguage;
|
||||
}
|
||||
if (outputBestDesired != null) {
|
||||
@ -366,7 +427,11 @@ public class XLocaleMatcher {
|
||||
return bestDesiredLocale;
|
||||
}
|
||||
// otherwise return first supported, combining variants and extensions from bestDesired
|
||||
return bestSupportedLocales.iterator().next();
|
||||
ULocale result = bestSupportedLocales.iterator().next();
|
||||
if (TRACE_MATCHER) {
|
||||
System.err.printf("First in the list of supported locales: %s\n", result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Combine features of the desired locale into those of the supported, and return result. */
|
||||
|
@ -99,8 +99,8 @@ public class XLocaleDistanceTest extends TestFmwk {
|
||||
newLikelyTime += System.nanoTime()-temp;
|
||||
|
||||
temp = System.nanoTime();
|
||||
int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.NORMAL);
|
||||
int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.NORMAL);
|
||||
int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.REGION_FIRST);
|
||||
int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.REGION_FIRST);
|
||||
newTimeMinusLikely += System.nanoTime()-temp;
|
||||
}
|
||||
}
|
||||
@ -178,7 +178,7 @@ public class XLocaleDistanceTest extends TestFmwk {
|
||||
class MyTestFileHandler extends DataDrivenTestHelper {
|
||||
final XLocaleDistance distance = XLocaleDistance.getDefault();
|
||||
Output<ULocale> bestDesired = new Output<ULocale>();
|
||||
private DistanceOption distanceOption = DistanceOption.NORMAL;
|
||||
private DistanceOption distanceOption = DistanceOption.REGION_FIRST;
|
||||
private Integer threshold = distance.getDefaultScriptDistance();
|
||||
|
||||
@Override
|
||||
|
@ -282,7 +282,7 @@ public class XLocaleMatcherTest extends TestFmwk {
|
||||
class MyTestFileHandler extends DataDrivenTestHelper {
|
||||
|
||||
Output<ULocale> bestDesired = new Output<ULocale>();
|
||||
DistanceOption distanceOption = DistanceOption.NORMAL;
|
||||
DistanceOption distanceOption = DistanceOption.REGION_FIRST;
|
||||
int threshold = -1;
|
||||
|
||||
@Override
|
||||
@ -305,8 +305,7 @@ public class XLocaleMatcherTest extends TestFmwk {
|
||||
if (breakpoint) {
|
||||
breakpoint = false; // put debugger breakpoint here to break at @debug in test file
|
||||
}
|
||||
|
||||
XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.NORMAL
|
||||
XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.REGION_FIRST
|
||||
? newXLocaleMatcher(supportedList)
|
||||
: newXLocaleMatcher(supportedList, threshold, distanceOption);
|
||||
commentBase = "(" + lineNumber + ") " + commentBase;
|
||||
|
@ -334,8 +334,8 @@ und, no ; nn-BE-fonipa ; no ; no-BE-fonipa
|
||||
und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin
|
||||
|
||||
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK
|
||||
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK
|
||||
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK
|
||||
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; en-PSCRACK # was: fr-PSCRACK
|
||||
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; en-PSCRACK # was: de-PSCRACK
|
||||
|
||||
##################################################
|
||||
# testClusters
|
||||
|
Loading…
Reference in New Issue
Block a user