ICU-20092 Integrates new languagematcher enhancements into ICU4J.

This commit is contained in:
Norbert Runge 2018-08-23 13:31:30 -07:00 committed by Shane Carr
parent 03c6e86c6c
commit c854dd0d54
No known key found for this signature in database
GPG Key ID: FCED3B24AAB18B5C
6 changed files with 219 additions and 59 deletions

View File

@ -136,7 +136,9 @@ public class XLikelySubtags {
// //new UnicodeRegex().compileBnf(pat)
// );
//
// TODO: fix this to check for format. Not required, since this is only called internally, but safer for the future.
// NOTE: Should we fix this to check for format?
// ANSWER: Not required, since this is only called internally. Moreover, we deliberately
// use invalid language tags ("x1", "x2", etc.) to represent pseudo-locales. See below.
static LSR from(String languageIdentifier) {
String[] parts = languageIdentifier.split("[-_]");
if (parts.length < 1 || parts.length > 3) {
@ -147,19 +149,64 @@ public class XLikelySubtags {
String p3 = parts.length < 3 ? "" : parts[2];
return p2.length() < 4 ? new LSR(lang, "", p2) : new LSR(lang, p2, p3);
// Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
// if (!matcher.matches()) {
// return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
// }
// System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
// throw new ICUException("invalid language id");
// Matcher matcher = LANGUAGE_PATTERN.matcher(languageIdentifier);
// if (!matcher.matches()) {
// return new LSR(matcher.group(1), matcher.group(2), matcher.group(3));
// }
// System.out.println(RegexUtilities.showMismatch(matcher, languageIdentifier));
// throw new ICUException("invalid language id");
}
private static final HashMap<ULocale, LSR> pseudoReplacements = new HashMap<ULocale, LSR>(11);
// Note code in XLocaledistance.java handle pseudo-regions XA, XB, and XC, making them
// very distant from any other locale. Similarly, it establishes that any of the
// invalid locales below ("x1", "x2", ..., "x7", and "x8-en") are very distant
// from any other locale.
static {
String[][] source = {
{"x-bork", "x1", "", ""},
{"x-elmer", "x2", "", ""},
{"x-hacker", "x3", "", ""},
{"x-piglatin", "x4", "", ""},
{"x-pirate", "x5", "", ""},
{"en-XA", "x6", "", ""},
{"en-PSACCENT", "x6", "", ""}, // Note: same as for ex-XA
{"ar-XB", "x7", "", ""},
{"ar-PSBIDI", "x7", "", ""}, // Note: same as for ar-XB
{"en-XC", "x8", "en", ""}, // Note: language is stored in LSR.script field
{"en-PSCRACK", "x8", "en", ""}, // Note: same as for en-XC
};
for (int i = 0; i < source.length; ++i) {
pseudoReplacements.put(new ULocale(source[i][0]),
new LSR(source[i][1], source[i][2], source[i][3]));
}
}
public static LSR from(ULocale locale) {
LSR replacement = pseudoReplacements.get(locale);
if (replacement != null) {
return replacement;
}
// Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
if ("PSCRACK".equals(locale.getVariant())) {
return new LSR(
"x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
}
return new LSR(locale.getLanguage(), locale.getScript(), locale.getCountry());
}
public static LSR fromMaximalized(ULocale locale) {
LSR replacement = pseudoReplacements.get(locale);
if (replacement != null) {
return replacement;
}
// Map *-*-*-PSCRACK to x8-***, same as for en-PSCRACK.
if ("PSCRACK".equals(locale.getVariant())) {
return new LSR(
"x8", locale.getLanguage() + locale.getScript() + locale.getCountry(), "");
}
return fromMaximalized(locale.getLanguage(), locale.getScript(), locale.getCountry());
}

View File

@ -46,6 +46,10 @@ public class XLocaleDistance {
public static final int ABOVE_THRESHOLD = 100;
// Activates debugging output to stderr with details of GetBestMatch.
// Be sure to set this to false before checking this in for production!
private static final boolean TRACE_DISTANCE = false;
@Deprecated
public static final String ANY = "<EFBFBD>"; // matches any character. Uses value above any subtag.
@ -441,6 +445,10 @@ public class XLocaleDistance {
@Override
public int getDistance(String desired, String supported, Output<DistanceTable> distanceTable, boolean starEquals) {
if (TRACE_DISTANCE) {
System.err.printf(" Entering getDistance: desired=%s supported=%s starEquals=%s\n",
desired, supported, Boolean.toString(starEquals));
}
boolean star = false;
Map<String, DistanceNode> sub2 = subtables.get(desired);
if (sub2 == null) {
@ -462,7 +470,11 @@ public class XLocaleDistance {
if (distanceTable != null) {
distanceTable.value = ((StringDistanceNode) value).distanceTable;
}
return starEquals && star && desired.equals(supported) ? 0 : value.distance;
int result = starEquals && star && desired.equals(supported) ? 0 : value.distance;
if (TRACE_DISTANCE) {
System.err.printf(" Returning from getDistance: %d\n", result);
}
return result;
}
public void copy(StringDistanceTable other) {
@ -619,6 +631,7 @@ public class XLocaleDistance {
buffer.append('\t').append('#').append(id).append('\n');
} else {
((StringDistanceTable)distanceTable).toString(abbreviate, indent+"\t\t\t", intern, buffer);
buffer.append('\n');
}
} else {
buffer.append('\n');
@ -726,17 +739,31 @@ public class XLocaleDistance {
* ULocales must be in canonical, addLikelySubtags format. Returns distance
*/
public int distanceRaw(LSR desired, LSR supported, int threshold, DistanceOption distanceOption) {
return distanceRaw(desired.language, supported.language,
if (TRACE_DISTANCE) {
System.err.printf(" Entering distanceRaw: desired=%s supported=%s "
+ "threshold=%d preferred=%s\n",
desired, supported, threshold,
distanceOption.name());
}
int result = distanceRaw(desired.language, supported.language,
desired.script, supported.script,
desired.region, supported.region,
threshold, distanceOption);
if (TRACE_DISTANCE) {
System.err.printf(" Returning from distanceRaw: %d\n", result);
}
return result;
}
public enum DistanceOption {NORMAL, SCRIPT_FIRST}
public enum DistanceOption {REGION_FIRST, SCRIPT_FIRST}
// NOTE: Replaced "NORMAL" with "REGION_FIRST". By default, scripts have greater weight
// than regions, so they might be considered the "normal" case.
/**
* Returns distance, from 0 to ABOVE_THRESHOLD.
* ULocales must be in canonical, addLikelySubtags format. Returns distance
* ULocales must be in canonical, addLikelySubtags format.
* (Exception: internal calls may pass any strings. They do this for pseudo-locales.)
* Returns distance.
*/
public int distanceRaw(
String desiredLang, String supportedLang,
@ -942,6 +969,28 @@ public class XLocaleDistance {
}
}
}
// Pseudo regions should match no other regions.
// {"*-*-XA", "*-*-*", "0"},
// {"*-*-XB", "*-*-*", "0"},
// {"*-*-XC", "*-*-*", "0"},
// {"x1-*-*", "*-*-*", "0"},
// {"x2-*-*", "*-*-*", "0"},
// ...
// {"x8-*-*", "*-*-*", "0"},
List<String> supported = Arrays.asList("*", "*", "*");
for (String x : Arrays.asList("XA", "XB", "XC")) {
List<String> desired = Arrays.asList("*", "*", x);
add(defaultDistanceTable, desired, supported, 100);
add(defaultDistanceTable, supported, desired, 100);
}
// See XLikelySubtags.java for the mapping of pseudo-locales to x1 ... x8.
for (int i = 1; i <= 8; ++i) {
List<String> desired = Arrays.asList("x" + String.valueOf(i), "*", "*");
add(defaultDistanceTable, desired, supported, 100);
add(defaultDistanceTable, supported, desired, 100);
}
if (PRINT_OVERRIDES) {
System.out.println("\t\t</languageMatches>");
}

View File

@ -27,6 +27,9 @@ public class XLocaleMatcher {
private static final LSR UND = new LSR("und","","");
private static final ULocale UND_LOCALE = new ULocale("und");
// Activates debugging output to stderr with details of GetBestMatch.
private static final boolean TRACE_MATCHER = false;
// normally the default values, but can be set via constructor
private final XLocaleDistance localeDistance;
@ -60,7 +63,9 @@ public class XLocaleMatcher {
return this;
}
public Builder setSupportedLocales(Set<ULocale> languagePriorityList) {
this.supportedLanguagesList = languagePriorityList;
Set<ULocale> temp = new LinkedHashSet<ULocale>(); // maintain order
temp.addAll(languagePriorityList);
this.supportedLanguagesList = temp;
return this;
}
@ -114,6 +119,22 @@ public class XLocaleMatcher {
public XLocaleMatcher build() {
return new XLocaleMatcher(this);
}
@Override
public String toString() {
StringBuilder s = new StringBuilder().append("{XLocaleMatcher.Builder");
if (!supportedLanguagesList.isEmpty()) {
s.append(" supported={").append(supportedLanguagesList.toString()).append("}");
}
if (defaultLanguage != null) {
s.append(" default=").append(defaultLanguage.toString());
}
if (thresholdDistance >= 0) {
s.append(String.format(" thresholdDistance=%d", thresholdDistance));
}
s.append(" preference=").append(distanceOption.name());
return s.append("}").toString();
}
}
/**
@ -176,7 +197,8 @@ public class XLocaleMatcher {
private Multimap<LSR,ULocale> extractLsrMap(Set<ULocale> languagePriorityList, Set<LSR> priorities) {
Multimap<LSR, ULocale> builder = LinkedHashMultimap.create();
for (ULocale item : languagePriorityList) {
final LSR max = item.equals(UND_LOCALE) ? UND : LSR.fromMaximalized(item);
final LSR max = item.equals(UND_LOCALE) ? UND :
LSR.fromMaximalized(item);
builder.put(max, item);
}
if (builder.size() > 1 && priorities != null) {
@ -255,46 +277,65 @@ public class XLocaleMatcher {
ULocale bestDesiredLocale = null;
Collection<ULocale> bestSupportedLocales = null;
int delta = 0;
mainLoop:
for (final Entry<LSR, ULocale> desiredLsrAndLocale : desiredLSRs.entries()) {
// quick check for exact match
ULocale desiredLocale = desiredLsrAndLocale.getValue();
LSR desiredLSR = desiredLsrAndLocale.getKey();
if (delta < bestDistance) {
if (exactSupportedLocales.contains(desiredLocale)) {
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
return desiredLocale;
}
// quick check for maximized locale
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
if (found != null) {
// if we find one in the set, return first (lowest). We already know the exact one isn't there.
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
return found.iterator().next();
}
mainLoop:
for (final Entry<LSR, Set<ULocale>> desiredLsrAndLocales : desiredLSRs.asMap().entrySet()) {
LSR desiredLSR = desiredLsrAndLocales.getKey();
for (ULocale desiredLocale : desiredLsrAndLocales.getValue()) {
// quick check for exact match
if (delta < bestDistance) {
if (exactSupportedLocales.contains(desiredLocale)) {
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
int distance = delta + localeDistance.distanceRaw(desiredLSR, supportedLsrAndLocale.getKey(),
thresholdDistance, distanceOption);
if (distance < bestDistance) {
bestDistance = distance;
bestDesiredLocale = desiredLocale;
bestSupportedLocales = supportedLsrAndLocale.getValue();
if (distance == 0) {
break mainLoop;
}
}
if (TRACE_MATCHER) {
System.err.printf(
"Returning %s, which is an exact match for a supported language\n",
desiredLocale);
}
return desiredLocale;
}
// quick check for maximized locale
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
if (found != null) {
// if we find one in the set, return first (lowest). We already know the exact one isn't
// there.
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
delta += demotionPerAdditionalDesiredLocale;
ULocale result = found.iterator().next();
if (TRACE_MATCHER) {
System.err.printf("Returning %s\n", result.toString());
}
return result;
}
}
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
int distance =
delta
+ localeDistance.distanceRaw(
desiredLSR,
supportedLsrAndLocale.getKey(),
thresholdDistance,
distanceOption);
if (distance < bestDistance) {
bestDistance = distance;
bestDesiredLocale = desiredLocale;
bestSupportedLocales = supportedLsrAndLocale.getValue();
if (distance == 0) {
break mainLoop;
}
}
}
delta += demotionPerAdditionalDesiredLocale;
}
}
if (bestDistance >= thresholdDistance) {
if (outputBestDesired != null) {
outputBestDesired.value = null;
}
if (TRACE_MATCHER) {
System.err.printf("Returning default %s\n", defaultLanguage.toString());
}
return defaultLanguage;
}
if (outputBestDesired != null) {
@ -302,10 +343,18 @@ public class XLocaleMatcher {
}
// pick exact match if there is one
if (bestSupportedLocales.contains(bestDesiredLocale)) {
if (TRACE_MATCHER) {
System.err.printf(
"Returning %s which matches a supported language\n", bestDesiredLocale.toString());
}
return bestDesiredLocale;
}
// otherwise return first supported, combining variants and extensions from bestDesired
return bestSupportedLocales.iterator().next();
ULocale result = bestSupportedLocales.iterator().next();
if (TRACE_MATCHER) {
System.err.printf("Returning first supported language %s\n", result.toString());
}
return result;
}
/**
@ -327,17 +376,24 @@ public class XLocaleMatcher {
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
if (TRACE_MATCHER) {
System.err.printf("Exact match with a supported locale.\n");
}
return desiredLocale;
}
// quick check for maximized locale
if (distanceOption == DistanceOption.NORMAL) {
if (distanceOption == DistanceOption.REGION_FIRST) {
Collection<ULocale> found = supportedLanguages.get(desiredLSR);
if (found != null) {
// if we find one in the set, return first (lowest). We already know the exact one isn't there.
if (outputBestDesired != null) {
outputBestDesired.value = desiredLocale;
}
return found.iterator().next();
ULocale result = found.iterator().next();
if (TRACE_MATCHER) {
System.err.printf("Matches a maximized supported locale: %s\n", result);
}
return result;
}
}
for (final Entry<LSR, Set<ULocale>> supportedLsrAndLocale : supportedLanguages.entrySet()) {
@ -356,6 +412,11 @@ public class XLocaleMatcher {
if (outputBestDesired != null) {
outputBestDesired.value = null;
}
if (TRACE_MATCHER) {
System.err.printf(
"Returning default %s because everything exceeded the threshold of %d.\n",
defaultLanguage, thresholdDistance);
}
return defaultLanguage;
}
if (outputBestDesired != null) {
@ -366,7 +427,11 @@ public class XLocaleMatcher {
return bestDesiredLocale;
}
// otherwise return first supported, combining variants and extensions from bestDesired
return bestSupportedLocales.iterator().next();
ULocale result = bestSupportedLocales.iterator().next();
if (TRACE_MATCHER) {
System.err.printf("First in the list of supported locales: %s\n", result);
}
return result;
}
/** Combine features of the desired locale into those of the supported, and return result. */

View File

@ -99,8 +99,8 @@ public class XLocaleDistanceTest extends TestFmwk {
newLikelyTime += System.nanoTime()-temp;
temp = System.nanoTime();
int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.NORMAL);
int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.NORMAL);
int dist1 = localeMatcher.distanceRaw(desiredLSR, supportedLSR, 1000, DistanceOption.REGION_FIRST);
int dist2 = localeMatcher.distanceRaw(supportedLSR, desiredLSR, 1000, DistanceOption.REGION_FIRST);
newTimeMinusLikely += System.nanoTime()-temp;
}
}
@ -178,7 +178,7 @@ public class XLocaleDistanceTest extends TestFmwk {
class MyTestFileHandler extends DataDrivenTestHelper {
final XLocaleDistance distance = XLocaleDistance.getDefault();
Output<ULocale> bestDesired = new Output<ULocale>();
private DistanceOption distanceOption = DistanceOption.NORMAL;
private DistanceOption distanceOption = DistanceOption.REGION_FIRST;
private Integer threshold = distance.getDefaultScriptDistance();
@Override

View File

@ -282,7 +282,7 @@ public class XLocaleMatcherTest extends TestFmwk {
class MyTestFileHandler extends DataDrivenTestHelper {
Output<ULocale> bestDesired = new Output<ULocale>();
DistanceOption distanceOption = DistanceOption.NORMAL;
DistanceOption distanceOption = DistanceOption.REGION_FIRST;
int threshold = -1;
@Override
@ -305,8 +305,7 @@ public class XLocaleMatcherTest extends TestFmwk {
if (breakpoint) {
breakpoint = false; // put debugger breakpoint here to break at @debug in test file
}
XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.NORMAL
XLocaleMatcher matcher = threshold < 0 && distanceOption == DistanceOption.REGION_FIRST
? newXLocaleMatcher(supportedList)
: newXLocaleMatcher(supportedList, threshold, distanceOption);
commentBase = "(" + lineNumber + ") " + commentBase;

View File

@ -334,8 +334,8 @@ und, no ; nn-BE-fonipa ; no ; no-BE-fonipa
und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; en-PSCRACK # was: fr-PSCRACK
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; en-PSCRACK # was: de-PSCRACK
##################################################
# testClusters
@ -384,4 +384,4 @@ und, en-GU, en-GB, en-IN ; en-VI ; en-GU
ru, fr ; zh, pl ; fr
ru, fr ; zh-Cyrl, pl ; ru
#hr, en-Cyrl; sr ; en-Cyrl
da, ru, hr; sr ; ru
da, ru, hr; sr ; ru