ICU-20917 LocaleMatcher: prefer a more-default locale

This commit is contained in:
Markus Scherer 2019-12-21 06:48:17 -08:00
parent 79fac50101
commit 60b567d6ab
8 changed files with 306 additions and 31 deletions

View File

@ -69,7 +69,7 @@ void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
errorCode = U_MISSING_RESOURCE_ERROR;
return;
}
gLocaleDistance = new LocaleDistance(data);
gLocaleDistance = new LocaleDistance(data, likely);
if (gLocaleDistance == nullptr) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
@ -83,7 +83,8 @@ const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
return gLocaleDistance;
}
LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
likelySubtags(likely),
trie(data.distanceTrieBytes),
regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
@ -122,6 +123,8 @@ int32_t LocaleDistance::getBestIndexAndDistance(
uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
// Index of the supported LSR with the lowest distance.
int32_t bestIndex = -1;
// Cached lookup info from XLikelySubtags.compareLikely().
int32_t bestLikelyInfo = -1;
for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
const LSR &supported = *supportedLSRs[slIndex];
bool star = false;
@ -207,13 +210,29 @@ int32_t LocaleDistance::getBestIndexAndDistance(
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
}
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else {
if (shiftedDistance < shiftedThreshold) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
bestLikelyInfo = likelySubtags.compareLikely(
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
bestIndex = slIndex;
}
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
}
}
return bestIndex >= 0 ?

View File

@ -82,7 +82,7 @@ private:
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
}
LocaleDistance(const LocaleDistanceData &data);
LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
LocaleDistance(const LocaleDistance &other) = delete;
LocaleDistance &operator=(const LocaleDistance &other) = delete;
@ -110,6 +110,8 @@ private:
return defaultRegionDistance;
}
const XLikelySubtags &likelySubtags;
// The trie maps each dlang+slang+dscript+sscript+dregion+sregion
// (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
// There is also a trie value for each subsequence of whole subtags.

View File

@ -557,6 +557,106 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
return LSR(language, script, region, retainOldMask);
}
int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
// If likelyInfo >= 0:
// likelyInfo bit 1 is set if the previous comparison with lsr
// was for equal language and script.
// Otherwise the scripts differed.
if (uprv_strcmp(lsr.language, other.language) != 0) {
return 0xfffffffc; // negative, lsr not better than other
}
if (uprv_strcmp(lsr.script, other.script) != 0) {
int32_t index;
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, "");
likelyInfo = index << 2;
}
const LSR &likely = lsrs[index];
if (uprv_strcmp(lsr.script, likely.script) == 0) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
if (uprv_strcmp(lsr.region, other.region) != 0) {
int32_t index;
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, lsr.region);
likelyInfo = (index << 2) | 2;
}
const LSR &likely = lsrs[index];
if (uprv_strcmp(lsr.region, likely.region) == 0) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
return likelyInfo & ~1; // lsr not better than other
}
// Subset of maximize().
int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const {
if (uprv_strcmp(language, "und") == 0) {
language = "";
}
if (uprv_strcmp(script, "Zzzz") == 0) {
script = "";
}
BytesTrie iter(trie);
uint64_t state;
int32_t value;
// Small optimization: Array lookup for first language letter.
int32_t c0;
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
language[1] != 0 && // language.length() >= 2
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
if (value >= 0) {
state = iter.getState64();
} else {
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
state = iter.getState64();
} else {
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
U_ASSERT(value >= 0);
state = iter.getState64();
}
}
}
if (value > 0) {
// Final value from just language or language+script.
} else {
value = trieNext(iter, "", 0);
U_ASSERT(value > 0);
}
U_ASSERT(value < lsrsLength);
return value;
}
int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
UStringTrieResult result;
uint8_t c;

View File

@ -85,6 +85,18 @@ public:
// VisibleForTesting
LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
/**
* Tests whether lsr is "more likely" than other.
* For example, fr-Latn-FR is more likely than fr-Latn-CH because
* FR is the default region for fr-Latn.
*
* The likelyInfo caches lookup information between calls.
* The return value is an updated likelyInfo value,
* with bit 0 set if lsr is "more likely".
* The initial value of likelyInfo must be negative.
*/
int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
// in loclikely.cpp to this new code, including activating this
// minimizeSubtags() function. The LocaleMatcher does not minimize.
@ -111,6 +123,8 @@ private:
*/
LSR maximize(const char *language, const char *script, const char *region) const;
int32_t getLikelyIndex(const char *language, const char *script) const;
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
UResourceBundle *langInfoBundle;

View File

@ -733,7 +733,7 @@ ja >> fr
@favor=script
en-GB >> en-GB
en-US >> en
fr >> en-GB
fr >> en
ja >> fr
** test: testEmptyWithDefault
@ -761,8 +761,8 @@ en-GB >> en-GB
en-US >> en
fr-FR >> fr
ja-JP >> fr
zu >> en
# For a language that doesn't match anything, return the default.
zu >> en-GB
zxx >> fr
@favor=script
@ -770,7 +770,7 @@ en-GB >> en-GB
en-US >> en
fr-FR >> fr
ja-JP >> fr
zu >> en-GB
zu >> en
zxx >> en
** test: TestExactMatch
@ -1322,7 +1322,7 @@ en >> en-US
@favor=script
und >> und
ja >> und
fr-CA >> en-GB
fr-CA >> en-US
en-AU >> en-GB
en-BZ >> en-GB
en-CA >> en-GB
@ -1359,8 +1359,8 @@ fr >> und
@supported=en-GB, en-US, en, en-AU
und >> und
ja >> und
fr-CA >> en-GB
fr >> en-GB
fr-CA >> en-US
fr >> en-US
@supported=en-AU, ja, ca
fr >> en-AU
@supported=pl, ja, ca
@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen.
fr-FR >> fr-CA # Sibling match is chosen.
@supported=fr-CA, fr-FR
fr >> fr-FR # Inferred region match is chosen.
fr-SN >> fr-CA
fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR # Child match is chosen.
@supported=de, en, it
@ -1931,7 +1931,7 @@ fr-FR >> fr
fr-FR >> fr-CA
@supported=fr-CA, fr-FR
fr >> fr-FR
fr-SN >> fr-CA
fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR
@supported=de, en, it
@ -1951,3 +1951,10 @@ ru >> uk
zh-CN >> zh-TW
@supported=ja
ru >> und
** test: favor a more-default locale among equally imperfect matches
@supported=fr-CA, fr-CH, fr-FR, fr-GB
fr-SN >> fr-FR
@supported=sr-Latn, sr-Cyrl, sr-Grek
@threshold=60
sr-Thai >> sr-Cyrl

View File

@ -255,6 +255,8 @@ public class LocaleDistance {
long desLangState = desLangDistance >= 0 && supportedLSRs.length > 1 ? iter.getState64() : 0;
// Index of the supported LSR with the lowest distance.
int bestIndex = -1;
// Cached lookup info from XLikelySubtags.compareLikely().
int bestLikelyInfo = -1;
for (int slIndex = 0; slIndex < supportedLSRs.length; ++slIndex) {
LSR supported = supportedLSRs[slIndex];
boolean star = false;
@ -340,13 +342,29 @@ public class LocaleDistance {
// Distinguish between equivalent but originally unequal locales via an
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
}
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else {
if (shiftedDistance < shiftedThreshold) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
supported, supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
bestIndex = slIndex;
}
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
}
}
return bestIndex >= 0 ?

View File

@ -367,6 +367,114 @@ public final class XLikelySubtags {
return new LSR(language, script, region, retainOldMask);
}
/**
* Tests whether lsr is "more likely" than other.
* For example, fr-Latn-FR is more likely than fr-Latn-CH because
* FR is the default region for fr-Latn.
*
* <p>The likelyInfo caches lookup information between calls.
* The return value is an updated likelyInfo value,
* with bit 0 set if lsr is "more likely".
* The initial value of likelyInfo must be negative.
*/
int compareLikely(LSR lsr, LSR other, int likelyInfo) {
// If likelyInfo >= 0:
// likelyInfo bit 1 is set if the previous comparison with lsr
// was for equal language and script.
// Otherwise the scripts differed.
if (!lsr.language.equals(other.language)) {
return 0xfffffffc; // negative, lsr not better than other
}
if (!lsr.script.equals(other.script)) {
int index;
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, "");
likelyInfo = index << 2;
}
LSR likely = lsrs[index];
if (lsr.script.equals(likely.script)) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
if (!lsr.region.equals(other.region)) {
int index;
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
index = likelyInfo >> 2;
} else {
index = getLikelyIndex(lsr.language, lsr.region);
likelyInfo = (index << 2) | 2;
}
LSR likely = lsrs[index];
if (lsr.region.equals(likely.region)) {
return likelyInfo | 1;
} else {
return likelyInfo & ~1;
}
}
return likelyInfo & ~1; // lsr not better than other
}
// Subset of maximize().
private int getLikelyIndex(String language, String script) {
if (language.equals("und")) {
language = "";
}
if (script.equals("Zzzz")) {
script = "";
}
BytesTrie iter = new BytesTrie(trie);
long state;
int value;
// Small optimization: Array lookup for first language letter.
int c0;
if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
(state = trieFirstLetterStates[c0]) != 0) {
value = trieNext(iter.resetToState64(state), language, 1);
} else {
value = trieNext(iter, language, 0);
}
if (value >= 0) {
state = iter.getState64();
} else {
iter.resetToState64(trieUndState); // "und" ("*")
state = 0;
}
if (value > 0) {
// Intermediate or final value from just language.
if (value == SKIP_SCRIPT) {
value = 0;
}
} else {
value = trieNext(iter, script, 0);
if (value >= 0) {
state = iter.getState64();
} else {
if (state == 0) {
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
} else {
iter.resetToState64(state);
value = trieNext(iter, "", 0);
assert value >= 0;
state = iter.getState64();
}
}
}
if (value > 0) {
// Final value from just language or language+script.
} else {
value = trieNext(iter, "", 0);
assert value > 0;
}
return value;
}
private static final int trieNext(BytesTrie iter, String s, int i) {
BytesTrie.Result result;
if (s.isEmpty()) {

View File

@ -733,7 +733,7 @@ ja >> fr
@favor=script
en-GB >> en-GB
en-US >> en
fr >> en-GB
fr >> en
ja >> fr
** test: testEmptyWithDefault
@ -761,8 +761,8 @@ en-GB >> en-GB
en-US >> en
fr-FR >> fr
ja-JP >> fr
zu >> en
# For a language that doesn't match anything, return the default.
zu >> en-GB
zxx >> fr
@favor=script
@ -770,7 +770,7 @@ en-GB >> en-GB
en-US >> en
fr-FR >> fr
ja-JP >> fr
zu >> en-GB
zu >> en
zxx >> en
** test: TestExactMatch
@ -1322,7 +1322,7 @@ en >> en-US
@favor=script
und >> und
ja >> und
fr-CA >> en-GB
fr-CA >> en-US
en-AU >> en-GB
en-BZ >> en-GB
en-CA >> en-GB
@ -1359,8 +1359,8 @@ fr >> und
@supported=en-GB, en-US, en, en-AU
und >> und
ja >> und
fr-CA >> en-GB
fr >> en-GB
fr-CA >> en-US
fr >> en-US
@supported=en-AU, ja, ca
fr >> en-AU
@supported=pl, ja, ca
@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen.
fr-FR >> fr-CA # Sibling match is chosen.
@supported=fr-CA, fr-FR
fr >> fr-FR # Inferred region match is chosen.
fr-SN >> fr-CA
fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR # Child match is chosen.
@supported=de, en, it
@ -1931,7 +1931,7 @@ fr-FR >> fr
fr-FR >> fr-CA
@supported=fr-CA, fr-FR
fr >> fr-FR
fr-SN >> fr-CA
fr-SN >> fr-FR
@supported=en, fr-FR
fr >> fr-FR
@supported=de, en, it
@ -1951,3 +1951,10 @@ ru >> uk
zh-CN >> zh-TW
@supported=ja
ru >> und
** test: favor a more-default locale among equally imperfect matches
@supported=fr-CA, fr-CH, fr-FR, fr-GB
fr-SN >> fr-FR
@supported=sr-Latn, sr-Cyrl, sr-Grek
@threshold=60
sr-Thai >> sr-Cyrl