ICU-20917 LocaleMatcher: prefer a more-default locale
This commit is contained in:
parent
79fac50101
commit
60b567d6ab
@ -69,7 +69,7 @@ void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
|
||||
errorCode = U_MISSING_RESOURCE_ERROR;
|
||||
return;
|
||||
}
|
||||
gLocaleDistance = new LocaleDistance(data);
|
||||
gLocaleDistance = new LocaleDistance(data, likely);
|
||||
if (gLocaleDistance == nullptr) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
@ -83,7 +83,8 @@ const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
|
||||
return gLocaleDistance;
|
||||
}
|
||||
|
||||
LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
|
||||
LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
|
||||
likelySubtags(likely),
|
||||
trie(data.distanceTrieBytes),
|
||||
regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
|
||||
paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
|
||||
@ -122,6 +123,8 @@ int32_t LocaleDistance::getBestIndexAndDistance(
|
||||
uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
|
||||
// Index of the supported LSR with the lowest distance.
|
||||
int32_t bestIndex = -1;
|
||||
// Cached lookup info from XLikelySubtags.compareLikely().
|
||||
int32_t bestLikelyInfo = -1;
|
||||
for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
|
||||
const LSR &supported = *supportedLSRs[slIndex];
|
||||
bool star = false;
|
||||
@ -207,13 +210,29 @@ int32_t LocaleDistance::getBestIndexAndDistance(
|
||||
// Distinguish between equivalent but originally unequal locales via an
|
||||
// additional micro distance.
|
||||
shiftedDistance |= (desired.flags ^ supported.flags);
|
||||
}
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
if (shiftedDistance == 0) {
|
||||
return slIndex << INDEX_SHIFT;
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
if (shiftedDistance == 0) {
|
||||
return slIndex << INDEX_SHIFT;
|
||||
}
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
bestLikelyInfo = -1;
|
||||
}
|
||||
} else {
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
bestLikelyInfo = -1;
|
||||
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
|
||||
bestLikelyInfo = likelySubtags.compareLikely(
|
||||
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
|
||||
if ((bestLikelyInfo & 1) != 0) {
|
||||
// This supported locale matches as well as the previous best match,
|
||||
// and neither matches perfectly,
|
||||
// but this one is "more likely" (has more-default subtags).
|
||||
bestIndex = slIndex;
|
||||
}
|
||||
}
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
}
|
||||
}
|
||||
return bestIndex >= 0 ?
|
||||
|
@ -82,7 +82,7 @@ private:
|
||||
return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
|
||||
}
|
||||
|
||||
LocaleDistance(const LocaleDistanceData &data);
|
||||
LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
|
||||
LocaleDistance(const LocaleDistance &other) = delete;
|
||||
LocaleDistance &operator=(const LocaleDistance &other) = delete;
|
||||
|
||||
@ -110,6 +110,8 @@ private:
|
||||
return defaultRegionDistance;
|
||||
}
|
||||
|
||||
const XLikelySubtags &likelySubtags;
|
||||
|
||||
// The trie maps each dlang+slang+dscript+sscript+dregion+sregion
|
||||
// (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
|
||||
// There is also a trie value for each subsequence of whole subtags.
|
||||
|
@ -557,6 +557,106 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
|
||||
return LSR(language, script, region, retainOldMask);
|
||||
}
|
||||
|
||||
int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
|
||||
// If likelyInfo >= 0:
|
||||
// likelyInfo bit 1 is set if the previous comparison with lsr
|
||||
// was for equal language and script.
|
||||
// Otherwise the scripts differed.
|
||||
if (uprv_strcmp(lsr.language, other.language) != 0) {
|
||||
return 0xfffffffc; // negative, lsr not better than other
|
||||
}
|
||||
if (uprv_strcmp(lsr.script, other.script) != 0) {
|
||||
int32_t index;
|
||||
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
|
||||
index = likelyInfo >> 2;
|
||||
} else {
|
||||
index = getLikelyIndex(lsr.language, "");
|
||||
likelyInfo = index << 2;
|
||||
}
|
||||
const LSR &likely = lsrs[index];
|
||||
if (uprv_strcmp(lsr.script, likely.script) == 0) {
|
||||
return likelyInfo | 1;
|
||||
} else {
|
||||
return likelyInfo & ~1;
|
||||
}
|
||||
}
|
||||
if (uprv_strcmp(lsr.region, other.region) != 0) {
|
||||
int32_t index;
|
||||
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
|
||||
index = likelyInfo >> 2;
|
||||
} else {
|
||||
index = getLikelyIndex(lsr.language, lsr.region);
|
||||
likelyInfo = (index << 2) | 2;
|
||||
}
|
||||
const LSR &likely = lsrs[index];
|
||||
if (uprv_strcmp(lsr.region, likely.region) == 0) {
|
||||
return likelyInfo | 1;
|
||||
} else {
|
||||
return likelyInfo & ~1;
|
||||
}
|
||||
}
|
||||
return likelyInfo & ~1; // lsr not better than other
|
||||
}
|
||||
|
||||
// Subset of maximize().
|
||||
int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const {
|
||||
if (uprv_strcmp(language, "und") == 0) {
|
||||
language = "";
|
||||
}
|
||||
if (uprv_strcmp(script, "Zzzz") == 0) {
|
||||
script = "";
|
||||
}
|
||||
|
||||
BytesTrie iter(trie);
|
||||
uint64_t state;
|
||||
int32_t value;
|
||||
// Small optimization: Array lookup for first language letter.
|
||||
int32_t c0;
|
||||
if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
|
||||
language[1] != 0 && // language.length() >= 2
|
||||
(state = trieFirstLetterStates[c0]) != 0) {
|
||||
value = trieNext(iter.resetToState64(state), language, 1);
|
||||
} else {
|
||||
value = trieNext(iter, language, 0);
|
||||
}
|
||||
if (value >= 0) {
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
iter.resetToState64(trieUndState); // "und" ("*")
|
||||
state = 0;
|
||||
}
|
||||
|
||||
if (value > 0) {
|
||||
// Intermediate or final value from just language.
|
||||
if (value == SKIP_SCRIPT) {
|
||||
value = 0;
|
||||
}
|
||||
} else {
|
||||
value = trieNext(iter, script, 0);
|
||||
if (value >= 0) {
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
if (state == 0) {
|
||||
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
|
||||
} else {
|
||||
iter.resetToState64(state);
|
||||
value = trieNext(iter, "", 0);
|
||||
U_ASSERT(value >= 0);
|
||||
state = iter.getState64();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (value > 0) {
|
||||
// Final value from just language or language+script.
|
||||
} else {
|
||||
value = trieNext(iter, "", 0);
|
||||
U_ASSERT(value > 0);
|
||||
}
|
||||
U_ASSERT(value < lsrsLength);
|
||||
return value;
|
||||
}
|
||||
|
||||
int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
|
||||
UStringTrieResult result;
|
||||
uint8_t c;
|
||||
|
@ -85,6 +85,18 @@ public:
|
||||
// VisibleForTesting
|
||||
LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Tests whether lsr is "more likely" than other.
|
||||
* For example, fr-Latn-FR is more likely than fr-Latn-CH because
|
||||
* FR is the default region for fr-Latn.
|
||||
*
|
||||
* The likelyInfo caches lookup information between calls.
|
||||
* The return value is an updated likelyInfo value,
|
||||
* with bit 0 set if lsr is "more likely".
|
||||
* The initial value of likelyInfo must be negative.
|
||||
*/
|
||||
int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
|
||||
|
||||
// TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
|
||||
// in loclikely.cpp to this new code, including activating this
|
||||
// minimizeSubtags() function. The LocaleMatcher does not minimize.
|
||||
@ -111,6 +123,8 @@ private:
|
||||
*/
|
||||
LSR maximize(const char *language, const char *script, const char *region) const;
|
||||
|
||||
int32_t getLikelyIndex(const char *language, const char *script) const;
|
||||
|
||||
static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);
|
||||
|
||||
UResourceBundle *langInfoBundle;
|
||||
|
23
icu4c/source/test/testdata/localeMatcherTest.txt
vendored
23
icu4c/source/test/testdata/localeMatcherTest.txt
vendored
@ -733,7 +733,7 @@ ja >> fr
|
||||
@favor=script
|
||||
en-GB >> en-GB
|
||||
en-US >> en
|
||||
fr >> en-GB
|
||||
fr >> en
|
||||
ja >> fr
|
||||
|
||||
** test: testEmptyWithDefault
|
||||
@ -761,8 +761,8 @@ en-GB >> en-GB
|
||||
en-US >> en
|
||||
fr-FR >> fr
|
||||
ja-JP >> fr
|
||||
zu >> en
|
||||
# For a language that doesn't match anything, return the default.
|
||||
zu >> en-GB
|
||||
zxx >> fr
|
||||
|
||||
@favor=script
|
||||
@ -770,7 +770,7 @@ en-GB >> en-GB
|
||||
en-US >> en
|
||||
fr-FR >> fr
|
||||
ja-JP >> fr
|
||||
zu >> en-GB
|
||||
zu >> en
|
||||
zxx >> en
|
||||
|
||||
** test: TestExactMatch
|
||||
@ -1322,7 +1322,7 @@ en >> en-US
|
||||
@favor=script
|
||||
und >> und
|
||||
ja >> und
|
||||
fr-CA >> en-GB
|
||||
fr-CA >> en-US
|
||||
en-AU >> en-GB
|
||||
en-BZ >> en-GB
|
||||
en-CA >> en-GB
|
||||
@ -1359,8 +1359,8 @@ fr >> und
|
||||
@supported=en-GB, en-US, en, en-AU
|
||||
und >> und
|
||||
ja >> und
|
||||
fr-CA >> en-GB
|
||||
fr >> en-GB
|
||||
fr-CA >> en-US
|
||||
fr >> en-US
|
||||
@supported=en-AU, ja, ca
|
||||
fr >> en-AU
|
||||
@supported=pl, ja, ca
|
||||
@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen.
|
||||
fr-FR >> fr-CA # Sibling match is chosen.
|
||||
@supported=fr-CA, fr-FR
|
||||
fr >> fr-FR # Inferred region match is chosen.
|
||||
fr-SN >> fr-CA
|
||||
fr-SN >> fr-FR
|
||||
@supported=en, fr-FR
|
||||
fr >> fr-FR # Child match is chosen.
|
||||
@supported=de, en, it
|
||||
@ -1931,7 +1931,7 @@ fr-FR >> fr
|
||||
fr-FR >> fr-CA
|
||||
@supported=fr-CA, fr-FR
|
||||
fr >> fr-FR
|
||||
fr-SN >> fr-CA
|
||||
fr-SN >> fr-FR
|
||||
@supported=en, fr-FR
|
||||
fr >> fr-FR
|
||||
@supported=de, en, it
|
||||
@ -1951,3 +1951,10 @@ ru >> uk
|
||||
zh-CN >> zh-TW
|
||||
@supported=ja
|
||||
ru >> und
|
||||
|
||||
** test: favor a more-default locale among equally imperfect matches
|
||||
@supported=fr-CA, fr-CH, fr-FR, fr-GB
|
||||
fr-SN >> fr-FR
|
||||
@supported=sr-Latn, sr-Cyrl, sr-Grek
|
||||
@threshold=60
|
||||
sr-Thai >> sr-Cyrl
|
||||
|
@ -255,6 +255,8 @@ public class LocaleDistance {
|
||||
long desLangState = desLangDistance >= 0 && supportedLSRs.length > 1 ? iter.getState64() : 0;
|
||||
// Index of the supported LSR with the lowest distance.
|
||||
int bestIndex = -1;
|
||||
// Cached lookup info from XLikelySubtags.compareLikely().
|
||||
int bestLikelyInfo = -1;
|
||||
for (int slIndex = 0; slIndex < supportedLSRs.length; ++slIndex) {
|
||||
LSR supported = supportedLSRs[slIndex];
|
||||
boolean star = false;
|
||||
@ -340,13 +342,29 @@ public class LocaleDistance {
|
||||
// Distinguish between equivalent but originally unequal locales via an
|
||||
// additional micro distance.
|
||||
shiftedDistance |= (desired.flags ^ supported.flags);
|
||||
}
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
if (shiftedDistance == 0) {
|
||||
return slIndex << INDEX_SHIFT;
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
if (shiftedDistance == 0) {
|
||||
return slIndex << INDEX_SHIFT;
|
||||
}
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
bestLikelyInfo = -1;
|
||||
}
|
||||
} else {
|
||||
if (shiftedDistance < shiftedThreshold) {
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
bestLikelyInfo = -1;
|
||||
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
|
||||
bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
|
||||
supported, supportedLSRs[bestIndex], bestLikelyInfo);
|
||||
if ((bestLikelyInfo & 1) != 0) {
|
||||
// This supported locale matches as well as the previous best match,
|
||||
// and neither matches perfectly,
|
||||
// but this one is "more likely" (has more-default subtags).
|
||||
bestIndex = slIndex;
|
||||
}
|
||||
}
|
||||
bestIndex = slIndex;
|
||||
shiftedThreshold = shiftedDistance;
|
||||
}
|
||||
}
|
||||
return bestIndex >= 0 ?
|
||||
|
@ -367,6 +367,114 @@ public final class XLikelySubtags {
|
||||
return new LSR(language, script, region, retainOldMask);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests whether lsr is "more likely" than other.
|
||||
* For example, fr-Latn-FR is more likely than fr-Latn-CH because
|
||||
* FR is the default region for fr-Latn.
|
||||
*
|
||||
* <p>The likelyInfo caches lookup information between calls.
|
||||
* The return value is an updated likelyInfo value,
|
||||
* with bit 0 set if lsr is "more likely".
|
||||
* The initial value of likelyInfo must be negative.
|
||||
*/
|
||||
int compareLikely(LSR lsr, LSR other, int likelyInfo) {
|
||||
// If likelyInfo >= 0:
|
||||
// likelyInfo bit 1 is set if the previous comparison with lsr
|
||||
// was for equal language and script.
|
||||
// Otherwise the scripts differed.
|
||||
if (!lsr.language.equals(other.language)) {
|
||||
return 0xfffffffc; // negative, lsr not better than other
|
||||
}
|
||||
if (!lsr.script.equals(other.script)) {
|
||||
int index;
|
||||
if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
|
||||
index = likelyInfo >> 2;
|
||||
} else {
|
||||
index = getLikelyIndex(lsr.language, "");
|
||||
likelyInfo = index << 2;
|
||||
}
|
||||
LSR likely = lsrs[index];
|
||||
if (lsr.script.equals(likely.script)) {
|
||||
return likelyInfo | 1;
|
||||
} else {
|
||||
return likelyInfo & ~1;
|
||||
}
|
||||
}
|
||||
if (!lsr.region.equals(other.region)) {
|
||||
int index;
|
||||
if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
|
||||
index = likelyInfo >> 2;
|
||||
} else {
|
||||
index = getLikelyIndex(lsr.language, lsr.region);
|
||||
likelyInfo = (index << 2) | 2;
|
||||
}
|
||||
LSR likely = lsrs[index];
|
||||
if (lsr.region.equals(likely.region)) {
|
||||
return likelyInfo | 1;
|
||||
} else {
|
||||
return likelyInfo & ~1;
|
||||
}
|
||||
}
|
||||
return likelyInfo & ~1; // lsr not better than other
|
||||
}
|
||||
|
||||
// Subset of maximize().
|
||||
private int getLikelyIndex(String language, String script) {
|
||||
if (language.equals("und")) {
|
||||
language = "";
|
||||
}
|
||||
if (script.equals("Zzzz")) {
|
||||
script = "";
|
||||
}
|
||||
|
||||
BytesTrie iter = new BytesTrie(trie);
|
||||
long state;
|
||||
int value;
|
||||
// Small optimization: Array lookup for first language letter.
|
||||
int c0;
|
||||
if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
|
||||
(state = trieFirstLetterStates[c0]) != 0) {
|
||||
value = trieNext(iter.resetToState64(state), language, 1);
|
||||
} else {
|
||||
value = trieNext(iter, language, 0);
|
||||
}
|
||||
if (value >= 0) {
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
iter.resetToState64(trieUndState); // "und" ("*")
|
||||
state = 0;
|
||||
}
|
||||
|
||||
if (value > 0) {
|
||||
// Intermediate or final value from just language.
|
||||
if (value == SKIP_SCRIPT) {
|
||||
value = 0;
|
||||
}
|
||||
} else {
|
||||
value = trieNext(iter, script, 0);
|
||||
if (value >= 0) {
|
||||
state = iter.getState64();
|
||||
} else {
|
||||
if (state == 0) {
|
||||
iter.resetToState64(trieUndZzzzState); // "und-Zzzz" ("**")
|
||||
} else {
|
||||
iter.resetToState64(state);
|
||||
value = trieNext(iter, "", 0);
|
||||
assert value >= 0;
|
||||
state = iter.getState64();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (value > 0) {
|
||||
// Final value from just language or language+script.
|
||||
} else {
|
||||
value = trieNext(iter, "", 0);
|
||||
assert value > 0;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private static final int trieNext(BytesTrie iter, String s, int i) {
|
||||
BytesTrie.Result result;
|
||||
if (s.isEmpty()) {
|
||||
|
@ -733,7 +733,7 @@ ja >> fr
|
||||
@favor=script
|
||||
en-GB >> en-GB
|
||||
en-US >> en
|
||||
fr >> en-GB
|
||||
fr >> en
|
||||
ja >> fr
|
||||
|
||||
** test: testEmptyWithDefault
|
||||
@ -761,8 +761,8 @@ en-GB >> en-GB
|
||||
en-US >> en
|
||||
fr-FR >> fr
|
||||
ja-JP >> fr
|
||||
zu >> en
|
||||
# For a language that doesn't match anything, return the default.
|
||||
zu >> en-GB
|
||||
zxx >> fr
|
||||
|
||||
@favor=script
|
||||
@ -770,7 +770,7 @@ en-GB >> en-GB
|
||||
en-US >> en
|
||||
fr-FR >> fr
|
||||
ja-JP >> fr
|
||||
zu >> en-GB
|
||||
zu >> en
|
||||
zxx >> en
|
||||
|
||||
** test: TestExactMatch
|
||||
@ -1322,7 +1322,7 @@ en >> en-US
|
||||
@favor=script
|
||||
und >> und
|
||||
ja >> und
|
||||
fr-CA >> en-GB
|
||||
fr-CA >> en-US
|
||||
en-AU >> en-GB
|
||||
en-BZ >> en-GB
|
||||
en-CA >> en-GB
|
||||
@ -1359,8 +1359,8 @@ fr >> und
|
||||
@supported=en-GB, en-US, en, en-AU
|
||||
und >> und
|
||||
ja >> und
|
||||
fr-CA >> en-GB
|
||||
fr >> en-GB
|
||||
fr-CA >> en-US
|
||||
fr >> en-US
|
||||
@supported=en-AU, ja, ca
|
||||
fr >> en-AU
|
||||
@supported=pl, ja, ca
|
||||
@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen.
|
||||
fr-FR >> fr-CA # Sibling match is chosen.
|
||||
@supported=fr-CA, fr-FR
|
||||
fr >> fr-FR # Inferred region match is chosen.
|
||||
fr-SN >> fr-CA
|
||||
fr-SN >> fr-FR
|
||||
@supported=en, fr-FR
|
||||
fr >> fr-FR # Child match is chosen.
|
||||
@supported=de, en, it
|
||||
@ -1931,7 +1931,7 @@ fr-FR >> fr
|
||||
fr-FR >> fr-CA
|
||||
@supported=fr-CA, fr-FR
|
||||
fr >> fr-FR
|
||||
fr-SN >> fr-CA
|
||||
fr-SN >> fr-FR
|
||||
@supported=en, fr-FR
|
||||
fr >> fr-FR
|
||||
@supported=de, en, it
|
||||
@ -1951,3 +1951,10 @@ ru >> uk
|
||||
zh-CN >> zh-TW
|
||||
@supported=ja
|
||||
ru >> und
|
||||
|
||||
** test: favor a more-default locale among equally imperfect matches
|
||||
@supported=fr-CA, fr-CH, fr-FR, fr-GB
|
||||
fr-SN >> fr-FR
|
||||
@supported=sr-Latn, sr-Cyrl, sr-Grek
|
||||
@threshold=60
|
||||
sr-Thai >> sr-Cyrl
|
||||
|
Loading…
Reference in New Issue
Block a user