ICU-20917 LocaleMatcher: prefer a more-default locale

2019-12-21 06:48:17 -08:00 · 2019-12-21 06:48:17 -08:00 · 60b567d6ab
commit 60b567d6ab
parent 79fac50101
8 changed files with 306 additions and 31 deletions
--- a/icu4c/source/common/locdistance.cpp
+++ b/icu4c/source/common/locdistance.cpp
@ -69,7 +69,7 @@ void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
        errorCode = U_MISSING_RESOURCE_ERROR;
        return;
    }
-    gLocaleDistance = new LocaleDistance(data);
+    gLocaleDistance = new LocaleDistance(data, likely);
    if (gLocaleDistance == nullptr) {
        errorCode = U_MEMORY_ALLOCATION_ERROR;
        return;
@ -83,7 +83,8 @@ const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
    return gLocaleDistance;
 }

-LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
+LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
+        likelySubtags(likely),
        trie(data.distanceTrieBytes),
        regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
        paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
@ -122,6 +123,8 @@ int32_t LocaleDistance::getBestIndexAndDistance(
    uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
    // Index of the supported LSR with the lowest distance.
    int32_t bestIndex = -1;
+    // Cached lookup info from XLikelySubtags.compareLikely().
+    int32_t bestLikelyInfo = -1;
    for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
        const LSR &supported = *supportedLSRs[slIndex];
        bool star = false;
@ -207,13 +210,29 @@ int32_t LocaleDistance::getBestIndexAndDistance(
            // Distinguish between equivalent but originally unequal locales via an
            // additional micro distance.
            shiftedDistance |= (desired.flags ^ supported.flags);
-        }
-        if (shiftedDistance < shiftedThreshold) {
-            if (shiftedDistance == 0) {
-                return slIndex << INDEX_SHIFT;
+            if (shiftedDistance < shiftedThreshold) {
+                if (shiftedDistance == 0) {
+                    return slIndex << INDEX_SHIFT;
+                }
+                bestIndex = slIndex;
+                shiftedThreshold = shiftedDistance;
+                bestLikelyInfo = -1;
+            }
+        } else {
+            if (shiftedDistance < shiftedThreshold) {
+                bestIndex = slIndex;
+                shiftedThreshold = shiftedDistance;
+                bestLikelyInfo = -1;
+            } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
+                bestLikelyInfo = likelySubtags.compareLikely(
+                        supported, *supportedLSRs[bestIndex], bestLikelyInfo);
+                if ((bestLikelyInfo & 1) != 0) {
+                    // This supported locale matches as well as the previous best match,
+                    // and neither matches perfectly,
+                    // but this one is "more likely" (has more-default subtags).
+                    bestIndex = slIndex;
+                }
            }
-            bestIndex = slIndex;
-            shiftedThreshold = shiftedDistance;
        }
    }
    return bestIndex >= 0 ?
--- a/icu4c/source/common/locdistance.h
+++ b/icu4c/source/common/locdistance.h
@ -82,7 +82,7 @@ private:
        return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT;
    }

-    LocaleDistance(const LocaleDistanceData &data);
+    LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely);
    LocaleDistance(const LocaleDistance &other) = delete;
    LocaleDistance &operator=(const LocaleDistance &other) = delete;

@ -110,6 +110,8 @@ private:
        return defaultRegionDistance;
    }

+    const XLikelySubtags &likelySubtags;
+
    // The trie maps each dlang+slang+dscript+sscript+dregion+sregion
    // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance.
    // There is also a trie value for each subsequence of whole subtags.
--- a/icu4c/source/common/loclikelysubtags.cpp
+++ b/icu4c/source/common/loclikelysubtags.cpp
@ -557,6 +557,106 @@ LSR XLikelySubtags::maximize(const char *language, const char *script, const cha
    return LSR(language, script, region, retainOldMask);
 }

+int32_t XLikelySubtags::compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const {
+    // If likelyInfo >= 0:
+    // likelyInfo bit 1 is set if the previous comparison with lsr
+    // was for equal language and script.
+    // Otherwise the scripts differed.
+    if (uprv_strcmp(lsr.language, other.language) != 0) {
+        return 0xfffffffc;  // negative, lsr not better than other
+    }
+    if (uprv_strcmp(lsr.script, other.script) != 0) {
+        int32_t index;
+        if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+            index = likelyInfo >> 2;
+        } else {
+            index = getLikelyIndex(lsr.language, "");
+            likelyInfo = index << 2;
+        }
+        const LSR &likely = lsrs[index];
+        if (uprv_strcmp(lsr.script, likely.script) == 0) {
+            return likelyInfo | 1;
+        } else {
+            return likelyInfo & ~1;
+        }
+    }
+    if (uprv_strcmp(lsr.region, other.region) != 0) {
+        int32_t index;
+        if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+            index = likelyInfo >> 2;
+        } else {
+            index = getLikelyIndex(lsr.language, lsr.region);
+            likelyInfo = (index << 2) | 2;
+        }
+        const LSR &likely = lsrs[index];
+        if (uprv_strcmp(lsr.region, likely.region) == 0) {
+            return likelyInfo | 1;
+        } else {
+            return likelyInfo & ~1;
+        }
+    }
+    return likelyInfo & ~1;  // lsr not better than other
+}
+
+// Subset of maximize().
+int32_t XLikelySubtags::getLikelyIndex(const char *language, const char *script) const {
+    if (uprv_strcmp(language, "und") == 0) {
+        language = "";
+    }
+    if (uprv_strcmp(script, "Zzzz") == 0) {
+        script = "";
+    }
+
+    BytesTrie iter(trie);
+    uint64_t state;
+    int32_t value;
+    // Small optimization: Array lookup for first language letter.
+    int32_t c0;
+    if (0 <= (c0 = uprv_lowerOrdinal(language[0])) && c0 <= 25 &&
+            language[1] != 0 &&  // language.length() >= 2
+            (state = trieFirstLetterStates[c0]) != 0) {
+        value = trieNext(iter.resetToState64(state), language, 1);
+    } else {
+        value = trieNext(iter, language, 0);
+    }
+    if (value >= 0) {
+        state = iter.getState64();
+    } else {
+        iter.resetToState64(trieUndState);  // "und" ("*")
+        state = 0;
+    }
+
+    if (value > 0) {
+        // Intermediate or final value from just language.
+        if (value == SKIP_SCRIPT) {
+            value = 0;
+        }
+    } else {
+        value = trieNext(iter, script, 0);
+        if (value >= 0) {
+            state = iter.getState64();
+        } else {
+            if (state == 0) {
+                iter.resetToState64(trieUndZzzzState);  // "und-Zzzz" ("**")
+            } else {
+                iter.resetToState64(state);
+                value = trieNext(iter, "", 0);
+                U_ASSERT(value >= 0);
+                state = iter.getState64();
+            }
+        }
+    }
+
+    if (value > 0) {
+        // Final value from just language or language+script.
+    } else {
+        value = trieNext(iter, "", 0);
+        U_ASSERT(value > 0);
+    }
+    U_ASSERT(value < lsrsLength);
+    return value;
+}
+
 int32_t XLikelySubtags::trieNext(BytesTrie &iter, const char *s, int32_t i) {
    UStringTrieResult result;
    uint8_t c;
--- a/icu4c/source/common/loclikelysubtags.h
+++ b/icu4c/source/common/loclikelysubtags.h
@ -85,6 +85,18 @@ public:
    // VisibleForTesting
    LSR makeMaximizedLsrFrom(const Locale &locale, UErrorCode &errorCode) const;

+    /**
+     * Tests whether lsr is "more likely" than other.
+     * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+     * FR is the default region for fr-Latn.
+     *
+     * The likelyInfo caches lookup information between calls.
+     * The return value is an updated likelyInfo value,
+     * with bit 0 set if lsr is "more likely".
+     * The initial value of likelyInfo must be negative.
+     */
+    int32_t compareLikely(const LSR &lsr, const LSR &other, int32_t likelyInfo) const;
+
    // TODO(ICU-20777): Switch Locale/uloc_ likely-subtags API from the old code
    // in loclikely.cpp to this new code, including activating this
    // minimizeSubtags() function. The LocaleMatcher does not minimize.
@ -111,6 +123,8 @@ private:
     */
    LSR maximize(const char *language, const char *script, const char *region) const;

+    int32_t getLikelyIndex(const char *language, const char *script) const;
+
    static int32_t trieNext(BytesTrie &iter, const char *s, int32_t i);

    UResourceBundle *langInfoBundle;
--- a/icu4c/source/test/testdata/localeMatcherTest.txt
+++ b/icu4c/source/test/testdata/localeMatcherTest.txt
@ -733,7 +733,7 @@ ja >> fr
@favor=script
 en-GB >> en-GB
 en-US >> en
-fr >> en-GB
+fr >> en
 ja >> fr

 ** test: testEmptyWithDefault
@ -761,8 +761,8 @@ en-GB >> en-GB
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
+zu >> en
 # For a language that doesn't match anything, return the default.
-zu >> en-GB
 zxx >> fr

@favor=script
@ -770,7 +770,7 @@ en-GB >> en-GB
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
-zu >> en-GB
+zu >> en
 zxx >> en

 ** test: TestExactMatch
@ -1322,7 +1322,7 @@ en >> en-US
@favor=script
 und >> und
 ja >> und
-fr-CA >> en-GB
+fr-CA >> en-US
 en-AU >> en-GB
 en-BZ >> en-GB
 en-CA >> en-GB
@ -1359,8 +1359,8 @@ fr >> und
@supported=en-GB, en-US, en, en-AU
 und >> und
 ja >> und
-fr-CA >> en-GB
-fr >> en-GB
+fr-CA >> en-US
+fr >> en-US
@supported=en-AU, ja, ca
 fr >> en-AU
@supported=pl, ja, ca
@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen.
 fr-FR >> fr-CA # Sibling match is chosen.
@supported=fr-CA, fr-FR
 fr >> fr-FR # Inferred region match is chosen.
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
 fr >> fr-FR # Child match is chosen.
@supported=de, en, it
@ -1931,7 +1931,7 @@ fr-FR >> fr
 fr-FR >> fr-CA
@supported=fr-CA, fr-FR
 fr >> fr-FR
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
 fr >> fr-FR
@supported=de, en, it
@ -1951,3 +1951,10 @@ ru >> uk
 zh-CN >> zh-TW
@supported=ja
 ru >> und
+
+** test: favor a more-default locale among equally imperfect matches
+@supported=fr-CA, fr-CH, fr-FR, fr-GB
+fr-SN >> fr-FR
+@supported=sr-Latn, sr-Cyrl, sr-Grek
+@threshold=60
+sr-Thai >> sr-Cyrl
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LocaleDistance.java
@ -255,6 +255,8 @@ public class LocaleDistance {
        long desLangState = desLangDistance >= 0 && supportedLSRs.length > 1 ? iter.getState64() : 0;
        // Index of the supported LSR with the lowest distance.
        int bestIndex = -1;
+        // Cached lookup info from XLikelySubtags.compareLikely().
+        int bestLikelyInfo = -1;
        for (int slIndex = 0; slIndex < supportedLSRs.length; ++slIndex) {
            LSR supported = supportedLSRs[slIndex];
            boolean star = false;
@ -340,13 +342,29 @@ public class LocaleDistance {
                // Distinguish between equivalent but originally unequal locales via an
                // additional micro distance.
                shiftedDistance |= (desired.flags ^ supported.flags);
-            }
-            if (shiftedDistance < shiftedThreshold) {
-                if (shiftedDistance == 0) {
-                    return slIndex << INDEX_SHIFT;
+                if (shiftedDistance < shiftedThreshold) {
+                    if (shiftedDistance == 0) {
+                        return slIndex << INDEX_SHIFT;
+                    }
+                    bestIndex = slIndex;
+                    shiftedThreshold = shiftedDistance;
+                    bestLikelyInfo = -1;
+                }
+            } else {
+                if (shiftedDistance < shiftedThreshold) {
+                    bestIndex = slIndex;
+                    shiftedThreshold = shiftedDistance;
+                    bestLikelyInfo = -1;
+                } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
+                    bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
+                            supported, supportedLSRs[bestIndex], bestLikelyInfo);
+                    if ((bestLikelyInfo & 1) != 0) {
+                        // This supported locale matches as well as the previous best match,
+                        // and neither matches perfectly,
+                        // but this one is "more likely" (has more-default subtags).
+                        bestIndex = slIndex;
+                    }
                }
-                bestIndex = slIndex;
-                shiftedThreshold = shiftedDistance;
            }
        }
        return bestIndex >= 0 ?
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/XLikelySubtags.java
@ -367,6 +367,114 @@ public final class XLikelySubtags {
        return new LSR(language, script, region, retainOldMask);
    }

+    /**
+     * Tests whether lsr is "more likely" than other.
+     * For example, fr-Latn-FR is more likely than fr-Latn-CH because
+     * FR is the default region for fr-Latn.
+     *
+     * <p>The likelyInfo caches lookup information between calls.
+     * The return value is an updated likelyInfo value,
+     * with bit 0 set if lsr is "more likely".
+     * The initial value of likelyInfo must be negative.
+     */
+    int compareLikely(LSR lsr, LSR other, int likelyInfo) {
+        // If likelyInfo >= 0:
+        // likelyInfo bit 1 is set if the previous comparison with lsr
+        // was for equal language and script.
+        // Otherwise the scripts differed.
+        if (!lsr.language.equals(other.language)) {
+            return 0xfffffffc;  // negative, lsr not better than other
+        }
+        if (!lsr.script.equals(other.script)) {
+            int index;
+            if (likelyInfo >= 0 && (likelyInfo & 2) == 0) {
+                index = likelyInfo >> 2;
+            } else {
+                index = getLikelyIndex(lsr.language, "");
+                likelyInfo = index << 2;
+            }
+            LSR likely = lsrs[index];
+            if (lsr.script.equals(likely.script)) {
+                return likelyInfo | 1;
+            } else {
+                return likelyInfo & ~1;
+            }
+        }
+        if (!lsr.region.equals(other.region)) {
+            int index;
+            if (likelyInfo >= 0 && (likelyInfo & 2) != 0) {
+                index = likelyInfo >> 2;
+            } else {
+                index = getLikelyIndex(lsr.language, lsr.region);
+                likelyInfo = (index << 2) | 2;
+            }
+            LSR likely = lsrs[index];
+            if (lsr.region.equals(likely.region)) {
+                return likelyInfo | 1;
+            } else {
+                return likelyInfo & ~1;
+            }
+        }
+        return likelyInfo & ~1;  // lsr not better than other
+    }
+
+    // Subset of maximize().
+    private int getLikelyIndex(String language, String script) {
+        if (language.equals("und")) {
+            language = "";
+        }
+        if (script.equals("Zzzz")) {
+            script = "";
+        }
+
+        BytesTrie iter = new BytesTrie(trie);
+        long state;
+        int value;
+        // Small optimization: Array lookup for first language letter.
+        int c0;
+        if (language.length() >= 2 && 0 <= (c0 = language.charAt(0) - 'a') && c0 <= 25 &&
+                (state = trieFirstLetterStates[c0]) != 0) {
+            value = trieNext(iter.resetToState64(state), language, 1);
+        } else {
+            value = trieNext(iter, language, 0);
+        }
+        if (value >= 0) {
+            state = iter.getState64();
+        } else {
+            iter.resetToState64(trieUndState);  // "und" ("*")
+            state = 0;
+        }
+
+        if (value > 0) {
+            // Intermediate or final value from just language.
+            if (value == SKIP_SCRIPT) {
+                value = 0;
+            }
+        } else {
+            value = trieNext(iter, script, 0);
+            if (value >= 0) {
+                state = iter.getState64();
+            } else {
+                if (state == 0) {
+                    iter.resetToState64(trieUndZzzzState);  // "und-Zzzz" ("**")
+                } else {
+                    iter.resetToState64(state);
+                    value = trieNext(iter, "", 0);
+                    assert value >= 0;
+                    state = iter.getState64();
+                }
+            }
+        }
+
+        if (value > 0) {
+            // Final value from just language or language+script.
+        } else {
+            value = trieNext(iter, "", 0);
+            assert value > 0;
+        }
+        return value;
+    }
+
    private static final int trieNext(BytesTrie iter, String s, int i) {
        BytesTrie.Result result;
        if (s.isEmpty()) {
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/data/localeMatcherTest.txt
@ -733,7 +733,7 @@ ja >> fr
@favor=script
 en-GB >> en-GB
 en-US >> en
-fr >> en-GB
+fr >> en
 ja >> fr

 ** test: testEmptyWithDefault
@ -761,8 +761,8 @@ en-GB >> en-GB
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
+zu >> en
 # For a language that doesn't match anything, return the default.
-zu >> en-GB
 zxx >> fr

@favor=script
@ -770,7 +770,7 @@ en-GB >> en-GB
 en-US >> en
 fr-FR >> fr
 ja-JP >> fr
-zu >> en-GB
+zu >> en
 zxx >> en

 ** test: TestExactMatch
@ -1322,7 +1322,7 @@ en >> en-US
@favor=script
 und >> und
 ja >> und
-fr-CA >> en-GB
+fr-CA >> en-US
 en-AU >> en-GB
 en-BZ >> en-GB
 en-CA >> en-GB
@ -1359,8 +1359,8 @@ fr >> und
@supported=en-GB, en-US, en, en-AU
 und >> und
 ja >> und
-fr-CA >> en-GB
-fr >> en-GB
+fr-CA >> en-US
+fr >> en-US
@supported=en-AU, ja, ca
 fr >> en-AU
@supported=pl, ja, ca
@ -1901,7 +1901,7 @@ fr-FR >> fr # Parent match is chosen.
 fr-FR >> fr-CA # Sibling match is chosen.
@supported=fr-CA, fr-FR
 fr >> fr-FR # Inferred region match is chosen.
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
 fr >> fr-FR # Child match is chosen.
@supported=de, en, it
@ -1931,7 +1931,7 @@ fr-FR >> fr
 fr-FR >> fr-CA
@supported=fr-CA, fr-FR
 fr >> fr-FR
-fr-SN >> fr-CA
+fr-SN >> fr-FR
@supported=en, fr-FR
 fr >> fr-FR
@supported=de, en, it
@ -1951,3 +1951,10 @@ ru >> uk
 zh-CN >> zh-TW
@supported=ja
 ru >> und
+
+** test: favor a more-default locale among equally imperfect matches
+@supported=fr-CA, fr-CH, fr-FR, fr-GB
+fr-SN >> fr-FR
+@supported=sr-Latn, sr-Cyrl, sr-Grek
+@threshold=60
+sr-Thai >> sr-Cyrl