ICU-20936 add LocaleMatcher.Builder.setDirection(with-one-way vs. only-two-way)

This commit is contained in:
Markus Scherer 2020-03-09 16:57:23 -07:00
parent 547030b8da
commit 72cd937620
8 changed files with 240 additions and 43 deletions

View File

@ -131,7 +131,8 @@ LocaleMatcher::Builder::Builder(LocaleMatcher::Builder &&src) U_NOEXCEPT :
thresholdDistance_(src.thresholdDistance_),
demotion_(src.demotion_),
defaultLocale_(src.defaultLocale_),
favor_(src.favor_) {
favor_(src.favor_),
direction_(src.direction_) {
src.supportedLocales_ = nullptr;
src.defaultLocale_ = nullptr;
}
@ -150,6 +151,7 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::operator=(LocaleMatcher::Builder
demotion_ = src.demotion_;
defaultLocale_ = src.defaultLocale_;
favor_ = src.favor_;
direction_ = src.direction_;
src.supportedLocales_ = nullptr;
src.defaultLocale_ = nullptr;
@ -332,6 +334,7 @@ LocaleMatcher::LocaleMatcher(const Builder &builder, UErrorCode &errorCode) :
thresholdDistance(builder.thresholdDistance_),
demotionPerDesiredLocale(0),
favorSubtag(builder.favor_),
direction(builder.direction_),
supportedLocales(nullptr), lsrs(nullptr), supportedLocalesLength(0),
supportedLsrToIndex(nullptr),
supportedLSRs(nullptr), supportedIndexes(nullptr), supportedLSRsLength(0),
@ -649,7 +652,8 @@ int32_t LocaleMatcher::getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remai
}
}
int32_t bestIndexAndDistance = localeDistance.getBestIndexAndDistance(
desiredLSR, supportedLSRs, supportedLSRsLength, bestShiftedDistance, favorSubtag);
desiredLSR, supportedLSRs, supportedLSRsLength,
bestShiftedDistance, favorSubtag, direction);
if (bestIndexAndDistance >= 0) {
bestShiftedDistance = LocaleDistance::getShiftedDistance(bestIndexAndDistance);
if (remainingIter != nullptr) {
@ -683,7 +687,7 @@ double LocaleMatcher::internalMatch(const Locale &desired, const Locale &support
int32_t indexAndDistance = localeDistance.getBestIndexAndDistance(
getMaximalLsrOrUnd(likelySubtags, desired, errorCode),
&pSuppLSR, 1,
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag);
LocaleDistance::shiftDistance(thresholdDistance), favorSubtag, direction);
double distance = LocaleDistance::getDistanceDouble(indexAndDistance);
return (100.0 - distance) / 100.0;
}

View File

@ -102,14 +102,15 @@ LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubt
LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
const LSR *p_enGB = &enGB;
int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE);
shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
}
int32_t LocaleDistance::getBestIndexAndDistance(
const LSR &desired,
const LSR **supportedLSRs, int32_t supportedLSRsLength,
int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
int32_t shiftedThreshold,
ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
// Round up the shifted threshold (if fraction bits are not 0)
// for comparison with un-shifted distances until we need fraction bits.
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
@ -211,26 +212,38 @@ int32_t LocaleDistance::getBestIndexAndDistance(
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else {
if (shiftedDistance < shiftedThreshold) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
bestLikelyInfo = likelySubtags.compareLikely(
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
bestLikelyInfo = likelySubtags.compareLikely(
supported, *supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
bestIndex = slIndex;
}
}
}
}

View File

@ -55,7 +55,8 @@ public:
int32_t getBestIndexAndDistance(const LSR &desired,
const LSR **supportedLSRs, int32_t supportedLSRsLength,
int32_t shiftedThreshold,
ULocMatchFavorSubtag favorSubtag) const;
ULocMatchFavorSubtag favorSubtag,
ULocMatchDirection direction) const;
UBool isParadigmLSR(const LSR &lsr) const;
@ -88,6 +89,14 @@ private:
static void initLocaleDistance(UErrorCode &errorCode);
UBool isMatch(const LSR &desired, const LSR &supported,
int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const {
const LSR *pSupp = &supported;
return getBestIndexAndDistance(
desired, &pSupp, 1,
shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0;
}
static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState,
const char *desired, const char *supported);

View File

@ -25,7 +25,7 @@
/**
* Builder option for whether the language subtag or the script subtag is most important.
*
* @see Builder#setFavorSubtag(FavorSubtag)
* @see Builder#setFavorSubtag(ULocMatchFavorSubtag)
* @draft ICU 65
*/
enum ULocMatchFavorSubtag {
@ -51,7 +51,7 @@ typedef enum ULocMatchFavorSubtag ULocMatchFavorSubtag;
* Builder option for whether all desired locales are treated equally or
* earlier ones are preferred.
*
* @see Builder#setDemotionPerDesiredLocale(Demotion)
* @see Builder#setDemotionPerDesiredLocale(ULocMatchDemotion)
* @draft ICU 65
*/
enum ULocMatchDemotion {
@ -93,6 +93,42 @@ enum ULocMatchDemotion {
typedef enum ULocMatchDemotion ULocMatchDemotion;
#endif
/**
* Builder option for whether to include or ignore one-way (fallback) match data.
* The LocaleMatcher uses CLDR languageMatch data which includes fallback (oneway=true) entries.
* Sometimes it is desirable to ignore those.
*
* <p>For example, consider a web application with the UI in a given language,
* with a link to another, related web app.
* The link should include the UI language, and the target server may also use
* the clients Accept-Language header data.
* The target server has its own list of supported languages.
* One may want to favor UI language consistency, that is,
* if there is a decent match for the original UI language, we want to use it,
* but not if it is merely a fallback.
*
* @see Builder#setDirection(ULocMatchDirection)
* @draft ICU 67
*/
enum ULocMatchDirection {
/**
* Locale matching includes one-way matches such as BretonFrench. (default)
*
* @draft ICU 67
*/
ULOCMATCH_DIRECTION_WITH_ONE_WAY,
/**
* Locale matching limited to two-way matches including e.g. DanishNorwegian
* but ignoring one-way matches.
*
* @draft ICU 67
*/
ULOCMATCH_DIRECTION_ONLY_TWO_WAY
};
#ifndef U_IN_DOXYGEN
typedef enum ULocMatchDirection ULocMatchDirection;
#endif
struct UHashtable;
U_NAMESPACE_BEGIN
@ -412,6 +448,21 @@ public:
*/
Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion);
/**
* Option for whether to include or ignore one-way (fallback) match data.
* By default, they are included.
*
* @param direction the match direction to set.
* @return this Builder object
* @draft ICU 67
*/
Builder &setDirection(ULocMatchDirection direction) {
if (U_SUCCESS(errorCode_)) {
direction_ = direction;
}
return *this;
}
/**
* Sets the UErrorCode if an error occurred while setting parameters.
* Preserves older error codes in the outErrorCode.
@ -451,6 +502,7 @@ public:
ULocMatchDemotion demotion_ = ULOCMATCH_DEMOTION_REGION;
Locale *defaultLocale_ = nullptr;
ULocMatchFavorSubtag favor_ = ULOCMATCH_FAVOR_LANGUAGE;
ULocMatchDirection direction_ = ULOCMATCH_DIRECTION_WITH_ONE_WAY;
};
// FYI No public LocaleMatcher constructors in C++; use the Builder.
@ -583,6 +635,7 @@ private:
int32_t thresholdDistance;
int32_t demotionPerDesiredLocale;
ULocMatchFavorSubtag favorSubtag;
ULocMatchDirection direction;
// These are in input order.
const Locale ** supportedLocales;

View File

@ -58,6 +58,7 @@ public:
void testSupportedDefault();
void testUnsupportedDefault();
void testDemotion();
void testDirection();
void testMatch();
void testResolvedLocale();
void testDataDriven();
@ -81,6 +82,7 @@ void LocaleMatcherTest::runIndexedTest(int32_t index, UBool exec, const char *&n
TESTCASE_AUTO(testSupportedDefault);
TESTCASE_AUTO(testUnsupportedDefault);
TESTCASE_AUTO(testDemotion);
TESTCASE_AUTO(testDirection);
TESTCASE_AUTO(testMatch);
TESTCASE_AUTO(testResolvedLocale);
TESTCASE_AUTO(testDataDriven);
@ -322,6 +324,31 @@ void LocaleMatcherTest::testDemotion() {
}
}
void LocaleMatcherTest::testDirection() {
IcuTestErrorCode errorCode(*this, "testDirection");
Locale supported[] = { "ar", "nn" };
Locale desired[] = { "arz-EG", "nb-DK" };
LocaleMatcher::Builder builder;
builder.setSupportedLocales(ARRAY_RANGE(supported));
{
// arz is a close one-way match to ar, and the region matches.
// (Egyptian Arabic vs. Arabic)
LocaleMatcher withOneWay = builder.build(errorCode);
Locale::RangeIterator<Locale *> desiredIter(ARRAY_RANGE(desired));
assertEquals("with one-way", "ar",
locString(withOneWay.getBestMatch(desiredIter, errorCode)));
}
{
// nb is a less close two-way match to nn, and the regions differ.
// (Norwegian Bokmal vs. Nynorsk)
LocaleMatcher onlyTwoWay =
builder.setDirection(ULOCMATCH_DIRECTION_ONLY_TWO_WAY).build(errorCode);
Locale::RangeIterator<Locale *> desiredIter(ARRAY_RANGE(desired));
assertEquals("only two-way", "nn",
locString(onlyTwoWay.getBestMatch(desiredIter, errorCode)));
}
}
void LocaleMatcherTest::testMatch() {
IcuTestErrorCode errorCode(*this, "testMatch");
LocaleMatcher matcher = LocaleMatcher::Builder().build(errorCode);

View File

@ -15,6 +15,7 @@ import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.UResource;
import com.ibm.icu.util.BytesTrie;
import com.ibm.icu.util.LocaleMatcher;
import com.ibm.icu.util.LocaleMatcher.FavorSubtag;
import com.ibm.icu.util.ULocale;
@ -211,7 +212,7 @@ public class LocaleDistance {
LSR en = new LSR("en", "Latn", "US", LSR.EXPLICIT_LSR);
LSR enGB = new LSR("en", "Latn", "GB", LSR.EXPLICIT_LSR);
int indexAndDistance = getBestIndexAndDistance(en, new LSR[] { enGB }, 1,
shiftDistance(50), FavorSubtag.LANGUAGE);
shiftDistance(50), FavorSubtag.LANGUAGE, LocaleMatcher.Direction.WITH_ONE_WAY);
defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
if (DEBUG_OUTPUT) {
@ -229,7 +230,7 @@ public class LocaleDistance {
LSR supportedLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(supported);
LSR desiredLSR = XLikelySubtags.INSTANCE.makeMaximizedLsrFrom(desired);
int indexAndDistance = getBestIndexAndDistance(desiredLSR, new LSR[] { supportedLSR }, 1,
shiftDistance(threshold), favorSubtag);
shiftDistance(threshold), favorSubtag, LocaleMatcher.Direction.WITH_ONE_WAY);
return getDistanceFloor(indexAndDistance);
}
@ -242,7 +243,7 @@ public class LocaleDistance {
* and its distance (0..ABOVE_THRESHOLD) in the low bits.
*/
public int getBestIndexAndDistance(LSR desired, LSR[] supportedLSRs, int supportedLSRsLength,
int shiftedThreshold, FavorSubtag favorSubtag) {
int shiftedThreshold, FavorSubtag favorSubtag, LocaleMatcher.Direction direction) {
// Round up the shifted threshold (if fraction bits are not 0)
// for comparison with un-shifted distances until we need fraction bits.
// (If we simply shifted non-zero fraction bits away, then we might ignore a language
@ -344,26 +345,38 @@ public class LocaleDistance {
// additional micro distance.
shiftedDistance |= (desired.flags ^ supported.flags);
if (shiftedDistance < shiftedThreshold) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
if (direction != LocaleMatcher.Direction.ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
if (shiftedDistance == 0) {
return slIndex << INDEX_SHIFT;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else {
if (shiftedDistance < shiftedThreshold) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
supported, supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
if (direction != LocaleMatcher.Direction.ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
bestIndex = slIndex;
shiftedThreshold = shiftedDistance;
bestLikelyInfo = -1;
}
} else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
if (direction != LocaleMatcher.Direction.ONLY_TWO_WAY ||
// Is there also a match when we swap desired/supported?
isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
bestLikelyInfo = XLikelySubtags.INSTANCE.compareLikely(
supported, supportedLSRs[bestIndex], bestLikelyInfo);
if ((bestLikelyInfo & 1) != 0) {
// This supported locale matches as well as the previous best match,
// and neither matches perfectly,
// but this one is "more likely" (has more-default subtags).
bestIndex = slIndex;
}
}
}
}
@ -373,6 +386,13 @@ public class LocaleDistance {
INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
}
private boolean isMatch(LSR desired, LSR supported,
int shiftedThreshold, FavorSubtag favorSubtag) {
return getBestIndexAndDistance(
desired, new LSR[] { supported }, 1,
shiftedThreshold, favorSubtag, null) >= 0;
}
private static final int getDesSuppScriptDistance(BytesTrie iter, long startState,
String desired, String supported) {
// Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.

View File

@ -155,6 +155,42 @@ public final class LocaleMatcher {
REGION
}
/**
* Builder option for whether to include or ignore one-way (fallback) match data.
* The LocaleMatcher uses CLDR languageMatch data which includes fallback (oneway=true) entries.
* Sometimes it is desirable to ignore those.
*
* <p>For example, consider a web application with the UI in a given language,
* with a link to another, related web app.
* The link should include the UI language, and the target server may also use
* the clients Accept-Language header data.
* The target server has its own list of supported languages.
* One may want to favor UI language consistency, that is,
* if there is a decent match for the original UI language, we want to use it,
* but not if it is merely a fallback.
*
* @see LocaleMatcher.Builder#setDirection(Direction)
* @draft ICU 67
* @provisional This API might change or be removed in a future release.
*/
public enum Direction {
/**
* Locale matching includes one-way matches such as BretonFrench. (default)
*
* @draft ICU 67
* @provisional This API might change or be removed in a future release.
*/
WITH_ONE_WAY,
/**
* Locale matching limited to two-way matches including e.g. DanishNorwegian
* but ignoring one-way matches.
*
* @draft ICU 67
* @provisional This API might change or be removed in a future release.
*/
ONLY_TWO_WAY
}
/**
* Data for the best-matching pair of a desired and a supported locale.
*
@ -319,6 +355,7 @@ public final class LocaleMatcher {
private final int thresholdDistance;
private final int demotionPerDesiredLocale;
private final FavorSubtag favorSubtag;
private final Direction direction;
// These are in input order.
private final ULocale[] supportedULocales;
@ -346,6 +383,7 @@ public final class LocaleMatcher {
private Demotion demotion;
private ULocale defaultLocale;
private FavorSubtag favor;
private Direction direction;
private Builder() {}
@ -483,6 +521,20 @@ public final class LocaleMatcher {
return this;
}
/**
* Option for whether to include or ignore one-way (fallback) match data.
* By default, they are included.
*
* @param direction the match direction to set.
* @return this Builder object
* @draft ICU 67
* @provisional This API might change or be removed in a future release.
*/
public Builder setDirection(Direction direction) {
this.direction = direction;
return this;
}
/**
* <i>Internal only!</i>
*
@ -661,6 +713,7 @@ public final class LocaleMatcher {
builder.demotion == Demotion.NONE ? 0 :
LocaleDistance.INSTANCE.getDefaultDemotionPerDesiredLocale(); // null or REGION
favorSubtag = builder.favor;
direction = builder.direction;
if (TRACE_MATCHER) {
System.err.printf("new LocaleMatcher: %s\n", toString());
}
@ -945,7 +998,7 @@ public final class LocaleMatcher {
}
int bestIndexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
desiredLSR, supportedLSRs, supportedLSRsLength,
bestShiftedDistance, favorSubtag);
bestShiftedDistance, favorSubtag, direction);
if (bestIndexAndDistance >= 0) {
bestShiftedDistance = LocaleDistance.getShiftedDistance(bestIndexAndDistance);
if (remainingIter != null) { remainingIter.rememberCurrent(desiredIndex); }
@ -998,7 +1051,7 @@ public final class LocaleMatcher {
int indexAndDistance = LocaleDistance.INSTANCE.getBestIndexAndDistance(
getMaximalLsrOrUnd(desired),
new LSR[] { getMaximalLsrOrUnd(supported) }, 1,
LocaleDistance.shiftDistance(thresholdDistance), favorSubtag);
LocaleDistance.shiftDistance(thresholdDistance), favorSubtag, direction);
double distance = LocaleDistance.getDistanceDouble(indexAndDistance);
if (TRACE_MATCHER) {
System.err.printf("LocaleMatcher distance(desired=%s, supported=%s)=%g\n",
@ -1044,6 +1097,9 @@ public final class LocaleMatcher {
if (favorSubtag != null) {
s.append(" favor=").append(favorSubtag);
}
if (direction != null) {
s.append(" direction=").append(direction);
}
if (thresholdDistance >= 0) {
s.append(String.format(" threshold=%d", thresholdDistance));
}

View File

@ -638,6 +638,21 @@ public class LocaleMatcherTest extends TestFmwk {
assertEquals("region demotion", ULocale.FRENCH, regionDemotion.getBestMatch(desired));
}
@Test
public void testDirection() {
List<ULocale> desired = Arrays.asList(new ULocale("arz-EG"), new ULocale("nb-DK"));
LocaleMatcher.Builder builder =
LocaleMatcher.builder().setSupportedLocales("ar, nn");
// arz is a close one-way match to ar, and the region matches.
// (Egyptian Arabic vs. Arabic)
LocaleMatcher withOneWay = builder.build();
assertEquals("with one-way", "ar", withOneWay.getBestMatch(desired).toString());
// nb is a less close two-way match to nn, and the regions differ.
// (Norwegian Bokmal vs. Nynorsk)
LocaleMatcher onlyTwoWay = builder.setDirection(LocaleMatcher.Direction.ONLY_TWO_WAY).build();
assertEquals("only two-way", "nn", onlyTwoWay.getBestMatch(desired).toString());
}
@Test
public void testCanonicalize() {
LocaleMatcher matcher = LocaleMatcher.builder().build();