Implement binary search in QLocale's likely sub-tag lookup

Follow through on a comment from 2012: sort the likely subtag array
(in the CLDR update script) and use bsearch to find entries in it.

This simplifies QLocaleXmlReader.likelyMap() slightly, moving the
detection of last entry to LocaleDataWriter.likelySubtags(), but
requires collecting all likely sub-tag mapping pairs (rather than just
passing them through from read to write via generators) in order to
sort them.

Change-Id: Ieb6875ccde1ddbd475ae68c0766a666ec32b7005
Reviewed-by: Mårten Nordheim <marten.nordheim@qt.io>
This commit is contained in:
Edward Welbourne 2020-10-12 13:12:48 +02:00
parent 246ba8ca61
commit a9e4bf7eef
4 changed files with 503 additions and 467 deletions

View File

@ -201,20 +201,42 @@ QLatin1String QLocalePrivate::countryToCode(QLocale::Country country)
return QLatin1String(reinterpret_cast<const char*>(c), c[2] == 0 ? 2 : 3);
}
static int cmpLikelySubtag(const void *lhs, const void *rhs)
{
// Must match the comparison LocaleDataWriter.likelySubtags() uses when
// sorting, see qtbase/util/locale_database.qlocalexml2cpp.py
const auto compare = [](int lhs, int rhs) {
// 0 sorts after all other values; lhs and rhs are passed ushort values.
const int huge = 0x10000;
return (lhs ? lhs : huge) - (rhs ? rhs : huge);
};
const auto &left = *reinterpret_cast<const QLocaleId *>(lhs);
const auto &right = *reinterpret_cast<const QLocaleId *>(rhs);
if (int cmp = compare(left.language_id, right.language_id))
return cmp;
if (int cmp = compare(left.country_id, right.country_id))
return cmp;
return compare(left.script_id, right.script_id);
}
// http://www.unicode.org/reports/tr35/#Likely_Subtags
static bool addLikelySubtags(QLocaleId &localeId)
{
// ### optimize with bsearch
const QLocaleId *p = likely_subtags;
const QLocaleId *const e = p + std::size(likely_subtags);
for ( ; p < e; p += 2) {
if (localeId == p[0]) {
// Array is overtly of QLocaleId but to be interpreted as of pairs, mapping
// each even entry to the following odd entry. So search only the even
// entries for a match and return the matching odd entry, if found.
static_assert(std::size(likely_subtags) % 2 == 0);
const auto *p = reinterpret_cast<const QLocaleId *>(
bsearch(&localeId,
likely_subtags, std::size(likely_subtags) / 2, 2 * sizeof(QLocaleId),
cmpLikelySubtag));
if (!p)
return false;
Q_ASSERT(p >= likely_subtags && p < likely_subtags + std::size(likely_subtags));
Q_ASSERT((p - likely_subtags) % 2 == 0);
localeId = p[1];
return true;
}
}
return false;
}
QLocaleId QLocaleId::withLikelySubtagsAdded() const
{

File diff suppressed because it is too large Load Diff

View File

@ -183,12 +183,11 @@ class QLocaleXmlReader (object):
def ids(t):
return tuple(x[0] for x in t)
for i, pair in enumerate(self.__likely, 1):
for pair in self.__likely:
have = self.__fromNames(pair[0])
give = self.__fromNames(pair[1])
yield ('_'.join(tag(have)), ids(have),
'_'.join(tag(give)), ids(give),
i == len(self.__likely))
'_'.join(tag(give)), ids(give))
def defaultMap(self):
"""Map language and script to their default country by ID.

View File

@ -163,11 +163,26 @@ class LocaleSourceEditor (SourceFileEditor):
class LocaleDataWriter (LocaleSourceEditor):
def likelySubtags(self, likely):
# First sort likely, so that we can use binary search in C++
# code. Although the entries are (lang, script, region), sort
# as (lang, region, script) and sort 0 after all non-zero
# values. This ensures that, when several mappings partially
# match a requested locale, the one we should prefer to use
# appears first.
huge = 0x10000 # > any ushort; all tag values are ushort
def keyLikely(entry):
have = entry[1] # Numeric id triple
return have[0] or huge, have[2] or huge, have[1] or huge # language, region, script
likely = list(likely) # Turn generator into list so we can sort it
likely.sort(key=keyLikely)
i = 0
self.writer.write('static const QLocaleId likely_subtags[] = {\n')
for had, have, got, give, last in likely:
for had, have, got, give in likely:
i += 1
self.writer.write(' {{ {:3d}, {:3d}, {:3d} }}'.format(*have))
self.writer.write(', {{ {:3d}, {:3d}, {:3d} }}'.format(*give))
self.writer.write(' ' if last else ',')
self.writer.write(' ' if i == len(likely) else ',')
self.writer.write(' // {} -> {}\n'.format(had, got))
self.writer.write('};\n\n')