a9aa206b7b
This includes byte array, string, char, unicode, locale, collation and regular expressions. Change-Id: I8b125fa52c8c513eb57a0f1298b91910e5a0d786 Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
695 lines
28 KiB
Python
Executable File
695 lines
28 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
#############################################################################
|
|
##
|
|
## Copyright (C) 2017 The Qt Company Ltd.
|
|
## Contact: https://www.qt.io/licensing/
|
|
##
|
|
## This file is part of the test suite of the Qt Toolkit.
|
|
##
|
|
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
|
|
## Commercial License Usage
|
|
## Licensees holding valid commercial Qt licenses may use this file in
|
|
## accordance with the commercial license agreement provided with the
|
|
## Software or, alternatively, in accordance with the terms contained in
|
|
## a written agreement between you and The Qt Company. For licensing terms
|
|
## and conditions see https://www.qt.io/terms-conditions. For further
|
|
## information use the contact form at https://www.qt.io/contact-us.
|
|
##
|
|
## GNU General Public License Usage
|
|
## Alternatively, this file may be used under the terms of the GNU
|
|
## General Public License version 3 as published by the Free Software
|
|
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
|
|
## included in the packaging of this file. Please review the following
|
|
## information to ensure the GNU General Public License requirements will
|
|
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
|
|
##
|
|
## $QT_END_LICENSE$
|
|
##
|
|
#############################################################################
|
|
"""Convert CLDR data to qLocaleXML
|
|
|
|
The CLDR data can be downloaded from CLDR_, which has a sub-directory
|
|
for each version; you need the ``core.zip`` file for your version of
|
|
choice (typically the latest). This script has had updates to cope up
|
|
to v35; for later versions, we may need adaptations. Unpack the
|
|
downloaded ``core.zip`` and check it has a common/main/ sub-directory:
|
|
pass the path of that sub-directory to this script as its single
|
|
command-line argument. Save its standard output (but not error) to a
|
|
file for later processing by ``./qlocalexml2cpp.py``
|
|
|
|
When you update the CLDR data, be sure to also update
|
|
src/corelib/text/qt_attribution.json's entry for unicode-cldr. Check
|
|
this script's output for unknown language, country or script messages;
|
|
if any can be resolved, use their entry in common/main/en.xml to
|
|
append new entries to enumdata.py's lists and update documentation in
|
|
src/corelib/text/qlocale.qdoc, adding the new entries in alphabetic
|
|
order.
|
|
|
|
While updating the locale data, check also for updates to MS-Win's
|
|
time zone names; see cldr2qtimezone.py for details.
|
|
|
|
.. _CLDR: ftp://unicode.org/Public/cldr/
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import re
|
|
import textwrap
|
|
|
|
import enumdata
|
|
import xpathlite
|
|
from xpathlite import DraftResolution, findAlias, findEntry, findTagsInFile
|
|
from dateconverter import convert_date
|
|
from localexml import Locale
|
|
|
|
findEntryInFile = xpathlite._findEntryInFile
|
|
def wrappedwarn(prefix, tokens):
|
|
return sys.stderr.write(
|
|
'\n'.join(textwrap.wrap(prefix + ', '.join(tokens),
|
|
subsequent_indent=' ', width=80)) + '\n')
|
|
|
|
def parse_number_format(patterns, data):
|
|
# this is a very limited parsing of the number format for currency only.
|
|
def skip_repeating_pattern(x):
|
|
p = x.replace('0', '#').replace(',', '').replace('.', '')
|
|
seen = False
|
|
result = ''
|
|
for c in p:
|
|
if c == '#':
|
|
if seen:
|
|
continue
|
|
seen = True
|
|
else:
|
|
seen = False
|
|
result = result + c
|
|
return result
|
|
patterns = patterns.split(';')
|
|
result = []
|
|
for pattern in patterns:
|
|
pattern = skip_repeating_pattern(pattern)
|
|
pattern = pattern.replace('#', "%1")
|
|
# according to http://www.unicode.org/reports/tr35/#Number_Format_Patterns
|
|
# there can be doubled or trippled currency sign, however none of the
|
|
# locales use that.
|
|
pattern = pattern.replace(u'\xa4', "%2")
|
|
pattern = pattern.replace("''", "###").replace("'", '').replace("###", "'")
|
|
pattern = pattern.replace('-', data['minus'])
|
|
pattern = pattern.replace('+', data['plus'])
|
|
result.append(pattern)
|
|
return result
|
|
|
|
def raiseUnknownCode(code, form, cache={}):
|
|
"""Check whether an unknown code could be supported.
|
|
|
|
We declare a language, script or country code unknown if it's not
|
|
known to enumdata.py; however, if it's present in main/en.xml's
|
|
mapping of codes to names, we have the option of adding support.
|
|
This caches the necessary look-up (so we only read main/en.xml
|
|
once) and returns the name we should use if we do add support.
|
|
|
|
First parameter, code, is the unknown code. Second parameter,
|
|
form, is one of 'language', 'script' or 'country' to select the
|
|
type of code to look up. Do not pass further parameters (the next
|
|
will deprive you of the cache).
|
|
|
|
Raises xpathlite.Error with a suitable message, that includes the
|
|
unknown code's full name if found.
|
|
|
|
Relies on global cldr_dir being set before it's called; see tail
|
|
of this file.
|
|
"""
|
|
if not cache:
|
|
cache.update(xpathlite.codeMapsFromFile(os.path.join(cldr_dir, 'en.xml')))
|
|
name = cache[form].get(code)
|
|
msg = 'unknown %s code "%s"' % (form, code)
|
|
if name:
|
|
msg += ' - could use "%s"' % name
|
|
raise xpathlite.Error(msg)
|
|
|
|
def parse_list_pattern_part_format(pattern):
|
|
# This is a very limited parsing of the format for list pattern part only.
|
|
return pattern.replace("{0}", "%1").replace("{1}", "%2").replace("{2}", "%3")
|
|
|
|
def unit_quantifiers(find, path, stem, suffix, known,
|
|
# Stop at exa/exbi: 16 exbi = 2^{64} < zetta =
|
|
# 1000^7 < zebi = 2^{70}, the next quantifiers up:
|
|
si_quantifiers = ('kilo', 'mega', 'giga', 'tera', 'peta', 'exa')):
|
|
"""Work out the unit quantifiers.
|
|
|
|
Unfortunately, the CLDR data only go up to terabytes and we want
|
|
all the way to exabytes; but we can recognize the SI quantifiers
|
|
as prefixes, strip and identify the tail as the localized
|
|
translation for 'B' (e.g. French has 'octet' for 'byte' and uses
|
|
ko, Mo, Go, To from which we can extrapolate Po, Eo).
|
|
|
|
Should be called first for the SI quantifiers, with suffix = 'B',
|
|
then for the IEC ones, with suffix = 'iB'; the list known
|
|
(initially empty before first call) is used to let the second call
|
|
know what the first learned about the localized unit.
|
|
"""
|
|
if suffix == 'B': # first call, known = []
|
|
tail = suffix
|
|
for q in si_quantifiers:
|
|
it = find(path, stem % q)
|
|
# kB for kilobyte, in contrast with KiB for IEC:
|
|
q = q[0] if q == 'kilo' else q[0].upper()
|
|
if not it:
|
|
it = q + tail
|
|
elif it.startswith(q):
|
|
rest = it[1:]
|
|
tail = rest if all(rest == k for k in known) else suffix
|
|
known.append(rest)
|
|
yield it
|
|
else: # second call, re-using first's known
|
|
assert suffix == 'iB'
|
|
if known:
|
|
byte = known.pop()
|
|
if all(byte == k for k in known):
|
|
suffix = 'i' + byte
|
|
for q in si_quantifiers:
|
|
yield find(path, stem % q[:2],
|
|
# Those don't (yet, v31) exist in CLDR, so we always fall back to:
|
|
q[0].upper() + suffix)
|
|
|
|
def generateLocaleInfo(path):
|
|
if not path.endswith(".xml"):
|
|
return {}
|
|
|
|
# skip legacy/compatibility ones
|
|
alias = findAlias(path)
|
|
if alias:
|
|
raise xpathlite.Error('alias to "%s"' % alias)
|
|
|
|
def code(tag):
|
|
return findEntryInFile(path, 'identity/' + tag, attribute="type")[0]
|
|
|
|
return _generateLocaleInfo(path, code('language'), code('script'),
|
|
code('territory'), code('variant'))
|
|
|
|
def getNumberSystems(cache={}):
|
|
"""Cached look-up of number system information.
|
|
|
|
Pass no arguments. Returns a mapping from number system names to,
|
|
for each system, a mapping with keys u'digits', u'type' and
|
|
u'id'\n"""
|
|
if not cache:
|
|
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
|
|
'numberingSystems.xml'),
|
|
'numberingSystems'):
|
|
# ns has form: [u'numberingSystem', [(u'digits', u'0123456789'), (u'type', u'numeric'), (u'id', u'latn')]]
|
|
entry = dict(ns[1])
|
|
name = entry[u'id']
|
|
if u'digits' in entry and ord(entry[u'digits'][0]) > 0xffff:
|
|
# FIXME, QTBUG-69324: make this redundant:
|
|
# omit number system if zero doesn't fit in single-char16 UTF-16 :-(
|
|
sys.stderr.write('skipping number system "%s" [can\'t represent its zero, U+%X]\n'
|
|
% (name, ord(entry[u'digits'][0])))
|
|
else:
|
|
cache[name] = entry
|
|
return cache
|
|
|
|
def _generateLocaleInfo(path, language_code, script_code, country_code, variant_code=""):
|
|
if not path.endswith(".xml"):
|
|
return {}
|
|
|
|
if language_code == 'root':
|
|
# just skip it
|
|
return {}
|
|
|
|
# we do not support variants
|
|
# ### actually there is only one locale with variant: en_US_POSIX
|
|
# does anybody care about it at all?
|
|
if variant_code:
|
|
raise xpathlite.Error('we do not support variants ("%s")' % variant_code)
|
|
|
|
language_id = enumdata.languageCodeToId(language_code)
|
|
if language_id <= 0:
|
|
raiseUnknownCode(language_code, 'language')
|
|
|
|
script_id = enumdata.scriptCodeToId(script_code)
|
|
if script_id == -1:
|
|
raiseUnknownCode(script_code, 'script')
|
|
|
|
# we should handle fully qualified names with the territory
|
|
if not country_code:
|
|
return {}
|
|
country_id = enumdata.countryCodeToId(country_code)
|
|
if country_id <= 0:
|
|
raiseUnknownCode(country_code, 'country')
|
|
|
|
# So we say we accept only those values that have "contributed" or
|
|
# "approved" resolution. see http://www.unicode.org/cldr/process.html
|
|
# But we only respect the resolution for new datas for backward
|
|
# compatibility.
|
|
draft = DraftResolution.contributed
|
|
|
|
result = dict(
|
|
language=enumdata.language_list[language_id][0],
|
|
language_code=language_code, language_id=language_id,
|
|
script=enumdata.script_list[script_id][0],
|
|
script_code=script_code, script_id=script_id,
|
|
country=enumdata.country_list[country_id][0],
|
|
country_code=country_code, country_id=country_id,
|
|
variant_code=variant_code)
|
|
|
|
(dir_name, file_name) = os.path.split(path)
|
|
def from_supplement(tag,
|
|
path=os.path.join(dir_name, '..', 'supplemental',
|
|
'supplementalData.xml')):
|
|
return findTagsInFile(path, tag)
|
|
currencies = from_supplement('currencyData/region[iso3166=%s]' % country_code)
|
|
result['currencyIsoCode'] = ''
|
|
result['currencyDigits'] = 2
|
|
result['currencyRounding'] = 1
|
|
if currencies:
|
|
for e in currencies:
|
|
if e[0] == 'currency':
|
|
t = [x[1] == 'false' for x in e[1] if x[0] == 'tender']
|
|
if t and t[0]:
|
|
pass
|
|
elif not any(x[0] == 'to' for x in e[1]):
|
|
result['currencyIsoCode'] = (x[1] for x in e[1] if x[0] == 'iso4217').next()
|
|
break
|
|
if result['currencyIsoCode']:
|
|
t = from_supplement("currencyData/fractions/info[iso4217=%s]"
|
|
% result['currencyIsoCode'])
|
|
if t and t[0][0] == 'info':
|
|
result['currencyDigits'] = (int(x[1]) for x in t[0][1] if x[0] == 'digits').next()
|
|
result['currencyRounding'] = (int(x[1]) for x in t[0][1] if x[0] == 'rounding').next()
|
|
numbering_system = None
|
|
try:
|
|
numbering_system = findEntry(path, "numbers/defaultNumberingSystem")
|
|
except xpathlite.Error:
|
|
pass
|
|
def findEntryDef(path, xpath, value=''):
|
|
try:
|
|
return findEntry(path, xpath)
|
|
except xpathlite.Error:
|
|
return value
|
|
def get_number_in_system(path, xpath, numbering_system):
|
|
if numbering_system:
|
|
try:
|
|
return findEntry(path, xpath + "[numberSystem=" + numbering_system + "]")
|
|
except xpathlite.Error:
|
|
# in CLDR 1.9 number system was refactored for numbers (but not for currency)
|
|
# so if previous findEntry doesn't work we should try this:
|
|
try:
|
|
return findEntry(path, xpath.replace("/symbols/", "/symbols[numberSystem=" + numbering_system + "]/"))
|
|
except xpathlite.Error:
|
|
# fallback to default
|
|
pass
|
|
return findEntry(path, xpath)
|
|
|
|
result['decimal'] = get_number_in_system(path, "numbers/symbols/decimal", numbering_system)
|
|
result['group'] = get_number_in_system(path, "numbers/symbols/group", numbering_system)
|
|
result['list'] = get_number_in_system(path, "numbers/symbols/list", numbering_system)
|
|
result['percent'] = get_number_in_system(path, "numbers/symbols/percentSign", numbering_system)
|
|
try:
|
|
result['zero'] = getNumberSystems()[numbering_system][u"digits"][0]
|
|
except Exception as e:
|
|
sys.stderr.write("Native zero detection problem: %s\n" % repr(e))
|
|
result['zero'] = get_number_in_system(path, "numbers/symbols/nativeZeroDigit", numbering_system)
|
|
result['minus'] = get_number_in_system(path, "numbers/symbols/minusSign", numbering_system)
|
|
result['plus'] = get_number_in_system(path, "numbers/symbols/plusSign", numbering_system)
|
|
result['exp'] = get_number_in_system(path, "numbers/symbols/exponential", numbering_system).lower()
|
|
result['quotationStart'] = findEntry(path, "delimiters/quotationStart")
|
|
result['quotationEnd'] = findEntry(path, "delimiters/quotationEnd")
|
|
result['alternateQuotationStart'] = findEntry(path, "delimiters/alternateQuotationStart")
|
|
result['alternateQuotationEnd'] = findEntry(path, "delimiters/alternateQuotationEnd")
|
|
result['listPatternPartStart'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[start]"))
|
|
result['listPatternPartMiddle'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[middle]"))
|
|
result['listPatternPartEnd'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[end]"))
|
|
result['listPatternPartTwo'] = parse_list_pattern_part_format(findEntry(path, "listPatterns/listPattern/listPatternPart[2]"))
|
|
result['am'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[am]", draft)
|
|
result['pm'] = findEntry(path, "dates/calendars/calendar[gregorian]/dayPeriods/dayPeriodContext[format]/dayPeriodWidth[wide]/dayPeriod[pm]", draft)
|
|
result['longDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[full]/dateFormat/pattern"))
|
|
result['shortDateFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/dateFormats/dateFormatLength[short]/dateFormat/pattern"))
|
|
result['longTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[full]/timeFormat/pattern"))
|
|
result['shortTimeFormat'] = convert_date(findEntry(path, "dates/calendars/calendar[gregorian]/timeFormats/timeFormatLength[short]/timeFormat/pattern"))
|
|
|
|
endonym = None
|
|
if country_code and script_code:
|
|
endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s_%s]" % (language_code, script_code, country_code))
|
|
if not endonym and script_code:
|
|
endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, script_code))
|
|
if not endonym and country_code:
|
|
endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s_%s]" % (language_code, country_code))
|
|
if not endonym:
|
|
endonym = findEntryDef(path, "localeDisplayNames/languages/language[type=%s]" % (language_code))
|
|
result['language_endonym'] = endonym
|
|
result['country_endonym'] = findEntryDef(path, "localeDisplayNames/territories/territory[type=%s]" % (country_code))
|
|
|
|
currency_format = get_number_in_system(path, "numbers/currencyFormats/currencyFormatLength/currencyFormat/pattern", numbering_system)
|
|
currency_format = parse_number_format(currency_format, result)
|
|
result['currencyFormat'] = currency_format[0]
|
|
result['currencyNegativeFormat'] = ''
|
|
if len(currency_format) > 1:
|
|
result['currencyNegativeFormat'] = currency_format[1]
|
|
|
|
result['currencySymbol'] = ''
|
|
result['currencyDisplayName'] = ''
|
|
if result['currencyIsoCode']:
|
|
result['currencySymbol'] = findEntryDef(path, "numbers/currencies/currency[%s]/symbol" % result['currencyIsoCode'])
|
|
result['currencyDisplayName'] = ';'.join(
|
|
findEntryDef(path, 'numbers/currencies/currency[' + result['currencyIsoCode']
|
|
+ ']/displayName' + tail)
|
|
for tail in ['',] + [
|
|
'[count=%s]' % x for x in ('zero', 'one', 'two', 'few', 'many', 'other')
|
|
]) + ';'
|
|
|
|
def findUnitDef(path, stem, fallback=''):
|
|
# The displayName for a quantified unit in en.xml is kByte
|
|
# instead of kB (etc.), so prefer any unitPattern provided:
|
|
for count in ('many', 'few', 'two', 'other', 'zero', 'one'):
|
|
try:
|
|
ans = findEntry(path, stem + 'unitPattern[count=%s]' % count)
|
|
except xpathlite.Error:
|
|
continue
|
|
|
|
# TODO: epxloit count-handling, instead of discarding placeholders
|
|
if ans.startswith('{0}'):
|
|
ans = ans[3:].lstrip()
|
|
if ans:
|
|
return ans
|
|
|
|
return findEntryDef(path, stem + 'displayName', fallback)
|
|
|
|
# First without quantifier, then quantified each way:
|
|
result['byte_unit'] = findEntryDef(
|
|
path, 'units/unitLength[type=long]/unit[type=digital-byte]/displayName',
|
|
'bytes')
|
|
stem = 'units/unitLength[type=short]/unit[type=digital-%sbyte]/'
|
|
known = [] # cases where we *do* have a given version:
|
|
result['byte_si_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem, 'B', known))
|
|
# IEC 60027-2
|
|
# http://physics.nist.gov/cuu/Units/binary.html
|
|
result['byte_iec_quantified'] = ';'.join(unit_quantifiers(findUnitDef, path, stem % '%sbi', 'iB', known))
|
|
|
|
# Used for month and day data:
|
|
namings = (
|
|
('standaloneLong', 'stand-alone', 'wide'),
|
|
('standaloneShort', 'stand-alone', 'abbreviated'),
|
|
('standaloneNarrow', 'stand-alone', 'narrow'),
|
|
('long', 'format', 'wide'),
|
|
('short', 'format', 'abbreviated'),
|
|
('narrow', 'format', 'narrow'),
|
|
)
|
|
|
|
# Month data:
|
|
for cal in ('gregorian',): # We shall want to add to this
|
|
stem = 'dates/calendars/calendar[' + cal + ']/months/'
|
|
for (key, mode, size) in namings:
|
|
prop = 'monthContext[' + mode + ']/monthWidth[' + size + ']/'
|
|
result[key + 'Months'] = ';'.join(
|
|
findEntry(path, stem + prop + "month[%d]" % i)
|
|
for i in range(1, 13)) + ';'
|
|
|
|
# Day data (for Gregorian, at least):
|
|
stem = 'dates/calendars/calendar[gregorian]/days/'
|
|
days = ('sun', 'mon', 'tue', 'wed', 'thu', 'fri', 'sat')
|
|
for (key, mode, size) in namings:
|
|
prop = 'dayContext[' + mode + ']/dayWidth[' + size + ']/day'
|
|
result[key + 'Days'] = ';'.join(
|
|
findEntry(path, stem + prop + '[' + day + ']')
|
|
for day in days) + ';'
|
|
|
|
return Locale(result)
|
|
|
|
def addEscapes(s):
|
|
result = ''
|
|
for c in s:
|
|
n = ord(c)
|
|
if n < 128:
|
|
result += c
|
|
else:
|
|
result += "\\x"
|
|
result += "%02x" % (n)
|
|
return result
|
|
|
|
def unicodeStr(s):
|
|
utf8 = s.encode('utf-8')
|
|
return "<size>" + str(len(utf8)) + "</size><data>" + addEscapes(utf8) + "</data>"
|
|
|
|
def usage():
|
|
print "Usage: cldr2qlocalexml.py <path-to-cldr-main>"
|
|
sys.exit()
|
|
|
|
def integrateWeekData(filePath):
|
|
if not filePath.endswith(".xml"):
|
|
return {}
|
|
|
|
def lookup(key):
|
|
return findEntryInFile(filePath, key, attribute='territories')[0].split()
|
|
days = ('mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun')
|
|
|
|
firstDayByCountryCode = {}
|
|
for day in days:
|
|
for countryCode in lookup('weekData/firstDay[day=%s]' % day):
|
|
firstDayByCountryCode[countryCode] = day
|
|
|
|
weekendStartByCountryCode = {}
|
|
for day in days:
|
|
for countryCode in lookup('weekData/weekendStart[day=%s]' % day):
|
|
weekendStartByCountryCode[countryCode] = day
|
|
|
|
weekendEndByCountryCode = {}
|
|
for day in days:
|
|
for countryCode in lookup('weekData/weekendEnd[day=%s]' % day):
|
|
weekendEndByCountryCode[countryCode] = day
|
|
|
|
for (key, locale) in locale_database.iteritems():
|
|
countryCode = locale.country_code
|
|
if countryCode in firstDayByCountryCode:
|
|
locale.firstDayOfWeek = firstDayByCountryCode[countryCode]
|
|
else:
|
|
locale.firstDayOfWeek = firstDayByCountryCode["001"]
|
|
|
|
if countryCode in weekendStartByCountryCode:
|
|
locale.weekendStart = weekendStartByCountryCode[countryCode]
|
|
else:
|
|
locale.weekendStart = weekendStartByCountryCode["001"]
|
|
|
|
if countryCode in weekendEndByCountryCode:
|
|
locale.weekendEnd = weekendEndByCountryCode[countryCode]
|
|
else:
|
|
locale.weekendEnd = weekendEndByCountryCode["001"]
|
|
|
|
def splitLocale(name):
|
|
"""Split name into (language, script, territory) triple as generator.
|
|
|
|
Ignores any trailing fields (with a warning), leaves script (a capitalised
|
|
four-letter token) or territory (either a number or an all-uppercase token)
|
|
empty if unspecified, returns a single-entry generator if name is a single
|
|
tag (i.e. contains no underscores). Always yields 1 or 3 values, never 2."""
|
|
tags = iter(name.split('_'))
|
|
yield tags.next() # Language
|
|
tag = tags.next()
|
|
|
|
# Script is always four letters, always capitalised:
|
|
if len(tag) == 4 and tag[0].isupper() and tag[1:].islower():
|
|
yield tag
|
|
try:
|
|
tag = tags.next()
|
|
except StopIteration:
|
|
tag = ''
|
|
else:
|
|
yield ''
|
|
|
|
# Territory is upper-case or numeric:
|
|
if tag and tag.isupper() or tag.isdigit():
|
|
yield tag
|
|
tag = ''
|
|
else:
|
|
yield ''
|
|
|
|
# If nothing is left, StopIteration will avoid the warning:
|
|
tag = (tag if tag else tags.next(),)
|
|
sys.stderr.write('Ignoring unparsed cruft %s in %s\n' % ('_'.join(tag + tuple(tags)), name))
|
|
|
|
if len(sys.argv) != 2:
|
|
usage()
|
|
|
|
cldr_dir = sys.argv[1]
|
|
|
|
if not os.path.isdir(cldr_dir):
|
|
usage()
|
|
|
|
cldr_files = os.listdir(cldr_dir)
|
|
|
|
locale_database = {}
|
|
|
|
# see http://www.unicode.org/reports/tr35/tr35-info.html#Default_Content
|
|
defaultContent_locales = []
|
|
for ns in findTagsInFile(os.path.join(cldr_dir, '..', 'supplemental',
|
|
'supplementalMetadata.xml'),
|
|
'metadata/defaultContent'):
|
|
for data in ns[1:][0]:
|
|
if data[0] == u"locales":
|
|
defaultContent_locales += data[1].split()
|
|
|
|
skips = []
|
|
for file in defaultContent_locales:
|
|
try:
|
|
language_code, script_code, country_code = splitLocale(file)
|
|
except ValueError:
|
|
sys.stderr.write('skipping defaultContent locale "' + file + '" [neither two nor three tags]\n')
|
|
continue
|
|
|
|
if not (script_code or country_code):
|
|
sys.stderr.write('skipping defaultContent locale "' + file + '" [second tag is neither script nor territory]\n')
|
|
continue
|
|
|
|
try:
|
|
l = _generateLocaleInfo(cldr_dir + "/" + file + ".xml", language_code, script_code, country_code)
|
|
if not l:
|
|
skips.append(file)
|
|
continue
|
|
except xpathlite.Error as e:
|
|
sys.stderr.write('skipping defaultContent locale "%s" (%s)\n' % (file, str(e)))
|
|
continue
|
|
|
|
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
|
|
|
|
if skips:
|
|
wrappedwarn('skipping defaultContent locales [no locale info generated]: ', skips)
|
|
skips = []
|
|
|
|
for file in cldr_files:
|
|
try:
|
|
l = generateLocaleInfo(cldr_dir + "/" + file)
|
|
if not l:
|
|
skips.append(file)
|
|
continue
|
|
except xpathlite.Error as e:
|
|
sys.stderr.write('skipping file "%s" (%s)\n' % (file, str(e)))
|
|
continue
|
|
|
|
locale_database[(l.language_id, l.script_id, l.country_id, l.variant_code)] = l
|
|
|
|
if skips:
|
|
wrappedwarn('skipping files [no locale info generated]: ', skips)
|
|
|
|
integrateWeekData(cldr_dir+"/../supplemental/supplementalData.xml")
|
|
locale_keys = locale_database.keys()
|
|
locale_keys.sort()
|
|
|
|
cldr_version = 'unknown'
|
|
ldml = open(cldr_dir+"/../dtd/ldml.dtd", "r")
|
|
for line in ldml:
|
|
if 'version cldrVersion CDATA #FIXED' in line:
|
|
cldr_version = line.split('"')[1]
|
|
|
|
print "<localeDatabase>"
|
|
print " <version>" + cldr_version + "</version>"
|
|
print " <languageList>"
|
|
for id in enumdata.language_list:
|
|
l = enumdata.language_list[id]
|
|
print " <language>"
|
|
print " <name>" + l[0] + "</name>"
|
|
print " <id>" + str(id) + "</id>"
|
|
print " <code>" + l[1] + "</code>"
|
|
print " </language>"
|
|
print " </languageList>"
|
|
|
|
print " <scriptList>"
|
|
for id in enumdata.script_list:
|
|
l = enumdata.script_list[id]
|
|
print " <script>"
|
|
print " <name>" + l[0] + "</name>"
|
|
print " <id>" + str(id) + "</id>"
|
|
print " <code>" + l[1] + "</code>"
|
|
print " </script>"
|
|
print " </scriptList>"
|
|
|
|
print " <countryList>"
|
|
for id in enumdata.country_list:
|
|
l = enumdata.country_list[id]
|
|
print " <country>"
|
|
print " <name>" + l[0] + "</name>"
|
|
print " <id>" + str(id) + "</id>"
|
|
print " <code>" + l[1] + "</code>"
|
|
print " </country>"
|
|
print " </countryList>"
|
|
|
|
def _parseLocale(l):
|
|
language = "AnyLanguage"
|
|
script = "AnyScript"
|
|
country = "AnyCountry"
|
|
|
|
if l == "und":
|
|
raise xpathlite.Error("we are treating unknown locale like C")
|
|
|
|
parsed = splitLocale(l)
|
|
language_code = parsed.next()
|
|
script_code = country_code = ''
|
|
try:
|
|
script_code, country_code = parsed
|
|
except ValueError:
|
|
pass
|
|
|
|
if language_code != "und":
|
|
language_id = enumdata.languageCodeToId(language_code)
|
|
if language_id == -1:
|
|
raise xpathlite.Error('unknown language code "%s"' % language_code)
|
|
language = enumdata.language_list[language_id][0]
|
|
|
|
if script_code:
|
|
script_id = enumdata.scriptCodeToId(script_code)
|
|
if script_id == -1:
|
|
raise xpathlite.Error('unknown script code "%s"' % script_code)
|
|
script = enumdata.script_list[script_id][0]
|
|
|
|
if country_code:
|
|
country_id = enumdata.countryCodeToId(country_code)
|
|
if country_id == -1:
|
|
raise xpathlite.Error('unknown country code "%s"' % country_code)
|
|
country = enumdata.country_list[country_id][0]
|
|
|
|
return (language, script, country)
|
|
|
|
skips = []
|
|
print " <likelySubtags>"
|
|
for ns in findTagsInFile(cldr_dir + "/../supplemental/likelySubtags.xml", "likelySubtags"):
|
|
tmp = {}
|
|
for data in ns[1:][0]: # ns looks like this: [u'likelySubtag', [(u'from', u'aa'), (u'to', u'aa_Latn_ET')]]
|
|
tmp[data[0]] = data[1]
|
|
|
|
try:
|
|
from_language, from_script, from_country = _parseLocale(tmp[u"from"])
|
|
to_language, to_script, to_country = _parseLocale(tmp[u"to"])
|
|
except xpathlite.Error as e:
|
|
if tmp[u'to'].startswith(tmp[u'from']) and str(e) == 'unknown language code "%s"' % tmp[u'from']:
|
|
skips.append(tmp[u'to'])
|
|
else:
|
|
sys.stderr.write('skipping likelySubtag "%s" -> "%s" (%s)\n' % (tmp[u"from"], tmp[u"to"], str(e)))
|
|
continue
|
|
# substitute according to http://www.unicode.org/reports/tr35/#Likely_Subtags
|
|
if to_country == "AnyCountry" and from_country != to_country:
|
|
to_country = from_country
|
|
if to_script == "AnyScript" and from_script != to_script:
|
|
to_script = from_script
|
|
|
|
print " <likelySubtag>"
|
|
print " <from>"
|
|
print " <language>" + from_language + "</language>"
|
|
print " <script>" + from_script + "</script>"
|
|
print " <country>" + from_country + "</country>"
|
|
print " </from>"
|
|
print " <to>"
|
|
print " <language>" + to_language + "</language>"
|
|
print " <script>" + to_script + "</script>"
|
|
print " <country>" + to_country + "</country>"
|
|
print " </to>"
|
|
print " </likelySubtag>"
|
|
print " </likelySubtags>"
|
|
if skips:
|
|
wrappedwarn('skipping likelySubtags (for unknown language codes): ', skips)
|
|
print " <localeList>"
|
|
|
|
Locale.C().toXml()
|
|
for key in locale_keys:
|
|
locale_database[key].toXml()
|
|
|
|
print " </localeList>"
|
|
print "</localeDatabase>"
|