qt5base-lts/util/locale_database/qlocalexml2cpp.py

581 lines
22 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
#############################################################################
##
## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
##
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
## Commercial License Usage
## Licensees holding valid commercial Qt licenses may use this file in
## accordance with the commercial license agreement provided with the
## Software or, alternatively, in accordance with the terms contained in
## a written agreement between you and The Qt Company. For licensing terms
## and conditions see https://www.qt.io/terms-conditions. For further
## information use the contact form at https://www.qt.io/contact-us.
##
## GNU General Public License Usage
## Alternatively, this file may be used under the terms of the GNU
## General Public License version 3 as published by the Free Software
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
## included in the packaging of this file. Please review the following
## information to ensure the GNU General Public License requirements will
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
##
## $QT_END_LICENSE$
##
#############################################################################
"""Script to generate C++ code from CLDR data in qLocaleXML form
See ``cldr2qlocalexml.py`` for how to generate the qLocaleXML data itself.
Pass the output file from that as first parameter to this script; pass
the root of the qtbase check-out as second parameter.
"""
import os
import datetime
from qlocalexml import QLocaleXmlReader
from xml.dom import minidom
from localetools import unicode2hex, wrap_list, Error, Transcriber, SourceFileEditor
def compareLocaleKeys(key1, key2):
if key1 == key2:
return 0
if key1[0] != key2[0]: # First sort by language:
return key1[0] - key2[0]
defaults = compareLocaleKeys.default_map
# maps {(language, script): country} by ID
try:
country = defaults[key1[:2]]
except KeyError:
pass
else:
if key1[2] == country:
return -1
if key2[2] == country:
return 1
if key1[1] == key2[1]:
return key1[2] - key2[2]
try:
country = defaults[key2[:2]]
except KeyError:
pass
else:
if key2[2] == country:
return 1
if key1[2] == country:
return -1
return key1[1] - key2[1]
class StringDataToken:
def __init__(self, index, length):
if index > 0xFFFF or length > 0xFFFF:
raise Error("Position exceeds ushort range: %d,%d " % (index, length))
self.index = index
self.length = length
def __str__(self):
return " %d,%d " % (self.index, self.length)
class StringData:
def __init__(self, name):
self.data = []
self.hash = {}
self.name = name
def append(self, s):
if s in self.hash:
return self.hash[s]
lst = unicode2hex(s)
index = len(self.data)
if index > 0xffff:
raise Error('Data index {} is too big for uint16!'.format(index))
size = len(lst)
if size >= 0xffff:
raise Error('Data is too big ({}) for uint16 size!'.format(size))
token = None
try:
token = StringDataToken(index, size)
except Error as e:
e.message += '(on data "{}")'.format(s)
raise
self.hash[s] = token
self.data += lst
return token
def write(self, fd):
fd.write("\nstatic const ushort %s[] = {\n" % self.name)
fd.write(wrap_list(self.data))
fd.write("\n};\n")
def currencyIsoCodeData(s):
if s:
return '{' + ",".join(str(ord(x)) for x in s) + '}'
return "{0,0,0}"
class LocaleSourceEditor (SourceFileEditor):
__upinit = SourceFileEditor.__init__
def __init__(self, path, temp, version):
self.__upinit(path, temp)
self.writer.write("""
/*
This part of the file was generated on {} from the
Common Locale Data Repository v{}
http://www.unicode.org/cldr/
Do not edit this section: instead regenerate it using
cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or
edited) CLDR data; see qtbase/util/locale_database/.
*/
""".format(datetime.date.today(), version))
class LocaleDataWriter (LocaleSourceEditor):
def likelySubtags(self, likely):
self.writer.write('static const QLocaleId likely_subtags[] = {\n')
for had, have, got, give, last in likely:
self.writer.write(' {{ {:3d}, {:3d}, {:3d} }}'.format(*have))
self.writer.write(', {{ {:3d}, {:3d}, {:3d} }}'.format(*give))
self.writer.write(' ' if last else ',')
self.writer.write(' // {} -> {}\n'.format(had, got))
self.writer.write('};\n\n')
def localeIndex(self, indices):
self.writer.write('static const quint16 locale_index[] = {\n')
for pair in indices:
self.writer.write('{:6d}, // {}\n'.format(*pair))
self.writer.write(' 0 // trailing 0\n')
self.writer.write('};\n\n')
def localeData(self, locales, names):
list_pattern_part_data = StringData('list_pattern_part_data')
date_format_data = StringData('date_format_data')
time_format_data = StringData('time_format_data')
days_data = StringData('days_data')
am_data = StringData('am_data')
pm_data = StringData('pm_data')
byte_unit_data = StringData('byte_unit_data')
currency_symbol_data = StringData('currency_symbol_data')
currency_display_name_data = StringData('currency_display_name_data')
currency_format_data = StringData('currency_format_data')
endonyms_data = StringData('endonyms_data')
# Locale data
self.writer.write('static const QLocaleData locale_data[] = {\n')
# Table headings: keep each label centred in its field, matching line_format:
self.writer.write(' // '
# Width 6 + comma
' lang ' # IDs
'script '
' terr '
' dec ' # Numeric punctuation
' group '
' list ' # Delimiter for *numeric* lists
' prcnt ' # Arithmetic symbols
' zero '
' minus '
' plus '
' exp '
# Width 8 + comma - to make space for these wide labels !
' quotOpn ' # Quotation marks
' quotEnd '
'altQtOpn '
'altQtEnd '
# Width 11 + comma
' lpStart ' # List pattern
' lpMid '
' lpEnd '
' lpTwo '
' sDtFmt ' # Date format
' lDtFmt '
' sTmFmt ' # Time format
' lTmFmt '
' ssDays ' # Days
' slDays '
' snDays '
' sDays '
' lDays '
' nDays '
' am ' # am/pm indicators
' pm '
# Width 8 + comma
' byte '
' siQuant '
'iecQuant '
# Width 8+4 + comma
' currISO '
# Width 11 + comma
' currSym ' # Currency formatting
' currDsply '
' currFmt '
' currFmtNeg '
' endoLang ' # Name of language in itself, and of country
' endoCntry '
# Width 6 + comma
'curDgt ' # Currency number representation
'curRnd '
'dow1st ' # First day of week
' wknd+ ' # Week-end start/end days
' wknd-'
# No trailing space on last entry (be sure to
# pad before adding anything after it).
'\n')
formatLine = ''.join((
' {{ ',
# Locale-identifier
'{:6d},' * 3,
# Numeric formats, list delimiter
'{:6d},' * 8,
# Quotation marks
'{:8d},' * 4,
# List patterns, date/time formats, month/day names, am/pm
'{:>11s},' * 16,
# SI/IEC byte-unit abbreviations
'{:>8s},' * 3,
# Currency ISO code
' {:>10s}, ',
# Currency and endonyms
'{:>11s},' * 6,
# Currency formatting
'{:6d},{:6d}',
# Day of week and week-end
',{:6d}' * 3,
' }}')).format
for key in names:
locale = locales[key]
self.writer.write(formatLine(
key[0], key[1], key[2],
locale.decimal,
locale.group,
locale.listDelim,
locale.percent,
locale.zero,
locale.minus,
locale.plus,
locale.exp,
locale.quotationStart,
locale.quotationEnd,
locale.alternateQuotationStart,
locale.alternateQuotationEnd,
list_pattern_part_data.append(locale.listPatternPartStart),
list_pattern_part_data.append(locale.listPatternPartMiddle),
list_pattern_part_data.append(locale.listPatternPartEnd),
list_pattern_part_data.append(locale.listPatternPartTwo),
date_format_data.append(locale.shortDateFormat),
date_format_data.append(locale.longDateFormat),
time_format_data.append(locale.shortTimeFormat),
time_format_data.append(locale.longTimeFormat),
days_data.append(locale.standaloneShortDays),
days_data.append(locale.standaloneLongDays),
days_data.append(locale.standaloneNarrowDays),
days_data.append(locale.shortDays),
days_data.append(locale.longDays),
days_data.append(locale.narrowDays),
am_data.append(locale.am),
pm_data.append(locale.pm),
byte_unit_data.append(locale.byte_unit),
byte_unit_data.append(locale.byte_si_quantified),
byte_unit_data.append(locale.byte_iec_quantified),
currencyIsoCodeData(locale.currencyIsoCode),
currency_symbol_data.append(locale.currencySymbol),
currency_display_name_data.append(locale.currencyDisplayName),
currency_format_data.append(locale.currencyFormat),
currency_format_data.append(locale.currencyNegativeFormat),
endonyms_data.append(locale.languageEndonym),
endonyms_data.append(locale.countryEndonym),
locale.currencyDigits,
locale.currencyRounding, # unused (QTBUG-81343)
locale.firstDayOfWeek,
locale.weekendStart,
locale.weekendEnd)
+ ', // {}/{}/{}\n'.format(
locale.language, locale.script, locale.country))
self.writer.write(formatLine(*( # All zeros, matching the format:
(0,) * (3 + 8 + 4) + ('0,0',) * (16 + 3)
+ (currencyIsoCodeData(0),)
+ ('0,0',) * 6 + (0,) * (2 + 3) ))
+ ' // trailing zeros\n')
self.writer.write('};\n')
# StringData tables:
for data in (list_pattern_part_data, date_format_data,
time_format_data, days_data,
byte_unit_data, am_data, pm_data, currency_symbol_data,
currency_display_name_data, currency_format_data,
endonyms_data):
data.write(self.writer)
@staticmethod
def __writeNameData(out, book, form):
out('static const char {}_name_list[] =\n'.format(form))
out('"Default\\0"\n')
for key, value in book.items():
if key == 0:
continue
out('"' + value[0] + '\\0"\n')
out(';\n\n')
out('static const quint16 {}_name_index[] = {{\n'.format(form))
out(' 0, // Any{}\n'.format(form.capitalize()))
index = 8
for key, value in book.items():
if key == 0:
continue
name = value[0]
out('{:6d}, // {}\n'.format(index, name))
index += len(name) + 1
out('};\n\n')
@staticmethod
def __writeCodeList(out, book, form, width):
out('static const unsigned char {}_code_list[] =\n'.format(form))
for key, value in book.items():
code = value[1]
code += r'\0' * max(width - len(code), 0)
out('"{}" // {}\n'.format(code, value[0]))
out(';\n\n')
def languageNames(self, languages):
self.__writeNameData(self.writer.write, languages, 'language')
def scriptNames(self, scripts):
self.__writeNameData(self.writer.write, scripts, 'script')
def countryNames(self, countries):
self.__writeNameData(self.writer.write, countries, 'country')
# TODO: unify these next three into the previous three; kept
# separate for now to verify we're not changing data.
def languageCodes(self, languages):
self.__writeCodeList(self.writer.write, languages, 'language', 3)
def scriptCodes(self, scripts):
self.__writeCodeList(self.writer.write, scripts, 'script', 4)
def countryCodes(self, countries): # TODO: unify with countryNames()
self.__writeCodeList(self.writer.write, countries, 'country', 3)
class CalendarDataWriter (LocaleSourceEditor):
formatCalendar = ''.join((
' {{',
'{:6d}',
',{:6d}' * 2,
',{{{:>5s}}}' * 6,
'}}, ')).format
def write(self, calendar, locales, names):
months_data = StringData('months_data')
self.writer.write('static const QCalendarLocale locale_data[] = {\n')
self.writer.write(' // '
# IDs, width 7 (6 + comma)
+ ' lang '
+ ' script'
+ ' terr '
# Month-name start-end pairs, width 8 (5 plus '{},'):
+ ' sShort '
+ ' sLong '
+ ' sNarrow'
+ ' short '
+ ' long '
+ ' narrow'
# No trailing space on last; be sure
# to pad before adding later entries.
+ '\n')
for key in names:
locale = locales[key]
self.writer.write(
self.formatCalendar(
key[0], key[1], key[2],
months_data.append(locale.standaloneShortMonths[calendar]),
months_data.append(locale.standaloneLongMonths[calendar]),
months_data.append(locale.standaloneNarrowMonths[calendar]),
months_data.append(locale.shortMonths[calendar]),
months_data.append(locale.longMonths[calendar]),
months_data.append(locale.narrowMonths[calendar]))
+ '// {}/{}/{}\n '.format(locale.language, locale.script, locale.country))
self.writer.write(self.formatCalendar(*( (0,) * 3 + ('0,0',) * 6 ))
+ '// trailing zeros\n')
self.writer.write('};\n')
months_data.write(self.writer)
class LocaleHeaderWriter (SourceFileEditor):
__upinit = SourceFileEditor.__init__
def __init__(self, path, temp, dupes):
self.__upinit(path, temp)
self.__dupes = dupes
def languages(self, languages):
self.__enum('Language', languages, self.__language)
self.writer.write('\n')
def countries(self, countries):
self.__enum('Country', countries, self.__country)
def scripts(self, scripts):
self.__enum('Script', scripts, self.__script)
self.writer.write('\n')
# Implementation details
from enumdata import (language_aliases as __language,
country_aliases as __country,
script_aliases as __script)
def __enum(self, name, book, alias):
assert book
out, dupes = self.writer.write, self.__dupes
out(' enum {} {{\n'.format(name))
for key, value in book.items():
member = value[0]
if name == 'Script':
# Don't .capitalize() as some names are already camel-case (see enumdata.py):
member = ''.join(word[0].upper() + word[1:] for word in member.split())
if not member.endswith('Script'):
member += 'Script'
if member in dupes:
raise Error('The script name "{}" is messy'.format(member))
else:
member = ''.join(member.split())
member = member + name if member in dupes else member
out(' {} = {},\n'.format(member, key))
out('\n '
+ ',\n '.join('{} = {}'.format(*pair)
for pair in sorted(alias.items()))
+ ',\n\n Last{} = {}\n }};\n'.format(name, member))
def usage(name, err, message = ''):
err.write("""Usage: {} path/to/qlocale.xml root/of/qtbase
""".format(name)) # TODO: elaborate
if message:
err.write('\n' + message + '\n')
def main(args, out, err):
# TODO: Make calendars a command-line parameter
# map { CLDR name: Qt file name }
calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew',
name = args.pop(0)
if len(args) != 2:
usage(name, err, 'I expect two arguments')
return 1
qlocalexml = args.pop(0)
qtsrcdir = args.pop(0)
if not (os.path.isdir(qtsrcdir)
and all(os.path.isfile(os.path.join(qtsrcdir, 'src', 'corelib', 'text', leaf))
for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))):
usage(name, err, 'Missing expected files under qtbase source root ' + qtsrcdir)
return 1
reader = QLocaleXmlReader(qlocalexml)
locale_map = dict(reader.loadLocaleMap(calendars, sys.stderr.write))
locale_keys = locale_map.keys()
compareLocaleKeys.default_map = dict(reader.defaultMap())
locale_keys.sort(compareLocaleKeys)
try:
writer = LocaleDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text',
'qlocale_data_p.h'),
qtsrcdir, reader.cldrVersion)
except IOError as e:
err.write('Failed to open files to transcribe locale data: ' + (e.message or e.args[1]))
return 1
try:
writer.likelySubtags(reader.likelyMap())
writer.localeIndex(reader.languageIndices(tuple(k[0] for k in locale_map)))
writer.localeData(locale_map, locale_keys)
writer.writer.write('\n')
writer.languageNames(reader.languages)
writer.scriptNames(reader.scripts)
writer.countryNames(reader.countries)
# TODO: merge the next three into the previous three
writer.languageCodes(reader.languages)
writer.scriptCodes(reader.scripts)
writer.countryCodes(reader.countries)
except Error as e:
writer.cleanup()
err.write('\nError updating locale data: ' + e.message + '\n')
return 1
writer.close()
# Generate calendar data
for calendar, stem in calendars.items():
try:
writer = CalendarDataWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'time',
'q{}calendar_data_p.h'.format(stem)),
qtsrcdir, reader.cldrVersion)
except IOError as e:
err.write('Failed to open files to transcribe ' + calendar
+ ' data ' + (e.message or e.args[1]))
return 1
try:
writer.write(calendar, locale_map, locale_keys)
except Error as e:
writer.cleanup()
err.write('\nError updating ' + calendar + ' locale data: ' + e.message + '\n')
return 1
writer.close()
# qlocale.h
try:
writer = LocaleHeaderWriter(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.h'),
qtsrcdir, reader.dupes)
except IOError as e:
err.write('Failed to open files to transcribe qlocale.h: ' + (e.message or e.args[1]))
return 1
try:
writer.languages(reader.languages)
writer.scripts(reader.scripts)
writer.countries(reader.countries)
except Error as e:
writer.cleanup()
err.write('\nError updating qlocale.h: ' + e.message + '\n')
return 1
writer.close()
# qlocale.qdoc
try:
writer = Transcriber(os.path.join(qtsrcdir, 'src', 'corelib', 'text', 'qlocale.qdoc'),
qtsrcdir)
except IOError as e:
err.write('Failed to open files to transcribe qlocale.qdoc: ' + (e.message or e.args[1]))
return 1
DOCSTRING = " QLocale's data is based on Common Locale Data Repository "
try:
for line in writer.reader:
if DOCSTRING in line:
writer.writer.write(DOCSTRING + 'v' + reader.cldrVersion + '.\n')
else:
writer.writer.write(line)
except Error as e:
writer.cleanup()
err.write('\nError updating qlocale.qdoc: ' + e.message + '\n')
return 1
writer.close()
return 0
if __name__ == "__main__":
import sys
sys.exit(main(sys.argv, sys.stdout, sys.stderr))