qt5base-lts/util/locale_database/qlocalexml2cpp.py

909 lines
34 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
#############################################################################
##
## Copyright (C) 2020 The Qt Company Ltd.
## Contact: https://www.qt.io/licensing/
##
## This file is part of the test suite of the Qt Toolkit.
##
## $QT_BEGIN_LICENSE:GPL-EXCEPT$
## Commercial License Usage
## Licensees holding valid commercial Qt licenses may use this file in
## accordance with the commercial license agreement provided with the
## Software or, alternatively, in accordance with the terms contained in
## a written agreement between you and The Qt Company. For licensing terms
## and conditions see https://www.qt.io/terms-conditions. For further
## information use the contact form at https://www.qt.io/contact-us.
##
## GNU General Public License Usage
## Alternatively, this file may be used under the terms of the GNU
## General Public License version 3 as published by the Free Software
## Foundation with exceptions as appearing in the file LICENSE.GPL3-EXCEPT
## included in the packaging of this file. Please review the following
## information to ensure the GNU General Public License requirements will
## be met: https://www.gnu.org/licenses/gpl-3.0.html.
##
## $QT_END_LICENSE$
##
#############################################################################
"""Script to generate C++ code from CLDR data in qLocaleXML form
See ``cldr2qlocalexml.py`` for how to generate the qLocaleXML data itself.
Pass the output file from that as first parameter to this script; pass
the root of the qtbase check-out as second parameter.
"""
import os
import sys
import tempfile
import datetime
import xml.dom.minidom
from enumdata import language_aliases, country_aliases, script_aliases
from qlocalexml import Locale
# TODO: Make calendars a command-line parameter
# map { CLDR name: Qt file name }
calendars = {'gregorian': 'roman', 'persian': 'jalali', 'islamic': 'hijri',} # 'hebrew': 'hebrew',
generated_template = """
/*
This part of the file was generated on %s from the
Common Locale Data Repository v%s
http://www.unicode.org/cldr/
Do not edit this section: instead regenerate it using
cldr2qlocalexml.py and qlocalexml2cpp.py on updated (or
edited) CLDR data; see qtbase/util/locale_database/.
*/
"""
class Error:
def __init__(self, msg):
self.msg = msg
def __str__(self):
return self.msg
def wrap_list(lst):
def split(lst, size):
while lst:
head, lst = lst[:size], lst[size:]
yield head
return ",\n".join(", ".join(x) for x in split(lst, 20))
def isNodeNamed(elt, name, TYPE=xml.dom.minidom.Node.ELEMENT_NODE):
return elt.nodeType == TYPE and elt.nodeName == name
def firstChildElt(parent, name):
child = parent.firstChild
while child:
if isNodeNamed(child, name):
return child
child = child.nextSibling
raise Error('No %s child found' % name)
def eachEltInGroup(parent, group, key):
try:
element = firstChildElt(parent, group).firstChild
except Error:
element = None
while element:
if isNodeNamed(element, key):
yield element
element = element.nextSibling
def eltWords(elt):
child = elt.firstChild
while child:
if child.nodeType == elt.TEXT_NODE:
yield child.nodeValue
child = child.nextSibling
def firstChildText(elt, key):
return ' '.join(eltWords(firstChildElt(elt, key)))
def loadMap(doc, category):
return dict((int(firstChildText(element, 'id')),
(firstChildText(element, 'name'),
firstChildText(element, 'code')))
for element in eachEltInGroup(doc.documentElement,
category + 'List', category))
def loadLikelySubtagsMap(doc):
def triplet(element, keys=('language', 'script', 'country')):
return tuple(firstChildText(element, key) for key in keys)
return dict((i, {'from': triplet(firstChildElt(elt, "from")),
'to': triplet(firstChildElt(elt, "to"))})
for i, elt in enumerate(eachEltInGroup(doc.documentElement,
'likelySubtags', 'likelySubtag')))
def fixedScriptName(name, dupes):
# Don't .capitalize() as some names are already camel-case (see enumdata.py):
name = ''.join(word[0].upper() + word[1:] for word in name.split())
if name[-6:] != "Script":
name = name + "Script"
if name in dupes:
sys.stderr.write("\n\n\nERROR: The script name '%s' is messy" % name)
sys.exit(1)
return name
def fixedCountryName(name, dupes):
if name in dupes:
return name.replace(" ", "") + "Country"
return name.replace(" ", "")
def fixedLanguageName(name, dupes):
if name in dupes:
return name.replace(" ", "") + "Language"
return name.replace(" ", "")
def findDupes(country_map, language_map):
country_set = set(v[0] for a, v in country_map.iteritems())
language_set = set(v[0] for a, v in language_map.iteritems())
return country_set & language_set
def languageNameToId(name, language_map):
for key in language_map.keys():
if language_map[key][0] == name:
return key
return -1
def scriptNameToId(name, script_map):
for key in script_map.keys():
if script_map[key][0] == name:
return key
return -1
def countryNameToId(name, country_map):
for key in country_map.keys():
if country_map[key][0] == name:
return key
return -1
def loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map):
result = {}
for locale_elt in eachEltInGroup(doc.documentElement, "localeList", "locale"):
locale = Locale.fromXmlData(lambda k: firstChildText(locale_elt, k), calendars.keys())
language_id = languageNameToId(locale.language, language_map)
if language_id == -1:
sys.stderr.write("Cannot find a language id for '%s'\n" % locale.language)
script_id = scriptNameToId(locale.script, script_map)
if script_id == -1:
sys.stderr.write("Cannot find a script id for '%s'\n" % locale.script)
country_id = countryNameToId(locale.country, country_map)
if country_id == -1:
sys.stderr.write("Cannot find a country id for '%s'\n" % locale.country)
if language_id != 1: # C
if country_id == 0:
sys.stderr.write("loadLocaleMap: No country id for '%s'\n" % locale.language)
if script_id == 0:
# find default script for a given language and country (see http://www.unicode.org/reports/tr35/#Likely_Subtags)
for key in likely_subtags_map.keys():
tmp = likely_subtags_map[key]
if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == locale.country:
locale.script = tmp["to"][1]
script_id = scriptNameToId(locale.script, script_map)
break
if script_id == 0 and country_id != 0:
# try with no country
for key in likely_subtags_map.keys():
tmp = likely_subtags_map[key]
if tmp["from"][0] == locale.language and tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry":
locale.script = tmp["to"][1]
script_id = scriptNameToId(locale.script, script_map)
break
result[(language_id, script_id, country_id)] = locale
return result
def compareLocaleKeys(key1, key2):
if key1 == key2:
return 0
if key1[0] == key2[0]:
l1 = compareLocaleKeys.locale_map[key1]
l2 = compareLocaleKeys.locale_map[key2]
if (l1.language, l1.script) in compareLocaleKeys.default_map.keys():
default = compareLocaleKeys.default_map[(l1.language, l1.script)]
if l1.country == default:
return -1
if l2.country == default:
return 1
if key1[1] != key2[1]:
if (l2.language, l2.script) in compareLocaleKeys.default_map.keys():
default = compareLocaleKeys.default_map[(l2.language, l2.script)]
if l2.country == default:
return 1
if l1.country == default:
return -1
if key1[1] != key2[1]:
return key1[1] - key2[1]
else:
return key1[0] - key2[0]
return key1[2] - key2[2]
def languageCount(language_id, locale_map):
result = 0
for key in locale_map.keys():
if key[0] == language_id:
result += 1
return result
def unicode2hex(s):
lst = []
for x in s:
v = ord(x)
if v > 0xFFFF:
# make a surrogate pair
# copied from qchar.h
high = (v >> 10) + 0xd7c0
low = (v % 0x400 + 0xdc00)
lst.append(hex(high))
lst.append(hex(low))
else:
lst.append(hex(v))
return lst
class StringDataToken:
def __init__(self, index, length, bits):
if index > 0xffff:
print "\n\n\n#error Data index is too big!", index
raise ValueError("Start-index (%d) exceeds the uint16 range!" % index)
if length >= (1 << bits):
print "\n\n\n#error Range length is too big!", length
raise ValueError("Data size (%d) exceeds the %d-bit range!" % (length, bits))
self.index = index
self.length = length
class StringData:
def __init__(self, name):
self.data = []
self.hash = {}
self.name = name
self.text = '' # Used in quick-search for matches in data
def append(self, s, bits=8):
try:
token = self.hash[s]
except KeyError:
token = self.__store(s, bits)
self.hash[s] = token
return token
def __store(self, s, bits):
"""Add string s to known data.
Seeks to avoid duplication, where possible.
For example, short-forms may be prefixes of long-forms.
"""
if not s:
return StringDataToken(0, 0, bits)
ucs2 = unicode2hex(s)
try:
index = self.text.index(s) - 1
matched = 0
while matched < len(ucs2):
index, matched = self.data.index(ucs2[0], index + 1), 1
if index + len(ucs2) >= len(self.data):
raise ValueError # not found after all !
while matched < len(ucs2) and self.data[index + matched] == ucs2[matched]:
matched += 1
except ValueError:
index = len(self.data)
self.data += ucs2
self.text += s
assert index >= 0
try:
return StringDataToken(index, len(ucs2), bits)
except ValueError as e:
e.args += (self.name, s)
raise
def write(self, fd):
if len(self.data) > 0xffff:
raise ValueError("Data is too big for quint16 index to its end!" % len(self.data),
self.name)
fd.write("\nstatic const char16_t %s[] = {\n" % self.name)
fd.write(wrap_list(self.data))
fd.write("\n};\n")
def escapedString(s):
result = ""
i = 0
while i < len(s):
if s[i] == '"':
result += '\\"'
i += 1
else:
result += s[i]
i += 1
s = result
line = ""
need_escape = False
result = ""
for c in s:
if ord(c) < 128 and not (need_escape and ord('a') <= ord(c.lower()) <= ord('f')):
line += c
need_escape = False
else:
line += "\\x%02x" % (ord(c))
need_escape = True
if len(line) > 80:
result = result + "\n" + '"' + line + '"'
line = ""
line += "\\0"
result = result + "\n" + '"' + line + '"'
if result[0] == "\n":
result = result[1:]
return result
def printEscapedString(s):
print escapedString(s)
def currencyIsoCodeData(s):
if s:
return '{' + ",".join(str(ord(x)) for x in s) + '}'
return "{0,0,0}"
def usage():
print "Usage: qlocalexml2cpp.py <path-to-locale.xml> <path-to-qtbase-src-tree>"
sys.exit(1)
GENERATED_BLOCK_START = "// GENERATED PART STARTS HERE\n"
GENERATED_BLOCK_END = "// GENERATED PART ENDS HERE\n"
def main():
if len(sys.argv) != 3:
usage()
qlocalexml = sys.argv[1]
qtsrcdir = sys.argv[2]
if not (os.path.isdir(qtsrcdir)
and all(os.path.isfile(os.path.join(qtsrcdir, 'src', 'corelib', 'text', leaf))
for leaf in ('qlocale_data_p.h', 'qlocale.h', 'qlocale.qdoc'))):
usage()
(data_temp_file, data_temp_file_path) = tempfile.mkstemp("qlocale_data_p.h", dir=qtsrcdir)
data_temp_file = os.fdopen(data_temp_file, "w")
qlocaledata_file = open(qtsrcdir + "/src/corelib/text/qlocale_data_p.h", "r")
s = qlocaledata_file.readline()
while s and s != GENERATED_BLOCK_START:
data_temp_file.write(s)
s = qlocaledata_file.readline()
data_temp_file.write(GENERATED_BLOCK_START)
doc = xml.dom.minidom.parse(qlocalexml)
language_map = loadMap(doc, 'language')
script_map = loadMap(doc, 'script')
country_map = loadMap(doc, 'country')
likely_subtags_map = loadLikelySubtagsMap(doc)
default_map = {}
for key in likely_subtags_map.keys():
tmp = likely_subtags_map[key]
if tmp["from"][1] == "AnyScript" and tmp["from"][2] == "AnyCountry" and tmp["to"][2] != "AnyCountry":
default_map[(tmp["to"][0], tmp["to"][1])] = tmp["to"][2]
locale_map = loadLocaleMap(doc, language_map, script_map, country_map, likely_subtags_map)
dupes = findDupes(language_map, country_map)
cldr_version = firstChildText(doc.documentElement, "version")
data_temp_file.write(generated_template % (datetime.date.today(), cldr_version))
# Likely subtags map
data_temp_file.write("static const QLocaleId likely_subtags[] = {\n")
index = 0
for key in likely_subtags_map.keys():
tmp = likely_subtags_map[key]
from_language = languageNameToId(tmp["from"][0], language_map)
from_script = scriptNameToId(tmp["from"][1], script_map)
from_country = countryNameToId(tmp["from"][2], country_map)
to_language = languageNameToId(tmp["to"][0], language_map)
to_script = scriptNameToId(tmp["to"][1], script_map)
to_country = countryNameToId(tmp["to"][2], country_map)
cmnt_from = ""
if from_language != 0:
cmnt_from = cmnt_from + language_map[from_language][1]
else:
cmnt_from = cmnt_from + "und"
if from_script != 0:
if cmnt_from:
cmnt_from = cmnt_from + "_"
cmnt_from = cmnt_from + script_map[from_script][1]
if from_country != 0:
if cmnt_from:
cmnt_from = cmnt_from + "_"
cmnt_from = cmnt_from + country_map[from_country][1]
cmnt_to = ""
if to_language != 0:
cmnt_to = cmnt_to + language_map[to_language][1]
else:
cmnt_to = cmnt_to + "und"
if to_script != 0:
if cmnt_to:
cmnt_to = cmnt_to + "_"
cmnt_to = cmnt_to + script_map[to_script][1]
if to_country != 0:
if cmnt_to:
cmnt_to = cmnt_to + "_"
cmnt_to = cmnt_to + country_map[to_country][1]
data_temp_file.write(" ")
data_temp_file.write("{ %3d, %3d, %3d }, { %3d, %3d, %3d }" %
(from_language, from_script, from_country, to_language, to_script, to_country))
index += 1
if index != len(likely_subtags_map):
data_temp_file.write(",")
else:
data_temp_file.write(" ")
data_temp_file.write(" // %s -> %s\n" % (cmnt_from, cmnt_to))
data_temp_file.write("};\n")
data_temp_file.write("\n")
# Locale index
data_temp_file.write("static const quint16 locale_index[] = {\n")
index = 0
for key in language_map.keys():
i = 0
count = languageCount(key, locale_map)
if count > 0:
i = index
index += count
data_temp_file.write("%6d, // %s\n" % (i, language_map[key][0]))
data_temp_file.write(" 0 // trailing 0\n")
data_temp_file.write("};\n\n")
list_pattern_part_data = StringData('list_pattern_part_data')
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
single_character_data = StringData('single_character_data')
date_format_data = StringData('date_format_data')
time_format_data = StringData('time_format_data')
days_data = StringData('days_data')
am_data = StringData('am_data')
pm_data = StringData('pm_data')
byte_unit_data = StringData('byte_unit_data')
currency_symbol_data = StringData('currency_symbol_data')
currency_display_name_data = StringData('currency_display_name_data')
currency_format_data = StringData('currency_format_data')
endonyms_data = StringData('endonyms_data')
# Locale data
data_temp_file.write("static const QLocaleData locale_data[] = {\n")
# Table headings: keep each label centred in its field, matching line_format:
data_temp_file.write(' // '
# Width 6 + comma:
+ ' lang ' # IDs
+ 'script '
+ ' terr '
# Range entries (all start-indices, then all sizes):
# Width 5 + comma:
+ 'lStrt ' # List pattern
+ 'lpMid '
+ 'lpEnd '
+ 'lPair '
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
+ 'lDelm ' # List delimiter
# Representing numbers:
+ ' dec '
+ 'group '
+ 'prcnt '
+ ' zero '
+ 'minus '
+ 'plus '
+ ' exp '
# Quotation marks
+ 'qtOpn '
+ 'qtEnd '
+ 'altQO '
+ 'altQE '
+ 'lDFmt ' # Date format
+ 'sDFmt '
+ 'lTFmt ' # Time format
+ 'sTFmt '
+ 'slDay ' # Day names
+ 'lDays '
+ 'ssDys '
+ 'sDays '
+ 'snDay '
+ 'nDays '
+ ' am ' # am/pm indicators
+ ' pm '
+ ' byte '
+ 'siQnt '
+ 'iecQn '
+ 'crSym ' # Currency formatting:
+ 'crDsp '
+ 'crFmt '
+ 'crFNg '
+ 'ntLng ' # Name of language in itself, and of territory:
+ 'ntTer '
# Width 3 + comma for each size; no header
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
+ ' ' * 37
# Strays (char array, bit-fields):
# Width 8+4 + comma
+ ' currISO '
# Width 6 + comma:
+ 'curDgt ' # Currency digits
+ 'curRnd ' # Currencty rounding (unused: QTBUG-81343)
+ 'dow1st ' # First day of week
+ ' wknd+ ' # Week-end start/end days:
+ ' wknd-'
# No trailing space on last entry (be sure to
# pad before adding anything after it).
+ '\n')
locale_keys = locale_map.keys()
compareLocaleKeys.default_map = default_map
compareLocaleKeys.locale_map = locale_map
locale_keys.sort(compareLocaleKeys)
line_format = (' { '
# Locale-identifier:
+ '%6d,' * 3
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
# Offsets for starts of ranges:
+ '%5d,' * 37
# Sizes for the same:
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
+ '%3d,' * 37
# Currency ISO code:
+ ' %10s, '
# Currency formatting:
+ '%6d,%6d'
# Day of week and week-end:
+ ',%6d' * 3
+ ' }')
for key in locale_keys:
l = locale_map[key]
# Sequence of StringDataToken:
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
ranges = (tuple(list_pattern_part_data.append(p) for p in # 5 entries:
(l.listPatternPartStart, l.listPatternPartMiddle,
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
l.listPatternPartEnd, l.listPatternPartTwo, l.listDelim)) +
tuple(single_character_data.append(p) for p in # 11 entries
(l.decimal, l.group, l.percent, l.zero, l.minus, l.plus, l.exp,
l.quotationStart, l.quotationEnd,
l.alternateQuotationStart, l.alternateQuotationEnd)) +
tuple (date_format_data.append(f) for f in # 2 entries:
(l.longDateFormat, l.shortDateFormat)) +
tuple(time_format_data.append(f) for f in # 2 entries:
(l.longTimeFormat, l.shortTimeFormat)) +
tuple(days_data.append(d) for d in # 6 entries:
(l.standaloneLongDays, l.longDays,
l.standaloneShortDays, l.shortDays,
l.standaloneNarrowDays, l.narrowDays)) +
(am_data.append(l.am), pm_data.append(l.pm)) + # 2 entries:
tuple(byte_unit_data.append(b) for b in # 3 entries:
(l.byte_unit, l.byte_si_quantified, l.byte_iec_quantified)) +
(currency_symbol_data.append(l.currencySymbol),
currency_display_name_data.append(l.currencyDisplayName),
currency_format_data.append(l.currencyFormat),
currency_format_data.append(l.currencyNegativeFormat),
endonyms_data.append(l.languageEndonym),
endonyms_data.append(l.countryEndonym)) # 6 entries
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
) # Total: 37 entries
assert len(ranges) == 37
data_temp_file.write(line_format
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
% ((key[0], key[1], key[2]) +
tuple(r.index for r in ranges) +
tuple(r.length for r in ranges) +
(currencyIsoCodeData(l.currencyIsoCode),
l.currencyDigits,
l.currencyRounding, # unused (QTBUG-81343)
l.firstDayOfWeek,
l.weekendStart,
l.weekendEnd))
+ ", // %s/%s/%s\n" % (l.language, l.script, l.country))
data_temp_file.write(line_format # All zeros, matching the format:
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
% ( (0,) * 3 + (0,) * 37 * 2
+ (currencyIsoCodeData(0),)
+ (0,) * 2
+ (0,) * 3)
+ " // trailing zeros\n")
data_temp_file.write("};\n")
# StringData tables:
Allow surrogate pairs for various "single character" locale data Extract the character in its proper unicode form and encode it in a new single_character_data table of locale data. Record each entry as the range within that table that encodes it. Also added an assertion in the generator script to check that the digits CLDR gives us are a contiguous sequence in increasing order, as has been assumed by the C++ code for some time. Lots of number-formatting code now has to take account of how wide the digits are. This leaves nowhere for updateSystemPrivate() to record values read from sys_locale->query(), so we must always consult that function when accessing these members of the systemData() object. Various internal users of these single-character fields need the system-or-CLDR value rather than the raw CLDR value, so move QLocalePrivate's methods to supply them down to QLocaleData and ensure they check for system values, where appropriate first. This allows us to finally support the Chakma language and script, for whose number system UTF-16 needs surrogate pairs. Costs 10.8 kB in added data, much of it due to adding two new locales that need surrogates to represent digits. [ChangeLog][QtCore][QLocale] Various QLocale methods that returned single QChar values now return QString values to accommodate those locales which need a surrogate pair to represent the (single character) return value. Fixes: QTBUG-69324 Fixes: QTBUG-81053 Change-Id: I481722d6f5ee266164f09031679a851dfa6e7839 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-01-13 14:46:13 +00:00
for data in (list_pattern_part_data, single_character_data,
date_format_data, time_format_data, days_data,
byte_unit_data, am_data, pm_data, currency_symbol_data,
currency_display_name_data, currency_format_data,
endonyms_data):
data.write(data_temp_file)
data_temp_file.write("\n")
# Language name list
data_temp_file.write("static const char language_name_list[] =\n")
data_temp_file.write('"Default\\0"\n')
for key in language_map.keys():
if key == 0:
continue
data_temp_file.write('"' + language_map[key][0] + '\\0"\n')
data_temp_file.write(";\n")
data_temp_file.write("\n")
# Language name index
data_temp_file.write("static const quint16 language_name_index[] = {\n")
data_temp_file.write(" 0, // AnyLanguage\n")
index = 8
for key in language_map.keys():
if key == 0:
continue
language = language_map[key][0]
data_temp_file.write("%6d, // %s\n" % (index, language))
index += len(language) + 1
data_temp_file.write("};\n")
data_temp_file.write("\n")
# Script name list
data_temp_file.write("static const char script_name_list[] =\n")
data_temp_file.write('"Default\\0"\n')
for key in script_map.keys():
if key == 0:
continue
data_temp_file.write('"' + script_map[key][0] + '\\0"\n')
data_temp_file.write(";\n")
data_temp_file.write("\n")
# Script name index
data_temp_file.write("static const quint16 script_name_index[] = {\n")
data_temp_file.write(" 0, // AnyScript\n")
index = 8
for key in script_map.keys():
if key == 0:
continue
script = script_map[key][0]
data_temp_file.write("%6d, // %s\n" % (index, script))
index += len(script) + 1
data_temp_file.write("};\n")
data_temp_file.write("\n")
# Country name list
data_temp_file.write("static const char country_name_list[] =\n")
data_temp_file.write('"Default\\0"\n')
for key in country_map.keys():
if key == 0:
continue
data_temp_file.write('"' + country_map[key][0] + '\\0"\n')
data_temp_file.write(";\n")
data_temp_file.write("\n")
# Country name index
data_temp_file.write("static const quint16 country_name_index[] = {\n")
data_temp_file.write(" 0, // AnyCountry\n")
index = 8
for key in country_map.keys():
if key == 0:
continue
country = country_map[key][0]
data_temp_file.write("%6d, // %s\n" % (index, country))
index += len(country) + 1
data_temp_file.write("};\n")
data_temp_file.write("\n")
# Language code list
data_temp_file.write("static const unsigned char language_code_list[] =\n")
for key in language_map.keys():
code = language_map[key][1]
if len(code) == 2:
code += r"\0"
data_temp_file.write('"%2s" // %s\n' % (code, language_map[key][0]))
data_temp_file.write(";\n")
data_temp_file.write("\n")
# Script code list
data_temp_file.write("static const unsigned char script_code_list[] =\n")
for key in script_map.keys():
code = script_map[key][1]
for i in range(4 - len(code)):
code += "\\0"
data_temp_file.write('"%2s" // %s\n' % (code, script_map[key][0]))
data_temp_file.write(";\n")
# Country code list
data_temp_file.write("static const unsigned char country_code_list[] =\n")
for key in country_map.keys():
code = country_map[key][1]
if len(code) == 2:
code += "\\0"
data_temp_file.write('"%2s" // %s\n' % (code, country_map[key][0]))
data_temp_file.write(";\n")
data_temp_file.write("\n")
data_temp_file.write(GENERATED_BLOCK_END)
s = qlocaledata_file.readline()
# skip until end of the old block
while s and s != GENERATED_BLOCK_END:
s = qlocaledata_file.readline()
s = qlocaledata_file.readline()
while s:
data_temp_file.write(s)
s = qlocaledata_file.readline()
data_temp_file.close()
qlocaledata_file.close()
os.remove(qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
os.rename(data_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale_data_p.h")
# Generate calendar data
calendar_format = ' {%6d,%6d,%6d' + ',%5d' * 6 + ',%3d' * 6 + ' },'
for calendar, stem in calendars.items():
months_data = StringData('months_data')
calendar_data_file = "q%scalendar_data_p.h" % stem
calendar_template_file = open(os.path.join(qtsrcdir, 'src', 'corelib', 'time',
calendar_data_file), "r")
(calendar_temp_file, calendar_temp_file_path) = tempfile.mkstemp(calendar_data_file, dir=qtsrcdir)
calendar_temp_file = os.fdopen(calendar_temp_file, "w")
s = calendar_template_file.readline()
while s and s != GENERATED_BLOCK_START:
calendar_temp_file.write(s)
s = calendar_template_file.readline()
calendar_temp_file.write(GENERATED_BLOCK_START)
calendar_temp_file.write(generated_template % (datetime.date.today(), cldr_version))
calendar_temp_file.write("static const QCalendarLocale locale_data[] = {\n")
calendar_temp_file.write(' // '
# IDs, width 7 (6 + comma)
+ ' lang '
+ ' script'
+ ' terr '
# Month-name start-indices, width 6 (5 + comma):
+ 'sLng '
+ 'long '
+ 'sSrt '
+ 'shrt '
+ 'sNrw '
+ 'naro '
# No individual headers for the sizes.
+ 'Sizes...'
+ '\n')
for key in locale_keys:
l = locale_map[key]
# Sequence of StringDataToken:
try:
# Twelve long month names can add up to more than 256 (e.g. kde_TZ: 264)
ranges = (tuple(months_data.append(m[calendar], 16) for m in
(l.standaloneLongMonths, l.longMonths)) +
tuple(months_data.append(m[calendar]) for m in
(l.standaloneShortMonths, l.shortMonths,
l.standaloneNarrowMonths, l.narrowMonths)))
except ValueError as e:
e.args += (l.language, l.script, l.country, stem)
raise
calendar_temp_file.write(
calendar_format
% ((key[0], key[1], key[2]) +
tuple(r.index for r in ranges) +
tuple(r.length for r in ranges))
+ "// %s/%s/%s\n" % (l.language, l.script, l.country))
calendar_temp_file.write(calendar_format % ( (0,) * (3 + 6 * 2) )
+ '// trailing zeros\n')
calendar_temp_file.write("};\n")
months_data.write(calendar_temp_file)
s = calendar_template_file.readline()
while s and s != GENERATED_BLOCK_END:
s = calendar_template_file.readline()
while s:
calendar_temp_file.write(s)
s = calendar_template_file.readline()
os.rename(calendar_temp_file_path,
os.path.join(qtsrcdir, 'src', 'corelib', 'time', calendar_data_file))
# qlocale.h
(qlocaleh_temp_file, qlocaleh_temp_file_path) = tempfile.mkstemp("qlocale.h", dir=qtsrcdir)
qlocaleh_temp_file = os.fdopen(qlocaleh_temp_file, "w")
qlocaleh_file = open(qtsrcdir + "/src/corelib/text/qlocale.h", "r")
s = qlocaleh_file.readline()
while s and s != GENERATED_BLOCK_START:
qlocaleh_temp_file.write(s)
s = qlocaleh_file.readline()
qlocaleh_temp_file.write(GENERATED_BLOCK_START)
qlocaleh_temp_file.write("// see qlocale_data_p.h for more info on generated data\n")
# Language enum
qlocaleh_temp_file.write(" enum Language {\n")
language = None
for key, value in language_map.items():
language = fixedLanguageName(value[0], dupes)
qlocaleh_temp_file.write(" " + language + " = " + str(key) + ",\n")
qlocaleh_temp_file.write("\n " +
",\n ".join('%s = %s' % pair
for pair in sorted(language_aliases.items())) +
",\n")
qlocaleh_temp_file.write("\n")
qlocaleh_temp_file.write(" LastLanguage = " + language + "\n")
qlocaleh_temp_file.write(" };\n\n")
# Script enum
qlocaleh_temp_file.write(" enum Script {\n")
script = None
for key, value in script_map.items():
script = fixedScriptName(value[0], dupes)
qlocaleh_temp_file.write(" " + script + " = " + str(key) + ",\n")
qlocaleh_temp_file.write("\n " +
",\n ".join('%s = %s' % pair
for pair in sorted(script_aliases.items())) +
",\n")
qlocaleh_temp_file.write("\n")
qlocaleh_temp_file.write(" LastScript = " + script + "\n")
qlocaleh_temp_file.write(" };\n\n")
# Country enum
qlocaleh_temp_file.write(" enum Country {\n")
country = None
for key, value in country_map.items():
country = fixedCountryName(value[0], dupes)
qlocaleh_temp_file.write(" " + country + " = " + str(key) + ",\n")
qlocaleh_temp_file.write("\n " +
",\n ".join('%s = %s' % pair
for pair in sorted(country_aliases.items())) +
",\n")
qlocaleh_temp_file.write("\n")
qlocaleh_temp_file.write(" LastCountry = " + country + "\n")
qlocaleh_temp_file.write(" };\n")
qlocaleh_temp_file.write(GENERATED_BLOCK_END)
s = qlocaleh_file.readline()
# skip until end of the old block
while s and s != GENERATED_BLOCK_END:
s = qlocaleh_file.readline()
s = qlocaleh_file.readline()
while s:
qlocaleh_temp_file.write(s)
s = qlocaleh_file.readline()
qlocaleh_temp_file.close()
qlocaleh_file.close()
os.remove(qtsrcdir + "/src/corelib/text/qlocale.h")
os.rename(qlocaleh_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.h")
# qlocale.qdoc
(qlocaleqdoc_temp_file, qlocaleqdoc_temp_file_path) = tempfile.mkstemp("qlocale.qdoc", dir=qtsrcdir)
qlocaleqdoc_temp_file = os.fdopen(qlocaleqdoc_temp_file, "w")
qlocaleqdoc_file = open(qtsrcdir + "/src/corelib/text/qlocale.qdoc", "r")
s = qlocaleqdoc_file.readline()
DOCSTRING = " QLocale's data is based on Common Locale Data Repository "
while s:
if DOCSTRING in s:
qlocaleqdoc_temp_file.write(DOCSTRING + "v" + cldr_version + ".\n")
else:
qlocaleqdoc_temp_file.write(s)
s = qlocaleqdoc_file.readline()
qlocaleqdoc_temp_file.close()
qlocaleqdoc_file.close()
os.remove(qtsrcdir + "/src/corelib/text/qlocale.qdoc")
os.rename(qlocaleqdoc_temp_file_path, qtsrcdir + "/src/corelib/text/qlocale.qdoc")
if __name__ == "__main__":
main()