Move enum-name-munging from LocaleHeaderWriter to QLocaleXmlReader

The former needed the latter's .dupes to do the job, so can now just
take a method as a tool to do the job instead, letting .dupes become
private. In the process refine the munging to free enumdata.py from
having to capitalize each word in its names. This will, in due course,
let us use more natural forms in various comments. This causes no
change to generted data.

Update enumdata.py's introduction doc, mainly to reflect this but also
fixing the out-of-date names (old *_list have long been *_map) and
adding some details to other paragraphs.

Task-number: QTBUG-94460
Change-Id: If195b2e94a53a495fc4f1f216bed07a910439fa7
Reviewed-by: Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>
This commit is contained in:
Edward Welbourne 2023-08-01 12:03:18 +02:00
parent e212b3633c
commit 743ceb7cc2
3 changed files with 46 additions and 26 deletions

View File

@ -6,14 +6,18 @@
# can find a name (taken always from en.xml) that could potentially be # can find a name (taken always from en.xml) that could potentially be
# used. There is no point adding a mapping for such a code unless the # used. There is no point adding a mapping for such a code unless the
# CLDR's common/main/ contains an XML file for at least one locale # CLDR's common/main/ contains an XML file for at least one locale
# that exerciss it. # that exercises it (and little point absent substantial data).
# Each *_list reflects the current values of its enums in qlocale.h; # Each *_map reflects the current values of its enums in qlocale.h; if
# if new xml language files are available in CLDR, these languages and # new xml language files are available in CLDR, these languages and
# territories need to be *appended* to this list (for compatibility # territories need to be *appended* to this list (for compatibility
# between versions). Include any spaces present in names (scripts # between versions). Include any spaces and dashes present in names
# shall squish them out for the enum entries) in *_list, but use the # (they'll be squished them out for the enum entries) in *_map, but
# squished forms of names in the *_aliases mappings. # use the squished forms of names in the *_aliases mappings. The
# squishing also turns the first letter of each word into a capital so
# you can safely preserve the case of en.xml's name; but omit (or
# replace with space) any punctuation aside from dashes and map any
# accented letters to their un-accented plain ASCII.
# For a new major version (and only then), we can change the # For a new major version (and only then), we can change the
# numbering, so re-sort each list into alphabetic order (e.g. using # numbering, so re-sort each list into alphabetic order (e.g. using
@ -21,10 +25,10 @@
# are offset with a blank line, below. After doing that, regenerate # are offset with a blank line, below. After doing that, regenerate
# locale data as usual; this will cause a binary-incompatible change. # locale data as usual; this will cause a binary-incompatible change.
# Note on "macrolanguage" comments: see "ISO 639 macrolanguage" on # Note on "macrolanguage" comments: see QTBUG-107781 and "ISO 639
# Wikipedia. A "macrolanguage" is (loosely-speaking) a group of # macrolanguage" on Wikipedia. A "macrolanguage" is (loosely-speaking)
# languages so closely related to one another that they could also be # a group of languages so closely related to one another that they
# regarded as divergent dialects of the macrolanguage. # could also be regarded as divergent dialects of the macrolanguage.
language_map = { language_map = {
0: ("AnyLanguage", " "), 0: ("AnyLanguage", " "),

View File

@ -114,7 +114,7 @@ class QLocaleXmlReader (object):
self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts) self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
self.__landByName = dict((v[1], (v[0], v[2])) for v in territories) self.__landByName = dict((v[1], (v[0], v[2])) for v in territories)
# Other properties: # Other properties:
self.dupes = set(v[1] for v in languages) & set(v[1] for v in territories) self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
self.cldrVersion = self.__firstChildText(self.root, "version") self.cldrVersion = self.__firstChildText(self.root, "version")
def loadLocaleMap(self, calendars, grumble = lambda text: None): def loadLocaleMap(self, calendars, grumble = lambda text: None):
@ -184,6 +184,32 @@ class QLocaleXmlReader (object):
self.__textByName[give[1]][0]), self.__textByName[give[1]][0]),
self.__landByName[give[2]][0]) self.__landByName[give[2]][0])
def enumify(self, name, suffix):
"""Stick together the parts of an enumdata.py name.
Names given in enumdata.py include spaces and hyphens that we
can't include in an identifier, such as the name of a member
of an enum type. Removing those would lose the word
boundaries, so make sure each word starts with a capital (but
don't simply capitalize() as some names contain words,
e.g. McDonald, that have later capitals in them).
We also need to resolve duplication between languages and
territories (by adding a suffix to each) and add Script to the
ends of script-names that don't already end in it."""
name = name.replace('-', ' ')
# Don't .capitalize() as McDonald is already camel-case (see enumdata.py):
name = ''.join(word[0].upper() + word[1:] for word in name.split())
if suffix != 'Script':
assert not(name in self.__dupes and name.endswith(suffix))
return name + suffix if name in self.__dupes else name
if not name.endswith(suffix):
name += suffix
if name in self.__dupes:
raise Error(f'The script name "{name}" is messy')
return name
# Implementation details: # Implementation details:
def __loadMap(self, category): def __loadMap(self, category):
kid = self.__firstChildText kid = self.__firstChildText

View File

@ -456,9 +456,9 @@ class CalendarDataWriter (LocaleSourceEditor):
months_data.write(self.writer) months_data.write(self.writer)
class LocaleHeaderWriter (SourceFileEditor): class LocaleHeaderWriter (SourceFileEditor):
def __init__(self, path, temp, dupes): def __init__(self, path, temp, enumify):
super().__init__(path, temp) super().__init__(path, temp)
self.__dupes = dupes self.__enumify = enumify
def languages(self, languages): def languages(self, languages):
self.__enum('Language', languages, self.__language) self.__enum('Language', languages, self.__language)
@ -483,20 +483,10 @@ class LocaleHeaderWriter (SourceFileEditor):
if suffix is None: if suffix is None:
suffix = name suffix = name
out, dupes = self.writer.write, self.__dupes out, enumify = self.writer.write, self.__enumify
out(f' enum {name} : ushort {{\n') out(f' enum {name} : ushort {{\n')
for key, value in book.items(): for key, value in book.items():
member = value[0].replace('-', ' ') member = enumify(value[0], suffix)
if name == 'Script':
# Don't .capitalize() as some names are already camel-case (see enumdata.py):
member = ''.join(word[0].upper() + word[1:] for word in member.split())
if not member.endswith('Script'):
member += 'Script'
if member in dupes:
raise Error(f'The script name "{member}" is messy')
else:
member = ''.join(member.split())
member = member + suffix if member in dupes else member
out(f' {member} = {key},\n') out(f' {member} = {key},\n')
out('\n ' out('\n '
@ -581,7 +571,7 @@ def main(out, err):
# qlocale.h # qlocale.h
try: try:
with LocaleHeaderWriter(qtsrcdir.joinpath('src/corelib/text/qlocale.h'), with LocaleHeaderWriter(qtsrcdir.joinpath('src/corelib/text/qlocale.h'),
qtsrcdir, reader.dupes) as writer: qtsrcdir, reader.enumify) as writer:
writer.languages(reader.languages) writer.languages(reader.languages)
writer.scripts(reader.scripts) writer.scripts(reader.scripts)
writer.territories(reader.territories) writer.territories(reader.territories)