Move enum-name-munging from LocaleHeaderWriter to QLocaleXmlReader

The former needed the latter's .dupes to do the job, so can now just take a method as a tool to do the job instead, letting .dupes become private. In the process refine the munging to free enumdata.py from having to capitalize each word in its names. This will, in due course, let us use more natural forms in various comments. This causes no change to generted data. Update enumdata.py's introduction doc, mainly to reflect this but also fixing the out-of-date names (old *_list have long been *_map) and adding some details to other paragraphs. Task-number: QTBUG-94460 Change-Id: If195b2e94a53a495fc4f1f216bed07a910439fa7 Reviewed-by: Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>
2023-08-01 12:03:18 +02:00 · 2023-08-01 12:03:18 +02:00 · 743ceb7cc2
commit 743ceb7cc2
parent e212b3633c
3 changed files with 46 additions and 26 deletions
--- a/util/locale_database/enumdata.py
+++ b/util/locale_database/enumdata.py
@ -6,14 +6,18 @@
 # can find a name (taken always from en.xml) that could potentially be
 # used. There is no point adding a mapping for such a code unless the
 # CLDR's common/main/ contains an XML file for at least one locale
-# that exerciss it.
+# that exercises it (and little point absent substantial data).

-# Each *_list reflects the current values of its enums in qlocale.h;
-# if new xml language files are available in CLDR, these languages and
+# Each *_map reflects the current values of its enums in qlocale.h; if
+# new xml language files are available in CLDR, these languages and
 # territories need to be *appended* to this list (for compatibility
-# between versions).  Include any spaces present in names (scripts
-# shall squish them out for the enum entries) in *_list, but use the
-# squished forms of names in the *_aliases mappings.
+# between versions). Include any spaces and dashes present in names
+# (they'll be squished them out for the enum entries) in *_map, but
+# use the squished forms of names in the *_aliases mappings. The
+# squishing also turns the first letter of each word into a capital so
+# you can safely preserve the case of en.xml's name; but omit (or
+# replace with space) any punctuation aside from dashes and map any
+# accented letters to their un-accented plain ASCII.

 # For a new major version (and only then), we can change the
 # numbering, so re-sort each list into alphabetic order (e.g. using
@ -21,10 +25,10 @@
 # are offset with a blank line, below. After doing that, regenerate
 # locale data as usual; this will cause a binary-incompatible change.

-# Note on "macrolanguage" comments: see "ISO 639 macrolanguage" on
-# Wikipedia. A "macrolanguage" is (loosely-speaking) a group of
-# languages so closely related to one another that they could also be
-# regarded as divergent dialects of the macrolanguage.
+# Note on "macrolanguage" comments: see QTBUG-107781 and "ISO 639
+# macrolanguage" on Wikipedia. A "macrolanguage" is (loosely-speaking)
+# a group of languages so closely related to one another that they
+# could also be regarded as divergent dialects of the macrolanguage.

 language_map = {
      0: ("AnyLanguage",                 "  "),
--- a/util/locale_database/qlocalexml.py
+++ b/util/locale_database/qlocalexml.py
@ -114,7 +114,7 @@ class QLocaleXmlReader (object):
        self.__textByName = dict((v[1], (v[0], v[2])) for v in scripts)
        self.__landByName = dict((v[1], (v[0], v[2])) for v in territories)
        # Other properties:
-        self.dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
+        self.__dupes = set(v[1] for v in languages) & set(v[1] for v in territories)
        self.cldrVersion = self.__firstChildText(self.root, "version")

    def loadLocaleMap(self, calendars, grumble = lambda text: None):
@ -184,6 +184,32 @@ class QLocaleXmlReader (object):
                        self.__textByName[give[1]][0]),
                       self.__landByName[give[2]][0])

+    def enumify(self, name, suffix):
+        """Stick together the parts of an enumdata.py name.
+
+        Names given in enumdata.py include spaces and hyphens that we
+        can't include in an identifier, such as the name of a member
+        of an enum type. Removing those would lose the word
+        boundaries, so make sure each word starts with a capital (but
+        don't simply capitalize() as some names contain words,
+        e.g. McDonald, that have later capitals in them).
+
+        We also need to resolve duplication between languages and
+        territories (by adding a suffix to each) and add Script to the
+        ends of script-names that don't already end in it."""
+        name = name.replace('-', ' ')
+        # Don't .capitalize() as McDonald is already camel-case (see enumdata.py):
+        name = ''.join(word[0].upper() + word[1:] for word in name.split())
+        if suffix != 'Script':
+            assert not(name in self.__dupes and name.endswith(suffix))
+            return name + suffix if name in self.__dupes else name
+
+        if not name.endswith(suffix):
+            name += suffix
+        if name in self.__dupes:
+            raise Error(f'The script name "{name}" is messy')
+        return name
+
    # Implementation details:
    def __loadMap(self, category):
        kid = self.__firstChildText
--- a/util/locale_database/qlocalexml2cpp.py
+++ b/util/locale_database/qlocalexml2cpp.py
@ -456,9 +456,9 @@ class CalendarDataWriter (LocaleSourceEditor):
        months_data.write(self.writer)

 class LocaleHeaderWriter (SourceFileEditor):
-    def __init__(self, path, temp, dupes):
+    def __init__(self, path, temp, enumify):
        super().__init__(path, temp)
-        self.__dupes = dupes
+        self.__enumify = enumify

    def languages(self, languages):
        self.__enum('Language', languages, self.__language)
@ -483,20 +483,10 @@ class LocaleHeaderWriter (SourceFileEditor):
        if suffix is None:
            suffix = name

-        out, dupes = self.writer.write, self.__dupes
+        out, enumify = self.writer.write, self.__enumify
        out(f'    enum {name} : ushort {{\n')
        for key, value in book.items():
-            member = value[0].replace('-', ' ')
-            if name == 'Script':
-                # Don't .capitalize() as some names are already camel-case (see enumdata.py):
-                member = ''.join(word[0].upper() + word[1:] for word in member.split())
-                if not member.endswith('Script'):
-                    member += 'Script'
-                if member in dupes:
-                    raise Error(f'The script name "{member}" is messy')
-            else:
-                member = ''.join(member.split())
-                member = member + suffix if member in dupes else member
+            member = enumify(value[0], suffix)
            out(f'        {member} = {key},\n')

        out('\n        '
@ -581,7 +571,7 @@ def main(out, err):
    # qlocale.h
    try:
        with LocaleHeaderWriter(qtsrcdir.joinpath('src/corelib/text/qlocale.h'),
-                                qtsrcdir, reader.dupes) as writer:
+                                qtsrcdir, reader.enumify) as writer:
            writer.languages(reader.languages)
            writer.scripts(reader.scripts)
            writer.territories(reader.territories)