Break clashing-names test function out of CldrAccess.__checkEnum()

Moving it makes it easier to document what it's up to and why, while
leaving __checkEnum() easier to read; and I'm going to need it
elsewhere anyway. This makes no difference to generated data.

Task-number: QTBUG-94460
Change-Id: I684375bc926d5d54928fbf5b5e08978528aef487
Reviewed-by: Ievgenii Meshcheriakov <ievgenii.meshcheriakov@qt.io>
This commit is contained in:
Edward Welbourne 2023-08-01 11:48:37 +02:00
parent 4f686b7b78
commit e212b3633c
2 changed files with 40 additions and 17 deletions

View File

@ -16,6 +16,7 @@ from weakref import WeakValueDictionary as CacheDict
from pathlib import Path
from ldml import Error, Node, XmlScanner, Supplement, LocaleScanner
from localetools import names_clash
from qlocalexml import Locale
class CldrReader (object):
@ -353,10 +354,7 @@ class CldrAccess (object):
language, script, territory, variant)
@staticmethod
def __checkEnum(given, proper, scraps,
remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ü': 'u'},
prefix = { 'St.': 'Saint', 'U.S.': 'United States' },
skip = '\u02bc'):
def __checkEnum(given, proper, scraps):
# Each is a { code: full name } mapping
for code, name in given.items():
try: right = proper[code]
@ -366,19 +364,9 @@ class CldrAccess (object):
if code not in scraps:
yield name, f'[Found no CLDR name for code {code}]'
continue
if name == right: continue
ok = right.replace('&', 'And')
for k, v in prefix.items():
if ok.startswith(k + ' '):
ok = v + ok[len(k):]
while '(' in ok:
try: f, t = ok.index('('), ok.index(')')
except ValueError: break
ok = ok[:f].rstrip() + ' ' + ok[t:].lstrip()
if ''.join(ch for ch in name.lower() if not ch.isspace()) in ''.join(
remap.get(ch, ch) for ch in ok.lower() if ch.isalpha() and ch not in skip):
continue
yield name, ok
cleaned = names_clash(right, name)
if cleaned:
yield name, cleaned
def checkEnumData(self, grumble):
scraps = set()

View File

@ -48,6 +48,41 @@ def wrap_list(lst, perline=20):
yield head
return ",\n".join(", ".join(x) for x in split(lst, perline))
def names_clash(cldr, enum):
"""True if the reader might not recognize cldr as the name of enum
First argument, cldr, is the name CLDR gives for some language,
script or territory; second, enum, is the name enumdata.py gives
for it. If these are enough alike, returns None; otherwise, a
non-empty string that results from adapting cldr to be more like
how enumdata.py would express it."""
if cldr == enum:
return None
# Some common substitutions:
cldr = cldr.replace('&', 'And')
prefix = { 'St.': 'Saint', 'U.S.': 'United States' }
for k, v in prefix.items():
if cldr.startswith(k + ' '):
cldr = v + cldr[len(k):]
# Chop out any parenthesised part, e.g. (Burma):
while '(' in cldr:
try:
f, t = cldr.index('('), cldr.rindex(')')
except ValueError:
break
cldr = cldr[:f].rstrip() + ' ' + cldr[t + 1:].lstrip()
# Various accented letters:
remap = { 'å': 'a', 'ã': 'a', 'ç': 'c', 'é': 'e', 'í': 'i', 'ô': 'o', 'ü': 'u'}
skip = '\u02bc' # Punctuation for which .isalpha() is true.
# Let cldr match (ignoring non-letters and case) any substring as enum:
if ''.join(enum.lower().split()) in ''.join(
remap.get(ch, ch) for ch in cldr.lower() if ch.isalpha() and ch not in skip):
return None
return cldr
@contextmanager
def AtomicRenameTemporaryFile(originalLocation: Path, *, prefix: str, dir: Path):