ICU-20693 Reworking Ant structure to better explain and reflect 'tailorings'

This commit is contained in:
David Beaumont 2019-11-05 21:09:51 +01:00 committed by David Beaumont
parent 6c41b4b24d
commit 43826cccba
5 changed files with 164 additions and 106 deletions

View File

@ -196,104 +196,116 @@
<!-- The following elements configure directories in which a subset of the available
locales IDs should be generated. Unlike the main <localeId> element, these
filters must specify all locale IDs in full (but since they mostly select base
languages, this isn't a big deal). -->
languages, this isn't a big deal).
As well as allowing some data directories to have a subset of available data (via
the <localeIds> element) there are also mechanisms for controlling aliasing and
the locale parent relation which allows the sharing of some ICU data in cases
where it would otherwise need to be copied. The two mechanisms are:
1: inheritLanguageSubtag: Used to rewrite the parent of a locale ID from "root" to
its language subtag (e.g. "zh_Hant" has a natural parent of "root", but to allow
some base language data to be shared it can be made to have a parent of "zh").
2: forcedAlias: Used to add aliases for specific directories in order to affect the
ICU behaviour in special cases.
Between them these mechanisms are known as "tailorings" of the affected locales. -->
<!-- TODO: Explain why these special cases are needed/different. -->
<directoryFilter dir="coll">
root,
<!-- Collation data is large, but also more sharable than other data, which is why there
are a number of aliases and parent remappings for this directory. -->
<directory dir="coll" inheritLanguageSubtag="bs_Cyrl, sr_Latn, zh_Hant">
<!-- This alias is to avoid needing to copy and maintain the same collation data for
"zh" and "yue". The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs
"zh_Hans_CN"), and for "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the
aliases are effectively just rewriting the base language. -->
<forcedAlias source="yue_Hans" target="zh_Hans"/>
<forcedAlias source="yue" target="zh_Hant"/>
<!-- TODO: Find out and document this properly. -->
<forcedAlias source="sr_ME" target="sr_Cyrl_ME"/>
// A-B
af, am, ars, ar, as, az, be, bg, bn, bo, bs_Cyrl, bs,
<localeIds>
root,
// C-F
ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
en_US_POSIX, en_US, eo, es, et, fa_AF, fa, fil, fi, fo, fr_CA, fr,
// A-B
af, am, ars, ar, as, az, be, bg, bn, bo, bs_Cyrl, bs,
// G-J
ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
// C-F
ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
en_US_POSIX, en_US, eo, es, et, fa_AF, fa, fil, fi, fo, fr_CA, fr,
// K-P
ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
// G-J
ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
// R-T
ro, ru, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
ta, te, th, tk, to, tr,
// K-P
ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
// U-Z
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
</directoryFilter>
// R-T
ro, ru, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
ta, te, th, tk, to, tr,
<directoryFilter dir="rbnf">
root,
// U-Z
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
</localeIds>
</directory>
// A-E
af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
<directory dir="rbnf">
<!-- It is not at all clear why this is being done. It's certainly not exactly the
same as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with
different data than "yue", so this alias is not just rewriting the base
language. -->
<!-- TODO: Find out and document this properly. -->
<forcedAlias source="zh_Hant_HK" target="yue"/>
// F-P
fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,
<localeIds>
root,
// Q-Z
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
</directoryFilter>
// A-E
af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
<directoryFilter dir="brkitr">
root,
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
</directoryFilter>
// F-P
fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,
<!-- The following elements configure some very special case locale alias behaviour,
mainly to support situations where the natural alias relationship is not wanted
for a particular type of data. -->
// Q-Z
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
</localeIds>
</directory>
<directory dir="brkitr" inheritLanguageSubtag="zh_Hant">
<localeIds>
root,
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
</localeIds>
</directory>
<!-- GLOBAL ALIASES -->
<!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
(e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
relationship. Unlike deprecated languages for which an alias can be inferred from
the "languageAlias" element, there's no way in CLDR to represent the fact that we
want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
the "languageAlias" CLDR data, there's no way in CLDR to represent the fact that
we want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
This alias is the first example of potentially many cases where ICU needs to
generate an alias in order to affect "sideways inheritence" for spoken languages,
and at some stage it should be supported properly in the CLDR data. -->
generate an alias in order to affect "sideways inheritance" for spoken languages,
and at some stage it should probably be supported properly in the CLDR data. -->
<forcedAlias source="ars" target="ar_SA"/>
<!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
<forcedAlias source="no_NO_NY" target="nn_NO"/>
<!-- PER-DIRECTORY ALIASES (these are really special cases) -->
<!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally). -->
<!-- TODO: Find out and document this properly. -->
<forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
<!-- This alias is to avoid needing to copy and maintain the same "zh" data for "yue".
The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
"yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
rewriting the base language.
This is similar to the case for "ars"/"ar_SA" but it is not done globally, since
CLDR data does exist for "yue" and "yue_Hans" which is NOT the same as "zh_Hant"
and "zh_Hans"/"zh". This mapping is a bit more of a "hack" for the purposes of
reducing data duplication in ICU. -->
<forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
<forcedAlias dir="coll" source="yue" target="zh_Hant"/>
<!-- It is not at all clear why this is being done. It's certainly not exactly the same
as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
data than "yue", so this alias is not just rewriting the base language. -->
<!-- TODO: Find out and document this properly. -->
<forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
<!-- ALTERNATE VALUES -->
<!-- The following elements configure alternate values for some special case paths.
The target path will only be replaced if both it, and the source path, exist in

View File

@ -45,6 +45,7 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
private boolean emitReport = false;
private final SetMultimap<IcuLocaleDir, String> localeIdsMap = TreeMultimap.create();
private final Table<IcuLocaleDir, String, String> forcedAliases = TreeBasedTable.create();
private final Table<IcuLocaleDir, String, String> forcedParents = TreeBasedTable.create();
/**
* Sets the output directory in which the ICU data directories and files will go. This is
@ -95,6 +96,11 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
return this;
}
public Builder addForcedParent(IcuLocaleDir dir, String localeId, String parent) {
forcedParents.put(dir, localeId, parent);
return this;
}
/** Returns a converter config from the current builder state. */
public LdmlConverterConfig build() {
return new IcuConverterConfig(this);
@ -109,6 +115,7 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
private final ImmutableSet<String> allLocaleIds;
private final ImmutableSetMultimap<IcuLocaleDir, String> localeIdsMap;
private final ImmutableTable<IcuLocaleDir, String, String> forcedAliases;
private final ImmutableTable<IcuLocaleDir, String, String> forcedParents;
private IcuConverterConfig(Builder builder) {
this.outputDir = checkNotNull(builder.outputDir);
@ -128,6 +135,7 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
this.allLocaleIds = ImmutableSet.copyOf(builder.localeIdsMap.values());
this.localeIdsMap = ImmutableSetMultimap.copyOf(builder.localeIdsMap);
this.forcedAliases = ImmutableTable.copyOf(builder.forcedAliases);
this.forcedParents = ImmutableTable.copyOf(builder.forcedParents);
}
public static Builder builder() {
@ -164,6 +172,11 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
return forcedAliases.row(dir);
}
@Override
public ImmutableMap<String, String> getForcedParents(IcuLocaleDir dir) {
return forcedParents.row(dir);
}
@Override public ImmutableSet<String> getAllLocaleIds() {
return allLocaleIds;
}

View File

@ -323,11 +323,16 @@ public final class LdmlConverter {
// The split data can still be empty for this directory, but that's expected (it
// might only be written because it has an explicit parent added below).
splitPaths.get(dir).forEach(p -> splitData.add(p, icuData.get(p)));
// If we add an explicit parent locale, it forces the data to be written.
parent.ifPresent(p -> {
splitData.add(RB_PARENT, p);
graphMetadata.get(dir).addParent(id, p);
});
// If we add an explicit parent locale, it forces the data to be written. This is
// where we check for forced overrides of the parent relationship (which is a per
// directory thing).
parent
.map(p -> config.getForcedParents(dir).getOrDefault(id, p))
.ifPresent(p -> {
splitData.add(RB_PARENT, p);
graphMetadata.get(dir).addParent(id, p);
});
if (!splitData.getPaths().isEmpty() || isBaseLanguage || dir.includeEmpty()) {
splitData.setVersion(CldrDataSupplier.getCldrVersionString());
@ -384,7 +389,7 @@ public final class LdmlConverter {
Map<String, String> aliasMap = new LinkedHashMap<>();
for (String id : localeIds) {
if (forcedAliases.keySet().contains(id)) {
if (forcedAliases.containsKey(id)) {
// Forced aliases will be added later and don't need to be processed here. This
// is especially necessary if the ID is not structurally valid (e.g. "no_NO_NY")
// since that cannot be processed by the code below.

View File

@ -96,13 +96,22 @@ public interface LdmlConverterConfig {
Set<String> getTargetLocaleIds(IcuLocaleDir dir);
/**
* Return a map of locale IDs which specifies aliases which are applied to the given
* directory in contradiction to the natural alias or parent ID which would otherwise
* be generated. This is a mechanism for restructuring the parent chain and linking
* locales together in non-standard and unexpected ways.
* Returns a map of locale IDs which specifies aliases which are applied to the given directory
* in contradiction to the natural alias which would otherwise be generated. This mechanism
* allows for restructuring locale relationships on a per directory basis for special-case
* behaviour (such as sharing data which would otherwise need to be copied).
*/
Map<String, String> getForcedAliases(IcuLocaleDir dir);
/**
* Returns a map of locale IDs which specifies aliases which are applied to the given directory
* in contradiction to the natural parent which would otherwise be generated. This mechanism
* allows for restructuring locale relationships on a per directory basis for special-case
* behaviour (such as sharing data which would otherwise need to be copied).
*/
// TODO: Combine this and the force aliases into a single mechanism at this level.
Map<String, String> getForcedParents(IcuLocaleDir dir);
/**
* Whether to emit a summary report for debug purposes after conversion is complete.
*/

View File

@ -65,9 +65,10 @@ public final class ConvertIcuDataTask extends Task {
private Path cldrPath;
private CldrDraftStatus minimumDraftStatus;
// Set of default locale ID specifiers (wildcard IDs which are expanded).
private ImmutableSet<String> localeIdSpec;
private LocaleIds localeIds = null;
// Per directory overrides (fully specified locale IDs).
private final SetMultimap<IcuLocaleDir, String> perDirectoryIds = HashMultimap.create();
private final SetMultimap<IcuLocaleDir, String> inheritLanguageSubtag = HashMultimap.create();
private final IcuConverterConfig.Builder config = IcuConverterConfig.builder();
// Don't try and resolve actual paths until inside the execute method.
private final List<AltPath> altPaths = new ArrayList<>();
@ -136,9 +137,11 @@ public final class ConvertIcuDataTask extends Task {
}
}
public static final class DirectoryFilter extends Task {
public static final class Directory extends Task {
private IcuLocaleDir dir;
private ImmutableSet<String> ids;
private ImmutableSet<String> inheritLanguageSubtag = ImmutableSet.of();
private final List<ForcedAlias> forcedAliases = new ArrayList<>();
private LocaleIds localeIds = null;
@SuppressWarnings("unused")
public void setDir(String directory) {
@ -146,27 +149,33 @@ public final class ConvertIcuDataTask extends Task {
}
@SuppressWarnings("unused")
public void addText(String localeIds) {
this.ids = parseLocaleIds(localeIds);
public void setInheritLanguageSubtag(String localeIds) {
this.inheritLanguageSubtag = parseLocaleIds(localeIds);
}
@SuppressWarnings("unused")
public void addConfiguredForcedAlias(ForcedAlias alias) {
forcedAliases.add(alias);
}
@SuppressWarnings("unused")
public void addConfiguredLocaleIds(LocaleIds localeIds) {
checkBuild(this.localeIds == null,
"Cannot add more that one <localeIds> element for <directory>: %s", dir);
this.localeIds = localeIds;
}
@Override
public void init() throws BuildException {
checkBuild(dir != null, "Directory must be specified");
checkBuild(!ids.isEmpty(), "Locale IDs must be specified");
checkBuild(dir != null, "Directory attribute 'dir' must be specified");
checkBuild(localeIds != null, "<localeIds> must be specified for <directory>: %s", dir);
}
}
public static final class ForcedAlias extends Task {
private Optional<IcuLocaleDir> dir = Optional.empty();
private String source = "";
private String target = "";
@SuppressWarnings("unused")
public void setDir(String directory) {
this.dir = resolveDir(directory);
}
@SuppressWarnings("unused")
public void setSource(String source) {
this.source = whitespace().trimFrom(source);
@ -184,7 +193,6 @@ public final class ConvertIcuDataTask extends Task {
}
}
public static final class AltPath extends Task {
private String source = "";
private String target = "";
@ -214,23 +222,22 @@ public final class ConvertIcuDataTask extends Task {
@SuppressWarnings("unused")
public void addConfiguredLocaleIds(LocaleIds localeIds) {
checkBuild(this.localeIdSpec == null, "Cannot add more that one <localeIds> element");
this.localeIdSpec = localeIds.ids;
checkBuild(this.localeIds == null, "Cannot add more that one <localeIds> element");
this.localeIds = localeIds;
}
@SuppressWarnings("unused")
public void addConfiguredDirectoryFilter(DirectoryFilter filter) {
perDirectoryIds.putAll(filter.dir, filter.ids);
public void addConfiguredDirectory(Directory filter) {
perDirectoryIds.putAll(filter.dir, filter.localeIds.ids);
inheritLanguageSubtag.putAll(filter.dir, filter.inheritLanguageSubtag);
filter.forcedAliases.forEach(a -> config.addForcedAlias(filter.dir, a.source, a.target));
}
// Aliases on the outside are applied to all directories.
@SuppressWarnings("unused")
public void addConfiguredForcedAlias(ForcedAlias alias) {
if (alias.dir.isPresent()) {
config.addForcedAlias(alias.dir.get(), alias.source, alias.target);
} else {
for (IcuLocaleDir dir : IcuLocaleDir.values()) {
config.addForcedAlias(dir, alias.source, alias.target);
}
for (IcuLocaleDir dir : IcuLocaleDir.values()) {
config.addForcedAlias(dir, alias.source, alias.target);
}
}
@ -244,6 +251,8 @@ public final class ConvertIcuDataTask extends Task {
@SuppressWarnings("unused")
public void execute() throws BuildException {
checkBuild(localeIds != null, "<localeIds> must be specified");
CldrDataSupplier src = CldrDataSupplier
.forCldrFilesIn(cldrPath)
.withDraftStatusAtLeast(minimumDraftStatus);
@ -260,10 +269,20 @@ public final class ConvertIcuDataTask extends Task {
SupplementalData supplementalData = SupplementalData.create(src);
ImmutableSet<String> defaultTargetIds =
LocaleIdResolver.expandTargetIds(this.localeIdSpec, supplementalData);
LocaleIdResolver.expandTargetIds(this.localeIds.ids, supplementalData);
for (IcuLocaleDir dir : IcuLocaleDir.values()) {
Iterable<String> ids = perDirectoryIds.asMap().getOrDefault(dir, defaultTargetIds);
config.addLocaleIds(dir, Iterables.filter(ids, idFilter::test));
// We should only have locale IDs like "zh_Hant" here (language + script) and only
// those which would naturally inherit to "root"
inheritLanguageSubtag.get(dir).forEach(id -> {
checkArgument(id.matches("[a-z]{2}_[A-Z][a-z]{3}"),
"Invalid locale ID for inheritLanguageSubtag (expect '<lang>_<Script>'): ", id);
checkArgument(supplementalData.getParent(id).equals("root"),
"Invalid locale ID for inheritLanguageSubtag (parent must be 'root'): ", id);
config.addForcedParent(dir, id, id.substring(0, 2));
});
}
config.setMinimumDraftStatus(minimumDraftStatus);
LdmlConverter.convert(src, supplementalData, config.build());