ICU-20693 Adding Ant support for configuration of ICU data builds.

See #771
This commit is contained in:
David Beaumont 2019-08-28 11:14:31 +00:00 committed by pedberg-icu
parent b702db31df
commit cf4ce53541
7 changed files with 587 additions and 273 deletions

View File

@ -6,36 +6,20 @@
Basic instructions for running the LdmlConverter via Maven
==========================================================
Note that these instructions do not currently support configuration of the converter for things
such as limiting the set of files produced. That is supported in code and could be easily added
to the binary, or encapsulated via an Ant task, but currently it is not directly supported.
See the IcuConverterConfig class for the API by which this can be supported.
Important directories
---------------------
<CLDR_DIR> = The root directory of the CLDR release.
<ICU_DIR> = The root directory of the ICU release (probably a parent directory of where
this README file is located). This is an optional property and defaults to
the parent directory of the release from which it is run.
<DTD_CACHE> = The temporary cache directory in which DTD files are downloaded (this is the
same directory as would be used when running tools from the CLDR project).
Note that the need to specify this directory is scheduled to be removed after
ICU release 65.
<OUT_DIR> = The output directory into which ICU data files should be written.
Generating all ICU data
-----------------------
$ mvn exec:java \
-DCLDR_DIR='<CLDR_DIR>' \
-DCLDR_DTD_CACHE='<DTD_CACHE>' \
-Dexec.args='<OUT_DIR>'
First edit the Ant build file to
$ CLDR_DIR=<CLDR_DIR> ant -f build-icu-data.xml
Running unit tests

View File

@ -0,0 +1,311 @@
<!-- © 2019 and later: Unicode, Inc. and others.
License & terms of use: http://www.unicode.org/copyright.html -->
<!--================================================================================
To build ICU data files:
1: Determine the CLDR base directory and set the CLDR_DIR environment variable.
2: Determine the flags required (see the list of properties below).
3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>...
================================================================================-->
<!-- TODO: Add things like copying of a template directory and deleting previous files
(perhaps always generate into a temporary directory and copy back to avoid having
inconsistent state when the conversion is cancelled). -->
<project name="Convert" default="convert" basedir=".">
<!-- Initialize the properties which were not already set on the command line. -->
<target name="init-args">
<property environment="env"/>
<condition property="hascldrdir" >
<isset property="env.CLDR_DIR" />
</condition>
<fail unless="hascldrdir"
message="Please set the CLDR_DIR environment variable to the top level CLDR source dir (containing 'common')."/>
<!-- The base directory of the CLDR release from which CLDR data is obtained. For
legacy reasons, this must also match an environment variable called CLDR_DIR,
which is read here, so it is best to set this via the environment variable for
now. -->
<!-- TODO: Update this when the CLDR_DIR environment variable is no longer needed. -->
<property name="cldrDir" value="${env.CLDR_DIR}"/>
<!-- The output directory into which to write the converted ICU data. By default
this will overwrite (without deletion) the ICU data files in this ICU release,
so it is recommended that for testing, it be set to another value. -->
<property name="outDir" value="${basedir}/../../../icu4c/source/data/"/>
<!-- The directory in which the additional ICU XML data is stored. -->
<property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/>
<!-- The minimum draft status for CLDR data to be used in the conversion. See
CldrDraftStatus for more details. -->
<property name="minDraftStatus" value="contributed"/>
<!-- Whether to emit a debug report containing some possibly useful information after
the conversion has finished. -->
<!-- TODO: Currently this isn't hugely useful, so find out what people want. -->
<property name="emitReport" value="false"/>
<!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty
list means "build everything".
Note that the grouping of types is based on the legacy converter behaviour and
is not always directly associated with an output directory (e.g. "locales"
produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT
coll/, brkitr/ or rbnf/).
You can also specify by DTD type (e.g. dtdBcp47, dtdSupplemental or dtdLdml)
which is still not quite directly associated with output directories either,
since some supplemental data is also written to the curr/ directory.
See LdmlConverter.OutputType for the full list of valid types. -->
<!-- TODO: Find out what people actually want here and switch to that. -->
<property name="outputTypes" value=""/>
</target>
<!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
about making Ant know the Maven class-path). -->
<target name="prepare-jar" depends="init-args">
<exec executable="mvn" searchpath="true">
<arg value="compile"/>
</exec>
</target>
<!-- Do the actual CLDR data conversion, based on the command line arguments, built in
default properties and the configuration in the "<convert>" element below. -->
<target name="convert" depends="init-args, prepare-jar">
<taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask">
<classpath>
<pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
</classpath>
</taskdef>
<convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
<!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
TODO: Find out and document this properly. -->
<forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
<!-- This appears to be a hack to avoid needing to copy and maintain the same "zh"
data for "yue". The files for "yue" in this directory should be empty otherwise.
The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
"yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
rewriting the base language. -->
<forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
<forcedAlias dir="coll" source="yue" target="zh_Hant"/>
<!-- It is not at all clear why this is being done. It's certainly not exactly the same
as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
data than "yue", so this alias is not just rewriting the base language.
TODO: Find out and document this properly. -->
<forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
<!-- The primary set of locale IDs to be generated. Other, directory specific, sets exist
and do not have to be subsets of this. Some of these ID are aliases, so XML files
may not exist for all of them. -->
<!-- TODO: Add locale ID inference to reduce this list considerably. -->
<localeIds dirs="curr,lang,locales,region,unit,zone">
root,
// A
af, af_NA, af_ZA, agq, agq_CM, ak, ak_GH, am, am_ET, ar, ar_001,
ar_AE, ar_BH, ar_DJ, ar_DZ, ar_EG, ar_EH, ar_ER, ar_IL, ar_IQ,
ar_JO, ar_KM, ar_KW, ar_LB, ar_LY, ar_MA, ar_MR, ar_OM, ar_PS,
ar_QA, ar_SA, ar_SD, ar_SO, ar_SS, ar_SY, ar_TD, ar_TN, ar_YE, ars,
as, as_IN, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl, az_Cyrl_AZ,
az_Latn, az_Latn_AZ,
// B
bas, bas_CM, be, be_BY, bem, bem_ZM, bez, bez_TZ, bg, bg_BG, bm,
bm_ML, bn, bn_BD, bn_IN, bo, bo_CN, bo_IN, br, br_FR, brx, brx_IN,
bs, bs_Cyrl, bs_Cyrl_BA, bs_Latn, bs_Latn_BA, bs_BA,
// C
ca, ca_AD, ca_ES, ca_FR, ca_IT, ccp, ccp_BD, ccp_IN, ce, ce_RU,
ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs,
cs_CZ, cy, cy_GB,
// D
da, da_DK, da_GL, dav, dav_KE, de, de_AT, de_BE, de_CH, de_DE,
de_IT, de_LI, de_LU, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo,
dyo_SN, dz, dz_BT,
// E
ebu, ebu_KE, ee, ee_GH, ee_TG, el, el_CY, el_GR, en, en_001,
en_150, en_AE, en_AG, en_AI, en_AS, en_AT, en_AU, en_BB, en_BE,
en_BI, en_BM, en_BS, en_BW, en_BZ, en_CA, en_CC, en_CH, en_CK,
en_CM, en_CX, en_CY, en_DE, en_DG, en_DK, en_DM, en_ER, en_FI,
en_FJ, en_FK, en_FM, en_GB, en_GD, en_GG, en_GH, en_GI, en_GM,
en_GU, en_GY, en_HK, en_IE, en_IL, en_IM, en_IN, en_IO, en_JE,
en_JM, en_KE, en_KI, en_KN, en_KY, en_LC, en_LR, en_LS, en_MG,
en_MH, en_MO, en_MP, en_MS, en_MT, en_MU, en_MW, en_MY, en_NA,
en_NF, en_NG, en_NH, en_NL, en_NR, en_NU, en_NZ, en_PG, en_PH,
en_PK, en_PN, en_PR, en_PW, en_RH, en_RW, en_SB, en_SC, en_SD,
en_SE, en_SG, en_SH, en_SI, en_SL, en_SS, en_SX, en_SZ, en_TC,
en_TK, en_TO, en_TT, en_TV, en_TZ, en_UG, en_UM, en_US, en_US_POSIX,
en_VC, en_VG, en_VI, en_VU, en_WS, en_ZA, en_ZM, en_ZW, eo,
eo_001, es, es_419, es_AR, es_BO, es_BR, es_BZ, es_CL, es_CO,
es_CR, es_CU, es_DO, es_EA, es_EC, es_ES, es_GQ, es_GT, es_HN,
es_IC, es_MX, es_NI, es_PA, es_PE, es_PH, es_PR, es_PY, es_SV,
es_US, es_UY, es_VE, et, et_EE, eu, eu_ES, ewo, ewo_CM,
// F
fa, fa_AF, fa_IR, ff, ff_CM, ff_GN, ff_Latn, ff_Latn_BF, ff_Latn_CM,
ff_Latn_GH, ff_Latn_GM, ff_Latn_GN, ff_Latn_GW, ff_Latn_LR, ff_Latn_MR,
ff_Latn_NE, ff_Latn_NG, ff_Latn_SL, ff_Latn_SN, ff_MR, ff_SN, fi,
fi_FI, fil, fil_PH, fo, fo_DK, fo_FO, fr, fr_BE, fr_BF, fr_BI,
fr_BJ, fr_BL, fr_CA, fr_CD, fr_CF, fr_CG, fr_CH, fr_CI, fr_CM,
fr_DJ, fr_DZ, fr_FR, fr_GA, fr_GF, fr_GN, fr_GP, fr_GQ, fr_HT,
fr_KM, fr_LU, fr_MA, fr_MC, fr_MF, fr_MG, fr_ML, fr_MQ, fr_MR,
fr_MU, fr_NC, fr_NE, fr_PF, fr_PM, fr_RE, fr_RW, fr_SC, fr_SN,
fr_SY, fr_TD, fr_TG, fr_TN, fr_VU, fr_WF, fr_YT, fur, fur_IT,
fy, fy_NL,
// G
ga, ga_IE, gd, gd_GB, gl, gl_ES, gsw, gsw_CH, gsw_FR, gsw_LI,
gu, gu_IN, guz, guz_KE, gv, gv_IM,
// H
ha, ha_GH, ha_NE, ha_NG, haw, haw_US, he, he_IL, hi, hi_IN,
hr, hr_BA, hr_HR, hsb, hsb_DE, hu, hu_HU, hy, hy_AM,
// I
ia, ia_001, id, id_ID, ig, ig_NG, ii, ii_CN, in, in_ID, is,
is_IS, it, it_CH, it_IT, it_SM, it_VA, iw, iw_IL,
// J
ja, ja_JP, ja_JP_TRADITIONAL, jgo, jgo_CM, jmc, jmc_TZ, jv, jv_ID,
// K
ka, ka_GE, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV,
khq, khq_ML, ki, ki_KE, kk, kk_KZ, kkj, kkj_CM, kl, kl_GL, kln,
kln_KE, km, km_KH, kn, kn_IN, ko, ko_KP, ko_KR, kok, kok_IN,
ks, ks_IN, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, ku_TR,
kw, kw_GB, ky, ky_KG,
// L
lag, lag_TZ, lb, lb_LU, lg, lg_UG, lkt, lkt_US, ln, ln_AO,
ln_CD, ln_CF, ln_CG, lo, lo_LA, lrc, lrc_IQ, lrc_IR, lt, lt_LT,
lu, lu_CD, luo, luo_KE, luy, luy_KE, lv, lv_LV,
// M
mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mg_MG, mgh,
mgh_MZ, mgo, mgo_CM, mi, mi_NZ, mk, mk_MK, ml, ml_IN, mn,
mn_MN, mo, mr, mr_IN, ms, ms_BN, ms_MY, ms_SG, mt, mt_MT, mua,
mua_CM, my, my_MM, mzn, mzn_IR,
// N
naq, naq_NA, nb, nb_NO, nb_SJ, nd, nd_ZW, nds, nds_DE, nds_NL,
ne, ne_IN, ne_NP, nl, nl_AW, nl_BE, nl_BQ, nl_CW, nl_NL, nl_SR,
nl_SX, nmg, nmg_CM, nn, nn_NO, nnh, nnh_CM, no, no_NO, no_NO_NY,
nus, nus_SS, nyn, nyn_UG,
// O
om, om_ET, om_KE, or, or_IN, os, os_GE, os_RU,
// P
pa, pa_Arab, pa_Arab_PK, pa_Guru, pa_Guru_IN, pa_IN, pa_PK, pl,
pl_PL, ps, ps_AF, ps_PK, pt, pt_AO, pt_BR, pt_CH, pt_CV, pt_GQ,
pt_GW, pt_LU, pt_MO, pt_MZ, pt_PT, pt_ST, pt_TL,
// Q
qu, qu_BO, qu_EC, qu_PE,
// R
rm, rm_CH, rn, rn_BI, ro, ro_MD, ro_RO, rof, rof_TZ, ru,
ru_BY, ru_KG, ru_KZ, ru_MD, ru_RU, ru_UA, rw, rw_RW, rwk, rwk_TZ,
// S
sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, sd_PK, se, se_FI,
se_NO, se_SE, seh, seh_MZ, ses, ses_ML, sg, sg_CF, sh, sh_BA,
sh_CS, sh_YU, shi, shi_Latn, shi_Latn_MA, shi_Tfng, shi_Tfng_MA,
shi_MA, si, si_LK, sk, sk_SK, sl, sl_SI, smn, smn_FI, sn, sn_ZW,
so, so_DJ, so_ET, so_KE, so_SO, sq, sq_AL, sq_MK, sq_XK, sr,
sr_Cyrl, sr_Cyrl_BA, sr_Cyrl_ME, sr_Cyrl_RS, sr_Cyrl_CS, sr_Cyrl_XK,
sr_Cyrl_YU, sr_Latn, sr_Latn_BA, sr_Latn_ME, sr_Latn_RS, sr_Latn_CS,
sr_Latn_XK, sr_Latn_YU, sr_BA, sr_ME, sr_RS, sr_CS, sr_XK, sr_YU,
sv, sv_AX, sv_FI, sv_SE, sw, sw_CD, sw_KE, sw_TZ, sw_UG,
// T
ta, ta_IN, ta_LK, ta_MY, ta_SG, te, te_IN, teo, teo_KE, teo_UG,
tg, tg_TJ, th, th_TH, th_TH_TRADITIONAL, ti, ti_ER, ti_ET, tk,
tk_TM, tl, tl_PH, to, to_TO, tr, tr_CY, tr_TR, tt, tt_RU,
twq, twq_NE, tzm, tzm_MA,
// U
ug, ug_CN, uk, uk_UA, ur, ur_IN, ur_PK, uz, uz_AF, uz_Arab,
uz_Arab_AF, uz_Cyrl, uz_Cyrl_UZ, uz_Latn, uz_Latn_UZ, uz_UZ,
// V
vai, vai_Latn, vai_Latn_LR, vai_LR, vai_Vaii, vai_Vaii_LR, vi,
vi_VN, vun, vun_TZ,
// W
wae, wae_CH, wo, wo_SN,
// X
xh, xh_ZA, xog, xog_UG,
// Y
yav, yav_CM, yi, yi_001, yo, yo_BJ, yo_NG, yue, yue_CN, yue_HK,
yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK,
// Z
zgh, zgh_MA, zh, zh_Hans, zh_Hans_CN, zh_Hans_HK, zh_Hans_MO,
zh_Hans_SG, zh_Hant, zh_Hant_HK, zh_Hant_MO, zh_Hant_TW, zh_CN,
zh_HK, zh_MO, zh_SG, zh_TW, zu, zu_ZA
</localeIds>
<!-- TODO: Explain why these special cases are needed/different. -->
<localeIds dirs="coll">
root,
// A-B
af, am, ars, ar, as, az, be, bg, bn, bo, bs_Cyrl, bs,
// C-F
ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
en_US_POSIX, en_US, eo, es, et, fa_AF, fa, fil, fi, fo, fr_CA, fr,
// G-J
ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
// K-P
ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
// R-T
ro, ru, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
ta, te, th, tk, to, tr,
// U-Z
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
</localeIds>
<localeIds dirs="rbnf">
root,
// A-E
af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
// F-P
fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,
// Q-Z
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
</localeIds>
<localeIds dirs="brkitr">
root,
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
</localeIds>
</convert>
</target>
</project>

View File

@ -8,6 +8,10 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<groupId>org.unicode.icu</groupId>
<artifactId>cldr-to-icu</artifactId>
<version>1.0-SNAPSHOT</version>
@ -25,8 +29,11 @@
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>org.unicode.icu.tool.cldrtoicu.LdmlConverter</mainClass>
<mainClass>
org.unicode.icu.tool.cldrtoicu.LdmlConverter
</mainClass>
<systemProperties>
<property>
<key>ICU_DIR</key>
@ -35,10 +42,35 @@
</systemProperties>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.1.1</version>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<archive>
<manifest>
<mainClass>
org.unicode.icu.tool.cldrtoicu.LdmlConverter
</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<!-- This is where the snapshots of the CLDR API and additional auxilliary jars are held. -->
<!-- This is where the snapshots of the CLDR API and additional auxiliary JAR files are held. -->
<repositories>
<repository>
<id>local-maven-repo</id>
@ -79,5 +111,10 @@
<version>1.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.ant</groupId>
<artifactId>ant</artifactId>
<version>1.10.6</version>
</dependency>
</dependencies>
</project>
</project>

View File

@ -14,11 +14,16 @@ import java.util.Optional;
import java.util.Set;
import org.unicode.cldr.api.CldrDraftStatus;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.ImmutableTable;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Table;
import com.google.common.collect.TreeBasedTable;
import com.google.common.collect.TreeMultimap;
/**
* The converter config intended to generate the standard ICU data files. This used to be something
* that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
@ -34,15 +39,18 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
.map(d -> Paths.get(d).toAbsolutePath());
/** The builder with which to specify configuration for the {@link LdmlConverter}. */
@SuppressWarnings("UnusedReturnValue")
public static final class Builder {
private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
private Path outputDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
private Path specialsDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);;
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);
private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
private boolean emitReport = false;
private final SetMultimap<IcuLocaleDir, String> localeIdsMap = TreeMultimap.create();
private final Table<IcuLocaleDir, String, String> forcedAliases = TreeBasedTable.create();
/**
* Sets the CLDR base directory from which to load all CLDR data. This is optional if the
@ -98,6 +106,16 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
return this;
}
public Builder addLocaleIds(IcuLocaleDir dir, Iterable<String> localeIds) {
localeIdsMap.putAll(dir, localeIds);
return this;
}
public Builder addForcedAlias(IcuLocaleDir dir, String source, String target) {
forcedAliases.put(dir, source, target);
return this;
}
/** Returns a converter config from the current builder state. */
public LdmlConverterConfig build() {
return new IcuConverterConfig(this);
@ -110,6 +128,8 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
private final ImmutableSet<OutputType> outputTypes;
private final CldrDraftStatus minimalDraftStatus;
private final boolean emitReport;
private final ImmutableSetMultimap<IcuLocaleDir, String> localeIdsMap;
private final ImmutableTable<IcuLocaleDir, String, String> forcedAliases;
private IcuConverterConfig(Builder builder) {
this.cldrDir = checkNotNull(builder.cldrDir,
@ -135,247 +155,50 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
Arrays.asList(OutputType.values()));
this.minimalDraftStatus = builder.minimalDraftStatus;
this.emitReport = builder.emitReport;
this.localeIdsMap = ImmutableSetMultimap.copyOf(builder.localeIdsMap);
this.forcedAliases = ImmutableTable.copyOf(builder.forcedAliases);
}
public static Builder builder() {
return new Builder();
}
@Override public Path getCldrDirectory() {
@Override
public Path getCldrDirectory() {
return cldrDir;
}
@Override public Path getOutputDir() {
@Override
public Path getOutputDir() {
return outputDir;
}
@Override public Set<OutputType> getOutputTypes() {
@Override
public Set<OutputType> getOutputTypes() {
return outputTypes;
}
@Override public CldrDraftStatus getMinimumDraftStatus() {
@Override
public CldrDraftStatus getMinimumDraftStatus() {
return minimalDraftStatus;
}
@Override public Path getSpecialsDir() {
@Override
public Path getSpecialsDir() {
return specialsDir;
}
@Override public boolean emitReport() {
@Override
public boolean emitReport() {
return emitReport;
}
// Currently hard-coded "hacks" which could be encoded via the builder if wanted.
@Override public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
switch (dir) {
case COLL:
return ImmutableMap.<String, String>builder()
// It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
// TODO: Find out and document this properly.
.put("sr_ME", "sr_Cyrl_ME")
// This appears to be a hack to avoid needing to copy and maintain the same "zh"
// data for "yue". The files for "yue" in this directory should be empty otherwise.
//
// The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
// "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
// rewriting the base language.
.put("yue_Hans", "zh_Hans")
.put("yue", "zh_Hant")
.build();
case RBNF:
// It is not at all clear why this is being done. It's certainly not exactly the same
// as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
// data than "yue", so this alias is not just rewriting the base language.
// TODO: Find out and document this properly.
return ImmutableMap.of("zh_Hant_HK", "yue");
default:
return ImmutableMap.of();
}
@Override
public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
return forcedAliases.row(dir);
}
// This set of locale files in each directory denotes the supported/available locales for that
// API. In most cases, it's the same set, but a few directories support only a subset of IDs.
@Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
switch (dir) {
case COLL:
return COLL_LOCALE_IDS;
case BRKITR:
return BRKITR_LOCALE_IDS;
case RBNF:
return RBNF_LOCALE_IDS;
default:
return ICU_LOCALE_IDS;
}
return localeIdsMap.get(dir);
}
// The primary set of locale IDs to be generated. Other, directory specific, sets should be
// subsets of this. Some of these ID are aliases, so XML files may not exist for all of them.
//
// This was further modified (in order to better match the set of generated ICU files) by:
// * Removing "es_003" (which just seems to be ignored in current code)
// * Adding: "en_NH", "sr_XK", "yue_CN", "yue_HK" (deprecated locale IDs in the manual config)
// * Adding: "no_NO_NY" (a not even structurally valid ID that exists for very legacy reasons)
private static final ImmutableSet<String> ICU_LOCALE_IDS = ImmutableSet.of(
"root",
// A
"af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
"ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ",
"ar_JO", "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS",
"ar_QA", "ar_SA", "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars",
"as", "as_IN", "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ",
"az_Latn", "az_Latn_AZ",
// B
"bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg", "bg_BG", "bm",
"bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR", "brx", "brx_IN",
"bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA",
// C
"ca", "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU",
"ceb", "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs",
"cs_CZ", "cy", "cy_GB",
// D
"da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
"de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
"dyo_SN", "dz", "dz_BT",
// E
"ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR", "en", "en_001",
"en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB", "en_BE",
"en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
"en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI",
"en_FJ", "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM",
"en_GU", "en_GY", "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE",
"en_JM", "en_KE", "en_KI", "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG",
"en_MH", "en_MO", "en_MP", "en_MS", "en_MT", "en_MU", "en_MW", "en_MY", "en_NA",
"en_NF", "en_NG", "en_NH", "en_NL", "en_NR", "en_NU", "en_NZ", "en_PG", "en_PH",
"en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB", "en_SC", "en_SD",
"en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ", "en_TC",
"en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US", "en_US_POSIX",
"en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
"eo_001", "es", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
"es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN",
"es_IC", "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV",
"es_US", "es_UY", "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM",
// F
"fa", "fa_AF", "fa_IR", "ff", "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM",
"ff_Latn_GH", "ff_Latn_GM", "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR",
"ff_Latn_NE", "ff_Latn_NG", "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi",
"fi_FI", "fil", "fil_PH", "fo", "fo_DK", "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI",
"fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF", "fr_CG", "fr_CH", "fr_CI", "fr_CM",
"fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN", "fr_GP", "fr_GQ", "fr_HT",
"fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML", "fr_MQ", "fr_MR",
"fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC", "fr_SN",
"fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
"fy", "fy_NL",
// G
"ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR", "gsw_LI",
"gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM",
// H
"ha", "ha_GH", "ha_NE", "ha_NG", "haw", "haw_US", "he", "he_IL", "hi", "hi_IN",
"hr", "hr_BA", "hr_HR", "hsb", "hsb_DE", "hu", "hu_HU", "hy", "hy_AM",
// I
"ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN", "in", "in_ID", "is",
"is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL",
// J
"ja", "ja_JP", "ja_JP_TRADITIONAL", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID",
// K
"ka", "ka_GE", "kab", "kab_DZ", "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV",
"khq", "khq_ML", "ki", "ki_KE", "kk", "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln",
"kln_KE", "km", "km_KH", "kn", "kn_IN", "ko", "ko_KP", "ko_KR", "kok", "kok_IN",
"ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM", "ksh", "ksh_DE", "ku", "ku_TR",
"kw", "kw_GB", "ky", "ky_KG",
// L
"lag", "lag_TZ", "lb", "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO",
"ln_CD", "ln_CF", "ln_CG", "lo", "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT",
"lu", "lu_CD", "luo", "luo_KE", "luy", "luy_KE", "lv", "lv_LV",
// M
"mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg", "mg_MG", "mgh",
"mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN", "mn",
"mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
"mua_CM", "my", "my_MM", "mzn", "mzn_IR",
// N
"naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd", "nd_ZW", "nds", "nds_DE", "nds_NL",
"ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ", "nl_CW", "nl_NL", "nl_SR",
"nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no", "no_NO", "no_NO_NY",
"nus", "nus_SS", "nyn", "nyn_UG",
// O
"om", "om_ET", "om_KE", "or", "or_IN", "os", "os_GE", "os_RU",
// P
"pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK", "pl",
"pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
"pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL",
// Q
"qu", "qu_BO", "qu_EC", "qu_PE",
// R
"rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
"ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ",
// S
"sah", "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI",
"se_NO", "se_SE", "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA",
"sh_CS", "sh_YU", "shi", "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA",
"shi_MA", "si", "si_LK", "sk", "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn",
"sn_ZW", "so", "so_DJ", "so_ET", "so_KE", "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK",
"sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME", "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK",
"sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA", "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS",
"sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME", "sr_RS", "sr_CS", "sr_XK", "sr_YU",
"sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ", "sw_UG",
// T
"ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
"tg", "tg_TJ", "th", "th_TH", "th_TH_TRADITIONAL", "ti", "ti_ER", "ti_ET", "tk",
"tk_TM", "tl", "tl_PH", "to", "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU",
"twq", "twq_NE", "tzm", "tzm_MA",
// U
"ug", "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab",
"uz_Arab_AF", "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ",
// V
"vai", "vai_Latn", "vai_Latn_LR", "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi",
"vi_VN", "vun", "vun_TZ",
// W
"wae", "wae_CH", "wo", "wo_SN",
// X
"xh", "xh_ZA", "xog", "xog_UG",
// Y
"yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ", "yo_NG", "yue", "yue_CN", "yue_HK",
"yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK",
// Z
"zgh", "zgh_MA", "zh", "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO",
"zh_Hans_SG", "zh_Hant", "zh_Hant_HK", "zh_Hant_MO", "zh_Hant_TW", "zh_CN",
"zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
private static final ImmutableSet<String> COLL_LOCALE_IDS = ImmutableSet.of(
"root",
// A-B
"af", "am", "ars", "ar", "as", "az", "be", "bg", "bn", "bo", "bs_Cyrl", "bs",
// C-F
"ca", "ceb", "chr", "cs", "cy", "da", "de_AT", "de", "dsb", "dz", "ee", "el", "en",
"en_US_POSIX", "en_US", "eo", "es", "et", "fa_AF", "fa", "fil", "fi", "fo", "fr_CA", "fr",
// G-J
"ga", "gl", "gu", "ha", "haw", "he", "hi", "hr", "hsb", "hu", "hy",
"id_ID", "id", "ig", "in", "in_ID", "is", "it", "iw_IL", "iw", "ja",
// K-P
"ka", "kk", "kl", "km", "kn", "kok", "ko", "ku", "ky", "lb", "lkt", "ln", "lo", "lt", "lv",
"mk", "ml", "mn", "mo", "mr", "ms", "mt", "my", "nb", "ne", "nl", "nn", "no_NO", "no",
"om", "or", "pa_IN", "pa", "pa_Guru", "pl", "ps", "pt",
// R-T
"ro", "ru", "se", "sh_BA", "sh_CS", "sh", "sh_YU", "si", "sk", "sl", "smn", "sq",
"sr_BA", "sr_Cyrl_ME", "sr_Latn", "sr_ME", "sr_RS", "sr", "sv", "sw",
"ta", "te", "th", "tk", "to", "tr",
// U-Z
"ug", "uk", "ur", "uz", "vi", "wae", "wo", "xh", "yi", "yo", "yue_CN", "yue_Hans",
"yue", "zh_CN", "zh_Hant", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zh", "zu");
private static final ImmutableSet<String> BRKITR_LOCALE_IDS = ImmutableSet.of(
"root", "de", "el", "en", "en_US_POSIX", "en_US", "es", "fr", "it", "ja", "pt", "ru",
"zh_Hant", "zh");
private static final ImmutableSet<String> RBNF_LOCALE_IDS = ImmutableSet.of(
"root", "af", "ak", "am", "ars", "ar", "az", "be", "bg", "bs", "ca", "ccp", "chr", "cs",
"cy", "da", "de_CH", "de", "ee", "el", "en_001", "en_IN", "en", "eo", "es_419", "es_DO",
"es_GT", "es_HN", "es_MX", "es_NI", "es_PA", "es_PR", "es_SV", "es", "es_US", "et",
"fa_AF", "fa", "ff", "fil", "fi", "fo", "fr_BE", "fr_CH", "fr", "ga", "he", "hi", "hr",
"hu", "hy", "id", "in", "is", "it", "iw", "ja", "ka", "kl", "km", "ko", "ky", "lb",
"lo", "lrc", "lt", "lv", "mk", "ms", "mt", "my", "nb", "nl", "nn", "no", "pl", "pt_PT",
"pt", "qu", "ro", "ru", "se", "sh", "sk", "sl", "sq", "sr_Latn", "sr", "sv",
"sw", "ta", "th", "tr", "uk", "vi", "yue_Hans", "yue", "zh_Hant_HK", "zh_Hant", "zh_HK",
"zh_MO", "zh_TW", "zh");
}

View File

@ -4,6 +4,10 @@ package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.util.stream.Collectors.toList;
import static org.unicode.cldr.api.CldrDataType.BCP47;
import static org.unicode.cldr.api.CldrDataType.LDML;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.BRKITR;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.COLL;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.CURR;
@ -13,10 +17,6 @@ import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.RB
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.REGION;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.UNIT;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.ZONE;
import static java.util.stream.Collectors.toList;
import static org.unicode.cldr.api.CldrDataType.BCP47;
import static org.unicode.cldr.api.CldrDataType.LDML;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import java.io.IOException;
import java.io.InputStream;
@ -42,17 +42,6 @@ import java.util.stream.Stream;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import com.google.common.base.CharMatcher;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
@ -66,6 +55,17 @@ import org.unicode.icu.tool.cldrtoicu.mapper.SupplementalMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.TransformsMapper;
import org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer;
import com.google.common.base.CharMatcher;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
/**
* The main converter tool for CLDR to ICU data. To run this tool, you need to supply a suitable
* {@link LdmlConverterConfig} instance. There is a simple {@code main()} method available in this
@ -168,7 +168,7 @@ public final class LdmlConverter {
DAY_PERIODS(
SUPPLEMENTAL,
c -> c.processDayPeriods("misc")),
LdmlConverter::processDayPeriods),
GENDER_LIST(
SUPPLEMENTAL,
c -> c.processSupplemental("genderList", GENDER_LIST_PATHS, "misc", false)),
@ -192,19 +192,19 @@ public final class LdmlConverter {
c -> c.processSupplemental("numberingSystems", NUMBERING_SYSTEMS_PATHS, "misc", false)),
PLURALS(
SUPPLEMENTAL,
c -> c.processPlurals("misc")),
LdmlConverter::processPlurals),
PLURAL_RANGES(
SUPPLEMENTAL,
c -> c.processPluralRanges("misc")),
LdmlConverter::processPluralRanges),
WINDOWS_ZONES(
SUPPLEMENTAL,
c -> c.processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false)),
TRANSFORMS(
SUPPLEMENTAL,
c -> c.processTransforms("translit")),
LdmlConverter::processTransforms),
KEY_TYPE_DATA(
BCP47,
c -> c.processKeyTypeData("misc")),
LdmlConverter::processKeyTypeData),
// Batching by type.
DTD_LDML(LDML, c -> c.processAll(LDML)),
@ -231,7 +231,8 @@ public final class LdmlConverter {
}
}
private static void convert(LdmlConverterConfig config) {
/** Converts CLDR data according to the given configuration. */
public static void convert(LdmlConverterConfig config) {
CldrDataSupplier src = CldrDataSupplier
.forCldrFilesIn(config.getCldrDirectory())
.withDraftStatusAtLeast(config.getMinimumDraftStatus());
@ -480,24 +481,24 @@ public final class LdmlConverter {
return idx == -1 ? segment : segment.substring(0, idx);
}
private void processDayPeriods(String dir) {
write(DayPeriodsMapper.process(src), dir);
private void processDayPeriods() {
write(DayPeriodsMapper.process(src), "misc");
}
private void processPlurals(String dir) {
write(PluralsMapper.process(src), dir);
private void processPlurals() {
write(PluralsMapper.process(src), "misc");
}
private void processPluralRanges(String dir) {
write(PluralRangesMapper.process(src), dir);
private void processPluralRanges() {
write(PluralRangesMapper.process(src), "misc");
}
private void processKeyTypeData(String dir) {
Bcp47Mapper.process(src).forEach(d -> write(d, dir));
private void processKeyTypeData() {
Bcp47Mapper.process(src).forEach(d -> write(d, "misc"));
}
private void processTransforms(String dir) {
Path transformDir = createDirectory(config.getOutputDir().resolve(dir));
private void processTransforms() {
Path transformDir = createDirectory(config.getOutputDir().resolve("translit"));
write(TransformsMapper.process(src, transformDir), transformDir);
}

View File

@ -7,9 +7,9 @@ import java.util.Map;
import java.util.Set;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
import com.google.common.base.Ascii;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
/** API for configuring the LDML converter. */
public interface LdmlConverterConfig {

View File

@ -0,0 +1,158 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.ant;
import static com.google.common.base.CharMatcher.inRange;
import static com.google.common.base.CharMatcher.is;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.util.stream.Collectors.joining;
import java.nio.file.Path;
import java.util.Arrays;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.Task;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.icu.tool.cldrtoicu.IcuConverterConfig;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
import com.google.common.base.Ascii;
import com.google.common.base.CaseFormat;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed.
public final class ConvertIcuDataTask extends Task {
private static final Splitter LIST_SPLITTER =
Splitter.on(CharMatcher.anyOf(",\n")).trimResults(whitespace()).omitEmptyStrings();
private static final CharMatcher DIGIT_OR_UNDERSCORE = inRange('0', '9').or(is('_'));
private static final CharMatcher UPPER_UNDERSCORE = inRange('A', 'Z').or(DIGIT_OR_UNDERSCORE);
private static final CharMatcher LOWER_UNDERSCORE = inRange('a', 'z').or(DIGIT_OR_UNDERSCORE);
private static final CharMatcher VALID_ENUM_CHAR = LOWER_UNDERSCORE.or(UPPER_UNDERSCORE);
private final IcuConverterConfig.Builder config = IcuConverterConfig.builder();
@SuppressWarnings("unused")
public void setOutputDir(Path path) {
config.setOutputDir(path);
}
@SuppressWarnings("unused")
public void setCldrDir(Path path) {
config.setCldrDir(path);
}
@SuppressWarnings("unused")
public void setMinimalDraftStatus(String status) {
config.setMinimalDraftStatus(resolve(CldrDraftStatus.class, status));
}
@SuppressWarnings("unused")
public void setOutputTypes(String types) {
config.setOutputTypes(
LIST_SPLITTER
.splitToList(types).stream()
.map(s -> resolve(LdmlConverter.OutputType.class, s))
.collect(toImmutableList()));
}
@SuppressWarnings("unused")
public void setSpecialsDir(Path path) {
config.setSpecialsDir(path);
}
@SuppressWarnings("unused")
public void setEmitReport(boolean emit) {
config.setEmitReport(emit);
}
public static final class LocaleIds extends Task {
private ImmutableList<IcuLocaleDir> dirs = ImmutableList.of();
private ImmutableList<String> ids = ImmutableList.of();
@SuppressWarnings("unused")
public void setDirs(String directories) {
this.dirs = LIST_SPLITTER.splitToList(directories).stream()
.map(s -> resolve(IcuLocaleDir.class, s))
.collect(toImmutableList());
}
@SuppressWarnings("unused")
public void addText(String localeIds) {
// Need to filter out '//' style end-of-line comments first (replace with \n to avoid
// inadvertantly joining two elements.
localeIds = localeIds.replaceAll("//[^\n]*\n", "\n");
this.ids = ImmutableList.copyOf(LIST_SPLITTER.splitToList(localeIds));
}
}
public static final class ForcedAlias extends Task {
private IcuLocaleDir dir;
private String source;
private String target;
@SuppressWarnings("unused")
public void setDir(String directory) {
this.dir = resolve(IcuLocaleDir.class, directory);
}
@SuppressWarnings("unused")
public void setSource(String source) {
this.source = checkNotNull(source);
}
@SuppressWarnings("unused")
public void setTarget(String target) {
this.target = checkNotNull(target);
}
}
@SuppressWarnings("unused")
public void addConfiguredLocaleIds(LocaleIds localeIds) {
localeIds.dirs.forEach(d -> config.addLocaleIds(d, localeIds.ids));
}
@SuppressWarnings("unused")
public void addConfiguredForcedAlias(ForcedAlias alias) {
config.addForcedAlias(alias.dir, alias.source, alias.target);
}
@SuppressWarnings("unused")
public void execute() throws BuildException {
LdmlConverter.convert(config.build());
}
private static <T extends Enum<T>> T resolve(Class<T> enumClass, String name) {
checkArgument(!name.isEmpty(), "enumeration name cannot be empty");
checkArgument(VALID_ENUM_CHAR.matchesAllOf(name),
"invalid enumeration name '%s'; expected only ASCII letters or '_'", name);
CaseFormat format;
if (UPPER_UNDERSCORE.matchesAllOf(name)) {
format = CaseFormat.UPPER_UNDERSCORE;
} else if (LOWER_UNDERSCORE.matchesAllOf(name)) {
format = CaseFormat.LOWER_UNDERSCORE;
} else {
// Mixed case with '_' is not permitted.
checkArgument(!name.contains("_"),
"invalid enumeration name '%s'; mixed case with underscore not allowed: %s", name);
format =
Ascii.isLowerCase(name.charAt(0)) ? CaseFormat.LOWER_CAMEL : CaseFormat.UPPER_CAMEL;
}
try {
return Enum.valueOf(enumClass, format.to(CaseFormat.UPPER_UNDERSCORE, name));
} catch (IllegalArgumentException e) {
String validNames =
Arrays.stream(enumClass.getEnumConstants())
.map(Object::toString)
.collect(joining(", "));
throw new IllegalArgumentException(
"invalid enumeration name " + name + "; expected one of; " + validNames);
}
}
}