ICU-20693 Adding Ant support for configuration of ICU data builds.
See #771
This commit is contained in:
parent
b702db31df
commit
cf4ce53541
@ -6,36 +6,20 @@
|
||||
Basic instructions for running the LdmlConverter via Maven
|
||||
==========================================================
|
||||
|
||||
Note that these instructions do not currently support configuration of the converter for things
|
||||
such as limiting the set of files produced. That is supported in code and could be easily added
|
||||
to the binary, or encapsulated via an Ant task, but currently it is not directly supported.
|
||||
See the IcuConverterConfig class for the API by which this can be supported.
|
||||
|
||||
|
||||
Important directories
|
||||
---------------------
|
||||
|
||||
<CLDR_DIR> = The root directory of the CLDR release.
|
||||
|
||||
<ICU_DIR> = The root directory of the ICU release (probably a parent directory of where
|
||||
this README file is located). This is an optional property and defaults to
|
||||
the parent directory of the release from which it is run.
|
||||
|
||||
<DTD_CACHE> = The temporary cache directory in which DTD files are downloaded (this is the
|
||||
same directory as would be used when running tools from the CLDR project).
|
||||
Note that the need to specify this directory is scheduled to be removed after
|
||||
ICU release 65.
|
||||
|
||||
<OUT_DIR> = The output directory into which ICU data files should be written.
|
||||
|
||||
|
||||
Generating all ICU data
|
||||
-----------------------
|
||||
|
||||
$ mvn exec:java \
|
||||
-DCLDR_DIR='<CLDR_DIR>' \
|
||||
-DCLDR_DTD_CACHE='<DTD_CACHE>' \
|
||||
-Dexec.args='<OUT_DIR>'
|
||||
First edit the Ant build file to
|
||||
|
||||
$ CLDR_DIR=<CLDR_DIR> ant -f build-icu-data.xml
|
||||
|
||||
|
||||
Running unit tests
|
||||
|
311
tools/cldr/cldr-to-icu/build-icu-data.xml
Normal file
311
tools/cldr/cldr-to-icu/build-icu-data.xml
Normal file
@ -0,0 +1,311 @@
|
||||
<!-- © 2019 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html -->
|
||||
|
||||
<!--================================================================================
|
||||
To build ICU data files:
|
||||
1: Determine the CLDR base directory and set the CLDR_DIR environment variable.
|
||||
2: Determine the flags required (see the list of properties below).
|
||||
3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>...
|
||||
================================================================================-->
|
||||
<!-- TODO: Add things like copying of a template directory and deleting previous files
|
||||
(perhaps always generate into a temporary directory and copy back to avoid having
|
||||
inconsistent state when the conversion is cancelled). -->
|
||||
<project name="Convert" default="convert" basedir=".">
|
||||
<!-- Initialize the properties which were not already set on the command line. -->
|
||||
<target name="init-args">
|
||||
<property environment="env"/>
|
||||
<condition property="hascldrdir" >
|
||||
<isset property="env.CLDR_DIR" />
|
||||
</condition>
|
||||
<fail unless="hascldrdir"
|
||||
message="Please set the CLDR_DIR environment variable to the top level CLDR source dir (containing 'common')."/>
|
||||
|
||||
<!-- The base directory of the CLDR release from which CLDR data is obtained. For
|
||||
legacy reasons, this must also match an environment variable called CLDR_DIR,
|
||||
which is read here, so it is best to set this via the environment variable for
|
||||
now. -->
|
||||
<!-- TODO: Update this when the CLDR_DIR environment variable is no longer needed. -->
|
||||
<property name="cldrDir" value="${env.CLDR_DIR}"/>
|
||||
|
||||
<!-- The output directory into which to write the converted ICU data. By default
|
||||
this will overwrite (without deletion) the ICU data files in this ICU release,
|
||||
so it is recommended that for testing, it be set to another value. -->
|
||||
<property name="outDir" value="${basedir}/../../../icu4c/source/data/"/>
|
||||
|
||||
<!-- The directory in which the additional ICU XML data is stored. -->
|
||||
<property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/>
|
||||
|
||||
<!-- The minimum draft status for CLDR data to be used in the conversion. See
|
||||
CldrDraftStatus for more details. -->
|
||||
<property name="minDraftStatus" value="contributed"/>
|
||||
|
||||
<!-- Whether to emit a debug report containing some possibly useful information after
|
||||
the conversion has finished. -->
|
||||
<!-- TODO: Currently this isn't hugely useful, so find out what people want. -->
|
||||
<property name="emitReport" value="false"/>
|
||||
|
||||
<!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty
|
||||
list means "build everything".
|
||||
|
||||
Note that the grouping of types is based on the legacy converter behaviour and
|
||||
is not always directly associated with an output directory (e.g. "locales"
|
||||
produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT
|
||||
coll/, brkitr/ or rbnf/).
|
||||
|
||||
You can also specify by DTD type (e.g. dtdBcp47, dtdSupplemental or dtdLdml)
|
||||
which is still not quite directly associated with output directories either,
|
||||
since some supplemental data is also written to the curr/ directory.
|
||||
|
||||
See LdmlConverter.OutputType for the full list of valid types. -->
|
||||
<!-- TODO: Find out what people actually want here and switch to that. -->
|
||||
<property name="outputTypes" value=""/>
|
||||
</target>
|
||||
|
||||
<!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
|
||||
about making Ant know the Maven class-path). -->
|
||||
<target name="prepare-jar" depends="init-args">
|
||||
<exec executable="mvn" searchpath="true">
|
||||
<arg value="compile"/>
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
<!-- Do the actual CLDR data conversion, based on the command line arguments, built in
|
||||
default properties and the configuration in the "<convert>" element below. -->
|
||||
<target name="convert" depends="init-args, prepare-jar">
|
||||
<taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask">
|
||||
<classpath>
|
||||
<pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
|
||||
</classpath>
|
||||
</taskdef>
|
||||
<convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
|
||||
minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
|
||||
|
||||
<!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
|
||||
TODO: Find out and document this properly. -->
|
||||
<forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
|
||||
|
||||
<!-- This appears to be a hack to avoid needing to copy and maintain the same "zh"
|
||||
data for "yue". The files for "yue" in this directory should be empty otherwise.
|
||||
The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
|
||||
"yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
|
||||
rewriting the base language. -->
|
||||
<forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
|
||||
<forcedAlias dir="coll" source="yue" target="zh_Hant"/>
|
||||
|
||||
<!-- It is not at all clear why this is being done. It's certainly not exactly the same
|
||||
as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
|
||||
data than "yue", so this alias is not just rewriting the base language.
|
||||
TODO: Find out and document this properly. -->
|
||||
<forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
|
||||
|
||||
<!-- The primary set of locale IDs to be generated. Other, directory specific, sets exist
|
||||
and do not have to be subsets of this. Some of these ID are aliases, so XML files
|
||||
may not exist for all of them. -->
|
||||
<!-- TODO: Add locale ID inference to reduce this list considerably. -->
|
||||
<localeIds dirs="curr,lang,locales,region,unit,zone">
|
||||
root,
|
||||
|
||||
// A
|
||||
af, af_NA, af_ZA, agq, agq_CM, ak, ak_GH, am, am_ET, ar, ar_001,
|
||||
ar_AE, ar_BH, ar_DJ, ar_DZ, ar_EG, ar_EH, ar_ER, ar_IL, ar_IQ,
|
||||
ar_JO, ar_KM, ar_KW, ar_LB, ar_LY, ar_MA, ar_MR, ar_OM, ar_PS,
|
||||
ar_QA, ar_SA, ar_SD, ar_SO, ar_SS, ar_SY, ar_TD, ar_TN, ar_YE, ars,
|
||||
as, as_IN, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl, az_Cyrl_AZ,
|
||||
az_Latn, az_Latn_AZ,
|
||||
|
||||
// B
|
||||
bas, bas_CM, be, be_BY, bem, bem_ZM, bez, bez_TZ, bg, bg_BG, bm,
|
||||
bm_ML, bn, bn_BD, bn_IN, bo, bo_CN, bo_IN, br, br_FR, brx, brx_IN,
|
||||
bs, bs_Cyrl, bs_Cyrl_BA, bs_Latn, bs_Latn_BA, bs_BA,
|
||||
|
||||
// C
|
||||
ca, ca_AD, ca_ES, ca_FR, ca_IT, ccp, ccp_BD, ccp_IN, ce, ce_RU,
|
||||
ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs,
|
||||
cs_CZ, cy, cy_GB,
|
||||
|
||||
// D
|
||||
da, da_DK, da_GL, dav, dav_KE, de, de_AT, de_BE, de_CH, de_DE,
|
||||
de_IT, de_LI, de_LU, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo,
|
||||
dyo_SN, dz, dz_BT,
|
||||
|
||||
// E
|
||||
ebu, ebu_KE, ee, ee_GH, ee_TG, el, el_CY, el_GR, en, en_001,
|
||||
en_150, en_AE, en_AG, en_AI, en_AS, en_AT, en_AU, en_BB, en_BE,
|
||||
en_BI, en_BM, en_BS, en_BW, en_BZ, en_CA, en_CC, en_CH, en_CK,
|
||||
en_CM, en_CX, en_CY, en_DE, en_DG, en_DK, en_DM, en_ER, en_FI,
|
||||
en_FJ, en_FK, en_FM, en_GB, en_GD, en_GG, en_GH, en_GI, en_GM,
|
||||
en_GU, en_GY, en_HK, en_IE, en_IL, en_IM, en_IN, en_IO, en_JE,
|
||||
en_JM, en_KE, en_KI, en_KN, en_KY, en_LC, en_LR, en_LS, en_MG,
|
||||
en_MH, en_MO, en_MP, en_MS, en_MT, en_MU, en_MW, en_MY, en_NA,
|
||||
en_NF, en_NG, en_NH, en_NL, en_NR, en_NU, en_NZ, en_PG, en_PH,
|
||||
en_PK, en_PN, en_PR, en_PW, en_RH, en_RW, en_SB, en_SC, en_SD,
|
||||
en_SE, en_SG, en_SH, en_SI, en_SL, en_SS, en_SX, en_SZ, en_TC,
|
||||
en_TK, en_TO, en_TT, en_TV, en_TZ, en_UG, en_UM, en_US, en_US_POSIX,
|
||||
en_VC, en_VG, en_VI, en_VU, en_WS, en_ZA, en_ZM, en_ZW, eo,
|
||||
eo_001, es, es_419, es_AR, es_BO, es_BR, es_BZ, es_CL, es_CO,
|
||||
es_CR, es_CU, es_DO, es_EA, es_EC, es_ES, es_GQ, es_GT, es_HN,
|
||||
es_IC, es_MX, es_NI, es_PA, es_PE, es_PH, es_PR, es_PY, es_SV,
|
||||
es_US, es_UY, es_VE, et, et_EE, eu, eu_ES, ewo, ewo_CM,
|
||||
|
||||
// F
|
||||
fa, fa_AF, fa_IR, ff, ff_CM, ff_GN, ff_Latn, ff_Latn_BF, ff_Latn_CM,
|
||||
ff_Latn_GH, ff_Latn_GM, ff_Latn_GN, ff_Latn_GW, ff_Latn_LR, ff_Latn_MR,
|
||||
ff_Latn_NE, ff_Latn_NG, ff_Latn_SL, ff_Latn_SN, ff_MR, ff_SN, fi,
|
||||
fi_FI, fil, fil_PH, fo, fo_DK, fo_FO, fr, fr_BE, fr_BF, fr_BI,
|
||||
fr_BJ, fr_BL, fr_CA, fr_CD, fr_CF, fr_CG, fr_CH, fr_CI, fr_CM,
|
||||
fr_DJ, fr_DZ, fr_FR, fr_GA, fr_GF, fr_GN, fr_GP, fr_GQ, fr_HT,
|
||||
fr_KM, fr_LU, fr_MA, fr_MC, fr_MF, fr_MG, fr_ML, fr_MQ, fr_MR,
|
||||
fr_MU, fr_NC, fr_NE, fr_PF, fr_PM, fr_RE, fr_RW, fr_SC, fr_SN,
|
||||
fr_SY, fr_TD, fr_TG, fr_TN, fr_VU, fr_WF, fr_YT, fur, fur_IT,
|
||||
fy, fy_NL,
|
||||
|
||||
// G
|
||||
ga, ga_IE, gd, gd_GB, gl, gl_ES, gsw, gsw_CH, gsw_FR, gsw_LI,
|
||||
gu, gu_IN, guz, guz_KE, gv, gv_IM,
|
||||
|
||||
// H
|
||||
ha, ha_GH, ha_NE, ha_NG, haw, haw_US, he, he_IL, hi, hi_IN,
|
||||
hr, hr_BA, hr_HR, hsb, hsb_DE, hu, hu_HU, hy, hy_AM,
|
||||
|
||||
// I
|
||||
ia, ia_001, id, id_ID, ig, ig_NG, ii, ii_CN, in, in_ID, is,
|
||||
is_IS, it, it_CH, it_IT, it_SM, it_VA, iw, iw_IL,
|
||||
|
||||
// J
|
||||
ja, ja_JP, ja_JP_TRADITIONAL, jgo, jgo_CM, jmc, jmc_TZ, jv, jv_ID,
|
||||
|
||||
// K
|
||||
ka, ka_GE, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV,
|
||||
khq, khq_ML, ki, ki_KE, kk, kk_KZ, kkj, kkj_CM, kl, kl_GL, kln,
|
||||
kln_KE, km, km_KH, kn, kn_IN, ko, ko_KP, ko_KR, kok, kok_IN,
|
||||
ks, ks_IN, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, ku_TR,
|
||||
kw, kw_GB, ky, ky_KG,
|
||||
|
||||
// L
|
||||
lag, lag_TZ, lb, lb_LU, lg, lg_UG, lkt, lkt_US, ln, ln_AO,
|
||||
ln_CD, ln_CF, ln_CG, lo, lo_LA, lrc, lrc_IQ, lrc_IR, lt, lt_LT,
|
||||
lu, lu_CD, luo, luo_KE, luy, luy_KE, lv, lv_LV,
|
||||
|
||||
// M
|
||||
mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mg_MG, mgh,
|
||||
mgh_MZ, mgo, mgo_CM, mi, mi_NZ, mk, mk_MK, ml, ml_IN, mn,
|
||||
mn_MN, mo, mr, mr_IN, ms, ms_BN, ms_MY, ms_SG, mt, mt_MT, mua,
|
||||
mua_CM, my, my_MM, mzn, mzn_IR,
|
||||
|
||||
// N
|
||||
naq, naq_NA, nb, nb_NO, nb_SJ, nd, nd_ZW, nds, nds_DE, nds_NL,
|
||||
ne, ne_IN, ne_NP, nl, nl_AW, nl_BE, nl_BQ, nl_CW, nl_NL, nl_SR,
|
||||
nl_SX, nmg, nmg_CM, nn, nn_NO, nnh, nnh_CM, no, no_NO, no_NO_NY,
|
||||
nus, nus_SS, nyn, nyn_UG,
|
||||
|
||||
// O
|
||||
om, om_ET, om_KE, or, or_IN, os, os_GE, os_RU,
|
||||
|
||||
// P
|
||||
pa, pa_Arab, pa_Arab_PK, pa_Guru, pa_Guru_IN, pa_IN, pa_PK, pl,
|
||||
pl_PL, ps, ps_AF, ps_PK, pt, pt_AO, pt_BR, pt_CH, pt_CV, pt_GQ,
|
||||
pt_GW, pt_LU, pt_MO, pt_MZ, pt_PT, pt_ST, pt_TL,
|
||||
|
||||
// Q
|
||||
qu, qu_BO, qu_EC, qu_PE,
|
||||
|
||||
// R
|
||||
rm, rm_CH, rn, rn_BI, ro, ro_MD, ro_RO, rof, rof_TZ, ru,
|
||||
ru_BY, ru_KG, ru_KZ, ru_MD, ru_RU, ru_UA, rw, rw_RW, rwk, rwk_TZ,
|
||||
|
||||
// S
|
||||
sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, sd_PK, se, se_FI,
|
||||
se_NO, se_SE, seh, seh_MZ, ses, ses_ML, sg, sg_CF, sh, sh_BA,
|
||||
sh_CS, sh_YU, shi, shi_Latn, shi_Latn_MA, shi_Tfng, shi_Tfng_MA,
|
||||
shi_MA, si, si_LK, sk, sk_SK, sl, sl_SI, smn, smn_FI, sn, sn_ZW,
|
||||
so, so_DJ, so_ET, so_KE, so_SO, sq, sq_AL, sq_MK, sq_XK, sr,
|
||||
sr_Cyrl, sr_Cyrl_BA, sr_Cyrl_ME, sr_Cyrl_RS, sr_Cyrl_CS, sr_Cyrl_XK,
|
||||
sr_Cyrl_YU, sr_Latn, sr_Latn_BA, sr_Latn_ME, sr_Latn_RS, sr_Latn_CS,
|
||||
sr_Latn_XK, sr_Latn_YU, sr_BA, sr_ME, sr_RS, sr_CS, sr_XK, sr_YU,
|
||||
sv, sv_AX, sv_FI, sv_SE, sw, sw_CD, sw_KE, sw_TZ, sw_UG,
|
||||
|
||||
// T
|
||||
ta, ta_IN, ta_LK, ta_MY, ta_SG, te, te_IN, teo, teo_KE, teo_UG,
|
||||
tg, tg_TJ, th, th_TH, th_TH_TRADITIONAL, ti, ti_ER, ti_ET, tk,
|
||||
tk_TM, tl, tl_PH, to, to_TO, tr, tr_CY, tr_TR, tt, tt_RU,
|
||||
twq, twq_NE, tzm, tzm_MA,
|
||||
|
||||
// U
|
||||
ug, ug_CN, uk, uk_UA, ur, ur_IN, ur_PK, uz, uz_AF, uz_Arab,
|
||||
uz_Arab_AF, uz_Cyrl, uz_Cyrl_UZ, uz_Latn, uz_Latn_UZ, uz_UZ,
|
||||
|
||||
// V
|
||||
vai, vai_Latn, vai_Latn_LR, vai_LR, vai_Vaii, vai_Vaii_LR, vi,
|
||||
vi_VN, vun, vun_TZ,
|
||||
|
||||
// W
|
||||
wae, wae_CH, wo, wo_SN,
|
||||
|
||||
// X
|
||||
xh, xh_ZA, xog, xog_UG,
|
||||
|
||||
// Y
|
||||
yav, yav_CM, yi, yi_001, yo, yo_BJ, yo_NG, yue, yue_CN, yue_HK,
|
||||
yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK,
|
||||
|
||||
// Z
|
||||
zgh, zgh_MA, zh, zh_Hans, zh_Hans_CN, zh_Hans_HK, zh_Hans_MO,
|
||||
zh_Hans_SG, zh_Hant, zh_Hant_HK, zh_Hant_MO, zh_Hant_TW, zh_CN,
|
||||
zh_HK, zh_MO, zh_SG, zh_TW, zu, zu_ZA
|
||||
</localeIds>
|
||||
|
||||
<!-- TODO: Explain why these special cases are needed/different. -->
|
||||
<localeIds dirs="coll">
|
||||
root,
|
||||
|
||||
// A-B
|
||||
af, am, ars, ar, as, az, be, bg, bn, bo, bs_Cyrl, bs,
|
||||
|
||||
// C-F
|
||||
ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
|
||||
en_US_POSIX, en_US, eo, es, et, fa_AF, fa, fil, fi, fo, fr_CA, fr,
|
||||
|
||||
// G-J
|
||||
ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
|
||||
id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
|
||||
|
||||
// K-P
|
||||
ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
|
||||
mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
|
||||
om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
|
||||
|
||||
// R-T
|
||||
ro, ru, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
|
||||
sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
|
||||
ta, te, th, tk, to, tr,
|
||||
|
||||
// U-Z
|
||||
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
|
||||
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
|
||||
</localeIds>
|
||||
|
||||
<localeIds dirs="rbnf">
|
||||
root,
|
||||
|
||||
// A-E
|
||||
af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
|
||||
da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
|
||||
es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
|
||||
|
||||
// F-P
|
||||
fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
|
||||
hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
|
||||
lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,
|
||||
|
||||
// Q-Z
|
||||
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
|
||||
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
|
||||
</localeIds>
|
||||
|
||||
<localeIds dirs="brkitr">
|
||||
root,
|
||||
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
|
||||
</localeIds>
|
||||
</convert>
|
||||
</target>
|
||||
</project>
|
@ -8,6 +8,10 @@
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<groupId>org.unicode.icu</groupId>
|
||||
<artifactId>cldr-to-icu</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
@ -25,8 +29,11 @@
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
<version>1.6.0</version>
|
||||
<configuration>
|
||||
<mainClass>org.unicode.icu.tool.cldrtoicu.LdmlConverter</mainClass>
|
||||
<mainClass>
|
||||
org.unicode.icu.tool.cldrtoicu.LdmlConverter
|
||||
</mainClass>
|
||||
<systemProperties>
|
||||
<property>
|
||||
<key>ICU_DIR</key>
|
||||
@ -35,10 +42,35 @@
|
||||
</systemProperties>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<version>3.1.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>compile</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<archive>
|
||||
<manifest>
|
||||
<mainClass>
|
||||
org.unicode.icu.tool.cldrtoicu.LdmlConverter
|
||||
</mainClass>
|
||||
</manifest>
|
||||
</archive>
|
||||
<descriptorRefs>
|
||||
<descriptorRef>jar-with-dependencies</descriptorRef>
|
||||
</descriptorRefs>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<!-- This is where the snapshots of the CLDR API and additional auxilliary jars are held. -->
|
||||
<!-- This is where the snapshots of the CLDR API and additional auxiliary JAR files are held. -->
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>local-maven-repo</id>
|
||||
@ -79,5 +111,10 @@
|
||||
<version>1.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.ant</groupId>
|
||||
<artifactId>ant</artifactId>
|
||||
<version>1.10.6</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
@ -14,11 +14,16 @@ import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.unicode.cldr.api.CldrDraftStatus;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.ImmutableSetMultimap;
|
||||
import com.google.common.collect.ImmutableTable;
|
||||
import com.google.common.collect.SetMultimap;
|
||||
import com.google.common.collect.Table;
|
||||
import com.google.common.collect.TreeBasedTable;
|
||||
import com.google.common.collect.TreeMultimap;
|
||||
|
||||
/**
|
||||
* The converter config intended to generate the standard ICU data files. This used to be something
|
||||
* that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
|
||||
@ -34,15 +39,18 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
|
||||
.map(d -> Paths.get(d).toAbsolutePath());
|
||||
|
||||
/** The builder with which to specify configuration for the {@link LdmlConverter}. */
|
||||
@SuppressWarnings("UnusedReturnValue")
|
||||
public static final class Builder {
|
||||
private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
|
||||
private Path outputDir =
|
||||
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
|
||||
private Path specialsDir =
|
||||
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);;
|
||||
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);
|
||||
private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
|
||||
private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
|
||||
private boolean emitReport = false;
|
||||
private final SetMultimap<IcuLocaleDir, String> localeIdsMap = TreeMultimap.create();
|
||||
private final Table<IcuLocaleDir, String, String> forcedAliases = TreeBasedTable.create();
|
||||
|
||||
/**
|
||||
* Sets the CLDR base directory from which to load all CLDR data. This is optional if the
|
||||
@ -98,6 +106,16 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addLocaleIds(IcuLocaleDir dir, Iterable<String> localeIds) {
|
||||
localeIdsMap.putAll(dir, localeIds);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addForcedAlias(IcuLocaleDir dir, String source, String target) {
|
||||
forcedAliases.put(dir, source, target);
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Returns a converter config from the current builder state. */
|
||||
public LdmlConverterConfig build() {
|
||||
return new IcuConverterConfig(this);
|
||||
@ -110,6 +128,8 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
|
||||
private final ImmutableSet<OutputType> outputTypes;
|
||||
private final CldrDraftStatus minimalDraftStatus;
|
||||
private final boolean emitReport;
|
||||
private final ImmutableSetMultimap<IcuLocaleDir, String> localeIdsMap;
|
||||
private final ImmutableTable<IcuLocaleDir, String, String> forcedAliases;
|
||||
|
||||
private IcuConverterConfig(Builder builder) {
|
||||
this.cldrDir = checkNotNull(builder.cldrDir,
|
||||
@ -135,247 +155,50 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
|
||||
Arrays.asList(OutputType.values()));
|
||||
this.minimalDraftStatus = builder.minimalDraftStatus;
|
||||
this.emitReport = builder.emitReport;
|
||||
this.localeIdsMap = ImmutableSetMultimap.copyOf(builder.localeIdsMap);
|
||||
this.forcedAliases = ImmutableTable.copyOf(builder.forcedAliases);
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
@Override public Path getCldrDirectory() {
|
||||
@Override
|
||||
public Path getCldrDirectory() {
|
||||
return cldrDir;
|
||||
}
|
||||
|
||||
@Override public Path getOutputDir() {
|
||||
@Override
|
||||
public Path getOutputDir() {
|
||||
return outputDir;
|
||||
}
|
||||
|
||||
@Override public Set<OutputType> getOutputTypes() {
|
||||
@Override
|
||||
public Set<OutputType> getOutputTypes() {
|
||||
return outputTypes;
|
||||
}
|
||||
|
||||
@Override public CldrDraftStatus getMinimumDraftStatus() {
|
||||
@Override
|
||||
public CldrDraftStatus getMinimumDraftStatus() {
|
||||
return minimalDraftStatus;
|
||||
}
|
||||
|
||||
@Override public Path getSpecialsDir() {
|
||||
@Override
|
||||
public Path getSpecialsDir() {
|
||||
return specialsDir;
|
||||
}
|
||||
|
||||
@Override public boolean emitReport() {
|
||||
@Override
|
||||
public boolean emitReport() {
|
||||
return emitReport;
|
||||
}
|
||||
|
||||
// Currently hard-coded "hacks" which could be encoded via the builder if wanted.
|
||||
|
||||
@Override public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
|
||||
switch (dir) {
|
||||
case COLL:
|
||||
return ImmutableMap.<String, String>builder()
|
||||
// It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
|
||||
// TODO: Find out and document this properly.
|
||||
.put("sr_ME", "sr_Cyrl_ME")
|
||||
|
||||
// This appears to be a hack to avoid needing to copy and maintain the same "zh"
|
||||
// data for "yue". The files for "yue" in this directory should be empty otherwise.
|
||||
//
|
||||
// The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
|
||||
// "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
|
||||
// rewriting the base language.
|
||||
.put("yue_Hans", "zh_Hans")
|
||||
.put("yue", "zh_Hant")
|
||||
.build();
|
||||
case RBNF:
|
||||
// It is not at all clear why this is being done. It's certainly not exactly the same
|
||||
// as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
|
||||
// data than "yue", so this alias is not just rewriting the base language.
|
||||
// TODO: Find out and document this properly.
|
||||
return ImmutableMap.of("zh_Hant_HK", "yue");
|
||||
default:
|
||||
return ImmutableMap.of();
|
||||
}
|
||||
@Override
|
||||
public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
|
||||
return forcedAliases.row(dir);
|
||||
}
|
||||
|
||||
// This set of locale files in each directory denotes the supported/available locales for that
|
||||
// API. In most cases, it's the same set, but a few directories support only a subset of IDs.
|
||||
@Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
|
||||
switch (dir) {
|
||||
case COLL:
|
||||
return COLL_LOCALE_IDS;
|
||||
case BRKITR:
|
||||
return BRKITR_LOCALE_IDS;
|
||||
case RBNF:
|
||||
return RBNF_LOCALE_IDS;
|
||||
default:
|
||||
return ICU_LOCALE_IDS;
|
||||
return localeIdsMap.get(dir);
|
||||
}
|
||||
}
|
||||
|
||||
// The primary set of locale IDs to be generated. Other, directory specific, sets should be
|
||||
// subsets of this. Some of these ID are aliases, so XML files may not exist for all of them.
|
||||
//
|
||||
// This was further modified (in order to better match the set of generated ICU files) by:
|
||||
// * Removing "es_003" (which just seems to be ignored in current code)
|
||||
// * Adding: "en_NH", "sr_XK", "yue_CN", "yue_HK" (deprecated locale IDs in the manual config)
|
||||
// * Adding: "no_NO_NY" (a not even structurally valid ID that exists for very legacy reasons)
|
||||
private static final ImmutableSet<String> ICU_LOCALE_IDS = ImmutableSet.of(
|
||||
"root",
|
||||
// A
|
||||
"af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
|
||||
"ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ",
|
||||
"ar_JO", "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS",
|
||||
"ar_QA", "ar_SA", "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars",
|
||||
"as", "as_IN", "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ",
|
||||
"az_Latn", "az_Latn_AZ",
|
||||
// B
|
||||
"bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg", "bg_BG", "bm",
|
||||
"bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR", "brx", "brx_IN",
|
||||
"bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA",
|
||||
// C
|
||||
"ca", "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU",
|
||||
"ceb", "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs",
|
||||
"cs_CZ", "cy", "cy_GB",
|
||||
// D
|
||||
"da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
|
||||
"de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
|
||||
"dyo_SN", "dz", "dz_BT",
|
||||
// E
|
||||
"ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR", "en", "en_001",
|
||||
"en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB", "en_BE",
|
||||
"en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
|
||||
"en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI",
|
||||
"en_FJ", "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM",
|
||||
"en_GU", "en_GY", "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE",
|
||||
"en_JM", "en_KE", "en_KI", "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG",
|
||||
"en_MH", "en_MO", "en_MP", "en_MS", "en_MT", "en_MU", "en_MW", "en_MY", "en_NA",
|
||||
"en_NF", "en_NG", "en_NH", "en_NL", "en_NR", "en_NU", "en_NZ", "en_PG", "en_PH",
|
||||
"en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB", "en_SC", "en_SD",
|
||||
"en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ", "en_TC",
|
||||
"en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US", "en_US_POSIX",
|
||||
"en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
|
||||
"eo_001", "es", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
|
||||
"es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN",
|
||||
"es_IC", "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV",
|
||||
"es_US", "es_UY", "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM",
|
||||
// F
|
||||
"fa", "fa_AF", "fa_IR", "ff", "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM",
|
||||
"ff_Latn_GH", "ff_Latn_GM", "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR",
|
||||
"ff_Latn_NE", "ff_Latn_NG", "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi",
|
||||
"fi_FI", "fil", "fil_PH", "fo", "fo_DK", "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI",
|
||||
"fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF", "fr_CG", "fr_CH", "fr_CI", "fr_CM",
|
||||
"fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN", "fr_GP", "fr_GQ", "fr_HT",
|
||||
"fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML", "fr_MQ", "fr_MR",
|
||||
"fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC", "fr_SN",
|
||||
"fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
|
||||
"fy", "fy_NL",
|
||||
// G
|
||||
"ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR", "gsw_LI",
|
||||
"gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM",
|
||||
// H
|
||||
"ha", "ha_GH", "ha_NE", "ha_NG", "haw", "haw_US", "he", "he_IL", "hi", "hi_IN",
|
||||
"hr", "hr_BA", "hr_HR", "hsb", "hsb_DE", "hu", "hu_HU", "hy", "hy_AM",
|
||||
// I
|
||||
"ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN", "in", "in_ID", "is",
|
||||
"is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL",
|
||||
// J
|
||||
"ja", "ja_JP", "ja_JP_TRADITIONAL", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID",
|
||||
// K
|
||||
"ka", "ka_GE", "kab", "kab_DZ", "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV",
|
||||
"khq", "khq_ML", "ki", "ki_KE", "kk", "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln",
|
||||
"kln_KE", "km", "km_KH", "kn", "kn_IN", "ko", "ko_KP", "ko_KR", "kok", "kok_IN",
|
||||
"ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM", "ksh", "ksh_DE", "ku", "ku_TR",
|
||||
"kw", "kw_GB", "ky", "ky_KG",
|
||||
// L
|
||||
"lag", "lag_TZ", "lb", "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO",
|
||||
"ln_CD", "ln_CF", "ln_CG", "lo", "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT",
|
||||
"lu", "lu_CD", "luo", "luo_KE", "luy", "luy_KE", "lv", "lv_LV",
|
||||
// M
|
||||
"mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg", "mg_MG", "mgh",
|
||||
"mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN", "mn",
|
||||
"mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
|
||||
"mua_CM", "my", "my_MM", "mzn", "mzn_IR",
|
||||
// N
|
||||
"naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd", "nd_ZW", "nds", "nds_DE", "nds_NL",
|
||||
"ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ", "nl_CW", "nl_NL", "nl_SR",
|
||||
"nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no", "no_NO", "no_NO_NY",
|
||||
"nus", "nus_SS", "nyn", "nyn_UG",
|
||||
// O
|
||||
"om", "om_ET", "om_KE", "or", "or_IN", "os", "os_GE", "os_RU",
|
||||
// P
|
||||
"pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK", "pl",
|
||||
"pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
|
||||
"pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL",
|
||||
// Q
|
||||
"qu", "qu_BO", "qu_EC", "qu_PE",
|
||||
// R
|
||||
"rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
|
||||
"ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ",
|
||||
// S
|
||||
"sah", "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI",
|
||||
"se_NO", "se_SE", "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA",
|
||||
"sh_CS", "sh_YU", "shi", "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA",
|
||||
"shi_MA", "si", "si_LK", "sk", "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn",
|
||||
"sn_ZW", "so", "so_DJ", "so_ET", "so_KE", "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK",
|
||||
"sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME", "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK",
|
||||
"sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA", "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS",
|
||||
"sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME", "sr_RS", "sr_CS", "sr_XK", "sr_YU",
|
||||
"sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ", "sw_UG",
|
||||
// T
|
||||
"ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
|
||||
"tg", "tg_TJ", "th", "th_TH", "th_TH_TRADITIONAL", "ti", "ti_ER", "ti_ET", "tk",
|
||||
"tk_TM", "tl", "tl_PH", "to", "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU",
|
||||
"twq", "twq_NE", "tzm", "tzm_MA",
|
||||
// U
|
||||
"ug", "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab",
|
||||
"uz_Arab_AF", "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ",
|
||||
// V
|
||||
"vai", "vai_Latn", "vai_Latn_LR", "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi",
|
||||
"vi_VN", "vun", "vun_TZ",
|
||||
// W
|
||||
"wae", "wae_CH", "wo", "wo_SN",
|
||||
// X
|
||||
"xh", "xh_ZA", "xog", "xog_UG",
|
||||
// Y
|
||||
"yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ", "yo_NG", "yue", "yue_CN", "yue_HK",
|
||||
"yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK",
|
||||
// Z
|
||||
"zgh", "zgh_MA", "zh", "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO",
|
||||
"zh_Hans_SG", "zh_Hant", "zh_Hant_HK", "zh_Hant_MO", "zh_Hant_TW", "zh_CN",
|
||||
"zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
|
||||
|
||||
private static final ImmutableSet<String> COLL_LOCALE_IDS = ImmutableSet.of(
|
||||
"root",
|
||||
// A-B
|
||||
"af", "am", "ars", "ar", "as", "az", "be", "bg", "bn", "bo", "bs_Cyrl", "bs",
|
||||
// C-F
|
||||
"ca", "ceb", "chr", "cs", "cy", "da", "de_AT", "de", "dsb", "dz", "ee", "el", "en",
|
||||
"en_US_POSIX", "en_US", "eo", "es", "et", "fa_AF", "fa", "fil", "fi", "fo", "fr_CA", "fr",
|
||||
// G-J
|
||||
"ga", "gl", "gu", "ha", "haw", "he", "hi", "hr", "hsb", "hu", "hy",
|
||||
"id_ID", "id", "ig", "in", "in_ID", "is", "it", "iw_IL", "iw", "ja",
|
||||
// K-P
|
||||
"ka", "kk", "kl", "km", "kn", "kok", "ko", "ku", "ky", "lb", "lkt", "ln", "lo", "lt", "lv",
|
||||
"mk", "ml", "mn", "mo", "mr", "ms", "mt", "my", "nb", "ne", "nl", "nn", "no_NO", "no",
|
||||
"om", "or", "pa_IN", "pa", "pa_Guru", "pl", "ps", "pt",
|
||||
// R-T
|
||||
"ro", "ru", "se", "sh_BA", "sh_CS", "sh", "sh_YU", "si", "sk", "sl", "smn", "sq",
|
||||
"sr_BA", "sr_Cyrl_ME", "sr_Latn", "sr_ME", "sr_RS", "sr", "sv", "sw",
|
||||
"ta", "te", "th", "tk", "to", "tr",
|
||||
// U-Z
|
||||
"ug", "uk", "ur", "uz", "vi", "wae", "wo", "xh", "yi", "yo", "yue_CN", "yue_Hans",
|
||||
"yue", "zh_CN", "zh_Hant", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zh", "zu");
|
||||
|
||||
private static final ImmutableSet<String> BRKITR_LOCALE_IDS = ImmutableSet.of(
|
||||
"root", "de", "el", "en", "en_US_POSIX", "en_US", "es", "fr", "it", "ja", "pt", "ru",
|
||||
"zh_Hant", "zh");
|
||||
|
||||
private static final ImmutableSet<String> RBNF_LOCALE_IDS = ImmutableSet.of(
|
||||
"root", "af", "ak", "am", "ars", "ar", "az", "be", "bg", "bs", "ca", "ccp", "chr", "cs",
|
||||
"cy", "da", "de_CH", "de", "ee", "el", "en_001", "en_IN", "en", "eo", "es_419", "es_DO",
|
||||
"es_GT", "es_HN", "es_MX", "es_NI", "es_PA", "es_PR", "es_SV", "es", "es_US", "et",
|
||||
"fa_AF", "fa", "ff", "fil", "fi", "fo", "fr_BE", "fr_CH", "fr", "ga", "he", "hi", "hr",
|
||||
"hu", "hy", "id", "in", "is", "it", "iw", "ja", "ka", "kl", "km", "ko", "ky", "lb",
|
||||
"lo", "lrc", "lt", "lv", "mk", "ms", "mt", "my", "nb", "nl", "nn", "no", "pl", "pt_PT",
|
||||
"pt", "qu", "ro", "ru", "se", "sh", "sk", "sl", "sq", "sr_Latn", "sr", "sv",
|
||||
"sw", "ta", "th", "tr", "uk", "vi", "yue_Hans", "yue", "zh_Hant_HK", "zh_Hant", "zh_HK",
|
||||
"zh_MO", "zh_TW", "zh");
|
||||
}
|
||||
|
@ -4,6 +4,10 @@ package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
import static org.unicode.cldr.api.CldrDataType.BCP47;
|
||||
import static org.unicode.cldr.api.CldrDataType.LDML;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.BRKITR;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.COLL;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.CURR;
|
||||
@ -13,10 +17,6 @@ import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.RB
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.REGION;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.UNIT;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.ZONE;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
import static org.unicode.cldr.api.CldrDataType.BCP47;
|
||||
import static org.unicode.cldr.api.CldrDataType.LDML;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@ -42,17 +42,6 @@ import java.util.stream.Stream;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.LinkedListMultimap;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
import com.google.common.collect.SetMultimap;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.common.io.CharStreams;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
|
||||
@ -66,6 +55,17 @@ import org.unicode.icu.tool.cldrtoicu.mapper.SupplementalMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.TransformsMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.LinkedListMultimap;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
import com.google.common.collect.SetMultimap;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.common.io.CharStreams;
|
||||
|
||||
/**
|
||||
* The main converter tool for CLDR to ICU data. To run this tool, you need to supply a suitable
|
||||
* {@link LdmlConverterConfig} instance. There is a simple {@code main()} method available in this
|
||||
@ -168,7 +168,7 @@ public final class LdmlConverter {
|
||||
|
||||
DAY_PERIODS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processDayPeriods("misc")),
|
||||
LdmlConverter::processDayPeriods),
|
||||
GENDER_LIST(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("genderList", GENDER_LIST_PATHS, "misc", false)),
|
||||
@ -192,19 +192,19 @@ public final class LdmlConverter {
|
||||
c -> c.processSupplemental("numberingSystems", NUMBERING_SYSTEMS_PATHS, "misc", false)),
|
||||
PLURALS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processPlurals("misc")),
|
||||
LdmlConverter::processPlurals),
|
||||
PLURAL_RANGES(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processPluralRanges("misc")),
|
||||
LdmlConverter::processPluralRanges),
|
||||
WINDOWS_ZONES(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false)),
|
||||
TRANSFORMS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processTransforms("translit")),
|
||||
LdmlConverter::processTransforms),
|
||||
KEY_TYPE_DATA(
|
||||
BCP47,
|
||||
c -> c.processKeyTypeData("misc")),
|
||||
LdmlConverter::processKeyTypeData),
|
||||
|
||||
// Batching by type.
|
||||
DTD_LDML(LDML, c -> c.processAll(LDML)),
|
||||
@ -231,7 +231,8 @@ public final class LdmlConverter {
|
||||
}
|
||||
}
|
||||
|
||||
private static void convert(LdmlConverterConfig config) {
|
||||
/** Converts CLDR data according to the given configuration. */
|
||||
public static void convert(LdmlConverterConfig config) {
|
||||
CldrDataSupplier src = CldrDataSupplier
|
||||
.forCldrFilesIn(config.getCldrDirectory())
|
||||
.withDraftStatusAtLeast(config.getMinimumDraftStatus());
|
||||
@ -480,24 +481,24 @@ public final class LdmlConverter {
|
||||
return idx == -1 ? segment : segment.substring(0, idx);
|
||||
}
|
||||
|
||||
private void processDayPeriods(String dir) {
|
||||
write(DayPeriodsMapper.process(src), dir);
|
||||
private void processDayPeriods() {
|
||||
write(DayPeriodsMapper.process(src), "misc");
|
||||
}
|
||||
|
||||
private void processPlurals(String dir) {
|
||||
write(PluralsMapper.process(src), dir);
|
||||
private void processPlurals() {
|
||||
write(PluralsMapper.process(src), "misc");
|
||||
}
|
||||
|
||||
private void processPluralRanges(String dir) {
|
||||
write(PluralRangesMapper.process(src), dir);
|
||||
private void processPluralRanges() {
|
||||
write(PluralRangesMapper.process(src), "misc");
|
||||
}
|
||||
|
||||
private void processKeyTypeData(String dir) {
|
||||
Bcp47Mapper.process(src).forEach(d -> write(d, dir));
|
||||
private void processKeyTypeData() {
|
||||
Bcp47Mapper.process(src).forEach(d -> write(d, "misc"));
|
||||
}
|
||||
|
||||
private void processTransforms(String dir) {
|
||||
Path transformDir = createDirectory(config.getOutputDir().resolve(dir));
|
||||
private void processTransforms() {
|
||||
Path transformDir = createDirectory(config.getOutputDir().resolve("translit"));
|
||||
write(TransformsMapper.process(src, transformDir), transformDir);
|
||||
}
|
||||
|
||||
|
@ -7,9 +7,9 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.unicode.cldr.api.CldrDraftStatus;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
|
||||
|
||||
/** API for configuring the LDML converter. */
|
||||
public interface LdmlConverterConfig {
|
||||
|
@ -0,0 +1,158 @@
|
||||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.ant;
|
||||
|
||||
import static com.google.common.base.CharMatcher.inRange;
|
||||
import static com.google.common.base.CharMatcher.is;
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.collect.ImmutableList.toImmutableList;
|
||||
import static java.util.stream.Collectors.joining;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.tools.ant.BuildException;
|
||||
import org.apache.tools.ant.Task;
|
||||
import org.unicode.cldr.api.CldrDraftStatus;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuConverterConfig;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverter;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
import com.google.common.base.CaseFormat;
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed.
|
||||
public final class ConvertIcuDataTask extends Task {
|
||||
private static final Splitter LIST_SPLITTER =
|
||||
Splitter.on(CharMatcher.anyOf(",\n")).trimResults(whitespace()).omitEmptyStrings();
|
||||
|
||||
private static final CharMatcher DIGIT_OR_UNDERSCORE = inRange('0', '9').or(is('_'));
|
||||
private static final CharMatcher UPPER_UNDERSCORE = inRange('A', 'Z').or(DIGIT_OR_UNDERSCORE);
|
||||
private static final CharMatcher LOWER_UNDERSCORE = inRange('a', 'z').or(DIGIT_OR_UNDERSCORE);
|
||||
private static final CharMatcher VALID_ENUM_CHAR = LOWER_UNDERSCORE.or(UPPER_UNDERSCORE);
|
||||
|
||||
private final IcuConverterConfig.Builder config = IcuConverterConfig.builder();
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setOutputDir(Path path) {
|
||||
config.setOutputDir(path);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setCldrDir(Path path) {
|
||||
config.setCldrDir(path);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setMinimalDraftStatus(String status) {
|
||||
config.setMinimalDraftStatus(resolve(CldrDraftStatus.class, status));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setOutputTypes(String types) {
|
||||
config.setOutputTypes(
|
||||
LIST_SPLITTER
|
||||
.splitToList(types).stream()
|
||||
.map(s -> resolve(LdmlConverter.OutputType.class, s))
|
||||
.collect(toImmutableList()));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setSpecialsDir(Path path) {
|
||||
config.setSpecialsDir(path);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setEmitReport(boolean emit) {
|
||||
config.setEmitReport(emit);
|
||||
}
|
||||
|
||||
public static final class LocaleIds extends Task {
|
||||
private ImmutableList<IcuLocaleDir> dirs = ImmutableList.of();
|
||||
private ImmutableList<String> ids = ImmutableList.of();
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setDirs(String directories) {
|
||||
this.dirs = LIST_SPLITTER.splitToList(directories).stream()
|
||||
.map(s -> resolve(IcuLocaleDir.class, s))
|
||||
.collect(toImmutableList());
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void addText(String localeIds) {
|
||||
// Need to filter out '//' style end-of-line comments first (replace with \n to avoid
|
||||
// inadvertantly joining two elements.
|
||||
localeIds = localeIds.replaceAll("//[^\n]*\n", "\n");
|
||||
this.ids = ImmutableList.copyOf(LIST_SPLITTER.splitToList(localeIds));
|
||||
}
|
||||
}
|
||||
|
||||
public static final class ForcedAlias extends Task {
|
||||
private IcuLocaleDir dir;
|
||||
private String source;
|
||||
private String target;
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setDir(String directory) {
|
||||
this.dir = resolve(IcuLocaleDir.class, directory);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setSource(String source) {
|
||||
this.source = checkNotNull(source);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void setTarget(String target) {
|
||||
this.target = checkNotNull(target);
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void addConfiguredLocaleIds(LocaleIds localeIds) {
|
||||
localeIds.dirs.forEach(d -> config.addLocaleIds(d, localeIds.ids));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void addConfiguredForcedAlias(ForcedAlias alias) {
|
||||
config.addForcedAlias(alias.dir, alias.source, alias.target);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
public void execute() throws BuildException {
|
||||
LdmlConverter.convert(config.build());
|
||||
}
|
||||
|
||||
private static <T extends Enum<T>> T resolve(Class<T> enumClass, String name) {
|
||||
checkArgument(!name.isEmpty(), "enumeration name cannot be empty");
|
||||
checkArgument(VALID_ENUM_CHAR.matchesAllOf(name),
|
||||
"invalid enumeration name '%s'; expected only ASCII letters or '_'", name);
|
||||
CaseFormat format;
|
||||
if (UPPER_UNDERSCORE.matchesAllOf(name)) {
|
||||
format = CaseFormat.UPPER_UNDERSCORE;
|
||||
} else if (LOWER_UNDERSCORE.matchesAllOf(name)) {
|
||||
format = CaseFormat.LOWER_UNDERSCORE;
|
||||
} else {
|
||||
// Mixed case with '_' is not permitted.
|
||||
checkArgument(!name.contains("_"),
|
||||
"invalid enumeration name '%s'; mixed case with underscore not allowed: %s", name);
|
||||
format =
|
||||
Ascii.isLowerCase(name.charAt(0)) ? CaseFormat.LOWER_CAMEL : CaseFormat.UPPER_CAMEL;
|
||||
}
|
||||
try {
|
||||
return Enum.valueOf(enumClass, format.to(CaseFormat.UPPER_UNDERSCORE, name));
|
||||
} catch (IllegalArgumentException e) {
|
||||
String validNames =
|
||||
Arrays.stream(enumClass.getEnumConstants())
|
||||
.map(Object::toString)
|
||||
.collect(joining(", "));
|
||||
throw new IllegalArgumentException(
|
||||
"invalid enumeration name " + name + "; expected one of; " + validNames);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user