289 lines
14 KiB
XML
289 lines
14 KiB
XML
<!-- © 2019 and later: Unicode, Inc. and others.
|
|
License & terms of use: http://www.unicode.org/copyright.html -->
|
|
|
|
<!--================================================================================
|
|
To build ICU data files:
|
|
1: Determine the CLDR base directory and set the CLDR_DIR environment variable.
|
|
2: Determine the flags required (see the list of properties below).
|
|
3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>...
|
|
================================================================================-->
|
|
<!-- TODO: Add things like copying of a template directory and deleting previous files
|
|
(perhaps always generate into a temporary directory and copy back to avoid having
|
|
inconsistent state when the conversion is cancelled). -->
|
|
<project name="Convert" default="convert" basedir=".">
|
|
<!-- Initialize the properties which were not already set on the command line. -->
|
|
<target name="init-args">
|
|
<property environment="env"/>
|
|
<condition property="hascldrdir" >
|
|
<isset property="env.CLDR_DIR" />
|
|
</condition>
|
|
<fail unless="hascldrdir"
|
|
message="Please set the CLDR_DIR environment variable to the top level CLDR source dir (containing 'common')."/>
|
|
|
|
<!-- The base directory of the CLDR release from which CLDR data is obtained. For
|
|
legacy reasons, this must also match an environment variable called CLDR_DIR,
|
|
which is read here, so it is best to set this via the environment variable for
|
|
now. -->
|
|
<!-- TODO: Update this when the CLDR_DIR environment variable is no longer needed. -->
|
|
<property name="cldrDir" value="${env.CLDR_DIR}"/>
|
|
|
|
<!-- The output directory into which to write the converted ICU data. By default
|
|
this will overwrite (without deletion) the ICU data files in this ICU release,
|
|
so it is recommended that for testing, it be set to another value. -->
|
|
<property name="outDir" value="${basedir}/../../../icu4c/source/data/"/>
|
|
|
|
<!-- The directory in which the additional ICU XML data is stored. -->
|
|
<property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/>
|
|
|
|
<!-- The minimum draft status for CLDR data to be used in the conversion. See
|
|
CldrDraftStatus for more details. -->
|
|
<property name="minDraftStatus" value="contributed"/>
|
|
|
|
<!-- Whether to emit a debug report containing some possibly useful information after
|
|
the conversion has finished. -->
|
|
<!-- TODO: Currently this isn't hugely useful, so find out what people want. -->
|
|
<property name="emitReport" value="false"/>
|
|
|
|
<!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty
|
|
list means "build everything".
|
|
|
|
Note that the grouping of types is based on the legacy converter behaviour and
|
|
is not always directly associated with an output directory (e.g. "locales"
|
|
produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT
|
|
coll/, brkitr/ or rbnf/).
|
|
|
|
You can also specify by DTD type (e.g. dtdBcp47, dtdSupplemental or dtdLdml)
|
|
which is still not quite directly associated with output directories either,
|
|
since some supplemental data is also written to the curr/ directory.
|
|
|
|
See LdmlConverter.OutputType for the full list of valid types. -->
|
|
<!-- TODO: Find out what people actually want here and switch to that. -->
|
|
<property name="outputTypes" value=""/>
|
|
</target>
|
|
|
|
<!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
|
|
about making Ant know the Maven class-path). -->
|
|
<target name="prepare-jar" depends="init-args">
|
|
<exec executable="mvn" searchpath="true">
|
|
<arg value="compile"/>
|
|
</exec>
|
|
</target>
|
|
|
|
<!-- Do the actual CLDR data conversion, based on the command line arguments, built in
|
|
default properties and the configuration in the "<convert>" element below. -->
|
|
<target name="convert" depends="init-args, prepare-jar">
|
|
<taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask">
|
|
<classpath>
|
|
<pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
|
|
</classpath>
|
|
</taskdef>
|
|
<convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
|
|
outputTypes="${outputTypes}" minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
|
|
|
|
<!-- The primary set of locale IDs to be generated by default. The IDs in this list are
|
|
automatically expanded to include default scripts and all available regions. The
|
|
rules are:
|
|
|
|
1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
|
|
2) All region and variant subtags are added for any base language or language+script
|
|
(e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").
|
|
|
|
If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").
|
|
|
|
Locale IDs with deprecated subtags (which become aliases) must still be listed in
|
|
full (e.g. "en_RH" or "sr_Latn_YU").
|
|
-->
|
|
<localeIds>
|
|
// A
|
|
af, agq, agq_CM, ak, am, ar, ars, as, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl
|
|
|
|
// B
|
|
bas, bas_CM, be, bem, bem_ZM, bez, bez_TZ, bg, bm, bn, bo, br, brx, brx_IN, bs, bs_BA
|
|
bs_Cyrl
|
|
|
|
// C
|
|
ca, ccp, ccp_BD, ccp_IN, ce, ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, cy
|
|
|
|
// D
|
|
da, dav, dav_KE, de, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, dyo_SN, dz
|
|
|
|
// E
|
|
ebu, ebu_KE, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo, ewo_CM
|
|
|
|
// F
|
|
fa, ff, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fil_PH, fo, fr, fur, fur_IT, fy
|
|
|
|
// G
|
|
ga, gd, gl, gsw, gsw_CH, gsw_FR, gsw_LI, gu, guz, guz_KE, gv
|
|
|
|
// H
|
|
ha, haw, haw_US, he, hi, hr, hsb, hsb_DE, hu, hy
|
|
|
|
// I
|
|
ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL
|
|
|
|
// J
|
|
ja, jgo, jgo_CM, jmc, jmc_TZ, jv
|
|
|
|
// K
|
|
ka, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, khq, khq_ML, ki, kk, kkj, kkj_CM, kl
|
|
kln, kln_KE, km, kn, ko, kok, kok_IN, ks, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, kw
|
|
ky
|
|
|
|
// L
|
|
lag, lag_TZ, lb, lg, lkt, lkt_US, ln, lo, lrc, lrc_IQ, lrc_IR, lt, lu, luo, luo_KE, luy
|
|
luy_KE, lv
|
|
|
|
// M
|
|
mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mgh, mgh_MZ, mgo, mgo_CM, mi, mk, ml, mn
|
|
mo, mr, ms, mt, mua, mua_CM, my, mzn, mzn_IR
|
|
|
|
// N
|
|
naq, naq_NA, nb, nd, nds, nds_DE, nds_NL, ne, nl, nmg, nmg_CM, nn, nnh, nnh_CM, no, no_NO
|
|
no_NO_NY, nus, nus_SS, nyn, nyn_UG
|
|
|
|
// O
|
|
om, or, os
|
|
|
|
// P
|
|
pa, pa_Arab, pa_IN, pa_PK, pl, ps, pt
|
|
|
|
// Q
|
|
qu
|
|
|
|
// R
|
|
rm, rn, ro, rof, rof_TZ, ru, rw, rwk, rwk_TZ
|
|
|
|
// S
|
|
sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, se, seh, seh_MZ, ses, ses_ML, sg, sh, sh_BA, sh_CS
|
|
sh_YU, shi, shi_Latn, shi_Latn_MA, shi_MA, shi_Tfng, shi_Tfng_MA, si, sk, sl, smn, smn_FI, sn, so, sq, sr
|
|
sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn, sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, sv, sw
|
|
|
|
// T
|
|
ta, te, teo, teo_KE, teo_UG, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, twq_NE
|
|
tzm, tzm_MA
|
|
|
|
// U
|
|
ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ
|
|
|
|
// V
|
|
vai, vai_LR, vai_Latn, vai_Latn_LR, vai_Vaii, vai_Vaii_LR, vi, vun, vun_TZ
|
|
|
|
// W
|
|
wae, wae_CH, wo
|
|
|
|
// X
|
|
xh, xog, xog_UG
|
|
|
|
// Y
|
|
yav, yav_CM, yi, yo, yue, yue_CN, yue_HK, yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK
|
|
|
|
// Z
|
|
zgh, zgh_MA, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
|
|
</localeIds>
|
|
|
|
<!-- The following elements configure directories in which a subset of the available
|
|
locales IDs should be generated. Unlike the main <localeId> element, these
|
|
filters must specify all locale IDs in full (but since they mostly select base
|
|
languages, this isn't a big deal). -->
|
|
<!-- TODO: Explain why these special cases are needed/different. -->
|
|
|
|
<directoryFilter dir="coll">
|
|
root,
|
|
|
|
// A-B
|
|
af, am, ars, ar, as, az, be, bg, bn, bo, bs_Cyrl, bs,
|
|
|
|
// C-F
|
|
ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
|
|
en_US_POSIX, en_US, eo, es, et, fa_AF, fa, fil, fi, fo, fr_CA, fr,
|
|
|
|
// G-J
|
|
ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
|
|
id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,
|
|
|
|
// K-P
|
|
ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
|
|
mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
|
|
om, or, pa_IN, pa, pa_Guru, pl, ps, pt,
|
|
|
|
// R-T
|
|
ro, ru, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
|
|
sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
|
|
ta, te, th, tk, to, tr,
|
|
|
|
// U-Z
|
|
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
|
|
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
|
|
</directoryFilter>
|
|
|
|
<directoryFilter dir="rbnf">
|
|
root,
|
|
|
|
// A-E
|
|
af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
|
|
da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
|
|
es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,
|
|
|
|
// F-P
|
|
fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
|
|
hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
|
|
lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,
|
|
|
|
// Q-Z
|
|
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
|
|
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
|
|
</directoryFilter>
|
|
|
|
<directoryFilter dir="brkitr">
|
|
root,
|
|
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
|
|
</directoryFilter>
|
|
|
|
<!-- The following elements configure some very special case locale alias behaviour,
|
|
mainly to support situations where the natural alias relationship is not wanted
|
|
for a particular type of data. -->
|
|
|
|
<!-- GLOBAL ALIASES -->
|
|
|
|
<!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
|
|
(e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
|
|
relationship. Unlike deprecated languages for which an alias can be inferred from
|
|
the "languageAlias" element, there's no way in CLDR to represent the fact that we
|
|
want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
|
|
|
|
This alias is the first example of potentially many cases where ICU needs to
|
|
generate an alias in order to affect "sideways inheritence" for spoken languages,
|
|
and at some stage it should be supported properly in the CLDR data. -->
|
|
<forcedAlias source="ars" target="ar_SA"/>
|
|
|
|
<!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
|
|
<forcedAlias source="no_NO_NY" target="nn_NO"/>
|
|
|
|
<!-- PER-DIRECTORY ALIASES (these are really special cases) -->
|
|
|
|
<!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally). -->
|
|
<!-- TODO: Find out and document this properly. -->
|
|
<forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
|
|
|
|
<!-- This alias is to avoid needing to copy and maintain the same "zh" data for "yue".
|
|
The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
|
|
"yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
|
|
rewriting the base language.
|
|
|
|
This is similar to the case for "ars"/"ar_SA" but it is not done globally, since
|
|
CLDR data does exist for "yue" and "yue_Hans" which is NOT the same as "zh_Hant"
|
|
and "zh_Hans"/"zh". This mapping is a bit more of a "hack" for the purposes of
|
|
reducing data duplication in ICU. -->
|
|
<forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
|
|
<forcedAlias dir="coll" source="yue" target="zh_Hant"/>
|
|
|
|
<!-- It is not at all clear why this is being done. It's certainly not exactly the same
|
|
as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
|
|
data than "yue", so this alias is not just rewriting the base language. -->
|
|
<!-- TODO: Find out and document this properly. -->
|
|
<forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
|
|
</convert>
|
|
</target>
|
|
</project>
|