scuffed-code/tools/cldr/cldr-to-icu/build-icu-data.xml

<!-- © 2019 and later: Unicode, Inc. and others.
     License & terms of use: http://www.unicode.org/copyright.html -->

<!--================================================================================
    To build ICU data files:
    1: Determine the CLDR base directory and set the CLDR_DIR environment variable.
    2: Determine the flags required (see the list of properties below).
    3: Run: ant -f build-icu-data.xml -D<flag-name>=<flag-value>...
    ================================================================================-->
<!-- TODO: Add things like copying of a template directory and deleting previous files
     (perhaps always generate into a temporary directory and copy back to avoid having
      inconsistent state when the conversion is cancelled). -->
<project name="Convert" default="convert" basedir=".">
    <!-- Initialize the properties which were not already set on the command line. -->
    <target name="init-args">
        <property environment="env"/>
        <condition property="hascldrdir" >
            <isset property="env.CLDR_DIR" />
        </condition>
        <fail unless="hascldrdir"
              message="Please set the CLDR_DIR environment variable to the top level CLDR source dir (containing 'common')."/>

        <!-- The base directory of the CLDR release from which CLDR data is obtained. For
             legacy reasons, this must also match an environment variable called CLDR_DIR,
             which is read here, so it is best to set this via the environment variable for
             now. -->
        <!-- TODO: Update this when the CLDR_DIR environment variable is no longer needed. -->
        <property name="cldrDir" value="${env.CLDR_DIR}"/>

        <!-- The output directory into which to write the converted ICU data. By default
             this will overwrite (without deletion) the ICU data files in this ICU release,
             so it is recommended that for testing, it be set to another value.  -->
        <property name="outDir" value="${basedir}/../../../icu4c/source/data/"/>

        <!-- The directory in which the additional ICU XML data is stored. -->
        <property name="specialsDir" value="${basedir}/../../../icu4c/source/data/xml"/>

        <!-- The minimum draft status for CLDR data to be used in the conversion. See
             CldrDraftStatus for more details. -->
        <property name="minDraftStatus" value="contributed"/>

        <!-- Whether to emit a debug report containing some possibly useful information after
             the conversion has finished. -->
        <!-- TODO: Currently this isn't hugely useful, so find out what people want. -->
        <property name="emitReport" value="false"/>

        <!-- List of output "types" to be generated (e.g. "rbnf,plurals,locales"); an empty
             list means "build everything".

             Note that the grouping of types is based on the legacy converter behaviour and
             is not always directly associated with an output directory (e.g. "locales"
             produces locale data for curr/, lang/, main/, region/, unit/, zone/ but NOT
             coll/, brkitr/ or rbnf/).

             You can also specify by DTD type (e.g. dtdBcp47, dtdSupplemental or dtdLdml)
             which is still not quite directly associated with output directories either,
             since some supplemental data is also written to the curr/ directory.

             See LdmlConverter.OutputType for the full list of valid types. -->
        <!-- TODO: Find out what people actually want here and switch to that. -->
        <property name="outputTypes" value=""/>
    </target>

    <!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
         about making Ant know the Maven class-path). -->
    <target name="prepare-jar" depends="init-args">
        <exec executable="mvn" searchpath="true">
            <arg value="compile"/>
        </exec>
    </target>

    <!-- Do the actual CLDR data conversion, based on the command line arguments, built in
         default properties and the configuration in the "<convert>" element below. -->
    <target name="convert" depends="init-args, prepare-jar">
        <taskdef name="convert" classname="org.unicode.icu.tool.cldrtoicu.ant.ConvertIcuDataTask">
            <classpath>
                <pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
            </classpath>
        </taskdef>
        <convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
                 outputTypes="${outputTypes}" minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">

            <!-- The primary set of locale IDs to be generated by default. The IDs in this list are
                 automatically expanded to include default scripts and all available regions. The
                 rules are:

                 1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
                 2) All region and variant subtags are added for any base language or language+script
                    (e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").

                 If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").

                 Locale IDs with deprecated subtags (which become aliases) must still be listed in
                 full (e.g. "en_RH" or "sr_Latn_YU").
            -->
            <localeIds>
                // A
                af, agq, agq_CM, ak, am, ar, ars, as, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl

                // B
                bas, bas_CM, be, bem, bem_ZM, bez, bez_TZ, bg, bm, bn, bo, br, brx, brx_IN, bs, bs_BA
                bs_Cyrl

                // C
                ca, ccp, ccp_BD, ccp_IN, ce, ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, cy

                // D
                da, dav, dav_KE, de, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, dyo_SN, dz

                // E
                ebu, ebu_KE, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo, ewo_CM

                // F
                fa, ff, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fil_PH, fo, fr, fur, fur_IT, fy

                // G
                ga, gd, gl, gsw, gsw_CH, gsw_FR, gsw_LI, gu, guz, guz_KE, gv

                // H
                ha, haw, haw_US, he, hi, hr, hsb, hsb_DE, hu, hy

                // I
                ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL

                // J
                ja, jgo, jgo_CM, jmc, jmc_TZ, jv

                // K
                ka, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, khq, khq_ML, ki, kk, kkj, kkj_CM, kl
                kln, kln_KE, km, kn, ko, kok, kok_IN, ks, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, kw
                ky

                // L
                lag, lag_TZ, lb, lg, lkt, lkt_US, ln, lo, lrc, lrc_IQ, lrc_IR, lt, lu, luo, luo_KE, luy
                luy_KE, lv

                // M
                mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mgh, mgh_MZ, mgo, mgo_CM, mi, mk, ml, mn
                mo, mr, ms, mt, mua, mua_CM, my, mzn, mzn_IR

                // N
                naq, naq_NA, nb, nd, nds, nds_DE, nds_NL, ne, nl, nmg, nmg_CM, nn, nnh, nnh_CM, no, no_NO
                no_NO_NY, nus, nus_SS, nyn, nyn_UG

                // O
                om, or, os

                // P
                pa, pa_Arab, pa_IN, pa_PK, pl, ps, pt

                // Q
                qu

                // R
                rm, rn, ro, rof, rof_TZ, ru, rw, rwk, rwk_TZ

                // S
                sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, se, seh, seh_MZ, ses, ses_ML, sg, sh, sh_BA, sh_CS
                sh_YU, shi, shi_Latn, shi_Latn_MA, shi_MA, shi_Tfng, shi_Tfng_MA, si, sk, sl, smn, smn_FI, sn, so, sq, sr
                sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn, sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, sv, sw

                // T
                ta, te, teo, teo_KE, teo_UG, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, twq_NE
                tzm, tzm_MA

                // U
                ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ

                // V
                vai, vai_LR, vai_Latn, vai_Latn_LR, vai_Vaii, vai_Vaii_LR, vi, vun, vun_TZ

                // W
                wae, wae_CH, wo

                // X
                xh, xog, xog_UG

                // Y
                yav, yav_CM, yi, yo, yue, yue_CN, yue_HK, yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK

                // Z
                zgh, zgh_MA, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
            </localeIds>

            <!-- The following elements configure directories in which a subset of the available
                 locales IDs should be generated. Unlike the main <localeId> element, these
                 filters must specify all locale IDs in full (but since they mostly select base
                 languages, this isn't a big deal). -->
            <!-- TODO: Explain why these special cases are needed/different. -->

            <directoryFilter dir="coll">
                root,

                // A-B
                af, am, ars, ar, as, az, be, bg, bn, bo, bs_Cyrl, bs,

                // C-F
                ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
                en_US_POSIX, en_US, eo, es, et, fa_AF, fa, fil, fi, fo, fr_CA, fr,

                // G-J
                ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
                id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,

                // K-P
                ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lkt, ln, lo, lt, lv,
                mk, ml, mn, mo, mr, ms, mt, my, nb, ne, nl, nn, no_NO, no,
                om, or, pa_IN, pa, pa_Guru, pl, ps, pt,

                // R-T
                ro, ru, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
                sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, sv, sw,
                ta, te, th, tk, to, tr,

                // U-Z
                ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
                yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
            </directoryFilter>

            <directoryFilter dir="rbnf">
                root,

                // A-E
                af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
                da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
                es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,

                // F-P
                fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
                hu, hy, id, in, is, it, iw, ja, ka, kl, km, ko, ky, lb,
                lo, lrc, lt, lv, mk, ms, mt, my, nb, nl, nn, no, pl, pt_PT, pt,

                // Q-Z
                qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
                uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
            </directoryFilter>

            <directoryFilter dir="brkitr">
                root,
                de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
            </directoryFilter>

            <!-- The following elements configure some very special case locale alias behaviour,
                 mainly to support situations where the natural alias relationship is not wanted
                 for a particular type of data. -->

            <!-- GLOBAL ALIASES -->

            <!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
                 (e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
                 relationship. Unlike deprecated languages for which an alias can be inferred from
                 the "languageAlias" element, there's no way in CLDR to represent the fact that we
                 want "ars" (a non-deprecated language) to inherit the data of "ar_SA".

                 This alias is the first example of potentially many cases where ICU needs to
                 generate an alias in order to affect "sideways inheritence" for spoken languages,
                 and at some stage it should be supported properly in the CLDR data. -->
            <forcedAlias source="ars" target="ar_SA"/>

            <!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
            <forcedAlias source="no_NO_NY" target="nn_NO"/>

            <!-- PER-DIRECTORY ALIASES (these are really special cases) -->

            <!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally). -->
            <!-- TODO: Find out and document this properly. -->
            <forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>

            <!-- This alias is to avoid needing to copy and maintain the same "zh" data for "yue".
                 The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
                 "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
                 rewriting the base language.

                 This is similar to the case for "ars"/"ar_SA" but it is not done globally, since
                 CLDR data does exist for "yue" and "yue_Hans" which is NOT the same as "zh_Hant"
                 and "zh_Hans"/"zh". This mapping is a bit more of a "hack" for the purposes of
                 reducing data duplication in ICU. -->
            <forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
            <forcedAlias dir="coll" source="yue" target="zh_Hant"/>

            <!-- It is not at all clear why this is being done. It's certainly not exactly the same
                 as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
                 data than "yue", so this alias is not just rewriting the base language. -->
            <!-- TODO: Find out and document this properly. -->
            <forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
        </convert>
    </target>
</project>