icu/tools/cldr/cldr-to-icu/config.xml

<!-- © 2019 and later: Unicode, Inc. and others.
     License & terms of use: http://www.unicode.org/copyright.html -->

<config>
    <convert>

        <!-- The primary set of locale IDs to be generated by default. The IDs in this list are
             automatically expanded to include default scripts and all available regions. The
             rules are:

             1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
             2) All region and variant subtags are added for any base language or language+script
                (e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").

             If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").

             Locale IDs with deprecated subtags (which become aliases) must still be listed in
             full (e.g. "en_RH" or "sr_Latn_YU").
        -->
        <localeIds>
            // A
            af, agq, ak, am, ar, ars, as, asa, ast, az, az_AZ, az_Cyrl

            // B
            bas, be, bem, bez, bg, bgc, bho, blo, bm, bn, bo, br, brx, bs, bs_BA, bs_Cyrl

            // C
            ca, ccp, ce, ceb, cgg, chr, ckb, cs, csw, cv, cy

            // D
            da, dav, de, dje, doi, dsb, dua, dyo, dz

            // E
            ebu, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo

            // F
            fa, ff, ff_Adlm, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fo, fr, fur, fy

            // G
            ga, gaa, gd, gl, gsw, gu, guz, gv

            // H
            ha, haw, he, hi, hi_Latn, hr, hsb, hu, hy

            // I
            ia, id, ie, ig, ii, in, in_ID, is, it, iw, iw_IL

            // J
            ja, jgo, jmc, jv

            // K
            ka, kab, kam, kde, kea, kgp, khq, ki, kk, kkj, kl, kln, km, kn, ko, kok, kok_Latn, ks
            ks_Deva, ks_IN, ksb, ksf, ksh, ku, kw, kxv, kxv_Deva, kxv_IN, kxv_Orya, kxv_Telu, ky

            // L
            lag, lb, lg, lij, lkt, lmo, ln, lo, lrc, lt, lu, luo, luy, lv

            // M
            mai, mas, mer, mfe, mg, mgh, mgo, mi, mk, ml, mn, mni, mni_IN, mo, mr, ms
            mt, mua, my, mzn

            // N
            naq, nb, nd, nds, ne, nl, nmg, nn, nnh, no, no_NO, no_NO_NY, nqo, nso, nus, nyn

            // O
            oc, om, or, os

            // P
            pa, pa_Arab, pa_IN, pa_PK, pcm, pl, prg, ps, pt

            // Q
            qu

            // R
            raj, rm, rn, ro, rof, ru, rw, rwk

            // S
            sa, sah, saq, sat, sat_IN, sbp, sc, sd, sd_Deva, sd_IN, sd_PK, se, seh, ses, sg, sh, sh_BA, sh_CS, sh_YU
            shi, shi_Latn, shi_MA, si, sk, sl, smn, sn, so, sq, sr, sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn
            sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, st, su, su_ID, sv, sw, syr, szl

            // T
            ta, te, teo, tg, th, ti, tk, tl, tl_PH, tn, to, tok, tr, tt, twq, tzm

            // U
            ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ

            // V
            vai, vai_LR, vai_Latn, vec, vi, vmw, vun

            // W
            wae, wo

            // X
            xh, xnr, xog

            // Y
            yav, yi, yo, yrl, yue, yue_CN, yue_HK, yue_Hans

            // Z
            za, zgh, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
        </localeIds>

        <!-- The following elements configure directories in which a subset of the available
             locales IDs should be generated. Unlike the main <localeId> element, these
             filters must specify all locale IDs in full (but since they mostly select base
             languages, this isn't a big deal).

             As well as allowing some data directories to have a subset of available data (via
             the <localeIds> element) there are also mechanisms for controlling aliasing and
             the locale parent relation which allows the sharing of some ICU data in cases
             where it would otherwise need to be copied. The two mechanisms are:

             1: inheritLanguageSubtag: Used to rewrite the parent of a locale ID from "root" to
                its language subtag (e.g. "zh_Hant" has a natural parent of "root", but to allow
                some base language data to be shared it can be made to have a parent of "zh").

             2: forcedAlias: Used to add aliases for specific directories in order to affect the
                ICU behaviour in special cases.

             Between them these mechanisms are known as "tailorings" of the affected locales. -->
        <!-- TODO: Explain why these special cases are needed/different. -->

        <!-- Collation data is large, but also more sharable than other data, which is why there
             are a number of aliases and parent remappings for this directory. -->
        <directory dir="coll" inheritLanguageSubtag="bs_Cyrl, sr_Latn, zh_Hant">
            <!-- These aliases are to avoid needing to copy and maintain the same collation data
                 for "zh" and "yue". The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs
                 "zh_Hans_CN"), and for "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the
                 aliases are effectively just rewriting the base language. -->
            <forcedAlias source="yue" target="zh_Hant"/>
            <forcedAlias source="yue_Hant" target="zh_Hant"/>
            <forcedAlias source="yue_CN" target="zh_Hans"/>
            <forcedAlias source="yue_Hans" target="zh_Hans"/>
            <forcedAlias source="yue_Hans_CN" target="zh_Hans"/>
            <!-- TODO: Find out and document this properly. -->
            <forcedAlias source="sr_ME" target="sr_Cyrl_ME"/>

            <localeIds>
                root,

                // A-B
                af, am, ars, ar, as, az, be, bg, bn, bo, br, bs_Cyrl, bs,

                // C-F
                ca, ceb, chr, cs, cy, da, de_AT, de, dsb, dz, ee, el, en,
                en_US_POSIX, en_US, eo, es, et, fa_AF, fa, ff_Adlm, ff, fil, fi, fo, fr_CA, fr, fy,

                // G-J
                ga, gl, gu, ha, haw, he, hi, hr, hsb, hu, hy,
                id_ID, id, ig, in, in_ID, is, it, iw_IL, iw, ja,

                // K-P
                ka, kk, kl, km, kn, kok, ko, ku, ky, lb, lij, lkt, ln, lo, lt, lv,
                mk, ml, mn, mo, mr, ms, mt, my, nb, nb_NO, ne, nl, nn, no, no_NO, nso,
                om, or, pa_IN, pa, pa_Guru, pl, ps, pt,

                // R-T
                ro, ru, sa, se, sh_BA, sh_CS, sh, sh_YU, si, sk, sl, smn, sq,
                sr_BA, sr_Cyrl_ME, sr_Latn, sr_ME, sr_RS, sr, st, sv, sw,
                ta, te, th, tk, tn, to, tr,

                // U-Z
                ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans_CN, yue_Hans
                yue_Hant, yue, zh_CN, zh_Hans, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
            </localeIds>
        </directory>

        <directory dir="rbnf">
            <!-- It is not at all clear why this is being done. It's certainly not exactly the
                 same as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with
                 different data than "yue", so this alias is not just rewriting the base
                 language. -->
            <!-- TODO: Find out and document this properly. -->
            <forcedAlias source="zh_Hant_HK" target="yue"/>

            <localeIds>
                root,

                // A-E
                af, ak, am, ars, ar, az, be, bg, bs, ca, ccp, chr, cs, cy,
                da, de_CH, de, ee, el, en_001, en_IN, en, eo, es_419, es_DO,
                es_GT, es_HN, es_MX, es_NI, es_PA, es_PR, es_SV, es, es_US, et,

                // F-P
                fa_AF, fa, ff, fil, fi, fo, fr_BE, fr_CH, fr, ga, he, hi, hr,
                hu, hy, id, in, is, it, iw, ja, ka, kk, kl, km, ko, ky, lb,
                lo, lrc, lt, lv, mk, ms, mt, my, nb, ne, nl, nn, no, pl, pt_PT, pt,

                // Q-Z
                qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, su, sv, sw, ta, th, tr,
                uk, vec, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
            </localeIds>
        </directory>

        <directory dir="brkitr" inheritLanguageSubtag="zh_Hant">
            <localeIds>
                root,
                de, el, en, en_US_POSIX, en_US, es, fi, fr, it, ja, ko, pt, ru, sv, zh_Hant, zh
            </localeIds>
        </directory>

        <!-- GLOBAL ALIASES -->

        <!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
             (e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
             relationship. Unlike deprecated languages for which an alias can be inferred from
             the "languageAlias" CLDR data, there's no way in CLDR to represent the fact that
             we want "ars" (a non-deprecated language) to inherit the data of "ar_SA".

             This alias is the first example of potentially many cases where ICU needs to
             generate an alias in order to affect "sideways inheritance" for spoken languages,
             and at some stage it should probably be supported properly in the CLDR data. -->
        <forcedAlias source="ars" target="ar_SA"/>

        <!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
        <forcedAlias source="no_NO_NY" target="nn_NO"/>

        <!-- This one is a bit silly, it is just to generate a stub for no_NO, which is
             not in CLDR. If we do not do this, then including it in localeIds will generate
             empty no_Latn and no_Latn_NO and then no_NO aliasing to no_Latn_NO. -->
        <forcedAlias source="no_NO" target="no"/>

        <!-- ALTERNATE VALUES -->

        <!-- The following elements configure alternate values for some special case paths.
             The target path will only be replaced if both it, and the source path, exist in
             the CLDR data (paths will not be modified if only the source path exists).

             Since the paths must represent the same semantic type of data, they must be in the
             same "namespace" (same element names) and must not contain value attributes. Thus
             they can only differ by distinguishing attributes (either added or modified).

             This feature is typically used to select alternate translations (e.g. short forms)
             for certain paths. -->
        <!-- <altPath target="//path/to/value[@attr='foo']"
                      source="//path/to/value[@attr='bar']"
                      locales="xx,yy_ZZ"/> -->
    </convert>

    <!-- If a directory is listed here, then every file in it is assumed to be automatically
         generated by the conversion tool, unless it is explicitly listed in a <retain> element.
         The tool then checks every file to determine if it has the expected header present,
         indiciating that it was automatically generated, before deleting it.

         If unexpected files are found, the "clean" task will fail without deleting anything
         (unless'forceDelete' is set to override this). Note that even if 'forceDelete' is set,
         the files listed explicitly below will never be deleted by this process.

         This two-step approach minimizes the risk that the conversion process will ever
         accidentally delete a manually maintained file.
         -->
    <outputDirectories root="${outDir}" forceDelete="${forceDelete}">
        <dir name="brkitr">
            <retain path="adaboost"/>
            <retain path="dictionaries"/>
            <retain path="lstm"/>
            <retain path="rules"/>
        </dir>
        <dir name="coll">
            <!-- Legacy files whose file names aren't supported for automatic generation.
                 Simple to maintain manually and unlikely to ever change again. -->
            <retain path="de__PHONEBOOK.txt"/>
            <retain path="de_.txt"/>
            <retain path="es__TRADITIONAL.txt"/>
            <retain path="es_.txt"/>
        </dir>
        <dir name="curr"/>
        <dir name="lang"/>
        <dir name="locales"/>
        <dir name="misc">
            <!-- Machine generated files produced by different tools.
                 Possibly worth moving into the new LDML conversion tool one day. -->
            <retain path="currencyNumericCodes.txt"/>
            <retain path="zoneinfo64.txt"/>
            <!-- Project file (not ICU data), unlikely to ever be auto-generated. -->
            <retain path="icudata.rc"/>
            <!-- Small high-level metadata file, stable and easy to maintain manually. -->
            <retain path="icustd.txt"/>
        </dir>
        <dir name="rbnf"/>
        <dir name="region"/>
        <dir name="translit">
            <!-- Small, easy to maintain, special case top-level files. -->
            <retain path="en.txt"/>
            <retain path="el.txt"/>
        </dir>
        <dir name="unit"/>
        <dir name="zone">
            <!-- Manually edited to support TZ database name compatibility. -->
            <retain path="tzdbNames.txt"/>
        </dir>
    </outputDirectories>
</config>