ICU-20693 Refactoring for inferred IDs.

This commit is contained in:
David Beaumont 2019-09-06 23:34:17 +02:00 committed by David Beaumont
parent 142c90afcc
commit 7078e19070
12 changed files with 449 additions and 299 deletions

View file

@ -78,184 +78,117 @@
</classpath>
</taskdef>
<convert cldrDir="${cldrDir}" outputDir="${outDir}" specialsDir="${specialsDir}"
minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
outputTypes="${outputTypes}" minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}">
<!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
TODO: Find out and document this properly. -->
<forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
<!-- The primary set of locale IDs to be generated by default. The IDs in this list are
automatically expanded to include default scripts and all available regions. The
rules are:
<!-- This appears to be a hack to avoid needing to copy and maintain the same "zh"
data for "yue". The files for "yue" in this directory should be empty otherwise.
The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
"yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
rewriting the base language. -->
<forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
<forcedAlias dir="coll" source="yue" target="zh_Hant"/>
1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn").
2) All region and variant subtags are added for any base language or language+script
(e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA").
<!-- It is not at all clear why this is being done. It's certainly not exactly the same
as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
data than "yue", so this alias is not just rewriting the base language.
TODO: Find out and document this properly. -->
<forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
<!-- The primary set of locale IDs to be generated. Other, directory specific, sets exist
and do not have to be subsets of this. Some of these ID are aliases, so XML files
may not exist for all of them. -->
<!-- TODO: Add locale ID inference to reduce this list considerably. -->
<localeIds dirs="curr,lang,locales,region,unit,zone">
root,
If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn").
Locale IDs with deprecated subtags (which become aliases) must still be listed in
full (e.g. "en_RH" or "sr_Latn_YU").
-->
<localeIds>
// A
af, af_NA, af_ZA, agq, agq_CM, ak, ak_GH, am, am_ET, ar, ar_001,
ar_AE, ar_BH, ar_DJ, ar_DZ, ar_EG, ar_EH, ar_ER, ar_IL, ar_IQ,
ar_JO, ar_KM, ar_KW, ar_LB, ar_LY, ar_MA, ar_MR, ar_OM, ar_PS,
ar_QA, ar_SA, ar_SD, ar_SO, ar_SS, ar_SY, ar_TD, ar_TN, ar_YE, ars,
as, as_IN, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl, az_Cyrl_AZ,
az_Latn, az_Latn_AZ,
af, agq, agq_CM, ak, am, ar, ars, as, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl
// B
bas, bas_CM, be, be_BY, bem, bem_ZM, bez, bez_TZ, bg, bg_BG, bm,
bm_ML, bn, bn_BD, bn_IN, bo, bo_CN, bo_IN, br, br_FR, brx, brx_IN,
bs, bs_Cyrl, bs_Cyrl_BA, bs_Latn, bs_Latn_BA, bs_BA,
bas, bas_CM, be, bem, bem_ZM, bez, bez_TZ, bg, bm, bn, bo, br, brx, brx_IN, bs, bs_BA
bs_Cyrl
// C
ca, ca_AD, ca_ES, ca_FR, ca_IT, ccp, ccp_BD, ccp_IN, ce, ce_RU,
ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs,
cs_CZ, cy, cy_GB,
ca, ccp, ccp_BD, ccp_IN, ce, ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, cy
// D
da, da_DK, da_GL, dav, dav_KE, de, de_AT, de_BE, de_CH, de_DE,
de_IT, de_LI, de_LU, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo,
dyo_SN, dz, dz_BT,
da, dav, dav_KE, de, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, dyo_SN, dz
// E
ebu, ebu_KE, ee, ee_GH, ee_TG, el, el_CY, el_GR, en, en_001,
en_150, en_AE, en_AG, en_AI, en_AS, en_AT, en_AU, en_BB, en_BE,
en_BI, en_BM, en_BS, en_BW, en_BZ, en_CA, en_CC, en_CH, en_CK,
en_CM, en_CX, en_CY, en_DE, en_DG, en_DK, en_DM, en_ER, en_FI,
en_FJ, en_FK, en_FM, en_GB, en_GD, en_GG, en_GH, en_GI, en_GM,
en_GU, en_GY, en_HK, en_IE, en_IL, en_IM, en_IN, en_IO, en_JE,
en_JM, en_KE, en_KI, en_KN, en_KY, en_LC, en_LR, en_LS, en_MG,
en_MH, en_MO, en_MP, en_MS, en_MT, en_MU, en_MW, en_MY, en_NA,
en_NF, en_NG, en_NH, en_NL, en_NR, en_NU, en_NZ, en_PG, en_PH,
en_PK, en_PN, en_PR, en_PW, en_RH, en_RW, en_SB, en_SC, en_SD,
en_SE, en_SG, en_SH, en_SI, en_SL, en_SS, en_SX, en_SZ, en_TC,
en_TK, en_TO, en_TT, en_TV, en_TZ, en_UG, en_UM, en_US, en_US_POSIX,
en_VC, en_VG, en_VI, en_VU, en_WS, en_ZA, en_ZM, en_ZW, eo,
eo_001, es, es_419, es_AR, es_BO, es_BR, es_BZ, es_CL, es_CO,
es_CR, es_CU, es_DO, es_EA, es_EC, es_ES, es_GQ, es_GT, es_HN,
es_IC, es_MX, es_NI, es_PA, es_PE, es_PH, es_PR, es_PY, es_SV,
es_US, es_UY, es_VE, et, et_EE, eu, eu_ES, ewo, ewo_CM,
ebu, ebu_KE, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo, ewo_CM
// F
fa, fa_AF, fa_IR, ff, ff_CM, ff_GN, ff_Latn, ff_Latn_BF, ff_Latn_CM,
ff_Latn_GH, ff_Latn_GM, ff_Latn_GN, ff_Latn_GW, ff_Latn_LR, ff_Latn_MR,
ff_Latn_NE, ff_Latn_NG, ff_Latn_SL, ff_Latn_SN, ff_MR, ff_SN, fi,
fi_FI, fil, fil_PH, fo, fo_DK, fo_FO, fr, fr_BE, fr_BF, fr_BI,
fr_BJ, fr_BL, fr_CA, fr_CD, fr_CF, fr_CG, fr_CH, fr_CI, fr_CM,
fr_DJ, fr_DZ, fr_FR, fr_GA, fr_GF, fr_GN, fr_GP, fr_GQ, fr_HT,
fr_KM, fr_LU, fr_MA, fr_MC, fr_MF, fr_MG, fr_ML, fr_MQ, fr_MR,
fr_MU, fr_NC, fr_NE, fr_PF, fr_PM, fr_RE, fr_RW, fr_SC, fr_SN,
fr_SY, fr_TD, fr_TG, fr_TN, fr_VU, fr_WF, fr_YT, fur, fur_IT,
fy, fy_NL,
fa, ff, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fil_PH, fo, fr, fur, fur_IT, fy
// G
ga, ga_IE, gd, gd_GB, gl, gl_ES, gsw, gsw_CH, gsw_FR, gsw_LI,
gu, gu_IN, guz, guz_KE, gv, gv_IM,
ga, gd, gl, gsw, gsw_CH, gsw_FR, gsw_LI, gu, guz, guz_KE, gv
// H
ha, ha_GH, ha_NE, ha_NG, haw, haw_US, he, he_IL, hi, hi_IN,
hr, hr_BA, hr_HR, hsb, hsb_DE, hu, hu_HU, hy, hy_AM,
ha, haw, haw_US, he, hi, hr, hsb, hsb_DE, hu, hy
// I
ia, ia_001, id, id_ID, ig, ig_NG, ii, ii_CN, in, in_ID, is,
is_IS, it, it_CH, it_IT, it_SM, it_VA, iw, iw_IL,
ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL
// J
ja, ja_JP, ja_JP_TRADITIONAL, jgo, jgo_CM, jmc, jmc_TZ, jv, jv_ID,
ja, jgo, jgo_CM, jmc, jmc_TZ, jv
// K
ka, ka_GE, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV,
khq, khq_ML, ki, ki_KE, kk, kk_KZ, kkj, kkj_CM, kl, kl_GL, kln,
kln_KE, km, km_KH, kn, kn_IN, ko, ko_KP, ko_KR, kok, kok_IN,
ks, ks_IN, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, ku_TR,
kw, kw_GB, ky, ky_KG,
ka, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, khq, khq_ML, ki, kk, kkj, kkj_CM, kl
kln, kln_KE, km, kn, ko, kok, kok_IN, ks, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, kw
ky
// L
lag, lag_TZ, lb, lb_LU, lg, lg_UG, lkt, lkt_US, ln, ln_AO,
ln_CD, ln_CF, ln_CG, lo, lo_LA, lrc, lrc_IQ, lrc_IR, lt, lt_LT,
lu, lu_CD, luo, luo_KE, luy, luy_KE, lv, lv_LV,
lag, lag_TZ, lb, lg, lkt, lkt_US, ln, lo, lrc, lrc_IQ, lrc_IR, lt, lu, luo, luo_KE, luy
luy_KE, lv
// M
mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mg_MG, mgh,
mgh_MZ, mgo, mgo_CM, mi, mi_NZ, mk, mk_MK, ml, ml_IN, mn,
mn_MN, mo, mr, mr_IN, ms, ms_BN, ms_MY, ms_SG, mt, mt_MT, mua,
mua_CM, my, my_MM, mzn, mzn_IR,
mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mgh, mgh_MZ, mgo, mgo_CM, mi, mk, ml, mn
mo, mr, ms, mt, mua, mua_CM, my, mzn, mzn_IR
// N
naq, naq_NA, nb, nb_NO, nb_SJ, nd, nd_ZW, nds, nds_DE, nds_NL,
ne, ne_IN, ne_NP, nl, nl_AW, nl_BE, nl_BQ, nl_CW, nl_NL, nl_SR,
nl_SX, nmg, nmg_CM, nn, nn_NO, nnh, nnh_CM, no, no_NO, no_NO_NY,
nus, nus_SS, nyn, nyn_UG,
naq, naq_NA, nb, nd, nds, nds_DE, nds_NL, ne, nl, nmg, nmg_CM, nn, nnh, nnh_CM, no, no_NO
no_NO_NY, nus, nus_SS, nyn, nyn_UG
// O
om, om_ET, om_KE, or, or_IN, os, os_GE, os_RU,
om, or, os
// P
pa, pa_Arab, pa_Arab_PK, pa_Guru, pa_Guru_IN, pa_IN, pa_PK, pl,
pl_PL, ps, ps_AF, ps_PK, pt, pt_AO, pt_BR, pt_CH, pt_CV, pt_GQ,
pt_GW, pt_LU, pt_MO, pt_MZ, pt_PT, pt_ST, pt_TL,
pa, pa_Arab, pa_IN, pa_PK, pl, ps, pt
// Q
qu, qu_BO, qu_EC, qu_PE,
qu
// R
rm, rm_CH, rn, rn_BI, ro, ro_MD, ro_RO, rof, rof_TZ, ru,
ru_BY, ru_KG, ru_KZ, ru_MD, ru_RU, ru_UA, rw, rw_RW, rwk, rwk_TZ,
rm, rn, ro, rof, rof_TZ, ru, rw, rwk, rwk_TZ
// S
sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, sd_PK, se, se_FI,
se_NO, se_SE, seh, seh_MZ, ses, ses_ML, sg, sg_CF, sh, sh_BA,
sh_CS, sh_YU, shi, shi_Latn, shi_Latn_MA, shi_Tfng, shi_Tfng_MA,
shi_MA, si, si_LK, sk, sk_SK, sl, sl_SI, smn, smn_FI, sn, sn_ZW,
so, so_DJ, so_ET, so_KE, so_SO, sq, sq_AL, sq_MK, sq_XK, sr,
sr_Cyrl, sr_Cyrl_BA, sr_Cyrl_ME, sr_Cyrl_RS, sr_Cyrl_CS, sr_Cyrl_XK,
sr_Cyrl_YU, sr_Latn, sr_Latn_BA, sr_Latn_ME, sr_Latn_RS, sr_Latn_CS,
sr_Latn_XK, sr_Latn_YU, sr_BA, sr_ME, sr_RS, sr_CS, sr_XK, sr_YU,
sv, sv_AX, sv_FI, sv_SE, sw, sw_CD, sw_KE, sw_TZ, sw_UG,
sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, se, seh, seh_MZ, ses, ses_ML, sg, sh, sh_BA, sh_CS
sh_YU, shi, shi_Latn, shi_Latn_MA, shi_MA, shi_Tfng, shi_Tfng_MA, si, sk, sl, smn, smn_FI, sn, so, sq, sr
sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn, sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, sv, sw
// T
ta, ta_IN, ta_LK, ta_MY, ta_SG, te, te_IN, teo, teo_KE, teo_UG,
tg, tg_TJ, th, th_TH, th_TH_TRADITIONAL, ti, ti_ER, ti_ET, tk,
tk_TM, tl, tl_PH, to, to_TO, tr, tr_CY, tr_TR, tt, tt_RU,
twq, twq_NE, tzm, tzm_MA,
ta, te, teo, teo_KE, teo_UG, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, twq_NE
tzm, tzm_MA
// U
ug, ug_CN, uk, uk_UA, ur, ur_IN, ur_PK, uz, uz_AF, uz_Arab,
uz_Arab_AF, uz_Cyrl, uz_Cyrl_UZ, uz_Latn, uz_Latn_UZ, uz_UZ,
ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ
// V
vai, vai_Latn, vai_Latn_LR, vai_LR, vai_Vaii, vai_Vaii_LR, vi,
vi_VN, vun, vun_TZ,
vai, vai_LR, vai_Latn, vai_Latn_LR, vai_Vaii, vai_Vaii_LR, vi, vun, vun_TZ
// W
wae, wae_CH, wo, wo_SN,
wae, wae_CH, wo
// X
xh, xh_ZA, xog, xog_UG,
xh, xog, xog_UG
// Y
yav, yav_CM, yi, yi_001, yo, yo_BJ, yo_NG, yue, yue_CN, yue_HK,
yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK,
yav, yav_CM, yi, yo, yue, yue_CN, yue_HK, yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK
// Z
zgh, zgh_MA, zh, zh_Hans, zh_Hans_CN, zh_Hans_HK, zh_Hans_MO,
zh_Hans_SG, zh_Hant, zh_Hant_HK, zh_Hant_MO, zh_Hant_TW, zh_CN,
zh_HK, zh_MO, zh_SG, zh_TW, zu, zu_ZA
zgh, zgh_MA, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu
</localeIds>
<!-- The following elements configure directories in which a subset of the available
locales IDs should be generated. Unlike the main <localeId> element, these
filters must specify all locale IDs in full (but since they mostly select base
languages, this isn't a big deal). -->
<!-- TODO: Explain why these special cases are needed/different. -->
<localeIds dirs="coll">
<directoryFilter dir="coll">
root,
// A-B
@ -282,9 +215,9 @@
// U-Z
ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans,
yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu
</localeIds>
</directoryFilter>
<localeIds dirs="rbnf">
<directoryFilter dir="rbnf">
root,
// A-E
@ -300,12 +233,56 @@
// Q-Z
qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr,
uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh
</localeIds>
</directoryFilter>
<localeIds dirs="brkitr">
<directoryFilter dir="brkitr">
root,
de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh
</localeIds>
</directoryFilter>
<!-- The following elements configure some very special case locale alias behaviour,
mainly to support situations where the natural alias relationship is not wanted
for a particular type of data. -->
<!-- GLOBAL ALIASES -->
<!-- Some spoken languages (e.g. "ars") inherit all their data from a written language
(e.g. "ar_SA"). However CLDR doesn't currently support a way to represent that
relationship. Unlike deprecated languages for which an alias can be inferred from
the "languageAlias" element, there's no way in CLDR to represent the fact that we
want "ars" (a non-deprecated language) to inherit the data of "ar_SA".
This alias is the first example of potentially many cases where ICU needs to
generate an alias in order to affect "sideways inheritence" for spoken languages,
and at some stage it should be supported properly in the CLDR data. -->
<forcedAlias source="ars" target="ar_SA"/>
<!-- A legacy global alias (note that "no_NO_NY" is not even structurally valid). -->
<forcedAlias source="no_NO_NY" target="nn_NO"/>
<!-- PER-DIRECTORY ALIASES (these are really special cases) -->
<!-- It is not at all clear why this is being done (we expect "sr_Latn_ME" normally). -->
<!-- TODO: Find out and document this properly. -->
<forcedAlias dir="coll" source="sr_ME" target="sr_Cyrl_ME"/>
<!-- This alias is to avoid needing to copy and maintain the same "zh" data for "yue".
The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
"yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
rewriting the base language.
This is similar to the case for "ars"/"ar_SA" but it is not done globally, since
CLDR data does exist for "yue" and "yue_Hans" which is NOT the same as "zh_Hant"
and "zh_Hans"/"zh". This mapping is a bit more of a "hack" for the purposes of
reducing data duplication in ICU. -->
<forcedAlias dir="coll" source="yue_Hans" target="zh_Hans"/>
<forcedAlias dir="coll" source="yue" target="zh_Hant"/>
<!-- It is not at all clear why this is being done. It's certainly not exactly the same
as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
data than "yue", so this alias is not just rewriting the base language. -->
<!-- TODO: Find out and document this properly. -->
<forcedAlias dir="rbnf" source="zh_Hant_HK" target="yue"/>
</convert>
</target>
</project>
</project>

View file

@ -9,13 +9,13 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.ImmutableTable;
@ -29,11 +29,6 @@ import com.google.common.collect.TreeMultimap;
* that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
*/
public final class IcuConverterConfig implements LdmlConverterConfig {
private static final Optional<Path> DEFAULT_CLDR_DIR =
Optional.ofNullable(System.getProperty("CLDR_DIR", null))
.map(d -> Paths.get(d).toAbsolutePath());
private static final Optional<Path> DEFAULT_ICU_DIR =
Optional.ofNullable(System.getProperty("ICU_DIR", null))
.map(d -> Paths.get(d).toAbsolutePath());
@ -41,26 +36,16 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
/** The builder with which to specify configuration for the {@link LdmlConverter}. */
@SuppressWarnings("UnusedReturnValue")
public static final class Builder {
private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
private Path outputDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
private Path specialsDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);
private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
private CldrDraftStatus minimumDraftStatus = CldrDraftStatus.CONTRIBUTED;
private boolean emitReport = false;
private final SetMultimap<IcuLocaleDir, String> localeIdsMap = TreeMultimap.create();
private final Table<IcuLocaleDir, String, String> forcedAliases = TreeBasedTable.create();
/**
* Sets the CLDR base directory from which to load all CLDR data. This is optional if the
* {@code CLDR_DIR} environment variable is set, which will be used instead.
*/
public Builder setCldrDir(Path cldrDir) {
this.cldrDir = checkNotNull(cldrDir.toAbsolutePath());
return this;
}
/**
* Sets the output directory in which the ICU data directories and files will go. This is
* optional if the {@code ICU_DIR} system property is set, which will be used to generate
@ -91,14 +76,8 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
return this;
}
/**
* Sets the minimum draft status for CLDR data to be converted (paths below this status are
* ignored during conversion). This is optional and defaults to {@link
* CldrDraftStatus#CONTRIBUTED}.
*/
public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) {
this.minimalDraftStatus = checkNotNull(minimalDraftStatus);
return this;
public void setMinimumDraftStatus(CldrDraftStatus minimumDraftStatus) {
this.minimumDraftStatus = checkNotNull(minimumDraftStatus);
}
public Builder setEmitReport(boolean emitReport) {
@ -122,26 +101,16 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
}
}
private final Path cldrDir;
private final Path outputDir;
private final Path specialsDir;
private final ImmutableSet<OutputType> outputTypes;
private final CldrDraftStatus minimalDraftStatus;
private final CldrDraftStatus minimumDraftStatus;
private final boolean emitReport;
private final ImmutableSet<String> allLocaleIds;
private final ImmutableSetMultimap<IcuLocaleDir, String> localeIdsMap;
private final ImmutableTable<IcuLocaleDir, String, String> forcedAliases;
private IcuConverterConfig(Builder builder) {
this.cldrDir = checkNotNull(builder.cldrDir,
"must set a CLDR directory, or the CLDR_DIR system property");
if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) {
System.err.format(
"Warning: Specified CLDR base directory does not appear to match the"
+ " directory inferred by the 'CLDR_DIR' system property.\n"
+ "Specified: %s\n"
+ "Inferred: %s\n",
this.cldrDir, DEFAULT_CLDR_DIR.get());
}
this.outputDir = checkNotNull(builder.outputDir);
checkArgument(!Files.isRegularFile(outputDir),
"specified output directory if not a directory: %s", outputDir);
@ -153,8 +122,10 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
checkArgument(!this.outputTypes.isEmpty(),
"must specify at least one output type to be generated (possible values are: %s)",
Arrays.asList(OutputType.values()));
this.minimalDraftStatus = builder.minimalDraftStatus;
this.minimumDraftStatus = checkNotNull(builder.minimumDraftStatus);
this.emitReport = builder.emitReport;
// getAllLocaleIds() returns the union of all the specified IDs in the map.
this.allLocaleIds = ImmutableSet.copyOf(builder.localeIdsMap.values());
this.localeIdsMap = ImmutableSetMultimap.copyOf(builder.localeIdsMap);
this.forcedAliases = ImmutableTable.copyOf(builder.forcedAliases);
}
@ -163,11 +134,6 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
return new Builder();
}
@Override
public Path getCldrDirectory() {
return cldrDir;
}
@Override
public Path getOutputDir() {
return outputDir;
@ -179,13 +145,13 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
}
@Override
public CldrDraftStatus getMinimumDraftStatus() {
return minimalDraftStatus;
public Path getSpecialsDir() {
return specialsDir;
}
@Override
public Path getSpecialsDir() {
return specialsDir;
public CldrDraftStatus getMinimumDraftStatus() {
return minimumDraftStatus;
}
@Override
@ -194,10 +160,14 @@ public final class IcuConverterConfig implements LdmlConverterConfig {
}
@Override
public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
public ImmutableMap<String, String> getForcedAliases(IcuLocaleDir dir) {
return forcedAliases.row(dir);
}
@Override public ImmutableSet<String> getAllLocaleIds() {
return allLocaleIds;
}
@Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
return localeIdsMap.get(dir);
}

View file

@ -148,7 +148,8 @@ final class IcuTextWriter {
// TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
private static void writeHeaderAndComments(
PrintWriter out, List<String> header, List<String> comments) {
header.forEach(out::println);
header.forEach(s -> out.println("// " + s));
if (!comments.isEmpty()) {
// TODO: Don't use /* */ block quotes, just use inline // quotes.
out.println(

View file

@ -27,7 +27,6 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@ -61,6 +60,7 @@ import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
@ -112,21 +112,6 @@ public final class LdmlConverter {
private static final PathMatcher WINDOWS_ZONES_PATHS =
supplementalMatcher("windowsZones");
// Special IDs which are not supported via CLDR, but for which synthetic data is injected.
// The "TRADITIONAL" variants are here because their calendar differs from the non-variant
// locale. However CLDR cannot represent this currently because calendar defaults are in
// supplemental data (rather than locale data) and are keyed only on territory.
private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
// Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a
// structurally valid locale ID. This is injected manually when creating the alias map.
// This does mean that nobody can ever parse the _keys_ of the alias map, but so far there
// has been no need for that.
// TODO: Get "ars" into CLDR and remove this hack.
private static final Map<String, String> PHANTOM_ALIASES =
ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO");
private static PathMatcher supplementalMatcher(String... spec) {
checkArgument(spec.length > 0, "must supply at least one matcher spec");
if (spec.length == 1) {
@ -223,42 +208,35 @@ public final class LdmlConverter {
}
/** Converts CLDR data according to the given configuration. */
public static void convert(LdmlConverterConfig config) {
CldrDataSupplier src = CldrDataSupplier
.forCldrFilesIn(config.getCldrDirectory())
.withDraftStatusAtLeast(config.getMinimumDraftStatus());
new LdmlConverter(config, src).convertAll(config);
public static void convert(
CldrDataSupplier src, SupplementalData supplementalData, LdmlConverterConfig config) {
new LdmlConverter(src, supplementalData, config).convertAll();
}
// The configuration controlling conversion behaviour.
private final LdmlConverterConfig config;
// The supplier for all data to be converted.
private final CldrDataSupplier src;
// The set of available locale IDs.
// TODO: Make available IDs include specials files (or fail if specials are not available).
private final ImmutableSet<String> availableIds;
// Supplemental data available to mappers if needed.
private final SupplementalData supplementalData;
// The configuration controlling conversion behaviour.
private final LdmlConverterConfig config;
// The set of expanded target locale IDs.
// TODO: Make available IDs include specials files (or fail if specials are not available).
private final ImmutableSet<String> availableIds;
// Transformer for locale data.
private final PathValueTransformer localeTransformer;
// Transformer for supplemental data.
private final PathValueTransformer supplementalTransformer;
// Header string to go into every ICU data file.
private final ImmutableList<String> icuFileHeader;
// Header string to go into every ICU data and transliteration rule file (comment prefixes
// are not present and must be added by the code writing the file).
private final ImmutableList<String> fileHeader;
private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) {
this.config = checkNotNull(config);
private LdmlConverter(
CldrDataSupplier src, SupplementalData supplementalData, LdmlConverterConfig config) {
this.src = checkNotNull(src);
this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL));
// Sort the set of available locale IDs but add "root" at the front. This is the
// set of non-alias locale IDs to be processed.
Set<String> localeIds = new LinkedHashSet<>();
localeIds.add("root");
localeIds.addAll(
Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES)));
localeIds.addAll(PHANTOM_LOCALE_IDS);
this.availableIds = ImmutableSet.copyOf(localeIds);
this.supplementalData = checkNotNull(supplementalData);
this.config = checkNotNull(config);
this.availableIds = ImmutableSet.copyOf(
Sets.intersection(supplementalData.getAvailableLocaleIds(), config.getAllLocaleIds()));
// Load the remaining path value transformers.
this.supplementalTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"),
@ -270,10 +248,10 @@ public final class LdmlConverter {
this.localeTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
this.fileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
}
private void convertAll(LdmlConverterConfig config) {
private void convertAll() {
ListMultimap<CldrDataType, OutputType> groupByType = LinkedListMultimap.create();
for (OutputType t : config.getOutputTypes()) {
groupByType.put(t.getCldrType(), t);
@ -361,7 +339,7 @@ public final class LdmlConverter {
SetMultimap<IcuLocaleDir, String> writtenLocaleIds = HashMultimap.create();
Path baseDir = config.getOutputDir();
for (String id : config.getTargetLocaleIds(LOCALES)) {
for (String id : config.getAllLocaleIds()) {
// Skip "target" IDs that are aliases (they are handled later).
if (!availableIds.contains(id)) {
continue;
@ -429,13 +407,17 @@ public final class LdmlConverter {
// and must be manually mapped (e.g. legacy locale IDs which don't even parse).
// 4: It is a "super special" forced alias, which might replace existing aliases in
// some output directories.
// Even forced aliases only apply if they are in the set of locale IDs for the directory.
Map<String, String> forcedAliases =
Maps.filterKeys(config.getForcedAliases(dir), localeIds::contains);
Map<String, String> aliasMap = new LinkedHashMap<>();
for (String id : localeIds) {
if (PHANTOM_ALIASES.keySet().contains(id)) {
checkArgument(!availableIds.contains(id),
"phantom aliases should never be otherwise supported: %s\n"
+ "(maybe the phantom alias can now be removed?)", id);
aliasMap.put(id, PHANTOM_ALIASES.get(id));
if (forcedAliases.keySet().contains(id)) {
// Forced aliases will be added later and don't need to be processed here. This
// is especially necessary if the ID is not structurally valid (e.g. "no_NO_NY")
// since that cannot be processed by the code below.
continue;
}
String canonicalId = supplementalData.replaceDeprecatedTags(id);
@ -459,7 +441,7 @@ public final class LdmlConverter {
// Important that we overwrite entries which might already exist here, since we might have
// already calculated a "natural" alias for something that we want to force (and we should
// replace the existing target, since that affects how we determine empty files later).
aliasMap.putAll(config.getForcedAliases(dir));
aliasMap.putAll(forcedAliases);
return aliasMap;
}
@ -490,7 +472,7 @@ public final class LdmlConverter {
private void processTransforms() {
Path transformDir = createDirectory(config.getOutputDir().resolve("translit"));
write(TransformsMapper.process(src, transformDir), transformDir);
write(TransformsMapper.process(src, transformDir, fileHeader), transformDir);
}
private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion");
@ -533,7 +515,7 @@ public final class LdmlConverter {
private void write(IcuData icuData, Path dir) {
createDirectory(dir);
IcuTextWriter.writeToFile(icuData, dir, icuFileHeader);
IcuTextWriter.writeToFile(icuData, dir, fileHeader);
}
private Path createDirectory(Path dir) {

View file

@ -63,9 +63,6 @@ public interface LdmlConverterConfig {
*/
Set<OutputType> getOutputTypes();
/** Returns the root directory in which the CLDR release is located. */
Path getCldrDirectory();
/**
* Returns an additional "specials" directory containing additional ICU specific XML
* files depending on the given output type. This is where the converter finds any XML
@ -83,12 +80,19 @@ public interface LdmlConverterConfig {
CldrDraftStatus getMinimumDraftStatus();
/**
* Returns the set of locale IDs to be processed for the given directory.
* Returns the complete set of locale IDs which should be considered for processing for this
* configuration.
*
* <p>This set can contain IDs which have noICU data associated with them if they are
* suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
* <p>Note that this set can contain IDs which have no CLDR data associated with them if they
* are suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
* exist).
*/
Set<String> getAllLocaleIds();
/**
* Returns the set of locale IDs to be processed for the given directory. This set must always
* be a subset of {@link #getAllLocaleIds()}.
*/
Set<String> getTargetLocaleIds(IcuLocaleDir dir);
/**

View file

@ -17,20 +17,24 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import com.google.common.base.Ascii;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableTable;
import com.google.common.collect.Sets;
import com.google.common.collect.Table;
/**
@ -43,6 +47,13 @@ import com.google.common.collect.Table;
*/
// TODO: This should be moved into the API and leverage some of the existing utility functions.
public final class SupplementalData {
// Special IDs which are not supported via CLDR, but for which synthetic data is injected.
// The "TRADITIONAL" variants are here because their calendar differs from the non-variant
// locale. However CLDR cannot represent this currently because calendar defaults are in
// supplemental data (rather than locale data) and are keyed only on territory.
private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
private static final PathMatcher ALIAS =
@ -88,18 +99,18 @@ public final class SupplementalData {
}
/**
* Creates a supplemental data API instance from the given CLDR data.
* Creates a supplemental data API instance from the given CLDR data supplier.
*
* @param supplementalData the raw CLDR supplemental data instance.
* @param src the CLDR data supplier.
* @return the supplemental data API.
*/
public static SupplementalData create(CldrData supplementalData) {
public static SupplementalData create(CldrDataSupplier src) {
Table<Alias, String, String> aliasTable = HashBasedTable.create();
Map<String, String> parentLocaleMap = new HashMap<>();
Map<String, String> defaultCalendarMap = new HashMap<>();
Map<String, String> likelySubtagMap = new HashMap<>();
supplementalData.accept(
src.getDataForType(CldrDataType.SUPPLEMENTAL).accept(
ARBITRARY,
v -> {
if (ALIAS.matches(v.getPath())) {
@ -122,17 +133,9 @@ public final class SupplementalData {
}
});
// WARNING: The original mapper code determines the full set of deprecated territories and
// then removes the following hard-coded list without any explanation as to why. While this
// is presumably to "undeprecate" them for the purposes of the locale processing, there's
// no explanation of where this list comes from, and thus no way to maintain it.
//
// asList("062", "172", "200", "830", "AN", "CS", "QU")
// .forEach(t -> aliasTable.remove(Alias.TERRITORY, t));
// TODO: Understand and document what on Earth this is all about or delete this comment.
Set<String> availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS);
return new SupplementalData(
aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
}
// A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
@ -256,22 +259,30 @@ public final class SupplementalData {
}
}
private final ImmutableSet<String> availableIds;
private final ImmutableTable<Alias, String, String> aliasTable;
private final ImmutableMap<String, String> parentLocaleMap;
private final ImmutableMap<String, String> defaultCalendarMap;
private final ImmutableMap<String, String> likelySubtagMap;
private SupplementalData(
Set<String> availableIds,
Table<Alias, String, String> aliasTable,
Map<String, String> parentLocaleMap,
Map<String, String> defaultCalendarMap,
Map<String, String> likelySubtagMap) {
this.availableIds = ImmutableSet.copyOf(availableIds);
this.aliasTable = ImmutableTable.copyOf(aliasTable);
this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
}
public ImmutableSet<String> getAvailableLocaleIds() {
return availableIds;
}
/**
* Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
*/

View file

@ -12,19 +12,26 @@ import static java.util.stream.Collectors.joining;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Optional;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.Task;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDraftStatus;
import org.unicode.icu.tool.cldrtoicu.IcuConverterConfig;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
import org.unicode.icu.tool.cldrtoicu.SupplementalData;
import com.google.common.base.Ascii;
import com.google.common.base.CaseFormat;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.SetMultimap;
// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed.
public final class ConvertIcuDataTask extends Task {
@ -36,6 +43,12 @@ public final class ConvertIcuDataTask extends Task {
private static final CharMatcher LOWER_UNDERSCORE = inRange('a', 'z').or(DIGIT_OR_UNDERSCORE);
private static final CharMatcher VALID_ENUM_CHAR = LOWER_UNDERSCORE.or(UPPER_UNDERSCORE);
private Path cldrPath;
private CldrDraftStatus minimumDraftStatus;
// Set of default locale ID specifiers (wildcard IDs which are expanded).
private ImmutableSet<String> localeIdSpec;
// Per directory overrides (fully specified locale IDs).
private final SetMultimap<IcuLocaleDir, String> perDirectoryIds = HashMultimap.create();
private final IcuConverterConfig.Builder config = IcuConverterConfig.builder();
@SuppressWarnings("unused")
@ -45,21 +58,24 @@ public final class ConvertIcuDataTask extends Task {
@SuppressWarnings("unused")
public void setCldrDir(Path path) {
config.setCldrDir(path);
this.cldrPath = checkNotNull(path);
}
@SuppressWarnings("unused")
public void setMinimalDraftStatus(String status) {
config.setMinimalDraftStatus(resolve(CldrDraftStatus.class, status));
minimumDraftStatus = resolve(CldrDraftStatus.class, status);
}
@SuppressWarnings("unused")
public void setOutputTypes(String types) {
config.setOutputTypes(
ImmutableList<OutputType> typeList =
LIST_SPLITTER
.splitToList(types).stream()
.map(s -> resolve(LdmlConverter.OutputType.class, s))
.collect(toImmutableList()));
.map(s -> resolve(OutputType.class, s))
.collect(toImmutableList());
if (!typeList.isEmpty()) {
config.setOutputTypes(typeList);
}
}
@SuppressWarnings("unused")
@ -73,59 +89,118 @@ public final class ConvertIcuDataTask extends Task {
}
public static final class LocaleIds extends Task {
private ImmutableList<IcuLocaleDir> dirs = ImmutableList.of();
private ImmutableList<String> ids = ImmutableList.of();
@SuppressWarnings("unused")
public void setDirs(String directories) {
this.dirs = LIST_SPLITTER.splitToList(directories).stream()
.map(s -> resolve(IcuLocaleDir.class, s))
.collect(toImmutableList());
}
private ImmutableSet<String> ids;
@SuppressWarnings("unused")
public void addText(String localeIds) {
// Need to filter out '//' style end-of-line comments first (replace with \n to avoid
// inadvertantly joining two elements.
localeIds = localeIds.replaceAll("//[^\n]*\n", "\n");
this.ids = ImmutableList.copyOf(LIST_SPLITTER.splitToList(localeIds));
this.ids = parseLocaleIds(localeIds);
}
@Override
public void init() throws BuildException {
checkBuild(!ids.isEmpty(), "Locale IDs must be specified");
}
}
public static final class ForcedAlias extends Task {
public static final class DirectoryFilter extends Task {
private IcuLocaleDir dir;
private String source;
private String target;
private ImmutableSet<String> ids;
@SuppressWarnings("unused")
public void setDir(String directory) {
this.dir = resolve(IcuLocaleDir.class, directory);
}
@SuppressWarnings("unused")
public void addText(String localeIds) {
this.ids = parseLocaleIds(localeIds);
}
@Override
public void init() throws BuildException {
checkBuild(dir != null, "Directory must be specified");
checkBuild(!ids.isEmpty(), "Locale IDs must be specified");
}
}
public static final class ForcedAlias extends Task {
private Optional<IcuLocaleDir> dir = Optional.empty();
private String source = "";
private String target = "";
@SuppressWarnings("unused")
public void setDir(String directory) {
this.dir = resolveOpt(IcuLocaleDir.class, directory);
}
@SuppressWarnings("unused")
public void setSource(String source) {
this.source = checkNotNull(source);
this.source = whitespace().trimFrom(source);
}
@SuppressWarnings("unused")
public void setTarget(String target) {
this.target = checkNotNull(target);
this.target = whitespace().trimFrom(target);
}
@Override
public void init() throws BuildException {
checkBuild(!source.isEmpty(), "Alias source must not be empty");
checkBuild(!target.isEmpty(), "Alias target must not be empty");
}
}
@SuppressWarnings("unused")
public void addConfiguredLocaleIds(LocaleIds localeIds) {
localeIds.dirs.forEach(d -> config.addLocaleIds(d, localeIds.ids));
checkBuild(this.localeIdSpec == null, "Cannot add more that one <localeIds> element");
this.localeIdSpec = localeIds.ids;
}
@SuppressWarnings("unused")
public void addConfiguredDirectoryFilter(DirectoryFilter filter) {
perDirectoryIds.putAll(filter.dir, filter.ids);
}
@SuppressWarnings("unused")
public void addConfiguredForcedAlias(ForcedAlias alias) {
config.addForcedAlias(alias.dir, alias.source, alias.target);
if (alias.dir.isPresent()) {
config.addForcedAlias(alias.dir.get(), alias.source, alias.target);
} else {
for (IcuLocaleDir dir : IcuLocaleDir.values()) {
config.addForcedAlias(dir, alias.source, alias.target);
}
}
}
@SuppressWarnings("unused")
public void execute() throws BuildException {
LdmlConverter.convert(config.build());
CldrDataSupplier src =
CldrDataSupplier.forCldrFilesIn(cldrPath).withDraftStatusAtLeast(minimumDraftStatus);
SupplementalData supplementalData = SupplementalData.create(src);
ImmutableSet<String> defaultTargetIds =
LocaleIdResolver.expandTargetIds(this.localeIdSpec, supplementalData);
for (IcuLocaleDir dir : IcuLocaleDir.values()) {
config.addLocaleIds(dir, perDirectoryIds.asMap().getOrDefault(dir, defaultTargetIds));
}
config.setMinimumDraftStatus(minimumDraftStatus);
LdmlConverter.convert(src, supplementalData, config.build());
}
private static void checkBuild(boolean condition, String message) {
if (!condition) {
throw new BuildException(message);
}
}
private static ImmutableSet<String> parseLocaleIds(String localeIds) {
// Need to filter out '//' style end-of-line comments first (replace with \n to avoid
// inadvertantly joining two elements.
localeIds = localeIds.replaceAll("//[^\n]*\n", "\n");
return ImmutableSet.copyOf(LIST_SPLITTER.splitToList(localeIds));
}
private static <T extends Enum<T>> Optional<T> resolveOpt(Class<T> enumClass, String name) {
return !name.isEmpty() ? Optional.of(resolve(enumClass, name)) : Optional.empty();
}
private static <T extends Enum<T>> T resolve(Class<T> enumClass, String name) {

View file

@ -0,0 +1,123 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.ant;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.unicode.icu.tool.cldrtoicu.SupplementalData;
import com.google.common.base.Ascii;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Sets;
/** Helper class to reslove ID configuration. */
final class LocaleIdResolver {
/** Returns the expanded set of target locale IDs based on the given ID specifications. */
public static ImmutableSet<String> expandTargetIds(
Set<String> idSpecs, SupplementalData supplementalData) {
return new LocaleIdResolver(supplementalData).resolve(idSpecs);
}
private final SupplementalData supplementalData;
private LocaleIdResolver(SupplementalData supplementalData) {
this.supplementalData = checkNotNull(supplementalData);
}
// ---- Code below here is to expand the incoming set of locale IDs ----
private static final Pattern WILDCARD_LOCALE = Pattern.compile("[a-z]{2}(?:_[A-Z][a-z]{3})?");
private ImmutableSet<String> resolve(Set<String> idSpecs) {
ImmutableSet<String> allAvailableIds = supplementalData.getAvailableLocaleIds();
// Get the minimized wildcard set, converting things like "en_Latn" --> "en".
ImmutableSet<String> wildcardIds = idSpecs.stream()
.filter(supplementalData.getAvailableLocaleIds()::contains)
.filter(id -> WILDCARD_LOCALE.matcher(id).matches())
.map(this::removeDefaultScript)
.collect(toImmutableSet());
// Get the set of IDs which are implied by the wildcard IDs.
Set<String> targetIds = new TreeSet<>();
allAvailableIds.forEach(id -> addWildcardMatches(id, wildcardIds::contains, targetIds));
// Get the IDs which don't need to be in the config (because they are implied).
Set<String> redundant = Sets.intersection(idSpecs, targetIds);
if (!redundant.isEmpty()) {
System.err.println("Configuration lists redundant locale IDs");
System.err.println("The following IDs should be removed from the configuration:");
Iterables.partition(redundant, 16)
.forEach(ids -> System.err.println(String.join(", ", ids)));
// Note that the minimal configuration includes aliases.
Set<String> minimalConfigIds = new TreeSet<>(Sets.difference(idSpecs, targetIds));
minimalConfigIds.remove("root");
ImmutableListMultimap<Character, String> idsByFirstChar =
Multimaps.index(minimalConfigIds, s -> s.charAt(0));
System.err.println("Canonical ID list is:");
for (char c: idsByFirstChar.keySet()) {
System.err.println(" // " + Ascii.toUpperCase(c));
Iterables.partition(idsByFirstChar.get(c), 16)
.forEach(ids -> System.err.println(" " + String.join(", ", ids)));
System.err.println();
}
System.err.flush();
throw new IllegalStateException("Non-canonical configuration");
}
// We return the set of IDs made up of:
// 1: The original IDs specified by the configuration (and any parent IDs).
// 2: IDs expanded from wildcard IDs (e.g. "en_Latn_GB" & "en_Latn" from "en").
// (this is what's already in targetIds).
// 3: The "root" ID.
idSpecs.forEach(id -> addRecursively(id, targetIds));
return ImmutableSet.<String>builder().add("root").addAll(targetIds).build();
}
// E.g. "xx_Fooo" --> "xx" --> "xx_Baar_YY" ==> "xx_Fooo"
// E.g. "xx_Fooo" --> "xx" --> "xx_Fooo_YY" ==> "xx"
private String removeDefaultScript(String id) {
if (id.contains("_")) {
String lang = id.substring(0, 2);
String maxId = supplementalData.maximize(lang)
.orElseThrow(
() -> new IllegalStateException("cannot maximize language subtag: " + lang));
if (maxId.startsWith(id)) {
return lang;
}
}
return id;
}
private void addRecursively(String id, Set<String> dst) {
while (!id.equals("root") && dst.add(id)) {
id = supplementalData.getParent(id);
}
}
private boolean addWildcardMatches(
String id, Predicate<String> isWildcard, Set<String> dst) {
if (id.equals("root")) {
return false;
}
String parentId = supplementalData.getParent(id);
if (isWildcard.test(parentId) || addWildcardMatches(parentId, isWildcard, dst)) {
dst.add(id);
return true;
}
return false;
}
}

View file

@ -15,6 +15,7 @@ import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
@ -30,6 +31,7 @@ import org.unicode.icu.tool.cldrtoicu.RbPath;
import org.unicode.icu.tool.cldrtoicu.RbValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.ibm.icu.text.Transliterator;
/**
@ -78,7 +80,9 @@ public final class TransformsMapper {
* @param ruleFileOutputDir the directory into which transliteration rule files will be written.
* @return the IcuData instance to be written to a file.
*/
public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) {
public static IcuData process(
CldrDataSupplier src, Path ruleFileOutputDir, List<String> header) {
Function<Path, PrintWriter> fileWriterFn = p -> {
Path file = ruleFileOutputDir.resolve(p);
try {
@ -88,12 +92,14 @@ public final class TransformsMapper {
}
};
CldrData cldrData = src.getDataForType(SUPPLEMENTAL);
return process(cldrData, fileWriterFn);
return process(cldrData, fileWriterFn, header);
}
@VisibleForTesting // It's easier to supply a fake data instance than a fake supplier.
static IcuData process(CldrData cldrData, Function<Path, PrintWriter> fileWriterFn) {
RuleVisitor visitor = new RuleVisitor(fileWriterFn);
static IcuData process(
CldrData cldrData, Function<Path, PrintWriter> fileWriterFn, List<String> header) {
RuleVisitor visitor = new RuleVisitor(fileWriterFn, header);
cldrData.accept(DTD, visitor);
addSpecialCaseValues(visitor.icuData);
return visitor.icuData;
@ -102,9 +108,11 @@ public final class TransformsMapper {
private static class RuleVisitor implements ValueVisitor {
private final IcuData icuData = new IcuData("root", false);
private final Function<Path, PrintWriter> outFn;
private final ImmutableList<String> header;
RuleVisitor(Function<Path, PrintWriter> outFn) {
RuleVisitor(Function<Path, PrintWriter> outFn, List<String> header) {
this.outFn = checkNotNull(outFn);
this.header = ImmutableList.copyOf(header);
icuData.setFileComment("File: root.txt");
}
@ -124,8 +132,8 @@ public final class TransformsMapper {
private void writeDataFile(String filename, CldrValue value) {
try (PrintWriter out = outFn.apply(Paths.get(filename))) {
out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others.");
out.println("# License & terms of use: http://www.unicode.org/copyright.html#License");
out.print("\uFEFF");
header.forEach(s -> out.println("# " + s));
out.println("#");
out.println("# File: " + filename);
out.println("# Generated from CLDR");

View file

@ -1,2 +1,2 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
© 2016 and later: Unicode, Inc. and others.
License & terms of use: http://www.unicode.org/copyright.html#License

View file

@ -5,8 +5,6 @@ package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth.assertWithMessage;
import static com.google.common.truth.Truth8.assertThat;
import static java.util.Arrays.asList;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import static org.unicode.cldr.api.CldrValue.parseValue;
import java.nio.file.Path;
@ -25,6 +23,7 @@ import org.unicode.cldr.tool.LikelySubtags;
import org.unicode.cldr.util.LanguageTagCanonicalizer;
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.icu.tool.cldrtoicu.testing.FakeDataSupplier;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableSet;
@ -41,8 +40,7 @@ public class SupplementalDataTest {
@BeforeClass
public static void loadRegressionData() {
Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR"));
regressionData = SupplementalData
.create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL));
regressionData = SupplementalData.create(CldrDataSupplier.forCldrFilesIn(cldrRoot));
SupplementalDataInfo sdi =
SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString());
likelySubtags = new LikelySubtags(sdi);
@ -348,6 +346,6 @@ public class SupplementalDataTest {
}
private static SupplementalData fakeSupplementalData(CldrValue... values) {
return SupplementalData.create(CldrDataSupplier.forValues(asList(values)));
return SupplementalData.create(new FakeDataSupplier().addSupplementalData(values));
}
}

View file

@ -21,7 +21,6 @@ import java.util.Arrays;
import java.util.Map;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.stream.Stream;
import org.junit.Test;
import org.junit.runner.RunWith;
@ -38,10 +37,14 @@ import com.google.common.collect.ImmutableList;
@RunWith(JUnit4.class)
public class TransformsMapperTest {
private static final ImmutableList<String> FILE_HEADER = ImmutableList.of(
"\uFEFF# © 2016 and later: Unicode, Inc. and others.",
"# License & terms of use: http://www.unicode.org/copyright.html#License",
"#");
private static final ImmutableList<String> HEADER_LINES = ImmutableList.of(
"First header line",
"Second header line");
private static final String FILE_HEADER =
"\uFEFF# First header line\n"
+ "# Second header line\n"
+ "#\n";
private static final int DEFAULT_PATH_COUNT = 7;
@ -64,7 +67,7 @@ public class TransformsMapperTest {
@Test
public void testDefaultContent() {
Map<String, String> fileMap = new TreeMap<>();
IcuData icuData = TransformsMapper.process(cldrData(), wrap(fileMap));
IcuData icuData = TransformsMapper.process(cldrData(), wrap(fileMap), HEADER_LINES);
assertThat(fileMap).isEmpty();
@ -88,7 +91,7 @@ public class TransformsMapperTest {
cldrData(oneWay("foo", "bar", FORWARD, null, INTERNAL, "first second third", ++idx));
Map<String, String> fileMap = new TreeMap<>();
IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap));
IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES);
assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 5);
assertThat(icuData).hasValuesFor("RuleBasedTransliteratorIDs/first/alias", "foo-bar");
@ -118,7 +121,7 @@ public class TransformsMapperTest {
cldrData(oneWay("foo", "bar", BACKWARD, "variant", EXTERNAL, "one two three", ++idx));
Map<String, String> fileMap = new TreeMap<>();
IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap));
IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES);
assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 5);
assertThat(icuData).hasValuesFor("RuleBasedTransliteratorIDs/one/alias", "bar-foo/variant");
@ -149,7 +152,7 @@ public class TransformsMapperTest {
both("foo", "bar", null, INTERNAL, "forward-alias", "backward-alias", ++idx));
Map<String, String> fileMap = new TreeMap<>();
IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap));
IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES);
// 3 for each direction.
assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 6);
@ -188,9 +191,7 @@ public class TransformsMapperTest {
private String headerPlusLines(String... lines) {
// For now the files always contain a blank line at the end (to match legacy behaviour) but
// this can, and probably should be changed.
return Stream
.concat(FILE_HEADER.stream(), Arrays.stream(lines))
.collect(joining("\n", "", "\n\n"));
return Arrays.stream(lines).collect(joining("\n", FILE_HEADER, "\n\n"));
}
private static CldrData cldrData(CldrValue... values) {