From 7078e19070f427679945bf501eed440334a5df1e Mon Sep 17 00:00:00 2001 From: David Beaumont Date: Fri, 6 Sep 2019 23:34:17 +0200 Subject: [PATCH] ICU-20693 Refactoring for inferred IDs. --- tools/cldr/cldr-to-icu/build-icu-data.xml | 229 ++++++++---------- .../tool/cldrtoicu/IcuConverterConfig.java | 66 ++--- .../icu/tool/cldrtoicu/IcuTextWriter.java | 3 +- .../icu/tool/cldrtoicu/LdmlConverter.java | 84 +++---- .../tool/cldrtoicu/LdmlConverterConfig.java | 16 +- .../icu/tool/cldrtoicu/SupplementalData.java | 41 ++-- .../cldrtoicu/ant/ConvertIcuDataTask.java | 127 ++++++++-- .../tool/cldrtoicu/ant/LocaleIdResolver.java | 123 ++++++++++ .../cldrtoicu/mapper/TransformsMapper.java | 22 +- .../src/main/resources/ldml2icu_header.txt | 4 +- .../tool/cldrtoicu/SupplementalDataTest.java | 8 +- .../mapper/TransformsMapperTest.java | 25 +- 12 files changed, 449 insertions(+), 299 deletions(-) create mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/LocaleIdResolver.java diff --git a/tools/cldr/cldr-to-icu/build-icu-data.xml b/tools/cldr/cldr-to-icu/build-icu-data.xml index 13d3936db0c..665d7150a9c 100644 --- a/tools/cldr/cldr-to-icu/build-icu-data.xml +++ b/tools/cldr/cldr-to-icu/build-icu-data.xml @@ -78,184 +78,117 @@ + outputTypes="${outputTypes}" minimalDraftStatus="${minDraftStatus}" emitReport="${emitReport}"> - - + - - + 1) Base languages are expanded to include default scripts (e.g. "en" -> "en_Latn"). + 2) All region and variant subtags are added for any base language or language+script + (e.g. "en" -> "en_GB" or "shi_Latn" -> "shi_Latn_MA"). - - - - - - - root, + If a non-default script is desired it should be listed explicitly (e.g. "sr_Latn"). + Locale IDs with deprecated subtags (which become aliases) must still be listed in + full (e.g. "en_RH" or "sr_Latn_YU"). + --> + // A - af, af_NA, af_ZA, agq, agq_CM, ak, ak_GH, am, am_ET, ar, ar_001, - ar_AE, ar_BH, ar_DJ, ar_DZ, ar_EG, ar_EH, ar_ER, ar_IL, ar_IQ, - ar_JO, ar_KM, ar_KW, ar_LB, ar_LY, ar_MA, ar_MR, ar_OM, ar_PS, - ar_QA, ar_SA, ar_SD, ar_SO, ar_SS, ar_SY, ar_TD, ar_TN, ar_YE, ars, - as, as_IN, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl, az_Cyrl_AZ, - az_Latn, az_Latn_AZ, + af, agq, agq_CM, ak, am, ar, ars, as, asa, asa_TZ, ast, ast_ES, az, az_AZ, az_Cyrl // B - bas, bas_CM, be, be_BY, bem, bem_ZM, bez, bez_TZ, bg, bg_BG, bm, - bm_ML, bn, bn_BD, bn_IN, bo, bo_CN, bo_IN, br, br_FR, brx, brx_IN, - bs, bs_Cyrl, bs_Cyrl_BA, bs_Latn, bs_Latn_BA, bs_BA, + bas, bas_CM, be, bem, bem_ZM, bez, bez_TZ, bg, bm, bn, bo, br, brx, brx_IN, bs, bs_BA + bs_Cyrl // C - ca, ca_AD, ca_ES, ca_FR, ca_IT, ccp, ccp_BD, ccp_IN, ce, ce_RU, - ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, - cs_CZ, cy, cy_GB, + ca, ccp, ccp_BD, ccp_IN, ce, ceb, ceb_PH, cgg, cgg_UG, chr, chr_US, ckb, ckb_IQ, ckb_IR, cs, cy // D - da, da_DK, da_GL, dav, dav_KE, de, de_AT, de_BE, de_CH, de_DE, - de_IT, de_LI, de_LU, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, - dyo_SN, dz, dz_BT, + da, dav, dav_KE, de, dje, dje_NE, dsb, dsb_DE, dua, dua_CM, dyo, dyo_SN, dz // E - ebu, ebu_KE, ee, ee_GH, ee_TG, el, el_CY, el_GR, en, en_001, - en_150, en_AE, en_AG, en_AI, en_AS, en_AT, en_AU, en_BB, en_BE, - en_BI, en_BM, en_BS, en_BW, en_BZ, en_CA, en_CC, en_CH, en_CK, - en_CM, en_CX, en_CY, en_DE, en_DG, en_DK, en_DM, en_ER, en_FI, - en_FJ, en_FK, en_FM, en_GB, en_GD, en_GG, en_GH, en_GI, en_GM, - en_GU, en_GY, en_HK, en_IE, en_IL, en_IM, en_IN, en_IO, en_JE, - en_JM, en_KE, en_KI, en_KN, en_KY, en_LC, en_LR, en_LS, en_MG, - en_MH, en_MO, en_MP, en_MS, en_MT, en_MU, en_MW, en_MY, en_NA, - en_NF, en_NG, en_NH, en_NL, en_NR, en_NU, en_NZ, en_PG, en_PH, - en_PK, en_PN, en_PR, en_PW, en_RH, en_RW, en_SB, en_SC, en_SD, - en_SE, en_SG, en_SH, en_SI, en_SL, en_SS, en_SX, en_SZ, en_TC, - en_TK, en_TO, en_TT, en_TV, en_TZ, en_UG, en_UM, en_US, en_US_POSIX, - en_VC, en_VG, en_VI, en_VU, en_WS, en_ZA, en_ZM, en_ZW, eo, - eo_001, es, es_419, es_AR, es_BO, es_BR, es_BZ, es_CL, es_CO, - es_CR, es_CU, es_DO, es_EA, es_EC, es_ES, es_GQ, es_GT, es_HN, - es_IC, es_MX, es_NI, es_PA, es_PE, es_PH, es_PR, es_PY, es_SV, - es_US, es_UY, es_VE, et, et_EE, eu, eu_ES, ewo, ewo_CM, + ebu, ebu_KE, ee, el, en, en_NH, en_RH, eo, es, et, eu, ewo, ewo_CM // F - fa, fa_AF, fa_IR, ff, ff_CM, ff_GN, ff_Latn, ff_Latn_BF, ff_Latn_CM, - ff_Latn_GH, ff_Latn_GM, ff_Latn_GN, ff_Latn_GW, ff_Latn_LR, ff_Latn_MR, - ff_Latn_NE, ff_Latn_NG, ff_Latn_SL, ff_Latn_SN, ff_MR, ff_SN, fi, - fi_FI, fil, fil_PH, fo, fo_DK, fo_FO, fr, fr_BE, fr_BF, fr_BI, - fr_BJ, fr_BL, fr_CA, fr_CD, fr_CF, fr_CG, fr_CH, fr_CI, fr_CM, - fr_DJ, fr_DZ, fr_FR, fr_GA, fr_GF, fr_GN, fr_GP, fr_GQ, fr_HT, - fr_KM, fr_LU, fr_MA, fr_MC, fr_MF, fr_MG, fr_ML, fr_MQ, fr_MR, - fr_MU, fr_NC, fr_NE, fr_PF, fr_PM, fr_RE, fr_RW, fr_SC, fr_SN, - fr_SY, fr_TD, fr_TG, fr_TN, fr_VU, fr_WF, fr_YT, fur, fur_IT, - fy, fy_NL, + fa, ff, ff_CM, ff_GN, ff_MR, ff_SN, fi, fil, fil_PH, fo, fr, fur, fur_IT, fy // G - ga, ga_IE, gd, gd_GB, gl, gl_ES, gsw, gsw_CH, gsw_FR, gsw_LI, - gu, gu_IN, guz, guz_KE, gv, gv_IM, + ga, gd, gl, gsw, gsw_CH, gsw_FR, gsw_LI, gu, guz, guz_KE, gv // H - ha, ha_GH, ha_NE, ha_NG, haw, haw_US, he, he_IL, hi, hi_IN, - hr, hr_BA, hr_HR, hsb, hsb_DE, hu, hu_HU, hy, hy_AM, + ha, haw, haw_US, he, hi, hr, hsb, hsb_DE, hu, hy // I - ia, ia_001, id, id_ID, ig, ig_NG, ii, ii_CN, in, in_ID, is, - is_IS, it, it_CH, it_IT, it_SM, it_VA, iw, iw_IL, + ia, id, ig, ii, in, in_ID, is, it, iw, iw_IL // J - ja, ja_JP, ja_JP_TRADITIONAL, jgo, jgo_CM, jmc, jmc_TZ, jv, jv_ID, + ja, jgo, jgo_CM, jmc, jmc_TZ, jv // K - ka, ka_GE, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, - khq, khq_ML, ki, ki_KE, kk, kk_KZ, kkj, kkj_CM, kl, kl_GL, kln, - kln_KE, km, km_KH, kn, kn_IN, ko, ko_KP, ko_KR, kok, kok_IN, - ks, ks_IN, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, ku_TR, - kw, kw_GB, ky, ky_KG, + ka, kab, kab_DZ, kam, kam_KE, kde, kde_TZ, kea, kea_CV, khq, khq_ML, ki, kk, kkj, kkj_CM, kl + kln, kln_KE, km, kn, ko, kok, kok_IN, ks, ksb, ksb_TZ, ksf, ksf_CM, ksh, ksh_DE, ku, kw + ky // L - lag, lag_TZ, lb, lb_LU, lg, lg_UG, lkt, lkt_US, ln, ln_AO, - ln_CD, ln_CF, ln_CG, lo, lo_LA, lrc, lrc_IQ, lrc_IR, lt, lt_LT, - lu, lu_CD, luo, luo_KE, luy, luy_KE, lv, lv_LV, + lag, lag_TZ, lb, lg, lkt, lkt_US, ln, lo, lrc, lrc_IQ, lrc_IR, lt, lu, luo, luo_KE, luy + luy_KE, lv // M - mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mg_MG, mgh, - mgh_MZ, mgo, mgo_CM, mi, mi_NZ, mk, mk_MK, ml, ml_IN, mn, - mn_MN, mo, mr, mr_IN, ms, ms_BN, ms_MY, ms_SG, mt, mt_MT, mua, - mua_CM, my, my_MM, mzn, mzn_IR, + mas, mas_KE, mas_TZ, mer, mer_KE, mfe, mfe_MU, mg, mgh, mgh_MZ, mgo, mgo_CM, mi, mk, ml, mn + mo, mr, ms, mt, mua, mua_CM, my, mzn, mzn_IR // N - naq, naq_NA, nb, nb_NO, nb_SJ, nd, nd_ZW, nds, nds_DE, nds_NL, - ne, ne_IN, ne_NP, nl, nl_AW, nl_BE, nl_BQ, nl_CW, nl_NL, nl_SR, - nl_SX, nmg, nmg_CM, nn, nn_NO, nnh, nnh_CM, no, no_NO, no_NO_NY, - nus, nus_SS, nyn, nyn_UG, + naq, naq_NA, nb, nd, nds, nds_DE, nds_NL, ne, nl, nmg, nmg_CM, nn, nnh, nnh_CM, no, no_NO + no_NO_NY, nus, nus_SS, nyn, nyn_UG // O - om, om_ET, om_KE, or, or_IN, os, os_GE, os_RU, + om, or, os // P - pa, pa_Arab, pa_Arab_PK, pa_Guru, pa_Guru_IN, pa_IN, pa_PK, pl, - pl_PL, ps, ps_AF, ps_PK, pt, pt_AO, pt_BR, pt_CH, pt_CV, pt_GQ, - pt_GW, pt_LU, pt_MO, pt_MZ, pt_PT, pt_ST, pt_TL, + pa, pa_Arab, pa_IN, pa_PK, pl, ps, pt // Q - qu, qu_BO, qu_EC, qu_PE, + qu // R - rm, rm_CH, rn, rn_BI, ro, ro_MD, ro_RO, rof, rof_TZ, ru, - ru_BY, ru_KG, ru_KZ, ru_MD, ru_RU, ru_UA, rw, rw_RW, rwk, rwk_TZ, + rm, rn, ro, rof, rof_TZ, ru, rw, rwk, rwk_TZ // S - sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, sd_PK, se, se_FI, - se_NO, se_SE, seh, seh_MZ, ses, ses_ML, sg, sg_CF, sh, sh_BA, - sh_CS, sh_YU, shi, shi_Latn, shi_Latn_MA, shi_Tfng, shi_Tfng_MA, - shi_MA, si, si_LK, sk, sk_SK, sl, sl_SI, smn, smn_FI, sn, sn_ZW, - so, so_DJ, so_ET, so_KE, so_SO, sq, sq_AL, sq_MK, sq_XK, sr, - sr_Cyrl, sr_Cyrl_BA, sr_Cyrl_ME, sr_Cyrl_RS, sr_Cyrl_CS, sr_Cyrl_XK, - sr_Cyrl_YU, sr_Latn, sr_Latn_BA, sr_Latn_ME, sr_Latn_RS, sr_Latn_CS, - sr_Latn_XK, sr_Latn_YU, sr_BA, sr_ME, sr_RS, sr_CS, sr_XK, sr_YU, - sv, sv_AX, sv_FI, sv_SE, sw, sw_CD, sw_KE, sw_TZ, sw_UG, + sah, sah_RU, saq, saq_KE, sbp, sbp_TZ, sd, se, seh, seh_MZ, ses, ses_ML, sg, sh, sh_BA, sh_CS + sh_YU, shi, shi_Latn, shi_Latn_MA, shi_MA, shi_Tfng, shi_Tfng_MA, si, sk, sl, smn, smn_FI, sn, so, sq, sr + sr_BA, sr_CS, sr_Cyrl_CS, sr_Cyrl_YU, sr_Latn, sr_Latn_CS, sr_Latn_YU, sr_ME, sr_RS, sr_XK, sr_YU, sv, sw // T - ta, ta_IN, ta_LK, ta_MY, ta_SG, te, te_IN, teo, teo_KE, teo_UG, - tg, tg_TJ, th, th_TH, th_TH_TRADITIONAL, ti, ti_ER, ti_ET, tk, - tk_TM, tl, tl_PH, to, to_TO, tr, tr_CY, tr_TR, tt, tt_RU, - twq, twq_NE, tzm, tzm_MA, + ta, te, teo, teo_KE, teo_UG, tg, th, ti, tk, tl, tl_PH, to, tr, tt, twq, twq_NE + tzm, tzm_MA // U - ug, ug_CN, uk, uk_UA, ur, ur_IN, ur_PK, uz, uz_AF, uz_Arab, - uz_Arab_AF, uz_Cyrl, uz_Cyrl_UZ, uz_Latn, uz_Latn_UZ, uz_UZ, + ug, uk, ur, uz, uz_AF, uz_Arab, uz_Cyrl, uz_UZ // V - vai, vai_Latn, vai_Latn_LR, vai_LR, vai_Vaii, vai_Vaii_LR, vi, - vi_VN, vun, vun_TZ, + vai, vai_LR, vai_Latn, vai_Latn_LR, vai_Vaii, vai_Vaii_LR, vi, vun, vun_TZ // W - wae, wae_CH, wo, wo_SN, + wae, wae_CH, wo // X - xh, xh_ZA, xog, xog_UG, + xh, xog, xog_UG // Y - yav, yav_CM, yi, yi_001, yo, yo_BJ, yo_NG, yue, yue_CN, yue_HK, - yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK, + yav, yav_CM, yi, yo, yue, yue_CN, yue_HK, yue_Hans, yue_Hans_CN, yue_Hant, yue_Hant_HK // Z - zgh, zgh_MA, zh, zh_Hans, zh_Hans_CN, zh_Hans_HK, zh_Hans_MO, - zh_Hans_SG, zh_Hant, zh_Hant_HK, zh_Hant_MO, zh_Hant_TW, zh_CN, - zh_HK, zh_MO, zh_SG, zh_TW, zu, zu_ZA + zgh, zgh_MA, zh, zh_CN, zh_HK, zh_Hant, zh_MO, zh_SG, zh_TW, zu + - + + root, // A-B @@ -282,9 +215,9 @@ // U-Z ug, uk, ur, uz, vi, wae, wo, xh, yi, yo, yue_CN, yue_Hans, yue, zh_CN, zh_Hant, zh_HK, zh_MO, zh_SG, zh_TW, zh, zu - + - + root, // A-E @@ -300,12 +233,56 @@ // Q-Z qu, ro, ru, se, sh, sk, sl, sq, sr_Latn, sr, sv, sw, ta, th, tr, uk, vi, yue_Hans, yue, zh_Hant_HK, zh_Hant, zh_HK, zh_MO, zh_TW, zh - + - + root, de, el, en, en_US_POSIX, en_US, es, fr, it, ja, pt, ru, zh_Hant, zh - + + + + + + + + + + + + + + + + + + + + + + + + + - \ No newline at end of file + diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java index 3ffa2d601cd..519b4f731e5 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuConverterConfig.java @@ -9,13 +9,13 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; -import java.util.Map; import java.util.Optional; import java.util.Set; import org.unicode.cldr.api.CldrDraftStatus; import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSetMultimap; import com.google.common.collect.ImmutableTable; @@ -29,11 +29,6 @@ import com.google.common.collect.TreeMultimap; * that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config. */ public final class IcuConverterConfig implements LdmlConverterConfig { - - private static final Optional DEFAULT_CLDR_DIR = - Optional.ofNullable(System.getProperty("CLDR_DIR", null)) - .map(d -> Paths.get(d).toAbsolutePath()); - private static final Optional DEFAULT_ICU_DIR = Optional.ofNullable(System.getProperty("ICU_DIR", null)) .map(d -> Paths.get(d).toAbsolutePath()); @@ -41,26 +36,16 @@ public final class IcuConverterConfig implements LdmlConverterConfig { /** The builder with which to specify configuration for the {@link LdmlConverter}. */ @SuppressWarnings("UnusedReturnValue") public static final class Builder { - private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null); private Path outputDir = DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null); private Path specialsDir = DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null); private ImmutableSet outputTypes = OutputType.ALL; - private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED; + private CldrDraftStatus minimumDraftStatus = CldrDraftStatus.CONTRIBUTED; private boolean emitReport = false; private final SetMultimap localeIdsMap = TreeMultimap.create(); private final Table forcedAliases = TreeBasedTable.create(); - /** - * Sets the CLDR base directory from which to load all CLDR data. This is optional if the - * {@code CLDR_DIR} environment variable is set, which will be used instead. - */ - public Builder setCldrDir(Path cldrDir) { - this.cldrDir = checkNotNull(cldrDir.toAbsolutePath()); - return this; - } - /** * Sets the output directory in which the ICU data directories and files will go. This is * optional if the {@code ICU_DIR} system property is set, which will be used to generate @@ -91,14 +76,8 @@ public final class IcuConverterConfig implements LdmlConverterConfig { return this; } - /** - * Sets the minimum draft status for CLDR data to be converted (paths below this status are - * ignored during conversion). This is optional and defaults to {@link - * CldrDraftStatus#CONTRIBUTED}. - */ - public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) { - this.minimalDraftStatus = checkNotNull(minimalDraftStatus); - return this; + public void setMinimumDraftStatus(CldrDraftStatus minimumDraftStatus) { + this.minimumDraftStatus = checkNotNull(minimumDraftStatus); } public Builder setEmitReport(boolean emitReport) { @@ -122,26 +101,16 @@ public final class IcuConverterConfig implements LdmlConverterConfig { } } - private final Path cldrDir; private final Path outputDir; private final Path specialsDir; private final ImmutableSet outputTypes; - private final CldrDraftStatus minimalDraftStatus; + private final CldrDraftStatus minimumDraftStatus; private final boolean emitReport; + private final ImmutableSet allLocaleIds; private final ImmutableSetMultimap localeIdsMap; private final ImmutableTable forcedAliases; private IcuConverterConfig(Builder builder) { - this.cldrDir = checkNotNull(builder.cldrDir, - "must set a CLDR directory, or the CLDR_DIR system property"); - if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) { - System.err.format( - "Warning: Specified CLDR base directory does not appear to match the" - + " directory inferred by the 'CLDR_DIR' system property.\n" - + "Specified: %s\n" - + "Inferred: %s\n", - this.cldrDir, DEFAULT_CLDR_DIR.get()); - } this.outputDir = checkNotNull(builder.outputDir); checkArgument(!Files.isRegularFile(outputDir), "specified output directory if not a directory: %s", outputDir); @@ -153,8 +122,10 @@ public final class IcuConverterConfig implements LdmlConverterConfig { checkArgument(!this.outputTypes.isEmpty(), "must specify at least one output type to be generated (possible values are: %s)", Arrays.asList(OutputType.values())); - this.minimalDraftStatus = builder.minimalDraftStatus; + this.minimumDraftStatus = checkNotNull(builder.minimumDraftStatus); this.emitReport = builder.emitReport; + // getAllLocaleIds() returns the union of all the specified IDs in the map. + this.allLocaleIds = ImmutableSet.copyOf(builder.localeIdsMap.values()); this.localeIdsMap = ImmutableSetMultimap.copyOf(builder.localeIdsMap); this.forcedAliases = ImmutableTable.copyOf(builder.forcedAliases); } @@ -163,11 +134,6 @@ public final class IcuConverterConfig implements LdmlConverterConfig { return new Builder(); } - @Override - public Path getCldrDirectory() { - return cldrDir; - } - @Override public Path getOutputDir() { return outputDir; @@ -179,13 +145,13 @@ public final class IcuConverterConfig implements LdmlConverterConfig { } @Override - public CldrDraftStatus getMinimumDraftStatus() { - return minimalDraftStatus; + public Path getSpecialsDir() { + return specialsDir; } @Override - public Path getSpecialsDir() { - return specialsDir; + public CldrDraftStatus getMinimumDraftStatus() { + return minimumDraftStatus; } @Override @@ -194,10 +160,14 @@ public final class IcuConverterConfig implements LdmlConverterConfig { } @Override - public Map getForcedAliases(IcuLocaleDir dir) { + public ImmutableMap getForcedAliases(IcuLocaleDir dir) { return forcedAliases.row(dir); } + @Override public ImmutableSet getAllLocaleIds() { + return allLocaleIds; + } + @Override public ImmutableSet getTargetLocaleIds(IcuLocaleDir dir) { return localeIdsMap.get(dir); } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java index 341731318b4..18763299282 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java @@ -148,7 +148,8 @@ final class IcuTextWriter { // TODO: Sort this out so there isn't a messy mix of comment styles in the data files. private static void writeHeaderAndComments( PrintWriter out, List header, List comments) { - header.forEach(out::println); + + header.forEach(s -> out.println("// " + s)); if (!comments.isEmpty()) { // TODO: Don't use /* */ block quotes, just use inline // quotes. out.println( diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java index 4e37788c1c8..e8206de9e09 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java @@ -27,7 +27,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.LinkedHashMap; -import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -61,6 +60,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.LinkedListMultimap; import com.google.common.collect.ListMultimap; +import com.google.common.collect.Maps; import com.google.common.collect.SetMultimap; import com.google.common.collect.Sets; import com.google.common.io.CharStreams; @@ -112,21 +112,6 @@ public final class LdmlConverter { private static final PathMatcher WINDOWS_ZONES_PATHS = supplementalMatcher("windowsZones"); - // Special IDs which are not supported via CLDR, but for which synthetic data is injected. - // The "TRADITIONAL" variants are here because their calendar differs from the non-variant - // locale. However CLDR cannot represent this currently because calendar defaults are in - // supplemental data (rather than locale data) and are keyed only on territory. - private static final ImmutableSet PHANTOM_LOCALE_IDS = - ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL"); - - // Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a - // structurally valid locale ID. This is injected manually when creating the alias map. - // This does mean that nobody can ever parse the _keys_ of the alias map, but so far there - // has been no need for that. - // TODO: Get "ars" into CLDR and remove this hack. - private static final Map PHANTOM_ALIASES = - ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO"); - private static PathMatcher supplementalMatcher(String... spec) { checkArgument(spec.length > 0, "must supply at least one matcher spec"); if (spec.length == 1) { @@ -223,42 +208,35 @@ public final class LdmlConverter { } /** Converts CLDR data according to the given configuration. */ - public static void convert(LdmlConverterConfig config) { - CldrDataSupplier src = CldrDataSupplier - .forCldrFilesIn(config.getCldrDirectory()) - .withDraftStatusAtLeast(config.getMinimumDraftStatus()); - new LdmlConverter(config, src).convertAll(config); + public static void convert( + CldrDataSupplier src, SupplementalData supplementalData, LdmlConverterConfig config) { + new LdmlConverter(src, supplementalData, config).convertAll(); } - // The configuration controlling conversion behaviour. - private final LdmlConverterConfig config; // The supplier for all data to be converted. private final CldrDataSupplier src; - // The set of available locale IDs. - // TODO: Make available IDs include specials files (or fail if specials are not available). - private final ImmutableSet availableIds; // Supplemental data available to mappers if needed. private final SupplementalData supplementalData; + // The configuration controlling conversion behaviour. + private final LdmlConverterConfig config; + // The set of expanded target locale IDs. + // TODO: Make available IDs include specials files (or fail if specials are not available). + private final ImmutableSet availableIds; // Transformer for locale data. private final PathValueTransformer localeTransformer; // Transformer for supplemental data. private final PathValueTransformer supplementalTransformer; - // Header string to go into every ICU data file. - private final ImmutableList icuFileHeader; + // Header string to go into every ICU data and transliteration rule file (comment prefixes + // are not present and must be added by the code writing the file). + private final ImmutableList fileHeader; - private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) { - this.config = checkNotNull(config); + private LdmlConverter( + CldrDataSupplier src, SupplementalData supplementalData, LdmlConverterConfig config) { this.src = checkNotNull(src); - this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL)); - // Sort the set of available locale IDs but add "root" at the front. This is the - // set of non-alias locale IDs to be processed. - Set localeIds = new LinkedHashSet<>(); - localeIds.add("root"); - localeIds.addAll( - Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES))); - localeIds.addAll(PHANTOM_LOCALE_IDS); - this.availableIds = ImmutableSet.copyOf(localeIds); - + this.supplementalData = checkNotNull(supplementalData); + this.config = checkNotNull(config); + this.availableIds = ImmutableSet.copyOf( + Sets.intersection(supplementalData.getAvailableLocaleIds(), config.getAllLocaleIds())); // Load the remaining path value transformers. this.supplementalTransformer = RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"), @@ -270,10 +248,10 @@ public final class LdmlConverter { this.localeTransformer = RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"), IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN); - this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt")); + this.fileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt")); } - private void convertAll(LdmlConverterConfig config) { + private void convertAll() { ListMultimap groupByType = LinkedListMultimap.create(); for (OutputType t : config.getOutputTypes()) { groupByType.put(t.getCldrType(), t); @@ -361,7 +339,7 @@ public final class LdmlConverter { SetMultimap writtenLocaleIds = HashMultimap.create(); Path baseDir = config.getOutputDir(); - for (String id : config.getTargetLocaleIds(LOCALES)) { + for (String id : config.getAllLocaleIds()) { // Skip "target" IDs that are aliases (they are handled later). if (!availableIds.contains(id)) { continue; @@ -429,13 +407,17 @@ public final class LdmlConverter { // and must be manually mapped (e.g. legacy locale IDs which don't even parse). // 4: It is a "super special" forced alias, which might replace existing aliases in // some output directories. + + // Even forced aliases only apply if they are in the set of locale IDs for the directory. + Map forcedAliases = + Maps.filterKeys(config.getForcedAliases(dir), localeIds::contains); + Map aliasMap = new LinkedHashMap<>(); for (String id : localeIds) { - if (PHANTOM_ALIASES.keySet().contains(id)) { - checkArgument(!availableIds.contains(id), - "phantom aliases should never be otherwise supported: %s\n" - + "(maybe the phantom alias can now be removed?)", id); - aliasMap.put(id, PHANTOM_ALIASES.get(id)); + if (forcedAliases.keySet().contains(id)) { + // Forced aliases will be added later and don't need to be processed here. This + // is especially necessary if the ID is not structurally valid (e.g. "no_NO_NY") + // since that cannot be processed by the code below. continue; } String canonicalId = supplementalData.replaceDeprecatedTags(id); @@ -459,7 +441,7 @@ public final class LdmlConverter { // Important that we overwrite entries which might already exist here, since we might have // already calculated a "natural" alias for something that we want to force (and we should // replace the existing target, since that affects how we determine empty files later). - aliasMap.putAll(config.getForcedAliases(dir)); + aliasMap.putAll(forcedAliases); return aliasMap; } @@ -490,7 +472,7 @@ public final class LdmlConverter { private void processTransforms() { Path transformDir = createDirectory(config.getOutputDir().resolve("translit")); - write(TransformsMapper.process(src, transformDir), transformDir); + write(TransformsMapper.process(src, transformDir, fileHeader), transformDir); } private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion"); @@ -533,7 +515,7 @@ public final class LdmlConverter { private void write(IcuData icuData, Path dir) { createDirectory(dir); - IcuTextWriter.writeToFile(icuData, dir, icuFileHeader); + IcuTextWriter.writeToFile(icuData, dir, fileHeader); } private Path createDirectory(Path dir) { diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java index a62d518fa94..c42b11ae70a 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java @@ -63,9 +63,6 @@ public interface LdmlConverterConfig { */ Set getOutputTypes(); - /** Returns the root directory in which the CLDR release is located. */ - Path getCldrDirectory(); - /** * Returns an additional "specials" directory containing additional ICU specific XML * files depending on the given output type. This is where the converter finds any XML @@ -83,12 +80,19 @@ public interface LdmlConverterConfig { CldrDraftStatus getMinimumDraftStatus(); /** - * Returns the set of locale IDs to be processed for the given directory. + * Returns the complete set of locale IDs which should be considered for processing for this + * configuration. * - *

This set can contain IDs which have noICU data associated with them if they are - * suitable aliases (e.g. they are deprecated versions of locale IDs for which data does + *

Note that this set can contain IDs which have no CLDR data associated with them if they + * are suitable aliases (e.g. they are deprecated versions of locale IDs for which data does * exist). */ + Set getAllLocaleIds(); + + /** + * Returns the set of locale IDs to be processed for the given directory. This set must always + * be a subset of {@link #getAllLocaleIds()}. + */ Set getTargetLocaleIds(IcuLocaleDir dir); /** diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java index b05d67359c2..1f0756802ff 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java @@ -17,20 +17,24 @@ import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Stream; import org.unicode.cldr.api.AttributeKey; -import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrDataType; import com.google.common.base.Ascii; import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.collect.HashBasedTable; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableTable; +import com.google.common.collect.Sets; import com.google.common.collect.Table; /** @@ -43,6 +47,13 @@ import com.google.common.collect.Table; */ // TODO: This should be moved into the API and leverage some of the existing utility functions. public final class SupplementalData { + // Special IDs which are not supported via CLDR, but for which synthetic data is injected. + // The "TRADITIONAL" variants are here because their calendar differs from the non-variant + // locale. However CLDR cannot represent this currently because calendar defaults are in + // supplemental data (rather than locale data) and are keyed only on territory. + private static final ImmutableSet PHANTOM_LOCALE_IDS = + ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL"); + private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}"); private static final PathMatcher ALIAS = @@ -88,18 +99,18 @@ public final class SupplementalData { } /** - * Creates a supplemental data API instance from the given CLDR data. + * Creates a supplemental data API instance from the given CLDR data supplier. * - * @param supplementalData the raw CLDR supplemental data instance. + * @param src the CLDR data supplier. * @return the supplemental data API. */ - public static SupplementalData create(CldrData supplementalData) { + public static SupplementalData create(CldrDataSupplier src) { Table aliasTable = HashBasedTable.create(); Map parentLocaleMap = new HashMap<>(); Map defaultCalendarMap = new HashMap<>(); Map likelySubtagMap = new HashMap<>(); - supplementalData.accept( + src.getDataForType(CldrDataType.SUPPLEMENTAL).accept( ARBITRARY, v -> { if (ALIAS.matches(v.getPath())) { @@ -122,17 +133,9 @@ public final class SupplementalData { } }); - // WARNING: The original mapper code determines the full set of deprecated territories and - // then removes the following hard-coded list without any explanation as to why. While this - // is presumably to "undeprecate" them for the purposes of the locale processing, there's - // no explanation of where this list comes from, and thus no way to maintain it. - // - // asList("062", "172", "200", "830", "AN", "CS", "QU") - // .forEach(t -> aliasTable.remove(Alias.TERRITORY, t)); - // TODO: Understand and document what on Earth this is all about or delete this comment. - + Set availableIds = Sets.union(src.getAvailableLocaleIds(), PHANTOM_LOCALE_IDS); return new SupplementalData( - aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap); + availableIds, aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap); } // A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU @@ -256,22 +259,30 @@ public final class SupplementalData { } } + private final ImmutableSet availableIds; private final ImmutableTable aliasTable; private final ImmutableMap parentLocaleMap; private final ImmutableMap defaultCalendarMap; private final ImmutableMap likelySubtagMap; private SupplementalData( + Set availableIds, Table aliasTable, Map parentLocaleMap, Map defaultCalendarMap, Map likelySubtagMap) { + + this.availableIds = ImmutableSet.copyOf(availableIds); this.aliasTable = ImmutableTable.copyOf(aliasTable); this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap); this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap); this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap); } + public ImmutableSet getAvailableLocaleIds() { + return availableIds; + } + /** * Returns the "maximized" form of a given locale ID, by adding likely subtags where possible. */ diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/ConvertIcuDataTask.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/ConvertIcuDataTask.java index 01861d700f2..876393c03c9 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/ConvertIcuDataTask.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/ConvertIcuDataTask.java @@ -12,19 +12,26 @@ import static java.util.stream.Collectors.joining; import java.nio.file.Path; import java.util.Arrays; +import java.util.Optional; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.Task; +import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDraftStatus; import org.unicode.icu.tool.cldrtoicu.IcuConverterConfig; import org.unicode.icu.tool.cldrtoicu.LdmlConverter; +import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType; import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir; +import org.unicode.icu.tool.cldrtoicu.SupplementalData; import com.google.common.base.Ascii; import com.google.common.base.CaseFormat; import com.google.common.base.CharMatcher; import com.google.common.base.Splitter; +import com.google.common.collect.HashMultimap; import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.SetMultimap; // Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed. public final class ConvertIcuDataTask extends Task { @@ -36,6 +43,12 @@ public final class ConvertIcuDataTask extends Task { private static final CharMatcher LOWER_UNDERSCORE = inRange('a', 'z').or(DIGIT_OR_UNDERSCORE); private static final CharMatcher VALID_ENUM_CHAR = LOWER_UNDERSCORE.or(UPPER_UNDERSCORE); + private Path cldrPath; + private CldrDraftStatus minimumDraftStatus; + // Set of default locale ID specifiers (wildcard IDs which are expanded). + private ImmutableSet localeIdSpec; + // Per directory overrides (fully specified locale IDs). + private final SetMultimap perDirectoryIds = HashMultimap.create(); private final IcuConverterConfig.Builder config = IcuConverterConfig.builder(); @SuppressWarnings("unused") @@ -45,21 +58,24 @@ public final class ConvertIcuDataTask extends Task { @SuppressWarnings("unused") public void setCldrDir(Path path) { - config.setCldrDir(path); + this.cldrPath = checkNotNull(path); } @SuppressWarnings("unused") public void setMinimalDraftStatus(String status) { - config.setMinimalDraftStatus(resolve(CldrDraftStatus.class, status)); + minimumDraftStatus = resolve(CldrDraftStatus.class, status); } @SuppressWarnings("unused") public void setOutputTypes(String types) { - config.setOutputTypes( + ImmutableList typeList = LIST_SPLITTER .splitToList(types).stream() - .map(s -> resolve(LdmlConverter.OutputType.class, s)) - .collect(toImmutableList())); + .map(s -> resolve(OutputType.class, s)) + .collect(toImmutableList()); + if (!typeList.isEmpty()) { + config.setOutputTypes(typeList); + } } @SuppressWarnings("unused") @@ -73,59 +89,118 @@ public final class ConvertIcuDataTask extends Task { } public static final class LocaleIds extends Task { - private ImmutableList dirs = ImmutableList.of(); - private ImmutableList ids = ImmutableList.of(); - - @SuppressWarnings("unused") - public void setDirs(String directories) { - this.dirs = LIST_SPLITTER.splitToList(directories).stream() - .map(s -> resolve(IcuLocaleDir.class, s)) - .collect(toImmutableList()); - } + private ImmutableSet ids; @SuppressWarnings("unused") public void addText(String localeIds) { - // Need to filter out '//' style end-of-line comments first (replace with \n to avoid - // inadvertantly joining two elements. - localeIds = localeIds.replaceAll("//[^\n]*\n", "\n"); - this.ids = ImmutableList.copyOf(LIST_SPLITTER.splitToList(localeIds)); + this.ids = parseLocaleIds(localeIds); + } + + @Override + public void init() throws BuildException { + checkBuild(!ids.isEmpty(), "Locale IDs must be specified"); } } - public static final class ForcedAlias extends Task { + public static final class DirectoryFilter extends Task { private IcuLocaleDir dir; - private String source; - private String target; + private ImmutableSet ids; @SuppressWarnings("unused") public void setDir(String directory) { this.dir = resolve(IcuLocaleDir.class, directory); } + @SuppressWarnings("unused") + public void addText(String localeIds) { + this.ids = parseLocaleIds(localeIds); + } + + @Override + public void init() throws BuildException { + checkBuild(dir != null, "Directory must be specified"); + checkBuild(!ids.isEmpty(), "Locale IDs must be specified"); + } + } + + public static final class ForcedAlias extends Task { + private Optional dir = Optional.empty(); + private String source = ""; + private String target = ""; + + @SuppressWarnings("unused") + public void setDir(String directory) { + this.dir = resolveOpt(IcuLocaleDir.class, directory); + } + @SuppressWarnings("unused") public void setSource(String source) { - this.source = checkNotNull(source); + this.source = whitespace().trimFrom(source); } @SuppressWarnings("unused") public void setTarget(String target) { - this.target = checkNotNull(target); + this.target = whitespace().trimFrom(target); + } + + @Override + public void init() throws BuildException { + checkBuild(!source.isEmpty(), "Alias source must not be empty"); + checkBuild(!target.isEmpty(), "Alias target must not be empty"); } } @SuppressWarnings("unused") public void addConfiguredLocaleIds(LocaleIds localeIds) { - localeIds.dirs.forEach(d -> config.addLocaleIds(d, localeIds.ids)); + checkBuild(this.localeIdSpec == null, "Cannot add more that one element"); + this.localeIdSpec = localeIds.ids; + } + + @SuppressWarnings("unused") + public void addConfiguredDirectoryFilter(DirectoryFilter filter) { + perDirectoryIds.putAll(filter.dir, filter.ids); } @SuppressWarnings("unused") public void addConfiguredForcedAlias(ForcedAlias alias) { - config.addForcedAlias(alias.dir, alias.source, alias.target); + if (alias.dir.isPresent()) { + config.addForcedAlias(alias.dir.get(), alias.source, alias.target); + } else { + for (IcuLocaleDir dir : IcuLocaleDir.values()) { + config.addForcedAlias(dir, alias.source, alias.target); + } + } } @SuppressWarnings("unused") public void execute() throws BuildException { - LdmlConverter.convert(config.build()); + CldrDataSupplier src = + CldrDataSupplier.forCldrFilesIn(cldrPath).withDraftStatusAtLeast(minimumDraftStatus); + SupplementalData supplementalData = SupplementalData.create(src); + ImmutableSet defaultTargetIds = + LocaleIdResolver.expandTargetIds(this.localeIdSpec, supplementalData); + for (IcuLocaleDir dir : IcuLocaleDir.values()) { + config.addLocaleIds(dir, perDirectoryIds.asMap().getOrDefault(dir, defaultTargetIds)); + } + config.setMinimumDraftStatus(minimumDraftStatus); + LdmlConverter.convert(src, supplementalData, config.build()); + } + + private static void checkBuild(boolean condition, String message) { + if (!condition) { + throw new BuildException(message); + } + } + + private static ImmutableSet parseLocaleIds(String localeIds) { + // Need to filter out '//' style end-of-line comments first (replace with \n to avoid + // inadvertantly joining two elements. + localeIds = localeIds.replaceAll("//[^\n]*\n", "\n"); + return ImmutableSet.copyOf(LIST_SPLITTER.splitToList(localeIds)); + } + + private static > Optional resolveOpt(Class enumClass, String name) { + return !name.isEmpty() ? Optional.of(resolve(enumClass, name)) : Optional.empty(); } private static > T resolve(Class enumClass, String name) { diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/LocaleIdResolver.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/LocaleIdResolver.java new file mode 100644 index 00000000000..601fa568599 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/LocaleIdResolver.java @@ -0,0 +1,123 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +package org.unicode.icu.tool.cldrtoicu.ant; + +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.collect.ImmutableSet.toImmutableSet; + +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +import org.unicode.icu.tool.cldrtoicu.SupplementalData; + +import com.google.common.base.Ascii; +import com.google.common.collect.ImmutableListMultimap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.collect.Multimaps; +import com.google.common.collect.Sets; + +/** Helper class to reslove ID configuration. */ +final class LocaleIdResolver { + /** Returns the expanded set of target locale IDs based on the given ID specifications. */ + public static ImmutableSet expandTargetIds( + Set idSpecs, SupplementalData supplementalData) { + return new LocaleIdResolver(supplementalData).resolve(idSpecs); + } + + private final SupplementalData supplementalData; + + private LocaleIdResolver(SupplementalData supplementalData) { + this.supplementalData = checkNotNull(supplementalData); + } + + // ---- Code below here is to expand the incoming set of locale IDs ---- + + private static final Pattern WILDCARD_LOCALE = Pattern.compile("[a-z]{2}(?:_[A-Z][a-z]{3})?"); + + private ImmutableSet resolve(Set idSpecs) { + ImmutableSet allAvailableIds = supplementalData.getAvailableLocaleIds(); + // Get the minimized wildcard set, converting things like "en_Latn" --> "en". + ImmutableSet wildcardIds = idSpecs.stream() + .filter(supplementalData.getAvailableLocaleIds()::contains) + .filter(id -> WILDCARD_LOCALE.matcher(id).matches()) + .map(this::removeDefaultScript) + .collect(toImmutableSet()); + + // Get the set of IDs which are implied by the wildcard IDs. + Set targetIds = new TreeSet<>(); + allAvailableIds.forEach(id -> addWildcardMatches(id, wildcardIds::contains, targetIds)); + + // Get the IDs which don't need to be in the config (because they are implied). + Set redundant = Sets.intersection(idSpecs, targetIds); + if (!redundant.isEmpty()) { + System.err.println("Configuration lists redundant locale IDs"); + System.err.println("The following IDs should be removed from the configuration:"); + Iterables.partition(redundant, 16) + .forEach(ids -> System.err.println(String.join(", ", ids))); + + // Note that the minimal configuration includes aliases. + Set minimalConfigIds = new TreeSet<>(Sets.difference(idSpecs, targetIds)); + minimalConfigIds.remove("root"); + ImmutableListMultimap idsByFirstChar = + Multimaps.index(minimalConfigIds, s -> s.charAt(0)); + + System.err.println("Canonical ID list is:"); + for (char c: idsByFirstChar.keySet()) { + System.err.println(" // " + Ascii.toUpperCase(c)); + Iterables.partition(idsByFirstChar.get(c), 16) + .forEach(ids -> System.err.println(" " + String.join(", ", ids))); + System.err.println(); + } + System.err.flush(); + throw new IllegalStateException("Non-canonical configuration"); + } + + // We return the set of IDs made up of: + // 1: The original IDs specified by the configuration (and any parent IDs). + // 2: IDs expanded from wildcard IDs (e.g. "en_Latn_GB" & "en_Latn" from "en"). + // (this is what's already in targetIds). + // 3: The "root" ID. + idSpecs.forEach(id -> addRecursively(id, targetIds)); + return ImmutableSet.builder().add("root").addAll(targetIds).build(); + } + + // E.g. "xx_Fooo" --> "xx" --> "xx_Baar_YY" ==> "xx_Fooo" + // E.g. "xx_Fooo" --> "xx" --> "xx_Fooo_YY" ==> "xx" + private String removeDefaultScript(String id) { + if (id.contains("_")) { + String lang = id.substring(0, 2); + String maxId = supplementalData.maximize(lang) + .orElseThrow( + () -> new IllegalStateException("cannot maximize language subtag: " + lang)); + if (maxId.startsWith(id)) { + return lang; + } + } + return id; + } + + private void addRecursively(String id, Set dst) { + while (!id.equals("root") && dst.add(id)) { + id = supplementalData.getParent(id); + } + } + + private boolean addWildcardMatches( + String id, Predicate isWildcard, Set dst) { + if (id.equals("root")) { + return false; + } + String parentId = supplementalData.getParent(id); + if (isWildcard.test(parentId) || addWildcardMatches(parentId, isWildcard, dst)) { + dst.add(id); + return true; + } + return false; + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java index 14c4d340bce..e0844e653dd 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java @@ -15,6 +15,7 @@ import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.List; import java.util.Optional; import java.util.function.Function; @@ -30,6 +31,7 @@ import org.unicode.icu.tool.cldrtoicu.RbPath; import org.unicode.icu.tool.cldrtoicu.RbValue; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; import com.ibm.icu.text.Transliterator; /** @@ -78,7 +80,9 @@ public final class TransformsMapper { * @param ruleFileOutputDir the directory into which transliteration rule files will be written. * @return the IcuData instance to be written to a file. */ - public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) { + public static IcuData process( + CldrDataSupplier src, Path ruleFileOutputDir, List header) { + Function fileWriterFn = p -> { Path file = ruleFileOutputDir.resolve(p); try { @@ -88,12 +92,14 @@ public final class TransformsMapper { } }; CldrData cldrData = src.getDataForType(SUPPLEMENTAL); - return process(cldrData, fileWriterFn); + return process(cldrData, fileWriterFn, header); } @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. - static IcuData process(CldrData cldrData, Function fileWriterFn) { - RuleVisitor visitor = new RuleVisitor(fileWriterFn); + static IcuData process( + CldrData cldrData, Function fileWriterFn, List header) { + + RuleVisitor visitor = new RuleVisitor(fileWriterFn, header); cldrData.accept(DTD, visitor); addSpecialCaseValues(visitor.icuData); return visitor.icuData; @@ -102,9 +108,11 @@ public final class TransformsMapper { private static class RuleVisitor implements ValueVisitor { private final IcuData icuData = new IcuData("root", false); private final Function outFn; + private final ImmutableList header; - RuleVisitor(Function outFn) { + RuleVisitor(Function outFn, List header) { this.outFn = checkNotNull(outFn); + this.header = ImmutableList.copyOf(header); icuData.setFileComment("File: root.txt"); } @@ -124,8 +132,8 @@ public final class TransformsMapper { private void writeDataFile(String filename, CldrValue value) { try (PrintWriter out = outFn.apply(Paths.get(filename))) { - out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others."); - out.println("# License & terms of use: http://www.unicode.org/copyright.html#License"); + out.print("\uFEFF"); + header.forEach(s -> out.println("# " + s)); out.println("#"); out.println("# File: " + filename); out.println("# Generated from CLDR"); diff --git a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt index a9bd38f4299..70f6b838a4f 100644 --- a/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt +++ b/tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_header.txt @@ -1,2 +1,2 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License \ No newline at end of file +© 2016 and later: Unicode, Inc. and others. +License & terms of use: http://www.unicode.org/copyright.html#License diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java index 17368f411d4..5ea8b4c4214 100644 --- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/SupplementalDataTest.java @@ -5,8 +5,6 @@ package org.unicode.icu.tool.cldrtoicu; import static com.google.common.truth.Truth.assertThat; import static com.google.common.truth.Truth.assertWithMessage; import static com.google.common.truth.Truth8.assertThat; -import static java.util.Arrays.asList; -import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; import static org.unicode.cldr.api.CldrValue.parseValue; import java.nio.file.Path; @@ -25,6 +23,7 @@ import org.unicode.cldr.tool.LikelySubtags; import org.unicode.cldr.util.LanguageTagCanonicalizer; import org.unicode.cldr.util.LocaleIDParser; import org.unicode.cldr.util.SupplementalDataInfo; +import org.unicode.icu.tool.cldrtoicu.testing.FakeDataSupplier; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; @@ -41,8 +40,7 @@ public class SupplementalDataTest { @BeforeClass public static void loadRegressionData() { Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR")); - regressionData = SupplementalData - .create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL)); + regressionData = SupplementalData.create(CldrDataSupplier.forCldrFilesIn(cldrRoot)); SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString()); likelySubtags = new LikelySubtags(sdi); @@ -348,6 +346,6 @@ public class SupplementalDataTest { } private static SupplementalData fakeSupplementalData(CldrValue... values) { - return SupplementalData.create(CldrDataSupplier.forValues(asList(values))); + return SupplementalData.create(new FakeDataSupplier().addSupplementalData(values)); } } diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapperTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapperTest.java index 7ba5ef918de..08c3049429a 100644 --- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapperTest.java +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapperTest.java @@ -21,7 +21,6 @@ import java.util.Arrays; import java.util.Map; import java.util.TreeMap; import java.util.function.Function; -import java.util.stream.Stream; import org.junit.Test; import org.junit.runner.RunWith; @@ -38,10 +37,14 @@ import com.google.common.collect.ImmutableList; @RunWith(JUnit4.class) public class TransformsMapperTest { - private static final ImmutableList FILE_HEADER = ImmutableList.of( - "\uFEFF# © 2016 and later: Unicode, Inc. and others.", - "# License & terms of use: http://www.unicode.org/copyright.html#License", - "#"); + private static final ImmutableList HEADER_LINES = ImmutableList.of( + "First header line", + "Second header line"); + + private static final String FILE_HEADER = + "\uFEFF# First header line\n" + + "# Second header line\n" + + "#\n"; private static final int DEFAULT_PATH_COUNT = 7; @@ -64,7 +67,7 @@ public class TransformsMapperTest { @Test public void testDefaultContent() { Map fileMap = new TreeMap<>(); - IcuData icuData = TransformsMapper.process(cldrData(), wrap(fileMap)); + IcuData icuData = TransformsMapper.process(cldrData(), wrap(fileMap), HEADER_LINES); assertThat(fileMap).isEmpty(); @@ -88,7 +91,7 @@ public class TransformsMapperTest { cldrData(oneWay("foo", "bar", FORWARD, null, INTERNAL, "first second third", ++idx)); Map fileMap = new TreeMap<>(); - IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap)); + IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES); assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 5); assertThat(icuData).hasValuesFor("RuleBasedTransliteratorIDs/first/alias", "foo-bar"); @@ -118,7 +121,7 @@ public class TransformsMapperTest { cldrData(oneWay("foo", "bar", BACKWARD, "variant", EXTERNAL, "one two three", ++idx)); Map fileMap = new TreeMap<>(); - IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap)); + IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES); assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 5); assertThat(icuData).hasValuesFor("RuleBasedTransliteratorIDs/one/alias", "bar-foo/variant"); @@ -149,7 +152,7 @@ public class TransformsMapperTest { both("foo", "bar", null, INTERNAL, "forward-alias", "backward-alias", ++idx)); Map fileMap = new TreeMap<>(); - IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap)); + IcuData icuData = TransformsMapper.process(cldrData, wrap(fileMap), HEADER_LINES); // 3 for each direction. assertThat(icuData).getPaths().hasSize(DEFAULT_PATH_COUNT + 6); @@ -188,9 +191,7 @@ public class TransformsMapperTest { private String headerPlusLines(String... lines) { // For now the files always contain a blank line at the end (to match legacy behaviour) but // this can, and probably should be changed. - return Stream - .concat(FILE_HEADER.stream(), Arrays.stream(lines)) - .collect(joining("\n", "", "\n\n")); + return Arrays.stream(lines).collect(joining("\n", FILE_HEADER, "\n\n")); } private static CldrData cldrData(CldrValue... values) {