ICU-20693 New LDML to ICU tooling.

See #721
This commit is contained in:
David Beaumont 2019-08-24 15:14:52 +00:00 committed by Markus Scherer
parent 97516f58b1
commit 2528d0bec1
46 changed files with 8684 additions and 0 deletions

7
tools/cldr/cldr-to-icu/.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
# Exclude the Maven local repository but keep the lib directory and the top-level readme.
/lib/**
!/lib/README.txt
# Ignore the default Maven target directory.
/target

View file

@ -0,0 +1,55 @@
*********************************************************************
*** © 2019 and later: Unicode, Inc. and others. ***
*** License & terms of use: http://www.unicode.org/copyright.html ***
*********************************************************************
Basic instructions for running the LdmlConverter via Maven
==========================================================
Note that these instructions do not currently support configuration of the converter for things
such as limiting the set of files produced. That is supported in code and could be easily added
to the binary, or encapsulated via an Ant task, but currently it is not directly supported.
See the IcuConverterConfig class for the API by which this can be supported.
Important directories
---------------------
<CLDR_DIR> = The root directory of the CLDR release.
<ICU_DIR> = The root directory of the ICU release (probably a parent directory of where
this README file is located). This is an optional property and defaults to
the parent directory of the release from which it is run.
<DTD_CACHE> = The temporary cache directory in which DTD files are downloaded (this is the
same directory as would be used when running tools from the CLDR project).
Note that the need to specify this directory is scheduled to be removed after
ICU release 65.
<OUT_DIR> = The output directory into which ICU data files should be written.
Generating all ICU data
-----------------------
$ mvn exec:java \
-DCLDR_DIR='<CLDR_DIR>' \
-DCLDR_DTD_CACHE='<DTD_CACHE>' \
-Dexec.args='<OUT_DIR>'
Running unit tests
------------------
$ mvn test \
-DCLDR_DIR='<CLDR_DIR>' \
-DCLDR_DTD_CACHE='<DTD_CACHE>'
Importing and running from an IDE
---------------------------------
This project should be easy to import into an IDE which supports Maven development, such
as IntelliJ or Eclipse. It uses a local Maven repository directory for the unpublished
CLDR libraries (which are included in the project), but otherwise gets all dependencies
via Maven's public repositories.

View file

@ -0,0 +1,61 @@
*********************************************************************
*** © 2019 and later: Unicode, Inc. and others. ***
*** License & terms of use: http://www.unicode.org/copyright.html ***
*********************************************************************
What is this directory and why is it empty?
-------------------------------------------
This is the root of a local Maven repository which needs to be populated before the
code in this project can be executed.
To do this, you need to have a local copy of the CLDR project configured on your
computer and be able able to build the API jar file and copy an existing utility
jar file. In the examples below it is assumed that <CLDR_ROOT> references this CLDR
release.
Regenerating the CLDR API jar
-----------------------------
To regenerate the CLDR API jar you need to build the "jar" target using the Ant
build.xml file in the "tools/java" directory of the CLDR project:
$ cd <CLDR_ROOT>/tools/java
$ ant clean jar
This should result in the cldr.jar file being built into that directory, which can then
be installed as a Maven dependency as described above.
Updating local Maven repository
-------------------------------
To update the local Maven repository (e.g. to install the CLDR jar) then from this
directory (lib/) you should run:
$ mvn install:install-file \
-DgroupId=org.unicode.cldr \
-DartifactId=cldr-api \
-Dversion=0.1-SNAPSHOT \
-Dpackaging=jar \
-DgeneratePom=true \
-DlocalRepositoryPath=. \
-Dfile=<CLDR_ROOT>/tools/java/cldr.jar
And also (for the utility jar):
$ mvn install:install-file \
-DgroupId=com.ibm.icu \
-DartifactId=icu-utilities \
-Dversion=0.1-SNAPSHOT \
-Dpackaging=jar \
-DgeneratePom=true \
-DlocalRepositoryPath=. \
-Dfile=<CLDR_ROOT>/tools/java/libs/utilities.jar
And if you have updated one of these libraries, run:
$ mvn dependency:purge-local-repository -DsnapshotsOnly=true
If you choose to update the version number, then remember to update the root pom.xml.

View file

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- © 2019 and later: Unicode, Inc. and others.
License & terms of use: http://www.unicode.org/copyright.html
See README.txt for instructions on updating the local repository.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.unicode.icu</groupId>
<artifactId>cldr-to-icu</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<configuration>
<mainClass>org.unicode.icu.tool.cldrtoicu.LdmlConverter</mainClass>
<systemProperties>
<property>
<key>ICU_DIR</key>
<value>${project.basedir}/../../..</value>
</property>
</systemProperties>
</configuration>
</plugin>
</plugins>
</build>
<!-- This is where the snapshots of the CLDR API and additional auxilliary jars are held. -->
<repositories>
<repository>
<id>local-maven-repo</id>
<url>file:///${project.basedir}/lib</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.unicode.cldr</groupId>
<artifactId>cldr-api</artifactId>
<version>0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu-utilities</artifactId>
<version>0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>64.2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.1-jre</version>
</dependency>
<dependency>
<groupId>com.google.truth</groupId>
<artifactId>truth</artifactId>
<version>1.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.truth.extensions</groupId>
<artifactId>truth-java8-extension</artifactId>
<version>1.0</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View file

@ -0,0 +1,381 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.unicode.cldr.api.CldrDraftStatus;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
/**
* The converter config intended to generate the standard ICU data files. This used to be something
* that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
*/
public final class IcuConverterConfig implements LdmlConverterConfig {
private static final Optional<Path> DEFAULT_CLDR_DIR =
Optional.ofNullable(System.getProperty("CLDR_DIR", null))
.map(d -> Paths.get(d).toAbsolutePath());
private static final Optional<Path> DEFAULT_ICU_DIR =
Optional.ofNullable(System.getProperty("ICU_DIR", null))
.map(d -> Paths.get(d).toAbsolutePath());
/** The builder with which to specify configuration for the {@link LdmlConverter}. */
public static final class Builder {
private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
private Path outputDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
private Path specialsDir =
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);;
private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
private boolean emitReport = false;
/**
* Sets the CLDR base directory from which to load all CLDR data. This is optional if the
* {@code CLDR_DIR} environment variable is set, which will be used instead.
*/
public Builder setCldrDir(Path cldrDir) {
this.cldrDir = checkNotNull(cldrDir.toAbsolutePath());
return this;
}
/**
* Sets the output directory in which the ICU data directories and files will go. This is
* optional if the {@code ICU_DIR} system property is set, which will be used to generate
* the path instead (i.e. {@code "icu4c/source/data"} inside the ICU release directory).
*/
public Builder setOutputDir(Path outputDir) {
this.outputDir = checkNotNull(outputDir);
return this;
}
/**
* Sets the "specials" directory containing additional ICU specific data to be processed.
* This is optional if the {@code ICU_DIR} system property is set, which will be used to
* generate the path instead (i.e. {@code "icu4c/source/data/xml"} inside the ICU release
* directory).
*/
public Builder setSpecialsDir(Path specialsDir) {
this.specialsDir = checkNotNull(specialsDir);
return this;
}
/**
* Sets the output types which will be converted. This is optional and defaults to {@link
* OutputType#ALL}.
*/
public Builder setOutputTypes(Iterable<OutputType> types) {
this.outputTypes = ImmutableSet.copyOf(types);
return this;
}
/**
* Sets the minimum draft status for CLDR data to be converted (paths below this status are
* ignored during conversion). This is optional and defaults to {@link
* CldrDraftStatus#CONTRIBUTED}.
*/
public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) {
this.minimalDraftStatus = checkNotNull(minimalDraftStatus);
return this;
}
public Builder setEmitReport(boolean emitReport) {
this.emitReport = emitReport;
return this;
}
/** Returns a converter config from the current builder state. */
public LdmlConverterConfig build() {
return new IcuConverterConfig(this);
}
}
private final Path cldrDir;
private final Path outputDir;
private final Path specialsDir;
private final ImmutableSet<OutputType> outputTypes;
private final CldrDraftStatus minimalDraftStatus;
private final boolean emitReport;
private IcuConverterConfig(Builder builder) {
this.cldrDir = checkNotNull(builder.cldrDir,
"must set a CLDR directory, or the CLDR_DIR system property");
if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) {
System.err.format(
"Warning: Specified CLDR base directory does not appear to match the"
+ " directory inferred by the 'CLDR_DIR' system property.\n"
+ "Specified: %s\n"
+ "Inferred: %s\n",
this.cldrDir, DEFAULT_CLDR_DIR.get());
}
this.outputDir = checkNotNull(builder.outputDir);
checkArgument(!Files.isRegularFile(outputDir),
"specified output directory if not a directory: %s", outputDir);
this.specialsDir = checkNotNull(builder.specialsDir,
"must specify a 'specials' XML directory");
checkArgument(Files.isDirectory(specialsDir),
"specified specials directory does not exist: %s", specialsDir);
this.outputTypes = builder.outputTypes;
checkArgument(!this.outputTypes.isEmpty(),
"must specify at least one output type to be generated (possible values are: %s)",
Arrays.asList(OutputType.values()));
this.minimalDraftStatus = builder.minimalDraftStatus;
this.emitReport = builder.emitReport;
}
public static Builder builder() {
return new Builder();
}
@Override public Path getCldrDirectory() {
return cldrDir;
}
@Override public Path getOutputDir() {
return outputDir;
}
@Override public Set<OutputType> getOutputTypes() {
return outputTypes;
}
@Override public CldrDraftStatus getMinimumDraftStatus() {
return minimalDraftStatus;
}
@Override public Path getSpecialsDir() {
return specialsDir;
}
@Override public boolean emitReport() {
return emitReport;
}
// Currently hard-coded "hacks" which could be encoded via the builder if wanted.
@Override public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
switch (dir) {
case COLL:
return ImmutableMap.<String, String>builder()
// It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
// TODO: Find out and document this properly.
.put("sr_ME", "sr_Cyrl_ME")
// This appears to be a hack to avoid needing to copy and maintain the same "zh"
// data for "yue". The files for "yue" in this directory should be empty otherwise.
//
// The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
// "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
// rewriting the base language.
.put("yue_Hans", "zh_Hans")
.put("yue", "zh_Hant")
.build();
case RBNF:
// It is not at all clear why this is being done. It's certainly not exactly the same
// as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
// data than "yue", so this alias is not just rewriting the base language.
// TODO: Find out and document this properly.
return ImmutableMap.of("zh_Hant_HK", "yue");
default:
return ImmutableMap.of();
}
}
// This set of locale files in each directory denotes the supported/available locales for that
// API. In most cases, it's the same set, but a few directories support only a subset of IDs.
@Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
switch (dir) {
case COLL:
return COLL_LOCALE_IDS;
case BRKITR:
return BRKITR_LOCALE_IDS;
case RBNF:
return RBNF_LOCALE_IDS;
default:
return ICU_LOCALE_IDS;
}
}
// The primary set of locale IDs to be generated. Other, directory specific, sets should be
// subsets of this. Some of these ID are aliases, so XML files may not exist for all of them.
//
// This was further modified (in order to better match the set of generated ICU files) by:
// * Removing "es_003" (which just seems to be ignored in current code)
// * Adding: "en_NH", "sr_XK", "yue_CN", "yue_HK" (deprecated locale IDs in the manual config)
// * Adding: "no_NO_NY" (a not even structurally valid ID that exists for very legacy reasons)
private static final ImmutableSet<String> ICU_LOCALE_IDS = ImmutableSet.of(
"root",
// A
"af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
"ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ",
"ar_JO", "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS",
"ar_QA", "ar_SA", "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars",
"as", "as_IN", "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ",
"az_Latn", "az_Latn_AZ",
// B
"bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg", "bg_BG", "bm",
"bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR", "brx", "brx_IN",
"bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA",
// C
"ca", "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU",
"ceb", "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs",
"cs_CZ", "cy", "cy_GB",
// D
"da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
"de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
"dyo_SN", "dz", "dz_BT",
// E
"ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR", "en", "en_001",
"en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB", "en_BE",
"en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
"en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI",
"en_FJ", "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM",
"en_GU", "en_GY", "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE",
"en_JM", "en_KE", "en_KI", "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG",
"en_MH", "en_MO", "en_MP", "en_MS", "en_MT", "en_MU", "en_MW", "en_MY", "en_NA",
"en_NF", "en_NG", "en_NH", "en_NL", "en_NR", "en_NU", "en_NZ", "en_PG", "en_PH",
"en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB", "en_SC", "en_SD",
"en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ", "en_TC",
"en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US", "en_US_POSIX",
"en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
"eo_001", "es", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
"es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN",
"es_IC", "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV",
"es_US", "es_UY", "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM",
// F
"fa", "fa_AF", "fa_IR", "ff", "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM",
"ff_Latn_GH", "ff_Latn_GM", "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR",
"ff_Latn_NE", "ff_Latn_NG", "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi",
"fi_FI", "fil", "fil_PH", "fo", "fo_DK", "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI",
"fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF", "fr_CG", "fr_CH", "fr_CI", "fr_CM",
"fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN", "fr_GP", "fr_GQ", "fr_HT",
"fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML", "fr_MQ", "fr_MR",
"fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC", "fr_SN",
"fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
"fy", "fy_NL",
// G
"ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR", "gsw_LI",
"gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM",
// H
"ha", "ha_GH", "ha_NE", "ha_NG", "haw", "haw_US", "he", "he_IL", "hi", "hi_IN",
"hr", "hr_BA", "hr_HR", "hsb", "hsb_DE", "hu", "hu_HU", "hy", "hy_AM",
// I
"ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN", "in", "in_ID", "is",
"is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL",
// J
"ja", "ja_JP", "ja_JP_TRADITIONAL", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID",
// K
"ka", "ka_GE", "kab", "kab_DZ", "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV",
"khq", "khq_ML", "ki", "ki_KE", "kk", "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln",
"kln_KE", "km", "km_KH", "kn", "kn_IN", "ko", "ko_KP", "ko_KR", "kok", "kok_IN",
"ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM", "ksh", "ksh_DE", "ku", "ku_TR",
"kw", "kw_GB", "ky", "ky_KG",
// L
"lag", "lag_TZ", "lb", "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO",
"ln_CD", "ln_CF", "ln_CG", "lo", "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT",
"lu", "lu_CD", "luo", "luo_KE", "luy", "luy_KE", "lv", "lv_LV",
// M
"mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg", "mg_MG", "mgh",
"mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN", "mn",
"mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
"mua_CM", "my", "my_MM", "mzn", "mzn_IR",
// N
"naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd", "nd_ZW", "nds", "nds_DE", "nds_NL",
"ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ", "nl_CW", "nl_NL", "nl_SR",
"nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no", "no_NO", "no_NO_NY",
"nus", "nus_SS", "nyn", "nyn_UG",
// O
"om", "om_ET", "om_KE", "or", "or_IN", "os", "os_GE", "os_RU",
// P
"pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK", "pl",
"pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
"pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL",
// Q
"qu", "qu_BO", "qu_EC", "qu_PE",
// R
"rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
"ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ",
// S
"sah", "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI",
"se_NO", "se_SE", "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA",
"sh_CS", "sh_YU", "shi", "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA",
"shi_MA", "si", "si_LK", "sk", "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn",
"sn_ZW", "so", "so_DJ", "so_ET", "so_KE", "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK",
"sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME", "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK",
"sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA", "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS",
"sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME", "sr_RS", "sr_CS", "sr_XK", "sr_YU",
"sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ", "sw_UG",
// T
"ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
"tg", "tg_TJ", "th", "th_TH", "th_TH_TRADITIONAL", "ti", "ti_ER", "ti_ET", "tk",
"tk_TM", "tl", "tl_PH", "to", "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU",
"twq", "twq_NE", "tzm", "tzm_MA",
// U
"ug", "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab",
"uz_Arab_AF", "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ",
// V
"vai", "vai_Latn", "vai_Latn_LR", "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi",
"vi_VN", "vun", "vun_TZ",
// W
"wae", "wae_CH", "wo", "wo_SN",
// X
"xh", "xh_ZA", "xog", "xog_UG",
// Y
"yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ", "yo_NG", "yue", "yue_CN", "yue_HK",
"yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK",
// Z
"zgh", "zgh_MA", "zh", "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO",
"zh_Hans_SG", "zh_Hant", "zh_Hant_HK", "zh_Hant_MO", "zh_Hant_TW", "zh_CN",
"zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
private static final ImmutableSet<String> COLL_LOCALE_IDS = ImmutableSet.of(
"root",
// A-B
"af", "am", "ars", "ar", "as", "az", "be", "bg", "bn", "bo", "bs_Cyrl", "bs",
// C-F
"ca", "ceb", "chr", "cs", "cy", "da", "de_AT", "de", "dsb", "dz", "ee", "el", "en",
"en_US_POSIX", "en_US", "eo", "es", "et", "fa_AF", "fa", "fil", "fi", "fo", "fr_CA", "fr",
// G-J
"ga", "gl", "gu", "ha", "haw", "he", "hi", "hr", "hsb", "hu", "hy",
"id_ID", "id", "ig", "in", "in_ID", "is", "it", "iw_IL", "iw", "ja",
// K-P
"ka", "kk", "kl", "km", "kn", "kok", "ko", "ku", "ky", "lb", "lkt", "ln", "lo", "lt", "lv",
"mk", "ml", "mn", "mo", "mr", "ms", "mt", "my", "nb", "ne", "nl", "nn", "no_NO", "no",
"om", "or", "pa_IN", "pa", "pa_Guru", "pl", "ps", "pt",
// R-T
"ro", "ru", "se", "sh_BA", "sh_CS", "sh", "sh_YU", "si", "sk", "sl", "smn", "sq",
"sr_BA", "sr_Cyrl_ME", "sr_Latn", "sr_ME", "sr_RS", "sr", "sv", "sw",
"ta", "te", "th", "tk", "to", "tr",
// U-Z
"ug", "uk", "ur", "uz", "vi", "wae", "wo", "xh", "yi", "yo", "yue_CN", "yue_Hans",
"yue", "zh_CN", "zh_Hant", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zh", "zu");
private static final ImmutableSet<String> BRKITR_LOCALE_IDS = ImmutableSet.of(
"root", "de", "el", "en", "en_US_POSIX", "en_US", "es", "fr", "it", "ja", "pt", "ru",
"zh_Hant", "zh");
private static final ImmutableSet<String> RBNF_LOCALE_IDS = ImmutableSet.of(
"root", "af", "ak", "am", "ars", "ar", "az", "be", "bg", "bs", "ca", "ccp", "chr", "cs",
"cy", "da", "de_CH", "de", "ee", "el", "en_001", "en_IN", "en", "eo", "es_419", "es_DO",
"es_GT", "es_HN", "es_MX", "es_NI", "es_PA", "es_PR", "es_SV", "es", "es_US", "et",
"fa_AF", "fa", "ff", "fil", "fi", "fo", "fr_BE", "fr_CH", "fr", "ga", "he", "hi", "hr",
"hu", "hy", "id", "in", "is", "it", "iw", "ja", "ka", "kl", "km", "ko", "ky", "lb",
"lo", "lrc", "lt", "lv", "mk", "ms", "mt", "my", "nb", "nl", "nn", "no", "pl", "pt_PT",
"pt", "qu", "ro", "ru", "se", "sh", "sk", "sl", "sq", "sr_Latn", "sr", "sv",
"sw", "ta", "th", "tr", "uk", "vi", "yue_Hans", "yue", "zh_Hant_HK", "zh_Hant", "zh_HK",
"zh_MO", "zh_TW", "zh");
}

View file

@ -0,0 +1,165 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;
/**
* Mutable ICU data, represented as a mapping from resource bundle paths to a sequence of values.
*/
public final class IcuData {
private static final RbPath RB_VERSION = RbPath.of("Version");
private static final Pattern ARRAY_INDEX = Pattern.compile("(/[^\\[]++)(?:\\[(\\d++)\\])?$");
private final String name;
private final boolean hasFallback;
private final NavigableSet<RbPath> paths = new TreeSet<>();
private final ListMultimap<RbPath, RbValue> rbPathToValues = ArrayListMultimap.create();
private ImmutableList<String> commentLines = ImmutableList.of();
/**
* IcuData constructor.
*
* @param name The name of the IcuData object, used as the name of the root node in the output file
* @param hasFallback true if the output file has another ICU file as a fallback.
*/
public IcuData(String name, boolean hasFallback) {
this.hasFallback = hasFallback;
this.name = name;
}
/** @return whether data should fallback on data in other ICU files. */
public boolean hasFallback() {
return hasFallback;
}
/**
* @return the name of this ICU data instance. Used in the output filename, and in comments.
*/
public String getName() {
return name;
}
/** Sets additional comment lines for the top of the file. */
public void setFileComment(String... commentLines) {
setFileComment(Arrays.asList(commentLines));
}
public void setFileComment(Iterable<String> commentLines) {
this.commentLines = ImmutableList.copyOf(commentLines);
}
public List<String> getFileComment() {
return commentLines;
}
/** Adds a singleton resource bundle value for a given path. */
public void add(RbPath rbPath, String element) {
add(rbPath, RbValue.of(element));
}
/** Adds a single resource bundle value for a given path. */
public void add(RbPath rbPath, RbValue rbValue) {
rbPathToValues.put(rbPath, rbValue);
paths.add(rbPath);
}
/** Adds a sequence of resource bundle values for a given path. */
public void add(RbPath rbPath, Iterable<RbValue> rbValues) {
rbValues.forEach(v -> rbPathToValues.put(rbPath, v));
paths.add(rbPath);
}
/** Replaces all resource bundle values for a given path with the specified singleton value. */
public void replace(RbPath rbPath, String element) {
rbPathToValues.removeAll(rbPath);
rbPathToValues.put(rbPath, RbValue.of(element));
paths.add(rbPath);
}
/** Replaces all resource bundle values for a given path with the specified value. */
public void replace(RbPath rbPath, RbValue rbValue) {
rbPathToValues.removeAll(rbPath);
add(rbPath, rbValue);
}
public void setVersion(String versionString) {
add(RB_VERSION, versionString);
}
public void addResults(ListMultimap<RbPath, PathValueTransformer.Result> resultsByRbPath) {
for (RbPath rbPath : resultsByRbPath.keySet()) {
for (PathValueTransformer.Result r : resultsByRbPath.get(rbPath)) {
if (r.isGrouped()) {
// Grouped results have all the values in a single value entry.
add(rbPath, RbValue.of(r.getValues()));
} else {
if (rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")) {
r.getValues().forEach(v -> add(rbPath, RbValue.of(v)));
} else {
// Ungrouped results are one value per entry, but might be expanded into
// grouped results if they are a path referencing a grouped entry.
r.getValues().forEach(v -> add(rbPath, replacePathValues(v)));
}
}
}
}
}
/**
* Replaces an ungrouped CLDR value for the form "/foo/bar" or "/foo/bar[N]" which is assumed
* to be a reference to an existing value in a resource bundle. Note that the referenced bundle
* might be grouped (i.e. an array with more than one element).
*/
private RbValue replacePathValues(String value) {
Matcher m = ARRAY_INDEX.matcher(value);
if (!m.matches()) {
return RbValue.of(value);
}
// The only constraint is that the "path" value starts with a leading '/', but parsing into
// the RbPath ignores this. We must use "parse()" here, rather than RbPath.of(), since the
// captured value contains '/' characters to represent path delimiters.
RbPath replacePath = RbPath.parse(m.group(1));
List<RbValue> replaceValues = get(replacePath);
checkArgument(replaceValues != null, "Path %s is missing from IcuData", replacePath);
// If no index is given (e.g. "/foo/bar") then treat it as index 0 (i.e. "/foo/bar[0]").
int replaceIndex = m.groupCount() > 1 ? Integer.parseInt(m.group(2)) : 0;
return replaceValues.get(replaceIndex);
}
/**
* Returns the mutable list of values associated with the given path (or null if there are no
* associated values).
*/
public List<RbValue> get(RbPath rbPath) {
return paths.contains(rbPath) ? rbPathToValues.get(rbPath) : null;
}
/** Returns an unmodifiable view of the set of paths in this instance. */
public Set<RbPath> getPaths() {
return Collections.unmodifiableSet(paths);
}
/** Returns whether the given path is present in this instance. */
public boolean contains(RbPath rbPath) {
return paths.contains(rbPath);
}
/** Returns whether there are any paths in this instance. */
public boolean isEmpty() {
return paths.isEmpty();
}
}

View file

@ -0,0 +1,381 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkElementIndex;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import com.google.common.base.Joiner;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Iterables;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
/**
* Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
* ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
* extended to be a more fully featured "diff" tool or a proper ICU data file parser.
*
* <p>This is a temporary debugging tool and should not be relied upon during any part of the data
* generation process.
*/
final class IcuDataDumper {
private static final Joiner LIST_JOINER = Joiner.on(',');
private static final RbPath VERSION = RbPath.of("Version");
public static void main(String... args) throws IOException {
Path fileOrDir;
Optional<Pattern> name = Optional.empty();
switch (args.length) {
case 2:
name = Optional.of(Pattern.compile(args[1]));
case 1:
fileOrDir = Paths.get(args[0]);
break;
default:
throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
}
if (Files.isDirectory(fileOrDir)) {
walkDirectory(fileOrDir, name);
} else {
checkArgument(!name.isPresent(),
"cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
IcuDataParser parser = new IcuDataParser(fileOrDir);
parser.parse();
dump(parser.icuData);
}
}
private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
Predicate<Path> matchesName =
f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
List<IcuDataParser> icuParsers;
try (Stream<Path> files = Files.walk(fileOrDir)) {
icuParsers = files
.filter(Files::isRegularFile)
.filter(matchesName)
.map(IcuDataParser::new)
.collect(toImmutableList());
}
ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
for (IcuDataParser p : icuParsers) {
p.parse();
for (RbPath k : p.icuData.keySet()) {
List<RbValue> values = p.icuData.get(k);
if (!allPaths.containsKey(k)) {
allPaths.putAll(k, values);
} else if (!VERSION.equals(k)) {
checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
}
}
}
dump(allPaths);
}
private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
allPaths.keySet().stream()
.sorted()
.forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
}
private static final class IcuDataParser {
// Path of file being parsed.
private final Path path;
// Comments in header (before data starts), without comment characters.
private final List<String> headerComment = new ArrayList<>();
// ICU data name (the name of the root element).
private String name = null;
// ICU data values.
private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();
// Current line number (1-indexed).
private int lineNumber = 0;
// The type of the previous line that was processed.
private LineType lastType = LineType.COMMENT;
// True when inside /* .. */ comments in the header.
private boolean inBlockComment = false;
// True when in the final top-level group at the end of parsing.
private boolean inFinalGroup = false;
// True when a partial (line wrapped) value has been read.
private boolean isLineContinuation = false;
// Current path while parsing (NOT including the root element).
private Deque<String> pathStack = new ArrayDeque<>();
// Current sequence of values for the path (as defined in the current path stack).
private List<String> currentValue = new ArrayList<>();
// Current partially read value of a multi-line value.
private String wrappedValue = "";
// Map of indices used to auto-generate names for anonymous path segments.
// TODO: Check if this is even needed and remove if not.
private Multiset<Integer> indices = HashMultiset.create();
IcuDataParser(Path path) {
this.path = checkNotNull(path);
}
public boolean parse() throws IOException {
List<String> lines = Files.readAllLines(path);
// Best approximation to a magic number be have (BOM plus inline comment). This stops
// use trying to parse the transliteration files, which are a different type.
if (!lines.get(0).startsWith("\uFEFF//")) {
return false;
}
lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);
// Sanity check for expected final state. Just checking the "lastType" should be enough
// to catch everything else (due to transition rules and how the code tidies up) but it
// seems prudent to sanity check everything just in case.
checkState(lastType == LineType.GROUP_END);
checkState(!inBlockComment);
checkState(name != null);
checkState(pathStack.isEmpty() && inFinalGroup);
checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
return true;
}
void processLineWithCheck(String line) {
lineNumber++;
if (lineNumber == 1 && line.startsWith("\uFEFF")) {
line = line.substring(1);
}
try {
processLine(line);
} catch (RuntimeException e) {
throw new RuntimeException(
String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
e);
}
}
void processLine(String line) {
line = maybeTrimEndOfLineComment(line);
if (line.isEmpty()) {
return;
}
LineMatch match = LineType.match(line, inBlockComment);
checkState(match.getType().isValidTransitionFrom(lastType),
"invalid state transition: %s --//-> %s", lastType, match.getType());
boolean isEndOfWrappedValue = false;
switch (match.getType()) {
case COMMENT:
if (name != null) {
// Comments in data are ignored since they cannot be properly associated with
// paths or values in an IcuData instance (only legacy tooling emits these).
break;
}
if (line.startsWith("/*")) {
inBlockComment = true;
}
headerComment.add(match.get(0));
if (inBlockComment && line.contains("*/")) {
checkState(line.indexOf("*/") == line.length() - 2,
"unexpected end of comment block");
inBlockComment = false;
}
break;
case INLINE_VALUE:
icuData.put(
getPathFromStack().extendBy(getSegment(match.get(0))),
RbValue.of(unquote(match.get(1))));
break;
case GROUP_START:
checkState(currentValue.isEmpty());
if (name == null) {
name = match.get(0);
checkState(name != null, "cannot have anonymous top-level group");
} else {
pathStack.push(getSegment(match.get(0)));
}
wrappedValue = "";
isLineContinuation = false;
break;
case QUOTED_VALUE:
wrappedValue += unquote(match.get(0));
isLineContinuation = !line.endsWith(",");
if (!isLineContinuation) {
currentValue.add(wrappedValue);
wrappedValue = "";
}
break;
case VALUE:
checkState(!isLineContinuation, "unexpected unquoted value");
currentValue.add(match.get(0));
break;
case GROUP_END:
// Account for quoted values without trailing ',' just before group end.
if (isLineContinuation) {
currentValue.add(wrappedValue);
isLineContinuation = false;
}
// Emit the collection sequence of values for the current path as an RbValue.
if (!currentValue.isEmpty()) {
icuData.put(getPathFromStack(), RbValue.of(currentValue));
currentValue.clear();
}
// Annoyingly the name is outside the stack so the stack will empty before the last
// end group.
if (!pathStack.isEmpty()) {
pathStack.pop();
indices.setCount(pathStack.size(), 0);
} else {
checkState(!inFinalGroup, "unexpected group end");
inFinalGroup = true;
}
break;
case UNKNOWN:
throw new IllegalStateException("cannot parse line: " + match.get(0));
}
lastType = match.getType();
}
private RbPath getPathFromStack() {
if (pathStack.isEmpty()) {
return RbPath.empty();
}
List<String> segments = new ArrayList<>();
Iterables.addAll(segments, pathStack);
if (segments.get(0).matches("<[0-9]{4}>")) {
segments.remove(0);
}
return segments.isEmpty() ? RbPath.empty() : RbPath.of(Lists.reverse(segments));
}
private String getSegment(String segmentOrNull) {
if (segmentOrNull != null) {
return segmentOrNull;
}
int depth = pathStack.size();
int index = indices.count(depth);
indices.add(depth, 1);
return String.format("<%04d>", index);
}
private String maybeTrimEndOfLineComment(String line) {
// Once the name is set, we are past the header and into the data.
if (name != null) {
// Index to search for '//' from - must skip quoted values.
int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
int commentIdx = line.indexOf("//", startIdx);
if (commentIdx != -1) {
line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
}
}
return line;
}
private static String unquote(String s) {
if (s.startsWith("\"") && s.endsWith("\"")) {
return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
}
checkState(!s.contains("\""), "invalid unquoted value: %s", s);
return s;
}
private static final class LineMatch {
private final LineType type;
private final Function<Integer, String> args;
LineMatch(LineType type, Function<Integer, String> args) {
this.type = checkNotNull(type);
this.args = checkNotNull(args);
}
String get(int n) {
return args.apply(n);
}
LineType getType() {
return type;
}
}
private enum LineType {
// Comment _start_ with any comment value captured.
COMMENT("(?://|/\\*)\\s*(.*)"),
// A combination of GROUP_START, VALUE and GROUP_END with whitespace.
INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
// Allows for empty segment names (anonymous arrays) which match 'null'.
GROUP_START("(?:(.*\\S)\\s*)?\\{"),
GROUP_END("\\}"),
QUOTED_VALUE("(\".*\"),?"),
VALUE("([^\"{}]+),?"),
UNKNOWN(".*");
// Table of allowed transitions expected during parsing.
// key=current state, values=set of permitted previous states
private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
ImmutableSetMultimap.<LineType, LineType>builder()
.putAll(COMMENT, COMMENT)
.putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
.putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
.putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
.putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
.putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
.build();
private final Pattern pattern;
LineType(String regex) {
this.pattern = Pattern.compile(regex);
}
boolean isValidTransitionFrom(LineType lastType) {
return TRANSITIONS.get(this).contains(lastType);
}
static LineMatch match(String line, boolean inBlockComment) {
// Block comments kinda suck and it'd be great if the ICU data only used '//' style
// comments (if would definitely simplify any parsers out there). Once the
// transition to the new transformation tools is complete, they can be changed to
// only emit '//' style comments.
if (inBlockComment) {
if (line.startsWith("*")) {
line = whitespace().trimLeadingFrom(line.substring(1));
}
return new LineMatch(COMMENT, ImmutableList.of(line)::get);
}
for (LineType type : TRANSITIONS.keySet()) {
// Regex groups start at 1, but we want the getter function to be zero-indexed.
Matcher m = type.pattern.matcher(line);
if (m.matches()) {
return new LineMatch(type, n -> {
checkElementIndex(n, m.groupCount());
return m.group(n + 1);
});
}
}
return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
}
}
}
}

View file

@ -0,0 +1,209 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.lang.Integer.parseInt;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Ascii;
import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableMap;
import org.unicode.icu.tool.cldrtoicu.regex.NamedFunction;
/**
* The named functions used by the {@code RegexTransformer} for {@code ldml2icu_supplemental.txt}.
*/
final class IcuFunctions {
/**
* Converts an ISO date string to a space-separated pair of integer values representing the top
* and bottom parts of a deconstructed millisecond epoch value (i.e. {@code
* "<hi32bits> <low32bits>"}).
*
* <p>Note that the values are formatted as <em>signed</em> decimal values, so it's entirely
* possible that the low bits value will be appear as a negative number (the high bits won't
* appear negative for many thousands of years).
*
* <ul>
* <li>args[0] = ISO date string (e.g. "2019-05-23")
* <li>args[1] = Date field type name (e.g. "from")
* </ul>
*/
static final NamedFunction DATE_FN =
NamedFunction.create("date", 2, args -> {
long millis =
DateFieldType.toEnum(args.get(1)).toEpochMillis(LocalDate.parse(args.get(0)));
// Strictly speaking the masking is redundant and could be removed.
int hiBits = (int) ((millis >>> 32) & 0xFFFFFFFFL);
int loBits = (int) (millis & 0xFFFFFFFFL);
return hiBits + " " + loBits;
});
// TODO(dbeaumont): Improve this documentation (e.g. why is this being done, give examples?).
/**
* Inserts '%' into numberingSystems descriptions.
*
* <ul>
* <li>args[0] = numbering system description (string)
* </ul>
*/
static final NamedFunction ALGORITHM_FN =
NamedFunction.create("algorithm", 1, args -> {
String value = args.get(0);
int percentPos = value.lastIndexOf('/') + 1;
return value.substring(0, percentPos) + '%' + value.substring(percentPos);
});
/**
* Converts a number into a special integer that represents the number in normalized scientific
* notation for ICU's RB parser.
*
* <p>Resultant integers are in the form "xxyyyyyy", where "xx" is the exponent offset by 50
* and "yyyyyy" is the coefficient to 5 decimal places. Results may also have a leading '-' to
* denote negative values.
*
* <p>For example:
* <pre>{@code
* 14660000000000 -> 1.466E13 -> 63146600
* 0.0001 -> 1E-4 -> 46100000
* -123.456 -> -1.23456E-2 -> -48123456
* }</pre>
*
* <p>The additional exponent offset is applied directly to the calculated exponent and is used
* to do things like converting percentages into their decimal representation (i.e. by passing
* a value of "-2").
*
* <ul>
* <li>args[0] = number to be converted (double)
* <li>args[1] = additional exponent offset (integer)
* </ul>
*/
static final NamedFunction EXP_FN =
NamedFunction.create("exp", 2, args -> {
double value = Double.parseDouble(args.get(0));
if (value == 0) {
return "0";
}
int exponent = 50;
if (args.size() == 2) {
exponent += Integer.parseInt(args.get(1));
}
String sign = value >= 0 ? "" : "-";
value = Math.abs(value);
while (value >= 10) {
value /= 10;
exponent++;
}
while (value < 1) {
value *= 10;
exponent--;
}
if (exponent < 0 || exponent > 99) {
throw new IllegalArgumentException("Exponent out of bounds: " + exponent);
}
return sign + exponent + Math.round(value * 100000);
});
// Allow for single digit values in any part and negative year values.
private static final Pattern YMD = Pattern.compile("(-?[0-9]+)-([0-9]{1,2})-([0-9]{1,2})");
/**
* Converts an ISO date string (i.e. "YYYY-MM-DD") into an ICU date string, which is
* the same but with spaces instead of hyphens. Since functions are expanded before the
* resulting value is split, this function will result in 3 separate values being created,
* unless the function call is enclosed in quotes.
*
* <p>Note that for some cases (e.g. "eras") the year part can be negative (e.g. "-2165-1-1")
* so this is not as simple as "split by hyphen".
*
* <ul>
* <li>args[0] = ISO date string (e.g. "2019-05-23" or "-2165-1-1")
* </ul>
*/
static final NamedFunction YMD_FN =
NamedFunction.create("ymd", 1, args -> {
Matcher m = YMD.matcher(args.get(0));
checkArgument(m.matches(), "invalid year-month-day string: %s", args.get(0));
// NOTE: Re-parsing is not optional since it removes leading zeros (needed for ICU).
return String.format("%s %s %s",
parseInt(m.group(1)), parseInt(m.group(2)), parseInt(m.group(3)));
});
// For transforming day-of-week identifiers.
private static final ImmutableMap<String, String> WEEKDAY_MAP_ID =
ImmutableMap.<String, String>builder()
.put("sun", "1")
.put("mon", "2")
.put("tues", "3")
.put("wed", "4")
.put("thu", "5")
.put("fri", "6")
.put("sat", "7")
.build();
/**
* Converts a day-of-week identifier into its ordinal value (e.g. "sun" --> 1, "mon" --> 2 ...).
*/
static final NamedFunction DAY_NUMBER_FN =
NamedFunction.create("day_number", 1,
args -> {
String id = WEEKDAY_MAP_ID.get(args.get(0));
checkArgument(id != null, "unknown weekday: %s", args.get(0));
return id;
});
// For transform IDs in <contextTransform> elements.
private static final ImmutableMap<String, String> TRANSFORM_ID_MAP =
ImmutableMap.of("no-change", "0", "titlecase-firstword", "1");
/**
* Converts the transform type in the {@code <contextTransform>} element into its ICU index
* (e.g. "titlecase-firstword" --> 1).
*/
static final NamedFunction CONTEXT_TRANSFORM_INDEX_FN =
NamedFunction.create("context_transform_index", 1,
args -> {
String id = TRANSFORM_ID_MAP.get(args.get(0));
checkArgument(id != null, "unknown contextTransform: %s", args.get(0));
return id;
});
// For DATE_FN only.
private enum DateFieldType {
from(LocalDate::atStartOfDay),
// Remember that atTime() takes nanoseconds, not micro or milli.
to(d -> d.atTime(23, 59, 59, 999_000_000));
private final Function<LocalDate, LocalDateTime> adjustFn;
DateFieldType(Function<LocalDate, LocalDateTime> adjustFn) {
this.adjustFn = adjustFn;
}
long toEpochMillis(LocalDate date) {
return adjustFn.apply(date).toInstant(ZoneOffset.UTC).toEpochMilli();
}
static DateFieldType toEnum(String value) {
switch (Ascii.toLowerCase(CharMatcher.whitespace().trimFrom(value))) {
case "from":
case "start":
return from;
case "to":
case "end":
return to;
default:
throw new IllegalArgumentException(value + " is not a valid date field type");
}
}
}
private IcuFunctions() {}
}

View file

@ -0,0 +1,313 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.util.stream.Collectors.joining;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Writes an IcuData object to a text file. A lot of this class was copied directly from the
* original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
* behaviours. The behaviour of this class is currently tuned to produce perfect parity with
* the original conversion tools, but once migration of the tools is complete, it should
* probably be revisited and tidied up.
*/
// TODO: Link to a definitive specification for the ICU data files and remove the hacks!
final class IcuTextWriter {
private static final String INDENT = " ";
// List of characters to escape in UnicodeSets
// ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
private static final Pattern UNICODESET_ESCAPE =
Pattern.compile("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]");
// Only escape \ and " from other strings.
private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
/** Write a file in ICU data format with the specified header. */
static void writeToFile(IcuData icuData, Path outDir, List<String> header) {
try {
Files.createDirectories(outDir);
try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt"));
PrintWriter out = new PrintWriter(w)) {
new IcuTextWriter(icuData).writeTo(out, header);
}
} catch (IOException e) {
throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
}
}
private final IcuData icuData;
private int depth = 0;
private boolean valueWasInline = false;
IcuTextWriter(IcuData icuData) {
this.icuData = checkNotNull(icuData);
}
// TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
private void writeTo(PrintWriter out, List<String> header) throws IOException {
out.write('\uFEFF');
writeHeaderAndComments(out, header, icuData.getFileComment());
// Write the ICU data to file. This takes the form:
// ----
// <name>{
// foo{
// bar{baz}
// }
// }
// ----
// So it's like every RbPath has an implicit prefix of the IcuData name.
String root = icuData.getName();
if (!icuData.hasFallback()) {
root += ":table(nofallback)";
}
// TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
out.print(root);
out.print("{");
depth++;
RbPath lastPath = RbPath.empty();
for (RbPath path : icuData.getPaths()) {
// Close any blocks up to the common path length. Since paths are all distinct, the
// common length should always be shorter than either path. We add 1 since we must also
// account for the implicit root segment.
int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
// Before closing, the "cursor" is at the end of the last value written.
closeLastPath(lastPath, commonDepth, out);
// After opening the value will be ready for the next value to be written.
openNextPath(path, out);
valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
lastPath = path;
}
closeLastPath(lastPath, 0, out);
out.println();
out.close();
}
// Before: Cursor is at the end of the previous line.
// After: Cursor is positioned immediately after the last closed '}'
private void closeLastPath(RbPath lastPath, int minDepth, PrintWriter out) {
if (valueWasInline) {
depth--;
out.print('}');
valueWasInline = false;
}
while (depth > minDepth) {
close(out);
}
}
// Before: Cursor is at the end of the previous line.
// After: Cursor is positioned immediately after the newly opened '{'
private void openNextPath(RbPath path, PrintWriter out) {
while (depth <= path.length()) {
// The -1 is to adjust for the implicit root element which means indentation (depth)
// no longer matches the index of the segment we are writing.
open(path.getSegment(depth - 1), out);
}
}
private void open(String label, PrintWriter out) {
newLineAndIndent(out);
depth++;
// This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
// These take the form of "<any-string>" and are used to ensure that path order can be
// well defined even for anonymous lists of items.
if (!label.startsWith("<") && !label.endsWith(">")) {
out.print(label);
}
out.print('{');
}
private void close(PrintWriter out) {
depth--;
newLineAndIndent(out);
out.print('}');
}
private void newLineAndIndent(PrintWriter out) {
out.println();
for (int i = 0; i < depth; i++) {
out.print(INDENT);
}
}
// Currently the "header" uses '//' line comments but the comments are in a block.
// TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
private static void writeHeaderAndComments(
PrintWriter out, List<String> header, List<String> comments) {
header.forEach(out::println);
if (!comments.isEmpty()) {
// TODO: Don't use /* */ block quotes, just use inline // quotes.
out.println(
comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
}
}
/** Inserts padding and values between braces. */
private boolean appendValues(
String name, RbPath rbPath, List<RbValue> values, PrintWriter out) {
RbValue onlyValue;
boolean wasSingular = false;
boolean quote = !rbPath.isIntPath();
boolean isSequence = rbPath.endsWith(RB_SEQUENCE);
if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
onlyValue = values.get(0);
if (onlyValue.size() == 1 && !mustBeArray(false, name, rbPath)) {
// Value has a single element and is not being forced to be an array.
String onlyElement = onlyValue.getElement(0);
if (quote) {
onlyElement = quoteInside(onlyElement);
}
// The numbers below are simply tuned to match the line wrapping in the original
// CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
// for a single character) and could definitely be improved.
// TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
if (onlyElement.length() <= maxWidth) {
// Single element for path: don't add newlines.
printValue(out, onlyElement, quote);
wasSingular = true;
} else {
// Element too long to fit in one line, so wrap.
int end;
for (int i = 0; i < onlyElement.length(); i = end) {
end = goodBreak(onlyElement, i + maxWidth);
String part = onlyElement.substring(i, end);
newLineAndIndent(out);
printValue(out, part, quote);
}
}
} else {
// Only one array for the rbPath, so don't add an extra set of braces.
printArray(onlyValue, quote, isSequence, out);
}
} else {
for (RbValue value : values) {
if (value.size() == 1) {
// Single-value array: print normally.
printArray(value, quote, isSequence, out);
} else {
// Enclose this array in braces to separate it from other values.
open("", out);
printArray(value, quote, isSequence, out);
close(out);
}
}
}
return wasSingular;
}
private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
private static final RbPath RB_RULES = RbPath.of("rules");
private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
private static final RbPath RB_ERAS = RbPath.of("eras");
private static final RbPath RB_NAMED = RbPath.of("named");
private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");
/**
* Wrapper for a hack to determine if the given rb path should always present its values as an
* array.
*/
// TODO: Verify this is still needed, and either make it less hacky, or delete it.
private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
if (topValues) {
// matches "rules/setNN" (hence the mucking about with raw segments).
return name.equals("pluralRanges")
&& rbPath.startsWith(RB_RULES)
&& rbPath.getSegment(1).startsWith("set");
}
return rbPath.equals(RB_LOCALE_SCRIPT)
|| (rbPath.contains(RB_ERAS)
&& !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
&& !rbPath.endsWith(RB_NAMED))
|| rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
|| rbPath.startsWith(RB_METAZONE_INFO);
}
private void printArray(RbValue rbValue, boolean quote, boolean isSequence, PrintWriter out) {
for (int n = 0; n < rbValue.size(); n++) {
newLineAndIndent(out);
printValue(out, quoteInside(rbValue.getElement(n)), quote);
if (!isSequence) {
out.print(",");
}
}
}
private static void printValue(PrintWriter out, String value, boolean quote) {
if (quote) {
out.append('"').append(value).append('"');
} else {
out.append(value);
}
}
// Can a string be broken here? If not, backup until we can.
// TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
private static int goodBreak(String quoted, int end) {
if (end > quoted.length()) {
return quoted.length();
}
// Don't break escaped Unicode characters.
// Need to handle both e.g. \u4E00 and \U00020000
for (int i = end - 1; i > end - 10;) {
char current = quoted.charAt(i--);
if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
if ((current == 'u' || current == 'U') && i > end - 10
&& quoted.charAt(i) == '\\') {
return i;
}
break;
}
}
while (end > 0) {
char ch = quoted.charAt(end - 1);
if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
break;
}
--end;
}
return end;
}
// Fix characters inside strings.
private static String quoteInside(String item) {
// Unicode-escape all quotes.
item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
// Double up on backslashes, ignoring Unicode-escaped characters.
Pattern pattern =
item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
Matcher matcher = pattern.matcher(item);
if (!matcher.find()) {
return item;
}
StringBuilder buffer = new StringBuilder();
int start = 0;
do {
buffer.append(item, start, matcher.start());
int punctuationChar = item.codePointAt(matcher.end() - 1);
buffer.append("\\");
if (punctuationChar == '\\') {
buffer.append('\\');
}
buffer.append(matcher.group());
start = matcher.end();
} while (matcher.find());
buffer.append(item.substring(start));
return buffer.toString();
}
}

View file

@ -0,0 +1,618 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.BRKITR;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.COLL;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.CURR;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LANG;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LOCALES;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.RBNF;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.REGION;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.UNIT;
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.ZONE;
import static java.util.stream.Collectors.toList;
import static org.unicode.cldr.api.CldrDataType.BCP47;
import static org.unicode.cldr.api.CldrDataType.LDML;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import com.google.common.base.CharMatcher;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.CollationMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.DayPeriodsMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.LocaleMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.PluralRangesMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.PluralsMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.RbnfMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.SupplementalMapper;
import org.unicode.icu.tool.cldrtoicu.mapper.TransformsMapper;
import org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer;
/**
* The main converter tool for CLDR to ICU data. To run this tool, you need to supply a suitable
* {@link LdmlConverterConfig} instance. There is a simple {@code main()} method available in this
* class which can be invoked passing just the desired output directory and which relies on the
* presence of several system properties for the remainder of its parameters:
* <ul>
* <li>CLDR_DIR: The root of the CLDR release from which CLDR data is read.
* <li>ICU_DIR: The root of the ICU release from which additional "specials" XML data is read.
* <li>CLDR_DTD_CACHE: A temporary directory with the various DTDs cached (this is a legacy
* requirement from the underlying CLDR libraries and might go away one day).
* </ul>
*/
public final class LdmlConverter {
// TODO: Do all supplemental data in one go and split similarly to locale data (using RbPath).
private static final PathMatcher GENDER_LIST_PATHS =
supplementalMatcher("gender");
private static final PathMatcher LIKELY_SUBTAGS_PATHS =
supplementalMatcher("likelySubtags");
private static final PathMatcher METAZONE_PATHS =
supplementalMatcher("metaZones", "primaryZones");
private static final PathMatcher METADATA_PATHS =
supplementalMatcher("metadata");
private static final PathMatcher SUPPLEMENTAL_DATA_PATHS =
supplementalMatcher(
"calendarData",
"calendarPreferenceData",
"codeMappings",
"codeMappingsCurrency",
"idValidity",
"languageData",
"languageMatching",
"measurementData",
"parentLocales",
"subdivisionContainment",
"territoryContainment",
"territoryInfo",
"timeData",
"unitPreferenceData",
"weekData",
"weekOfPreference");
private static final PathMatcher CURRENCY_DATA_PATHS =
supplementalMatcher("currencyData");
private static final PathMatcher NUMBERING_SYSTEMS_PATHS =
supplementalMatcher("numberingSystems");
private static final PathMatcher WINDOWS_ZONES_PATHS =
supplementalMatcher("windowsZones");
// Special IDs which are not supported via CLDR, but for which synthetic data is injected.
// The "TRADITIONAL" variants are here because their calendar differs from the non-variant
// locale. However CLDR cannot represent this currently because calendar defaults are in
// supplemental data (rather than locale data) and are keyed only on territory.
private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
// Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a
// structurally valid locale ID. This is injected manually when creating the alias map.
// This does mean that nobody can ever parse the _keys_ of the alias map, but so far there
// has been no need for that.
// TODO: Get "ars" into CLDR and remove this hack.
private static final Map<String, String> PHANTOM_ALIASES =
ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO");
private static PathMatcher supplementalMatcher(String... spec) {
checkArgument(spec.length > 0, "must supply at least one matcher spec");
if (spec.length == 1) {
return PathMatcher.of("supplementalData/" + spec[0]);
}
return PathMatcher.anyOf(
Arrays.stream(spec)
.map(s -> PathMatcher.of("supplementalData/" + s))
.toArray(PathMatcher[]::new));
}
private static RbPath RB_PARENT = RbPath.of("%%Parent");
// The quotes below are only so we achieve parity with the manually written alias files.
// TODO: Remove unnecessary quotes once the migration to this code is complete.
private static RbPath RB_ALIAS = RbPath.of("\"%%ALIAS\"");
// Special path for adding to empty files which only exist to complete the parent chain.
// TODO: Confirm that this has no meaningful effect and unify "empty" file contents.
private static RbPath RB_EMPTY_ALIAS = RbPath.of("___");
/** Provisional entry point until better config support exists. */
public static void main(String... args) {
convert(IcuConverterConfig.builder()
.setOutputDir(Paths.get(args[0]))
.setEmitReport(true)
.build());
}
/**
* Output types defining specific subsets of the ICU data which can be converted separately.
* This closely mimics the original "NewLdml2IcuConverter" behaviour but could be simplified to
* hide what are essentially implementation specific data splits.
*/
public enum OutputType {
LOCALES(LDML, LdmlConverter::processLocales),
BRKITR(LDML, LdmlConverter::processBrkitr),
COLL(LDML, LdmlConverter::processCollation),
RBNF(LDML, LdmlConverter::processRbnf),
DAY_PERIODS(
SUPPLEMENTAL,
c -> c.processDayPeriods("misc")),
GENDER_LIST(
SUPPLEMENTAL,
c -> c.processSupplemental("genderList", GENDER_LIST_PATHS, "misc", false)),
LIKELY_SUBTAGS(
SUPPLEMENTAL,
c -> c.processSupplemental("likelySubtags", LIKELY_SUBTAGS_PATHS, "misc", false)),
SUPPLEMENTAL_DATA(
SUPPLEMENTAL,
c -> c.processSupplemental("supplementalData", SUPPLEMENTAL_DATA_PATHS, "misc", true)),
CURRENCY_DATA(
SUPPLEMENTAL,
c -> c.processSupplemental("supplementalData", CURRENCY_DATA_PATHS, "curr", true)),
METADATA(
SUPPLEMENTAL,
c -> c.processSupplemental("metadata", METADATA_PATHS, "misc", false)),
META_ZONES(
SUPPLEMENTAL,
c -> c.processSupplemental("metaZones", METAZONE_PATHS, "misc", false)),
NUMBERING_SYSTEMS(
SUPPLEMENTAL,
c -> c.processSupplemental("numberingSystems", NUMBERING_SYSTEMS_PATHS, "misc", false)),
PLURALS(
SUPPLEMENTAL,
c -> c.processPlurals("misc")),
PLURAL_RANGES(
SUPPLEMENTAL,
c -> c.processPluralRanges("misc")),
WINDOWS_ZONES(
SUPPLEMENTAL,
c -> c.processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false)),
TRANSFORMS(
SUPPLEMENTAL,
c -> c.processTransforms("translit")),
KEY_TYPE_DATA(
BCP47,
c -> c.processKeyTypeData("misc")),
// Batching by type.
DTD_LDML(LDML, c -> c.processAll(LDML)),
DTD_SUPPLEMENTAL(SUPPLEMENTAL, c -> c.processAll(SUPPLEMENTAL)),
DTD_BCP47(BCP47, c -> c.processAll(BCP47));
public static final ImmutableSet<OutputType> ALL =
ImmutableSet.of(DTD_BCP47, DTD_SUPPLEMENTAL, DTD_LDML);
private final CldrDataType type;
private final Consumer<LdmlConverter> converterFn;
OutputType(CldrDataType type, Consumer<LdmlConverter> converterFn) {
this.type = checkNotNull(type);
this.converterFn = checkNotNull(converterFn);
}
void convert(LdmlConverter converter) {
converterFn.accept(converter);
}
CldrDataType getCldrType() {
return type;
}
}
private static void convert(LdmlConverterConfig config) {
CldrDataSupplier src = CldrDataSupplier
.forCldrFilesIn(config.getCldrDirectory())
.withDraftStatusAtLeast(config.getMinimumDraftStatus());
new LdmlConverter(config, src).convertAll(config);
}
// The configuration controlling conversion behaviour.
private final LdmlConverterConfig config;
// The supplier for all data to be converted.
private final CldrDataSupplier src;
// The set of available locale IDs.
// TODO: Make available IDs include specials files (or fail if specials are not available).
private final ImmutableSet<String> availableIds;
// Supplemental data available to mappers if needed.
private final SupplementalData supplementalData;
// Transformer for locale data.
private final PathValueTransformer localeTransformer;
// Transformer for supplemental data.
private final PathValueTransformer supplementalTransformer;
// Header string to go into every ICU data file.
private final ImmutableList<String> icuFileHeader;
private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) {
this.config = checkNotNull(config);
this.src = checkNotNull(src);
this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL));
// Sort the set of available locale IDs but add "root" at the front. This is the
// set of non-alias locale IDs to be processed.
Set<String> localeIds = new LinkedHashSet<>();
localeIds.add("root");
localeIds.addAll(
Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES)));
localeIds.addAll(PHANTOM_LOCALE_IDS);
this.availableIds = ImmutableSet.copyOf(localeIds);
// Load the remaining path value transformers.
this.supplementalTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"),
IcuFunctions.ALGORITHM_FN,
IcuFunctions.DATE_FN,
IcuFunctions.DAY_NUMBER_FN,
IcuFunctions.EXP_FN,
IcuFunctions.YMD_FN);
this.localeTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
}
private void convertAll(LdmlConverterConfig config) {
ListMultimap<CldrDataType, OutputType> groupByType = LinkedListMultimap.create();
for (OutputType t : config.getOutputTypes()) {
groupByType.put(t.getCldrType(), t);
}
for (CldrDataType cldrType : groupByType.keySet()) {
for (OutputType t : groupByType.get(cldrType)) {
t.convert(this);
}
}
if (config.emitReport()) {
System.out.println("Supplemental Data Transformer=" + supplementalTransformer);
System.out.println("Locale Data Transformer=" + localeTransformer);
}
}
private static List<String> readLinesFromResource(String name) {
try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) {
return CharStreams.readLines(new InputStreamReader(in));
} catch (IOException e) {
throw new RuntimeException("cannot read resource: " + name, e);
}
}
private PathValueTransformer getLocaleTransformer() {
return localeTransformer;
}
private PathValueTransformer getSupplementalTransformer() {
return supplementalTransformer;
}
private void processAll(CldrDataType cldrType) {
List<OutputType> targets = Arrays.stream(OutputType.values())
.filter(t -> t.getCldrType().equals(cldrType))
.filter(t -> !t.name().startsWith("DTD_"))
.collect(toList());
for (OutputType t : targets) {
t.convert(this);
}
}
private Optional<CldrData> loadSpecialsData(String localeId) {
String expected = localeId + ".xml";
try (Stream<Path> files = Files.walk(config.getSpecialsDir())) {
Set<Path> xmlFiles = files
.filter(Files::isRegularFile)
.filter(f -> f.getFileName().toString().equals(expected))
.collect(Collectors.toSet());
return !xmlFiles.isEmpty()
? Optional.of(
CldrDataSupplier.forCldrFiles(LDML, config.getMinimumDraftStatus(), xmlFiles))
: Optional.empty();
} catch (IOException e) {
throw new RuntimeException(
"error processing specials directory: " + config.getSpecialsDir(), e);
}
}
private void processLocales() {
// TODO: Pre-load specials files to avoid repeatedly re-loading them.
processAndSplitLocaleFiles(
id -> LocaleMapper.process(
id, src, loadSpecialsData(id), getLocaleTransformer(), supplementalData),
CURR, LANG, LOCALES, REGION, UNIT, ZONE);
}
private void processBrkitr() {
processAndSplitLocaleFiles(
id -> BreakIteratorMapper.process(id, src, loadSpecialsData(id)), BRKITR);
}
private void processCollation() {
processAndSplitLocaleFiles(
id -> CollationMapper.process(id, src, loadSpecialsData(id)), COLL);
}
private void processRbnf() {
processAndSplitLocaleFiles(
id -> RbnfMapper.process(id, src, loadSpecialsData(id)), RBNF);
}
private void processAndSplitLocaleFiles(
Function<String, IcuData> icuFn, IcuLocaleDir... splitDirs) {
SetMultimap<IcuLocaleDir, String> writtenLocaleIds = HashMultimap.create();
Path baseDir = config.getOutputDir();
for (String id : config.getTargetLocaleIds(LOCALES)) {
// Skip "target" IDs that are aliases (they are handled later).
if (!availableIds.contains(id)) {
continue;
}
IcuData icuData = icuFn.apply(id);
ListMultimap<IcuLocaleDir, RbPath> splitPaths = LinkedListMultimap.create();
for (RbPath p : icuData.getPaths()) {
String rootName = getBaseSegmentName(p.getSegment(0));
splitPaths.put(LOCALE_SPLIT_INFO.getOrDefault(rootName, LOCALES), p);
}
// We always write base languages (even if empty).
boolean isBaseLanguage = !id.contains("_");
// Run through all directories (not just the keySet() of the split path map) since we
// sometimes write empty files.
for (IcuLocaleDir dir : splitDirs) {
Set<String> targetIds = config.getTargetLocaleIds(dir);
if (!targetIds.contains(id)) {
if (!splitPaths.get(dir).isEmpty()) {
System.out.format(
"target IDs for %s does not contain %s, but it has data: %s\n",
dir, id, splitPaths.get(dir));
}
continue;
}
Path outDir = baseDir.resolve(dir.getOutputDir());
IcuData splitData = new IcuData(icuData.getName(), icuData.hasFallback());
// The split data can still be empty for this directory, but that's expected.
splitPaths.get(dir).forEach(p -> splitData.add(p, icuData.get(p)));
// Adding a parent locale makes the data non-empty and forces it to be written.
supplementalData.getExplicitParentLocaleOf(splitData.getName())
.ifPresent(p -> splitData.add(RB_PARENT, p));
if (!splitData.isEmpty() || isBaseLanguage || dir.includeEmpty()) {
splitData.setVersion(CldrDataSupplier.getCldrVersionString());
write(splitData, outDir);
writtenLocaleIds.put(dir, id);
}
}
}
for (IcuLocaleDir dir : splitDirs) {
Path outDir = baseDir.resolve(dir.getOutputDir());
Set<String> targetIds = config.getTargetLocaleIds(dir);
Map<String, String> aliasMap = getAliasMap(targetIds, dir);
aliasMap.forEach((s, t) -> {
// It's only important to record which alias files are written because of forced
// aliases, but since it's harmless otherwise, we just do it unconditionally.
// Normal alias files don't affect the empty file calculation, but forced ones can.
writtenLocaleIds.put(dir, s);
writeAliasFile(s, t, outDir);
});
calculateEmptyFiles(writtenLocaleIds.get(dir), aliasMap.values())
.forEach(id -> writeEmptyFile(id, outDir, aliasMap.values()));
}
}
private Map<String, String> getAliasMap(Set<String> localeIds, IcuLocaleDir dir) {
// There are four reasons for treating a locale ID as an alias.
// 1: It contains deprecated subtags (e.g. "sr_YU", which should be "sr_Cyrl_RS").
// 2: It has no CLDR data but is missing a script subtag.
// 3: It is one of the special "phantom" alias which cannot be represented normally
// and must be manually mapped (e.g. legacy locale IDs which don't even parse).
// 4: It is a "super special" forced alias, which might replace existing aliases in
// some output directories.
Map<String, String> aliasMap = new LinkedHashMap<>();
for (String id : localeIds) {
if (PHANTOM_ALIASES.keySet().contains(id)) {
checkArgument(!availableIds.contains(id),
"phantom aliases should never be otherwise supported: %s\n"
+ "(maybe the phantom alias can now be removed?)", id);
aliasMap.put(id, PHANTOM_ALIASES.get(id));
continue;
}
String canonicalId = supplementalData.replaceDeprecatedTags(id);
if (!canonicalId.equals(id)) {
// If the canonical form of an ID differs from the requested ID, the this is an
// alias, and just needs to point to the canonical ID.
aliasMap.put(id, canonicalId);
continue;
}
if (availableIds.contains(id)) {
// If it's canonical and supported, it's not an alias.
continue;
}
// If the requested locale is not supported, maximize it and alias to that.
String maximizedId = supplementalData.maximize(id)
.orElseThrow(() -> new IllegalArgumentException("unsupported locale ID: " + id));
// We can't alias to ourselves and we shouldn't be here is the ID was already maximal.
checkArgument(!maximizedId.equals(id), "unsupported maximized locale ID: %s", id);
aliasMap.put(id, maximizedId);
}
// Important that we overwrite entries which might already exist here, since we might have
// already calculated a "natural" alias for something that we want to force (and we should
// replace the existing target, since that affects how we determine empty files later).
aliasMap.putAll(config.getForcedAliases(dir));
return aliasMap;
}
private static final CharMatcher PATH_MODIFIER = CharMatcher.anyOf(":%");
// Resource bundle paths elements can have variants (e.g. "Currencies%narrow) or type
// annotations (e.g. "languages:intvector"). We strip these when considering the element name.
private static String getBaseSegmentName(String segment) {
int idx = PATH_MODIFIER.indexIn(segment);
return idx == -1 ? segment : segment.substring(0, idx);
}
private void processDayPeriods(String dir) {
write(DayPeriodsMapper.process(src), dir);
}
private void processPlurals(String dir) {
write(PluralsMapper.process(src), dir);
}
private void processPluralRanges(String dir) {
write(PluralRangesMapper.process(src), dir);
}
private void processKeyTypeData(String dir) {
Bcp47Mapper.process(src).forEach(d -> write(d, dir));
}
private void processTransforms(String dir) {
Path transformDir = createDirectory(config.getOutputDir().resolve(dir));
write(TransformsMapper.process(src, transformDir), transformDir);
}
private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion");
private void processSupplemental(
String label, PathMatcher paths, String dir, boolean addCldrVersion) {
IcuData icuData =
SupplementalMapper.process(src, getSupplementalTransformer(), label, paths);
// A hack for "supplementalData.txt" since the "cldrVersion" value doesn't come from the
// supplemental data XML files.
if (addCldrVersion) {
icuData.add(RB_CLDR_VERSION, CldrDataSupplier.getCldrVersionString());
}
write(icuData, dir);
}
private void writeAliasFile(String srcId, String destId, Path dir) {
IcuData icuData = new IcuData(srcId, true);
icuData.add(RB_ALIAS, destId);
write(icuData, dir);
}
private void writeEmptyFile(String id, Path dir, Collection<String> aliasTargets) {
IcuData icuData = new IcuData(id, true);
// TODO: Document the reason for this (i.e. why does it matter what goes into empty files?)
if (aliasTargets.contains(id)) {
icuData.setFileComment("generated alias target");
icuData.add(RB_EMPTY_ALIAS, "");
} else {
// These empty files only exist because the target of an alias has a parent locale
// which is itself not in the set of written ICU files. An "indirect alias target".
icuData.setVersion(CldrDataSupplier.getCldrVersionString());
}
write(icuData, dir);
}
private void write(IcuData icuData, String dir) {
write(icuData, config.getOutputDir().resolve(dir));
}
private void write(IcuData icuData, Path dir) {
createDirectory(dir);
IcuTextWriter.writeToFile(icuData, dir, icuFileHeader);
}
private Path createDirectory(Path dir) {
try {
Files.createDirectories(dir);
} catch (IOException e) {
throw new RuntimeException("cannot create directory: " + dir, e);
}
return dir;
}
// The set of IDs to process is:
// * any file that was written
// * any alias target (not written)
//
// From which we generate the complete "closure" under the "getParent()" function. This set
// contains all file (written or not) which need to exist to complete the locale hierarchy.
//
// Then we remove all the written files to just leave the ones that need to be generated.
// This is a simple and robust approach that handles things like "gaps" in non-aliased
// locale IDs, where an intermediate parent is not present.
private ImmutableSet<String> calculateEmptyFiles(
Set<String> writtenIds, Collection<String> aliasTargetIds) {
Set<String> seedIds = new HashSet<>(writtenIds);
seedIds.addAll(aliasTargetIds);
// Be nice and sort the output (makes easier debugging).
Set<String> allIds = new TreeSet<>();
for (String id : seedIds) {
while (!id.equals("root") && !allIds.contains(id)) {
allIds.add(id);
id = supplementalData.getParent(id);
}
}
return ImmutableSet.copyOf(Sets.difference(allIds, writtenIds));
}
private static final ImmutableMap<String, IcuLocaleDir> LOCALE_SPLIT_INFO =
ImmutableMap.<String, IcuLocaleDir>builder()
// BRKITR
.put("boundaries", BRKITR)
.put("dictionaries", BRKITR)
.put("exceptions", BRKITR)
// COLL
.put("collations", COLL)
.put("depends", COLL)
.put("UCARules", COLL)
// CURR
.put("Currencies", CURR)
.put("CurrencyPlurals", CURR)
.put("CurrencyUnitPatterns", CURR)
.put("currencySpacing", CURR)
// LANG
.put("Keys", LANG)
.put("Languages", LANG)
.put("Scripts", LANG)
.put("Types", LANG)
.put("Variants", LANG)
.put("characterLabelPattern", LANG)
.put("codePatterns", LANG)
.put("localeDisplayPattern", LANG)
// RBNF
.put("RBNFRules", RBNF)
// REGION
.put("Countries", REGION)
// UNIT
.put("durationUnits", UNIT)
.put("units", UNIT)
.put("unitsShort", UNIT)
.put("unitsNarrow", UNIT)
// ZONE
.put("zoneStrings", ZONE)
.build();
}

View file

@ -0,0 +1,106 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import java.nio.file.Path;
import java.util.Map;
import java.util.Set;
import org.unicode.cldr.api.CldrDraftStatus;
import com.google.common.base.Ascii;
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
/** API for configuring the LDML converter. */
public interface LdmlConverterConfig {
/** Output directories for ICU locale data (this is not used for supplemental data). */
enum IcuLocaleDir {
/** Data for the break-iterator library. */
BRKITR(true),
/** Data for the collations library. */
COLL(true),
/** Currency data. */
CURR(false),
/** Language data. */
LANG(false),
/** General locale data. */
LOCALES(true),
/** Rule-based number formatter data. */
RBNF(true),
/** Region data. */
REGION(false),
/** Measurement and units data. */
UNIT(false),
/** Timezone data. */
ZONE(false);
private final String dirName = Ascii.toLowerCase(name());
private final boolean includeEmpty;
IcuLocaleDir(boolean includeEmpty) {
this.includeEmpty = includeEmpty;
}
/** Returns the relative output directory name. */
String getOutputDir() {
return dirName;
}
/**
* Whether the directory is expected to contain empty data files (used to advertise
* the supported set of locales for the "service" provided by the data in that
* directory).
*/
// TODO: Document why there's a difference between directories for empty directories.
boolean includeEmpty() {
return includeEmpty;
}
}
/**
* Returns the set of output types to be converted. Use {@link OutputType#ALL} to convert
* everything.
*/
Set<OutputType> getOutputTypes();
/** Returns the root directory in which the CLDR release is located. */
Path getCldrDirectory();
/**
* Returns an additional "specials" directory containing additional ICU specific XML
* files depending on the given output type. This is where the converter finds any XML
* files using the "icu:" namespace.
*/
Path getSpecialsDir();
/**
* Returns the root of the ICU output directory hierarchy into which ICU data file are
* written.
*/
Path getOutputDir();
/** Returns the minimal draft status for CLDR data to be converted. */
CldrDraftStatus getMinimumDraftStatus();
/**
* Returns the set of locale IDs to be processed for the given directory.
*
* <p>This set can contain IDs which have noICU data associated with them if they are
* suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
* exist).
*/
Set<String> getTargetLocaleIds(IcuLocaleDir dir);
/**
* Return a map of locale IDs which specifies aliases which are applied to the given
* directory in contradiction to the natural alias or parent ID which would otherwise
* be generated. This is a mechanism for restructuring the parent chain and linking
* locales together in non-standard and unexpected ways.
*/
Map<String, String> getForcedAliases(IcuLocaleDir dir);
/**
* Whether to emit a summary report for debug purposes after conversion is complete.
*/
boolean emitReport();
}

View file

@ -0,0 +1,259 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkPositionIndex;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrPath;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
/**
* An immutable matcher for {@link CldrPath} instances. A path matcher specification looks like
* {@code "foo/*[@x="z"]/bar[@y=*]"}, where element names and attribute values can be wildcards.
*
* <p>Note that the path fragment represented by the specification does not include either leading
* or trailing {@code '/'}. This is because matching can occur at any point in a {@code CdlrPath}.
* The choice of where to match in the path is governed by the match method used (e.g.
* {@link PathMatcher#matchesSuffixOf(CldrPath)}.
*/
public abstract class PathMatcher {
/** Parses the path specification into a matcher. */
public static PathMatcher of(String pathSpec) {
// Supported so far: "a", "a/b", "a/b[@x=*]"
return new BasicMatcher(parse(pathSpec));
}
/**
* Combines the given matchers into a single composite matcher which tests all the given
* matchers in order.
*/
public static PathMatcher anyOf(PathMatcher... matchers) {
checkArgument(matchers.length > 0, "must supply at least one matcher");
if (matchers.length == 1) {
return checkNotNull(matchers[0]);
}
return new CompositeMatcher(ImmutableList.copyOf(matchers));
}
/** Attempts a full match against a given path. */
public abstract boolean matches(CldrPath path);
/** Attempts a suffix match against a given path. */
public abstract boolean matchesSuffixOf(CldrPath path);
/** Attempts a prefix match against a given path. */
public abstract boolean matchesPrefixOf(CldrPath path);
// A matcher that simply combines a sequences of other matchers in order.
private static final class CompositeMatcher extends PathMatcher {
private final ImmutableList<PathMatcher> matchers;
private CompositeMatcher(ImmutableList<PathMatcher> matchers) {
checkArgument(matchers.size() > 1);
this.matchers = checkNotNull(matchers);
}
@Override
public boolean matches(CldrPath path) {
for (PathMatcher m : matchers) {
if (m.matches(path)) {
return true;
}
}
return false;
}
@Override
public boolean matchesSuffixOf(CldrPath path) {
for (PathMatcher m : matchers) {
if (m.matchesSuffixOf(path)) {
return true;
}
}
return false;
}
@Override
public boolean matchesPrefixOf(CldrPath path) {
for (PathMatcher m : matchers) {
if (m.matchesPrefixOf(path)) {
return true;
}
}
return false;
}
}
private static final class BasicMatcher extends PathMatcher {
private final ImmutableList<Predicate<CldrPath>> elementMatchers;
private BasicMatcher(List<Predicate<CldrPath>> elementMatchers) {
this.elementMatchers = ImmutableList.copyOf(elementMatchers);
}
@Override
public boolean matches(CldrPath path) {
return elementMatchers.size() == path.getLength() && matchRegion(path, 0);
}
@Override
public boolean matchesSuffixOf(CldrPath path) {
int start = path.getLength() - elementMatchers.size();
return start >= 0 && matchRegion(path, start);
}
@Override
public boolean matchesPrefixOf(CldrPath path) {
return path.getLength() >= elementMatchers.size() && matchRegion(path, 0);
}
private boolean matchRegion(CldrPath path, int offset) {
// offset is the path element corresponding the the "top most" element matcher, it
// must be in the range 0 ... (path.length() - elementMatchers.size()).
checkPositionIndex(offset, path.getLength() - elementMatchers.size());
// First jump over the path parents until we find the last matcher.
int matchPathLength = offset + elementMatchers.size();
while (path.getLength() > matchPathLength) {
path = path.getParent();
}
return matchForward(path, elementMatchers.size() - 1);
}
private boolean matchForward(CldrPath path, int matcherIndex) {
if (matcherIndex < 0) {
return true;
}
return matchForward(path.getParent(), matcherIndex - 1)
&& elementMatchers.get(matcherIndex).test(path);
}
}
// Make a new, non-interned, unique instance here which we can test by reference to
// determine if the argument is to be captured (needed as ImmutableMap prohibits null).
// DO NOT change this code to assign "*" as the value directly, it MUST be a new instance.
private static final String WILDCARD = new String("*");
private static final Pattern ELEMENT_START_REGEX =
Pattern.compile("(\\*|[-:\\w]+)(?:/|\\[|$)");
private static final Pattern ATTRIBUTE_REGEX =
Pattern.compile("\\[@([-:\\w]+)=(?:\\*|\"([^\"]*)\")\\]");
// element := foo, foo[@bar="baz"], foo[@bar=*]
// pathspec := element{/element}*
private static List<Predicate<CldrPath>> parse(String pathSpec) {
List<Predicate<CldrPath>> specs = new ArrayList<>();
int pos = 0;
do {
pos = parse(pathSpec, pos, specs);
} while (pos >= 0);
return specs;
}
// Return next start index or -1.
private static int parse(String pathSpec, int pos, List<Predicate<CldrPath>> specs) {
Matcher m = ELEMENT_START_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
checkArgument(m.lookingAt(), "invalid path specification (index=%s): %s", pos, pathSpec);
String name = m.group(1);
Map<String, String> attributes = ImmutableMap.of();
pos = m.end(1);
if (pos < pathSpec.length() && pathSpec.charAt(pos) == '[') {
// We have attributes to add.
attributes = new LinkedHashMap<>();
do {
m = ATTRIBUTE_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
checkArgument(m.lookingAt(),
"invalid path specification (index=%s): %s", pos, pathSpec);
// Null if we matched the '*' wildcard.
String value = m.group(2);
attributes.put(m.group(1), value != null ? value : WILDCARD);
pos = m.end();
} while (pos < pathSpec.length() && pathSpec.charAt(pos) == '[');
}
// Wildcard matching is less efficient because attribute keys cannot be made in advance, so
// since it's also very rare, we special case it.
Predicate<CldrPath> matcher = name.equals(WILDCARD)
? new WildcardElementMatcher(attributes)::match
: new ElementMatcher(name, attributes)::match;
specs.add(matcher);
if (pos == pathSpec.length()) {
return -1;
}
checkState(pathSpec.charAt(pos) == '/',
"invalid path specification (index=%s): %s", pos, pathSpec);
return pos + 1;
}
// Matcher for path elements like "foo[@bar=*]" where the name is known in advance.
private static final class ElementMatcher {
private final String name;
private final ImmutableMap<AttributeKey, String> attributes;
private ElementMatcher(String name, Map<String, String> attributes) {
this.name = checkNotNull(name);
this.attributes = attributes.entrySet().stream()
.collect(toImmutableMap(e -> keyOf(name, e.getKey()), Entry::getValue));
}
boolean match(CldrPath path) {
if (!path.getName().equals(name)) {
return false;
}
for (Entry<AttributeKey, String> e : attributes.entrySet()) {
String actual = path.get(e.getKey());
if (actual == null) {
return false;
}
String expected = e.getValue();
// DO NOT change this to use expected.equals(WILDCARD).
if (expected != WILDCARD && !expected.equals(actual)) {
return false;
}
}
return true;
}
}
// Matcher for path elements like "*[@bar=*]", where the name isn't known until match time.
private static final class WildcardElementMatcher {
private final ImmutableMap<String, String> attributes;
private WildcardElementMatcher(Map<String, String> attributes) {
this.attributes = ImmutableMap.copyOf(attributes);
}
private boolean match(CldrPath path) {
// The wildcard matcher never fails due to the element name but must create new key
// instances every time matching occurs (because the key name is dynamic). Since this
// is rare, it's worth making into a separate case.
for (Entry<String, String> attribute : attributes.entrySet()) {
String actual = path.get(keyOf(path.getName(), attribute.getKey()));
if (actual == null) {
return false;
}
String expected = attribute.getValue();
// DO NOT change this to use expected.equals(WILDCARD).
if (expected != WILDCARD && !expected.equals(actual)) {
return false;
}
}
return true;
}
}
}

View file

@ -0,0 +1,130 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkNotNull;
import java.util.function.Function;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.collect.ImmutableList;
/**
* API for transforming CLDR path/value pairs. Transformed results support grouping by their key
* and the ability to generate default "fallback" values to account for missing values in a group.
*
* <p>To transform some set of CLDR path/values:
* <ol>
* <li>Transform all desired path/value pairs into a set of matched results, discarding duplicates
* (see {@link #transform(CldrValue)}.
* <li>Group the results by key (e.g. into a {@code ListMultimap}).
* <li>For each group, add any fallback values which don't yet exist for that key (see
* {@link #getFallbackResultsFor(RbPath, DynamicVars)} and {@link Result#isFallbackFor(Result)}).
* <li>Sort elements within each group and flatten result values (see {@link Result#isGrouped()}).
* </ol>
*
* <p>For each unique key, this should yield correctly ordered sequence of values (according to the
* semantics of the chosen transformer implementation).
*/
public abstract class PathValueTransformer {
/**
* A result either obtained by transforming a path/value pair, or as a potential fallback for
* some known key (see {@link PathValueTransformer#transform(CldrValue)} and
* {@link PathValueTransformer#getFallbackResultsFor(RbPath, DynamicVars)}).
*/
public static abstract class Result implements Comparable<Result> {
private final RbPath key;
protected Result(RbPath key) {
this.key = checkNotNull(key);
}
/**
* Returns the key of this result, used to group results and determine fallback values
* according to the semantics of the chosen transformer.
*/
public RbPath getKey() {
return key;
}
/**
* Returns whether the values in this result should be grouped or not. Un-grouped values
* should be considered as individual values in a sequence and might be joined with values
* from other results in the same group. Grouped values cannot be split and must appear
* as a single value.
*
* <p>For example for the ordered results:
* <pre>
* Result X = { key=K, values=[ "a", "b" ], grouped=false }
* Result Y = { key=K, values=[ "c", "d" ], grouped=false }
* Result Z = { key=K, values=[ "e" ], grouped=false }
* </pre>
* the values for key {@code K} are conceptually {@code [ "a", "b", "c", "d", "e" ]}.
*
* <p>However if result {@code Y} has {@code grouped=true} then there are now 4 values
* {@code [ "a", "b", ["c", "d"], "e" ]}, and if {@code X} is also grouped, then it is
* {@code [ ["a", "b"], ["c", "d"], "e" ]}, producing only 3 top-level values.
*/
public abstract boolean isGrouped();
/**
* Returns the transformed values of this result, which may or may not be grouped
* according to {@link #isGrouped()}.
*/
public abstract ImmutableList<String> getValues();
/**
* Returns whether this result is a fallback for some existing matched result. Fallback
* results should only be used when it is not a fallback for any existing result.
*/
public abstract boolean isFallbackFor(Result r);
/** Debug only string representation. */
@Override
public final String toString() {
return String.format(
"Result{ key='%s', grouped=%s, values=%s }",
getKey(), isGrouped(), getValues());
}
}
/**
* A "typedef" for the function to do late binding of dynamic variables. This is used for edge
* cases where a %N variable in the rules config is bound to a CLDR path (e.g. "//foo/bar")
* which cannot be resolved until the rule is evaluated. Unfortunately the need to support late
* binding of variables incurs significant additional complexity in the code, despite being
* used in exactly one situation so far (the '%D' variable to represent the default numbering
* scheme.
*/
// TODO: Figure out how to get rid of all of this mess.
public interface DynamicVars extends Function<CldrPath, String> {}
/**
* Transforms a CLDR value into a sequence of results (empty if the value was not matched by
* any rule).
*
* @param cldrValue the value to transform.
* @return the transformed result(s).
*/
public abstract ImmutableList<Result> transform(CldrValue cldrValue);
/**
* Transforms a CLDR value into a sequence of results (empty if the value was not matched by
* any rule). The dynamic variable function provides any "late bound" CLDR path variables to be
* resolved from CLDR data during processing (e.g "%D=//ldml/numbers/defaultNumberingSystem").
*
* @param cldrValue the value to transform.
* @param varFn a function for resolving "late bound" variables.
* @return the transformed result(s).
*/
public abstract ImmutableList<Result> transform(CldrValue cldrValue, DynamicVars varFn);
/**
* Returns a possibly empty sequence of fallback results for a given key. A fallback result for
* a key should be used only if it is not a fallback for any other result with that key; see
* also {@link Result#isFallbackFor(Result)}.
*/
public abstract ImmutableList<Result> getFallbackResultsFor(RbPath key, DynamicVars varFn);
}

View file

@ -0,0 +1,232 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Objects;
import java.util.function.Function;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.Comparators;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
/**
* A resource bundle path, used to identify entries in ICU data.
*
* <p>Immutable and thread safe.
*/
public final class RbPath implements Comparable<RbPath> {
private static final Splitter PATH_SPLITTER = Splitter.on('/').trimResults();
// This defines ordering of paths in IcuData instances and thus the order in ICU data files.
// If there's ever a reason to have a different "natural" order for paths, this Comparator
// should be moved into the ICU file writer class(es).
private static final Comparator<RbPath> ORDERING =
Comparator.comparing(
p -> p.segments,
Comparators.lexicographical(Comparator.<String>naturalOrder()));
// Matches the definition of invariant characters in "uinvchar.cpp". We can make this all much
// faster if needed with a custom matcher (it's just a 128 way bit lookup via 2 longs).
private static final CharMatcher INVARIANT_CHARS =
CharMatcher.ascii().and(CharMatcher.anyOf("!#$@[\\]^`{|}~").negate());
// Note that we must also prohibit double-quote from appearing anywhere other than surrounding
// segment values. This is because some segment values can contain special ICU data characters
// (e.g. ':') but must be treated as literals. There is not proper "escaping" mechanism in ICU
// data for key values (since '\' is not an invariant, things like \\uxxxx are not possible).
//
// Ideally quoting would be done when the file is written, but that would require additional
// complexity in RbPath, since suffixes like ":intvector" must not be quoted and must somehow
// be distinguished from timezone "metazone" names which also contain ':'.
private static final CharMatcher QUOTED_SEGMENT_CHARS =
INVARIANT_CHARS
.and(CharMatcher.javaIsoControl().negate())
.and(CharMatcher.isNot('"'));
private static final CharMatcher UNQUOTED_SEGMENT_CHARS =
QUOTED_SEGMENT_CHARS.and(whitespace().negate());
// Characters allowed in path segments which separate the "base name" from any suffix (e.g.
// the base name of "Foo:intvector" is "Foo").
private static final CharMatcher SEGMENT_SEPARATORS = CharMatcher.anyOf("%:");
private static final RbPath EMPTY = new RbPath(ImmutableList.of());
public static RbPath empty() {
return EMPTY;
}
public static RbPath of(String... segments) {
return of(Arrays.asList(segments));
}
public static RbPath of(Iterable<String> segments) {
return new RbPath(segments);
}
public static RbPath parse(String path) {
checkArgument(!path.isEmpty(), "cannot parse an empty path string");
// Allow leading '/', but don't allow empty segments anywhere else.
if (path.startsWith("/")) {
path = path.substring(1);
}
return new RbPath(PATH_SPLITTER.split(path));
}
static int getCommonPrefixLength(RbPath lhs, RbPath rhs) {
int maxLength = Math.min(lhs.length(), rhs.length());
int n = 0;
while (n < maxLength && lhs.getSegment(n).equals(rhs.getSegment(n))) {
n++;
}
return n;
}
private final ImmutableList<String> segments;
private final int hashCode;
private RbPath(Iterable<String> segments) {
this.segments = ImmutableList.copyOf(segments);
this.hashCode = Objects.hash(this.segments);
for (String segment : this.segments) {
checkArgument(!segment.isEmpty(),
"empty path segments not permitted: %s", this.segments);
// Either the label is quoted (e.g. "foo") or it is bar (e.g. foo) but it can only
// contain double quotes at either end, or not at all. If the string is quoted, only
// validate the content, and not the quotes themselves.
String toValidate;
switch (segment.charAt(0)) {
case '<':
// Allow anything in hidden labels, since they will be removed later and never
// appear in the final ICU data.
checkArgument(segment.endsWith(">"),
"mismatched quoting for hidden label: %s", segment);
continue;
case '"':
checkArgument(segment.endsWith("\""),
"mismatched quoting for segment: %s", segment);
checkArgument(
QUOTED_SEGMENT_CHARS.matchesAllOf(segment.substring(1, segment.length() - 1)),
"invalid character in unquoted resource bundle path segment: %s", segment);
break;
default:
checkArgument(
UNQUOTED_SEGMENT_CHARS.matchesAllOf(segment),
"invalid character in unquoted resource bundle path segment: %s", segment);
break;
}
}
}
public int length() {
return segments.size();
}
public String getSegment(int n) {
return segments.get(n);
}
public RbPath getParent() {
checkState(length() > 0, "cannot get parent of the empty path");
return length() > 1 ? new RbPath(segments.subList(0, length() - 1)) : EMPTY;
}
public boolean isAnonymous() {
return length() > 0 && segments.get(length() - 1).charAt(0) == '<';
}
public RbPath extendBy(String... parts) {
return new RbPath(Iterables.concat(segments, Arrays.asList(parts)));
}
public RbPath extendBy(RbPath suffix) {
return new RbPath(Iterables.concat(segments, suffix.segments));
}
public RbPath mapSegments(Function<? super String, String> fn) {
return new RbPath(segments.stream().map(fn).collect(toImmutableList()));
}
/**
* Returns whether the first element of this path is prefix by the given "base name".
*
* <p>Resource bundle paths relating to semantically similar data are typically grouped by the
* same first path element. This is not as simple as just comparing the first element, as in
* {@code path.startsWith(prefix)} however, since path elements can have suffixes, such as
* {@code "Foo:alias"} or {@code "Foo%subtype"}.
*
* @param baseName the base name to test for.
* @return true is the "base name" of the first path element is the given prefix.
*/
public boolean hasPrefix(String baseName) {
checkArgument(!baseName.isEmpty() && SEGMENT_SEPARATORS.matchesNoneOf(baseName));
if (length() == 0) {
return false;
}
String firstElement = getSegment(0);
// Slightly subtle (but safe) access to the separator character, since:
// (!a.equals(b) && a.startsWith(b)) ==> a.length() > b.length().
return firstElement.equals(baseName)
|| (firstElement.startsWith(baseName)
&& SEGMENT_SEPARATORS.matches(firstElement.charAt(baseName.length())));
}
public boolean startsWith(RbPath prefix) {
return prefix.length() <= length() && matchesSublist(prefix, 0);
}
public boolean endsWith(RbPath suffix) {
return suffix.length() <= length() && matchesSublist(suffix, length() - suffix.length());
}
public boolean contains(RbPath path) {
int maxOffset = length() - path.length();
for (int i = 0; i <= maxOffset; i++) {
if (matchesSublist(path, i)) {
return true;
}
}
return false;
}
// Assume length check has been done.
private boolean matchesSublist(RbPath path, int offset) {
for (int i = 0; i < path.length(); i++) {
if (!path.getSegment(i).equals(getSegment(i + offset))) {
return false;
}
}
return true;
}
boolean isIntPath() {
String lastElement = segments.get(segments.size() - 1);
return lastElement.endsWith(":int") || lastElement.endsWith(":intvector");
}
@Override public int compareTo(RbPath other) {
return ORDERING.compare(this, other);
}
@Override public boolean equals(Object other) {
return (other instanceof RbPath) && segments.equals(((RbPath) other).segments);
}
@Override public int hashCode() {
return hashCode;
}
@Override public String toString() {
return String.join("/", segments);
}
}

View file

@ -0,0 +1,58 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkArgument;
import java.util.Arrays;
import java.util.Objects;
import java.util.function.Function;
import com.google.common.collect.ImmutableList;
/**
* A resource bundle value containing a sequence of elements. This is a very thin wrapper over an
* immutable list, with a few additional constraints (e.g. cannot be empty).
*
* <p>Immutable and thread safe.
*/
public final class RbValue {
private final ImmutableList<String> elements;
/** Returns a resource bundle value of the given elements. */
public static RbValue of(String... elements) {
return of(Arrays.asList(elements));
}
/** Returns a resource bundle value of the given elements. */
public static RbValue of(Iterable<String> elements) {
return new RbValue(elements);
}
private RbValue(Iterable<String> elements) {
this.elements = ImmutableList.copyOf(elements);
checkArgument(!this.elements.isEmpty(), "Resource bundle values cannot be empty");
}
/** Returns the (non zero) number of elements in this value. */
public int size() {
return elements.size();
}
/** Returns the Nth element of this value. */
public String getElement(int n) {
return elements.get(n);
}
@Override public int hashCode() {
return Objects.hashCode(elements);
}
@Override public boolean equals(Object obj) {
return obj instanceof RbValue && elements.equals(((RbValue) obj).elements);
}
@Override public String toString() {
return elements.toString();
}
}

View file

@ -0,0 +1,593 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static java.util.function.Function.identity;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import com.google.common.base.Ascii;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableTable;
import com.google.common.collect.Table;
/**
* Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
* in some mapper classes.
*
* When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
* build using the same underlying CLDR data. The only reason mapper classes do not create their
* own instances directly is the relative cost of processing all the supplemental data each time.
*/
// TODO: This should be moved into the API and leverage some of the existing utility functions.
public final class SupplementalData {
private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
private static final PathMatcher ALIAS =
PathMatcher.of("supplementalData/metadata/alias/*[@type=*]");
private static final PathMatcher PARENT_LOCALE =
PathMatcher.of("supplementalData/parentLocales/parentLocale[@parent=*]");
private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");
private static final PathMatcher CALENDER_PREFERENCE =
PathMatcher.of("supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
private static final AttributeKey CALENDER_TERRITORIES =
keyOf("calendarPreference", "territories");
private static final AttributeKey CALENDER_ORDERING =
keyOf("calendarPreference", "ordering");
private static final PathMatcher LIKELY_SUBTAGS =
PathMatcher.of("supplementalData/likelySubtags/likelySubtag[@from=*]");
private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");
private static final Splitter LIST_SPLITTER =
Splitter.on(whitespace()).omitEmptyStrings();
// Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
// a single value (it's structurally always a list, but only territory aliases have a need for
// more than one value).
private enum Alias {
LANGUAGE, SCRIPT, TERRITORY;
private static final ImmutableMap<String, Alias> TYPE_MAP =
Arrays.stream(values())
.collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));
private final String elementName = Ascii.toLowerCase(name()) + "Alias";
final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");
static Optional<Alias> forElementName(String name) {
return Optional.ofNullable(TYPE_MAP.get(name));
}
}
/**
* Creates a supplemental data API instance from the given CLDR data.
*
* @param supplementalData the raw CLDR supplemental data instance.
* @return the supplemental data API.
*/
static SupplementalData create(CldrData supplementalData) {
Table<Alias, String, String> aliasTable = HashBasedTable.create();
Map<String, String> parentLocaleMap = new HashMap<>();
Map<String, String> defaultCalendarMap = new HashMap<>();
Map<String, String> likelySubtagMap = new HashMap<>();
supplementalData.accept(
ARBITRARY,
v -> {
if (ALIAS.matches(v.getPath())) {
// Territory alias replacements can be a list of values (e.g. when countries
// break up). We use the first (geo-politically most significant) value. This
// doesn't happen for languages or scripts, but could in theory.
Alias.forElementName(v.getPath().getName()).ifPresent(
alias -> aliasTable.put(
alias,
alias.typeKey.valueFrom(v),
alias.replacementKey.valueFrom(v)));
} else if (PARENT_LOCALE.matches(v.getPath())) {
String p = PARENT.valueFrom(v);
LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
} else if (CALENDER_PREFERENCE.matches(v.getPath())) {
String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
} else if (LIKELY_SUBTAGS.matches(v.getPath())) {
likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
}
});
// WARNING: The original mapper code determines the full set of deprecated territories and
// then removes the following hard-coded list without any explanation as to why. While this
// is presumably to "undeprecate" them for the purposes of the locale processing, there's
// no explanation of where this list comes from, and thus no way to maintain it.
//
// asList("062", "172", "200", "830", "AN", "CS", "QU")
// .forEach(t -> aliasTable.remove(Alias.TERRITORY, t));
// TODO: Understand and document what on Earth this is all about or delete this comment.
return new SupplementalData(
aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
}
// A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
// data generation. Because this is mutable, it is thoroughly unsuitable for general use.
private static final class LocaleId {
// From: https://unicode.org/reports/tr35/#Identifiers
// Locale ID is:
// (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
//
// However in CLDR data, there's always a language (even if it's "und"), and never more
// than one variant, so this can be simplified to:
// <language>(_<script>)?(_<region>)?(_<variant>)?
//
// * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
// Note that the specification allows for languages 5-8 characters long, but in reality
// this has never occurred yet, so it's ignored in this code.
//
// * Script is 4-letter Xxxx script identifier (e.g. "Latn").
// The specification permits any casing for script subtags, but since all the data uses
// the capitalized "Xxxx" form, that's what this code expects.
//
// * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
// identifier (e.g. "001").
//
// * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
// with a digit (this avoids any ambiguity with script subtags). However because ICU
// violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
// merely "longer than 5".
//
// Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
// for either '-' or '_').
//
// The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
private static final Pattern LOCALE_ID =
Pattern.compile("([a-z]{2,3})"
+ "(?:_([A-Z][a-z]{3}))?"
+ "(?:_([A-Z]{2}|[0-9]{3}))?"
+ "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");
static LocaleId parse(String localeId) {
Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
checkArgument(m.matches(), "invalid locale ID: %s", localeId);
return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
}
static LocaleId of(String language, String script, String region) {
return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
}
// Only the language subtag is non-nullable.
private String languageSubtag;
private String scriptSubtag;
private String regionSubtag;
private String variantSubtag;
String getLanguage() {
return languageSubtag;
}
String getScript() {
return scriptSubtag;
}
String getRegion() {
return regionSubtag;
}
String getVariant() {
return variantSubtag;
}
LocaleId setLanguage(String languageSubtag) {
checkNotNull(languageSubtag, "language subtag must not be null");
checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
this.languageSubtag = languageSubtag;
return this;
}
LocaleId setScript(String scriptSubtag) {
this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
return this;
}
LocaleId setRegion(String regionSubtag) {
this.regionSubtag = Strings.emptyToNull(regionSubtag);
return this;
}
LocaleId setVariant(String variantSubtag) {
this.variantSubtag = Strings.emptyToNull(variantSubtag);
return this;
}
@Override public String toString() {
StringBuilder id = new StringBuilder(languageSubtag);
if (scriptSubtag != null) {
id.append("_").append(scriptSubtag);
}
if (regionSubtag != null) {
id.append("_").append(regionSubtag);
}
if (variantSubtag != null) {
id.append("_").append(variantSubtag);
}
return id.toString();
}
@Override public boolean equals(Object o) {
if (!(o instanceof LocaleId)) {
return false;
}
LocaleId other = (LocaleId) o;
return Objects.equals(languageSubtag, other.languageSubtag)
&& Objects.equals(scriptSubtag, other.scriptSubtag)
&& Objects.equals(regionSubtag, other.regionSubtag)
&& Objects.equals(variantSubtag, other.variantSubtag);
}
@Override public int hashCode() {
return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
}
}
private final ImmutableTable<Alias, String, String> aliasTable;
private final ImmutableMap<String, String> parentLocaleMap;
private final ImmutableMap<String, String> defaultCalendarMap;
private final ImmutableMap<String, String> likelySubtagMap;
private SupplementalData(
Table<Alias, String, String> aliasTable,
Map<String, String> parentLocaleMap,
Map<String, String> defaultCalendarMap,
Map<String, String> likelySubtagMap) {
this.aliasTable = ImmutableTable.copyOf(aliasTable);
this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
}
/**
* Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
*/
public Optional<String> maximize(String localeId) {
return addLikelySubtags(localeId).map(Object::toString);
}
/**
* Returns the locale ID with any deprecated elements replaced. This is an
* implementation of the algorithm specified in
* <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
* specification</a> but without any "minimizing" of the final result (as happens for
* canonicalization in the CLDR tools).
*/
public String replaceDeprecatedTags(String localeId) {
if (localeId.equals("root")) {
return localeId;
}
LocaleId id = LocaleId.parse(localeId);
// ---- LDML Specification ----
// If the region subtag matches the type attribute of a territoryAlias element in
// Supplemental Data, replace the region subtag with the replacement value, as follows:
//
// * If there is a single territory in the replacement, use it.
// * If there are multiple territories:
// * Look up the most likely territory for the base language code (and script, if there
// is one).
// * If that likely territory is in the list, use it.
// * Otherwise, use the first territory in the list.
// ----
// However there is a footnote that says:
// Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
// However, there are a small number of cases of multiple territories, so the mappings
// can be precomputed. This results in a faster lookup with a very small subset of the
// likely subtags data.
//
// Note that (contrary to the order implied by the LDML specification) this step is
// performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
// work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
// "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
//
// TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
if (id.getRegion() != null) {
String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
if (replacementRegions != null) {
List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
if (regions.size() == 1) {
id.setRegion(regions.get(0));
} else {
LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
String likelyId = likelySubtagMap.get(key.toString());
if (likelyId == null) {
likelyId = likelySubtagMap.get(key.setScript(null).toString());
}
String likelyRegion =
likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
if (regions.contains(likelyRegion)) {
id.setRegion(likelyRegion);
} else {
id.setRegion(regions.get(0));
}
}
}
}
// While it's not mentioned in the LDML specification, there is data in the alias table for
// replacement scripts (currently it contains exactly one entry with one value). Because
// its not clear if this is intended to only be single values or a list (and how to handle
// it if it were a list), there's a hard check to ensure it's only ever a single value.
if (id.getScript() != null) {
String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
if (replacementScript != null) {
checkArgument(whitespace().matchesNoneOf(replacementScript),
"unexpected list of replacement scripts: %s", replacementScript);
id.setScript(replacementScript);
}
}
// ---- LDML Specification ----
// If the language subtag matches the type attribute of a languageAlias element in
// Supplemental Data, replace the language subtag with the replacement value.
//
// If there are additional subtags in the replacement value, add them to the result, but
// only if there is no corresponding subtag already in the tag.
// ----
// Contrary to the precise wording of the specification, we don't just check the language
// subtag, since language aliases can contain script and even region information. Instead
// we check the alias table using the same order as defined in subtag maximizing:
//
// <language>_<script>_<region>
// <language>_<region>
// <language>_<script>
// <language>
//
// There is no need to check for "und" however since that's not aliased anything, but since
// it shares the same code it's harmless to do.
resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
.ifPresent(resolvedId -> {
id.setLanguage(checkNotNull(resolvedId.getLanguage(),
"missing language subtag in language alias: %s", resolvedId));
if (id.getScript() == null) {
id.setScript(resolvedId.getScript());
}
if (id.getRegion() == null) {
id.setRegion(resolvedId.getRegion());
}
if (id.getVariant() == null) {
id.setVariant(resolvedId.getVariant());
}
});
return id.toString();
}
/**
* Returns a suitable default calendar for a given locale if it's different from the default
* calendar inferred by the locale's parent.
*
* <p>Note that since the default calendar data is keyed from territory (region subtag) rather
* than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
* that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
* handled with hard-code special casing, but should probably be data driven eventually.
*/
public Optional<String> getDefaultCalendar(String localeId) {
Optional<String> calendar = getSpecialCaseCalendar(localeId);
if (calendar.isPresent()) {
return calendar;
}
String t = territoryOf(localeId);
calendar = Optional.ofNullable(defaultCalendarMap.get(t));
if (!calendar.isPresent()) {
return Optional.empty();
}
String rootCalendar = defaultCalendarMap.get("001");
checkState(!rootCalendar.isEmpty(), "missing root calendar");
if (localeId.equals("root")) {
return Optional.of(rootCalendar);
}
// All locales reach "root" eventually, and that maps to territory "001" which
// we already know has a value, so this loop *must* exit.
String parentCalendar;
do {
localeId = getParent(localeId);
String territory = territoryOf(localeId);
parentCalendar = defaultCalendarMap.get(territory);
} while (parentCalendar == null);
return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
}
// Hack to work around the limitation that CLDR data cannot represent default calendars that
// change because of non-territory information. Since this is limited to exactly two cases at
// the moment, and is unlikely to be expanded, it's being done directly in code.
private Optional<String> getSpecialCaseCalendar(String localeId) {
Optional<String> maximized = maximize(localeId);
if (maximized.isPresent()) {
switch (maximized.get()) {
case "ja_Jpan_JP_TRADITIONAL":
return Optional.of("japanese");
case "th_Thai_TH_TRADITIONAL":
return Optional.of("buddhist");
}
}
return Optional.empty();
}
/**
* Returns the parent of a non-root locale ID. This is more complex than simple truncation for
* two reasons:
* <ul>
* <li>There may be an explicit parent locale ID specified in the CLDR data.
* <li>Removal of non-default script subtags makes the parent locale "root" (unless there
* was an explicit parent specified).
* </ul>
* Note that all valid locale ID parent "chains" must end up at "root" eventually.
*
* For example (showing parent "chains"):
* <ul>
* <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
* <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
* </ul>
*
* @throws IllegalArgumentException if the given locale ID is invalid or "root".
*/
public String getParent(String localeId) {
checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
// Always defer to an explicit parent locale set in the CLDR data.
Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
if (explicitParent.isPresent()) {
return explicitParent.get();
}
// Now look for the start of the last ID "part" in order to truncate.
int lastPartSeperatorIndex = localeId.lastIndexOf('_');
// The parent of a base language ID (e.g. "en" or "fr") is always "root".
if (lastPartSeperatorIndex == -1) {
return "root";
}
String parentId = localeId.substring(0, lastPartSeperatorIndex);
// However, if the script of the locale is what's being truncated and it's NOT the default
// script for the language, return "root" as the parent rather than truncating.
String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
return "root";
}
return !parentId.isEmpty() ? parentId : "root";
}
/**
* Returns the explicit parent of a locale ID if specified in the CLDR data.
*
* Note that this method will not return a value for most locale IDs, since they do not have
* an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
* #getParent(String)}.
*/
public Optional<String> getExplicitParentLocaleOf(String localeId) {
return Optional.ofNullable(parentLocaleMap.get(localeId));
}
private String territoryOf(String localeId) {
return localeId.equals("root")
? "001"
: addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
}
private String scriptOf(String localeId) {
return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
}
// From: https://unicode.org/reports/tr35/#Likely_Subtags
//
// Add Likely Subtags
// ------------------
// Given a source locale X, to return a locale Y where the empty subtags have been filled in
// by the most likely subtags. A subtag is called empty if it is a missing script or region
// subtag, or it is a base language subtag with the value "und".
//
// Canonicalize
// ------------
// Make sure the input locale is in canonical form ...
// ...
// Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
//
// Note that this implementation does not need to handle "grandfathered" tags.
private Optional<LocaleId> addLikelySubtags(String localeId) {
if (localeId.equals("root")) {
return Optional.empty();
}
LocaleId id = LocaleId.parse(localeId);
// ---- LDML Specification ----
// Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
if ("Zzzz".equals(id.getScript())) {
id.setScript(null);
}
if ("ZZ".equals(id.getRegion())) {
id.setRegion(null);
}
// ---- LDML Specification ----
// A subtag is called empty if it is a missing script or region subtag, or it is a base
// language subtag with the value "und"
if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
// We are already canonical, so just return.
return Optional.of(id);
}
Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
if (!optTags.isPresent()) {
return Optional.empty();
}
LocaleId subtags = optTags.get();
checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
// Replace "missing" elements in the original ID with likely subtags.
if (id.getLanguage().equals("und")) {
id.setLanguage(subtags.getLanguage());
}
if (id.getScript() == null) {
id.setScript(checkNotNull(subtags.getScript()));
}
if (id.getRegion() == null) {
id.setRegion(checkNotNull(subtags.getRegion()));
}
// Language is not "und" and both script and region subtags are set!
return Optional.of(id);
}
// From: https://unicode.org/reports/tr35/#Likely_Subtags
//
// Lookup
// ------
// Lookup each of the following in order, and stop on the first match:
// <language>_<script>_<region>
// <language>_<region>
// <language>_<script>
// <language>
// "und"_<script>
private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
String lang = id.getLanguage();
String script = id.getScript();
String region = id.getRegion();
Stream<LocaleId> candidateIds = Stream.of(
LocaleId.of(lang, script, region),
LocaleId.of(lang, null, region),
LocaleId.of(lang, script, null),
LocaleId.of(lang, null, null));
// Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
// its own ("en_Latn_US") which is not intended.
if (script != null) {
candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
}
return candidateIds
// Remove duplicate IDs (keeps the first one encountered).
.distinct()
.map(Object::toString)
.map(fn)
.filter(Objects::nonNull)
.findFirst()
.map(LocaleId::parse);
}
}

View file

@ -0,0 +1,246 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Ascii.toLowerCase;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrDataType.BCP47;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import javax.annotation.Nullable;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData.PrefixVisitor;
import org.unicode.cldr.api.CldrData.ValueVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.base.Ascii;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Sets;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* A mapper to collect BCP-47 data from {@link CldrDataType#BCP47 BCP47} data under paths
* matching:
* <pre>{@code
* //ldmlBCP47/keyword/key[@name=*]/type[@name=*]
* }</pre>
*/
public final class Bcp47Mapper {
// Other attributes (e.g. "alias") are value attributes and don't need to be matched here.
private static final PathMatcher KEY = PathMatcher.of("ldmlBCP47/keyword/key[@name=*]");
private static final AttributeKey KEY_NAME = keyOf("key", "name");
private static final AttributeKey KEY_ALIAS = keyOf("key", "alias");
private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType");
private static final PathMatcher TYPE = PathMatcher.of("type[@name=*]");
private static final AttributeKey TYPE_NAME = keyOf("type", "name");
private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias");
private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred");
// Deprecation of the data is not the same as deprecation of attributes themselves. This
// deprecation relates to identifying data which exists, but is not longer the right way to
// represent things (which means it can be important for clients to know about).
private static final AttributeKey KEY_DEPRECATED = keyOf("key", "deprecated");
private static final AttributeKey TYPE_DEPRECATED = keyOf("type", "deprecated");
// Attributes that can be emitted under the /keyInfo or /typeInfo paths for auxiliary
// information in the ICU data. If the value is equal to the declared default, it is ignored.
// NOTE: The need for hard-coded default values is a hack because there's not nice way (yet)
// to determine the default for implicit values via the DTD. Ideally this would be automatic
// and the AttributeKey class would be able to have a method like "isDefault(String value)".
private static final ImmutableMap<AttributeKey, String> INFO_ATTRIBUTES =
ImmutableMap.of(KEY_VALUE_TYPE, "", KEY_DEPRECATED, "false", TYPE_DEPRECATED, "false");
private static final RbPath RB_KEYMAP = RbPath.of("keyMap");
private static final RbPath RB_TYPE_ALIAS = RbPath.of("typeAlias", "timezone:alias");
private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias");
private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias");
/**
* Processes data from the given supplier to generate Timezone and BCP-47 ICU data.
*
* @param src the CLDR data supplier to process.
* @return A list of IcuData instances containing BCP-47 data to be written to files.
*/
public static ImmutableList<IcuData> process(CldrDataSupplier src) {
Bcp47Visitor visitor = new Bcp47Visitor();
src.getDataForType(BCP47).accept(ARBITRARY, visitor);
visitor.addKeyMapValues();
return ImmutableList.of(visitor.keyTypeData.icuData, visitor.tzData.icuData);
}
// Outer visitor which handles "key" paths by installing sub-visitor methods to process
// each child "type" element. Depending on the key name, values are stored in different
// IcuData instances.
private static final class Bcp47Visitor implements PrefixVisitor {
private final ValueCollector tzData =
new ValueCollector(new IcuData("timezoneTypes", false));
private final ValueCollector keyTypeData =
new ValueCollector(new IcuData("keyTypeData", false));
// The current key name from the parent path element (set when a prefix is matched).
@Nullable private String keyName = null;
// A map collecting each key and values as they are visited.
// TODO: Convert this to a Map<RbPath, String> which involves removing the '@' prefix hack.
private Map<String, String> keyMap = new LinkedHashMap<>();
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
if (KEY.matches(prefix)) {
// Don't inline this since it also sets the field!!
keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix));
// How the data is visited is the same for both timezone and other BCP-47 data,
// it's just split into different data files, so we just install a different
// instance of the visitor class according to where the data in this sub-hierarchy
// should end up.
ctx.install(keyName.equals("tz") ? tzData : keyTypeData);
}
}
// Post processing to add additional captured attribute values and some special cases.
private void addKeyMapValues() {
IcuData keyData = keyTypeData.icuData;
// Add all the keyMap values into the IcuData file.
for (Entry<String, String> kmData : keyMap.entrySet()) {
String bcpKey = kmData.getKey();
String key = kmData.getValue();
if (bcpKey.startsWith("@")) {
// Undoing the weird hack in addInfoAttributes(). This can be done better.
// We use "parse()" because these are full paths, and not single elements.
keyData.add(RbPath.parse(bcpKey.substring(1)), key);
continue;
}
if (bcpKey.equals(key)) {
// An empty value indicates that the BCP47 key is same as the legacy key.
bcpKey = "";
}
keyData.add(RB_KEYMAP.extendBy(key), bcpKey);
}
// Add aliases for timezone data.
keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone");
keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone");
keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz");
}
private final class ValueCollector implements ValueVisitor {
// Mutable ICU data collected into during visitation.
private final IcuData icuData;
ValueCollector(IcuData data) {
this.icuData = checkNotNull(data);
}
@Override
public void visit(CldrValue value) {
checkArgument(TYPE.matchesSuffixOf(value.getPath()),
"unexpected child element: %s", value.getPath());
String typeName = TYPE_NAME.valueFrom(value);
// Note that if a "preferred" type exists, we treat the value specially and add
// it only as an alias. We expected values with a preferred replacement to
// always be explicitly deprecated.
Optional<String> prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value);
if (prefName.isPresent()) {
checkState(KEY_DEPRECATED.booleanValueFrom(value, false)
|| TYPE_DEPRECATED.booleanValueFrom(value, false),
"unexpected 'preferred' attribute for non-deprecated value: %s", value);
icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get());
return;
}
// Note: There are some deprecated values which don't have a preferred
// replacement and these will be processed below (in particular we need to emit
// the fact that they are deprecated).
// According to the old mapper code, it's an error not to have an alias, but
// it's emitted via debug logging and not actually enforced.
// TODO: Consider making this an error if possible.
String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName));
keyMap.put(keyName, keyAlias);
RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias);
List<String> typeAliases = TYPE_ALIASES.listOfValuesFrom(value);
if (typeAliases.isEmpty()) {
// Generate type map entry using empty value (an empty value indicates same
// type name is used for both BCP47 and legacy type).
icuData.add(typeMapPrefix.extendBy(typeName), "");
} else {
String mainAlias = typeAliases.get(0);
icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName);
// Put additional aliases as secondary aliases referencing the main alias.
RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias);
typeAliases.stream()
.skip(1)
.map(Bcp47Visitor::quoteAlias)
.forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias));
}
addInfoAttributes(keyName, typeName, value.getValueAttributes());
}
// Add any additional attributes present to the attribute map. Note that this code was
// copied from largely undocumented code, and the precise reasoning for why this is
// needed or why it's done this way is not completely clear. It is very likely that it
// can be simplified.
//
// The '@' symbol added here is just a magic token that gets stripped off again in the
// addKeyMapValues() method, it appears to just be a way to distinguish keys added via
// this method vs during the visit method. A better approach might just be to have two
// maps.
// TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?).
private void addInfoAttributes(
String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes) {
// Only emit deprecation for the "key" level, even if all types below that are also
// marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES).
Set<AttributeKey> keys =
Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet());
for (AttributeKey a : keys) {
String value = attributes.get(a);
// Skip empty or default values in attributes.
if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) {
continue;
}
// The ID for the xxxInfo paths in ICU is the path fragment at which the
// attribute exists. Since we only process complete paths here, we must do a
// bit of reconstruction based on the element name of the attribute we are
// processing. This relies on explicit knowledge that the paths are "<key>" or
// "<key>/<type>". This all gets less messy if we switch to RbPath.
String id =
a.getElementName().equals("key") ? keyName : keyName + "/" + typeName;
keyMap.put(
"@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id,
value);
}
}
}
/**
* Escapes alias values containing '/' so they can appear in resource bundle paths. This
* function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar").
*
* <p>This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar'
* in the CLDR data.
*/
// TODO: Switch to RbPath and do quoting automatically when ICU data is written out.
private static String quoteAlias(String str) {
return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"';
}
}
private Bcp47Mapper() {}
}

View file

@ -0,0 +1,147 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
import java.util.Optional;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.escape.UnicodeEscaper;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under
* paths matching:
* <pre>{@code
* //ldml/segmentations/segmentation/suppressions/suppression
* //ldml/special/icu:breakIteratorData/...
* }</pre>
*/
// TODO: This class can almost certainly be replace with a small RegexTransformer config.
public final class BreakIteratorMapper {
// The "type" attribute is not required here, so cannot appear in the matcher.
private static final PathMatcher SUPPRESSION =
PathMatcher.of("ldml/segmentations/segmentation/suppressions/suppression");
private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type");
// Note: This could be done with an intermediate matcher for
// "ldml/special/icu:breakIteratorData" but there are so few "special" values it's not worth it
private static final PathMatcher BOUNDARIES =
PathMatcher.of("ldml/special/icu:breakIteratorData/icu:boundaries/*");
private static final PathMatcher DICTIONARY =
PathMatcher.of("ldml/special/icu:breakIteratorData/icu:dictionaries/icu:dictionary");
private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency");
private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type");
/**
* Processes data from the given supplier to generate break-iterator data for a set of locale
* IDs.
*
* @param localeId the locale ID to generate data for.
* @param src the CLDR data supplier to process.
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
* @return IcuData containing break-iterator data for the given locale ID.
*/
public static IcuData process(
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
BreakIteratorMapper mapper = new BreakIteratorMapper(localeId);
icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, mapper::addSpecials));
src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, mapper::addSuppression);
return mapper.icuData;
}
// The per-locale ICU data being collected by this visitor.
private final IcuData icuData;
private BreakIteratorMapper(String localeId) {
this.icuData = new IcuData(localeId, true);
}
private void addSuppression(CldrValue v) {
if (SUPPRESSION.matches(v.getPath())) {
String type = SEGMENTATION_TYPE.valueFrom(v);
// TODO: Understand and document why we escape values here, but not for collation data.
icuData.add(
RbPath.of("exceptions", type + ":array"),
ESCAPE_NON_ASCII.escape(v.getValue()));
}
}
private void addSpecials(CldrValue v) {
CldrPath p = v.getPath();
if (BOUNDARIES.matches(p)) {
addDependency(
getDependencyName(v),
getBoundaryType(v),
getBoundaryDependency(v));
} else if (DICTIONARY.matches(p)) {
addDependency(
getDependencyName(v),
DICTIONARY_TYPE.valueFrom(v),
DICTIONARY_DEP.optionalValueFrom(v));
}
}
private void addDependency(String name, String type, Optional<String> dependency) {
icuData.add(
RbPath.of(name, type + ":process(dependency)"),
dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency")));
}
// Must match the BOUNDARIES or DICTIONARY path.
private static String getDependencyName(CldrValue value) {
return stripXmlNamespace(value.getPath().getParent().getName());
}
// Must match the BOUNDARIES path.
private static String getBoundaryType(CldrValue value) {
String elementName = value.getPath().getName();
String type = stripXmlNamespace(elementName);
return keyOf(elementName, "alt")
.optionalValueFrom(value).map(a -> type + "_" + a).orElse(type);
}
// Must match the BOUNDARIES path.
private static Optional<String> getBoundaryDependency(CldrValue value) {
return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value);
}
// Strips the first prefix of the form "xxx:" from a string.
private static String stripXmlNamespace(String s) {
return s.substring(s.indexOf(':') + 1);
}
/*
* Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
* backslash to a double backslash. This class is super slow for non-ASCII escaping due to
* using "String.format()", however there's < 100 values that need any escaping, so it's fine.
*/
private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() {
private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
@Override
protected char[] escape(int cp) {
// Returning null means "do not escape".
if (0x0020 <= cp && cp <= 0x007F) {
return cp == '\\' ? DOUBLE_BACKSLASH : null;
} else if (cp <= 0xFFFF) {
return String.format("\\u%04X", cp).toCharArray();
}
return String.format("\\U%08X", cp).toCharArray();
}
};
}

View file

@ -0,0 +1,198 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Preconditions.checkArgument;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
import java.util.Optional;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrData.PrefixVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import org.unicode.icu.tool.cldrtoicu.RbValue;
/**
* A mapper to collect collation data from {@link CldrDataType#LDML LDML} data via the paths:
* <pre>{@code
* //ldml/collations/*
* //ldml/special/icu:UCARules
* //ldml/special/icu:depends
* }</pre>
*/
public final class CollationMapper {
private static final PathMatcher COLLATIONS = PathMatcher.of("ldml/collations");
// Note that the 'type' attribute is optional, so cannot be in the path matcher.
// However since the CLDR data never actually omits the value, it would be easy to change the
// attribute metadata to stop it being an implicit attribute and then it could appear.
private static final PathMatcher COLLATION_RULE = PathMatcher.of("collation/cr");
private static final AttributeKey COLLATION_TYPE = keyOf("collation", "type");
private static final AttributeKey COLLATION_RULE_ALT = keyOf("cr", "alt");
private static final PathMatcher DEFAULT_COLLATION = PathMatcher.of("defaultCollation");
private static final PathMatcher SPECIAL = PathMatcher.of("ldml/special");
private static final AttributeKey SPECIAL_RULES = keyOf("icu:UCARules", "icu:uca_rules");
private static final AttributeKey SPECIAL_DEP = keyOf("icu:depends", "icu:dependency");
private static final RbPath RB_COLLATIONS_DEFAULT = RbPath.of("collations", "default");
private static final RbPath RB_STANDARD_SEQUENCE =
RbPath.of("collations", "standard", "Sequence");
private static final RbPath RB_STANDARD_VERSION =
RbPath.of("collations", "standard", "Version");
private static final Splitter LINE_SPLITTER =
Splitter.on('\n').trimResults().omitEmptyStrings();
/**
* Processes data from the given supplier to generate collation data for a set of locale IDs.
*
* @param localeId the locale ID to generate data for.
* @param src the CLDR data supplier to process.
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
* @return IcuData containing RBNF data for the given locale ID.
*/
public static IcuData process(
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
CollationVisitor visitor = new CollationVisitor(localeId);
icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, visitor));
src.getDataForLocale(localeId, UNRESOLVED).accept(ARBITRARY, visitor);
return visitor.icuData;
}
final static class CollationVisitor implements PrefixVisitor {
private final IcuData icuData;
CollationVisitor(String localeId) {
this.icuData = new IcuData(localeId, true);
// Super special hack case because the XML data is a bit broken for the root collation
// data (there's an empty <collation> element that's a non-leaf element and thus not
// visited, but we should add an empty sequence to the output data.
if (localeId.equals("root")) {
icuData.replace(RB_STANDARD_SEQUENCE, "");
// TODO: Collation versioning probably needs to be improved.
icuData.replace(RB_STANDARD_VERSION, CldrDataSupplier.getCldrVersionString());
}
}
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
if (COLLATIONS.matchesPrefixOf(prefix)) {
ctx.install(this::collectRules);
} else if (SPECIAL.matchesPrefixOf(prefix)) {
ctx.install(this::maybeAddSpecial);
}
}
private void collectRules(CldrValue v) {
CldrPath p = v.getPath();
if (COLLATION_RULE.matchesSuffixOf(p)) {
String type = COLLATION_TYPE.valueFrom(v);
RbPath rbPath = RbPath.of("collations", type, "Sequence");
// WARNING: This is almost certainly a bug, since while @type can have the value
// "short" it can also have other values. This code was copied from CollationMapper
// which has the line;
// isShort = attr.getValue("alt") != null;
boolean isShort = COLLATION_RULE_ALT.optionalValueFrom(v).isPresent();
// Note that it's not clear why there's a check for "contains()" here. The code
// from which this was derived is largely undocumented and this check could have
// been overly defensive (perhaps a duplicate key should be an error?).
if (isShort || !icuData.contains(rbPath)) {
RbValue rules = RbValue.of(
LINE_SPLITTER.splitToList(v.getValue()).stream()
.map(CollationMapper::removeComment)
.filter(s -> !s.isEmpty())::iterator);
icuData.replace(rbPath, rules);
icuData.replace(
RbPath.of("collations", type, "Version"),
CldrDataSupplier.getCldrVersionString());
}
} else if (DEFAULT_COLLATION.matchesSuffixOf(p)) {
icuData.add(RB_COLLATIONS_DEFAULT, v.getValue());
}
}
// This is a bit special since the attribute we want to add depends on the element we are
// visiting (which is somewhat unusual in the transformation classes).
private void maybeAddSpecial(CldrValue value) {
AttributeKey key;
switch (value.getPath().getName()) {
case "icu:UCARules":
key = SPECIAL_RULES;
break;
case "icu:depends":
key = SPECIAL_DEP;
break;
default:
return;
}
// substring(4) just removes the "icu:" prefix (which we know is present in the key).
RbPath rbPath = RbPath.of(
String.format("%s:process(%s)",
key.getElementName().substring(4), key.getAttributeName().substring(4)));
icuData.add(rbPath, key.valueFrom(value));
}
}
// Collation data can contain # to mark an end-of-line comment, but it can also contain data
// with # in it. In the latter case it must be in a single-quoted string (e.g. 'x#y'). However
// the precise semantics of the quoting rules are not particularly clear, so this method
// assumes that:
// * single quote (apostrophe) begins and ends quoting.
// * outside a quoted section, all characters are literal.
// * inside a quoted section, backslash '\' escapes any single character (e.g \a, \', \\)
private static String removeComment(String s) {
int i = findCommentStart(s);
if (i >= 0) {
s = CharMatcher.whitespace().trimTrailingFrom(s.substring(0, i));
}
return s;
}
// Returns the index of the first unquoted '#' in the string.
private static int findCommentStart(String s) {
boolean quoted = false;
for (int i = 0; i < s.length(); i++) {
switch (s.charAt(i)) {
case '\'':
quoted = !quoted;
break;
case '\\':
if (quoted) {
i++;
}
break;
case '#':
if (!quoted) {
return i;
}
break;
default:
// Do nothing and consume the character
}
}
checkArgument(!quoted, "mismatched quotes in: %s", s);
return -1;
}
private CollationMapper() {}
}

View file

@ -0,0 +1,98 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import java.util.Optional;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrData.PrefixVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* A mapper to collect day-period data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
* data via the paths:
* <pre>{@code
* //supplementalData/dayPeriodRuleSet/*
* }</pre>
*/
public final class DayPeriodsMapper {
private static final PathMatcher RULESET =
PathMatcher.of("supplementalData/dayPeriodRuleSet");
private static final AttributeKey RULESET_TYPE = keyOf("dayPeriodRuleSet", "type");
private static final PathMatcher RULES = PathMatcher.of("dayPeriodRules[@locales=*]");
private static final AttributeKey RULES_LOCALES = keyOf("dayPeriodRules", "locales");
private static final PathMatcher RULE = PathMatcher.of("dayPeriodRule[@type=*]");
private static final AttributeKey RULE_TYPE = keyOf("dayPeriodRule", "type");
private static final RbPath RB_LOCALES = RbPath.of("locales");
/**
* Processes data from the given supplier to generate day-period ICU data.
*
* @param src the CLDR data supplier to process.
* @return the IcuData instance to be written to a file.
*/
public static IcuData process(CldrDataSupplier src) {
RuleSetVisitor mapper = new RuleSetVisitor();
CldrData data = src.getDataForType(SUPPLEMENTAL);
data.accept(ARBITRARY, mapper);
return mapper.icuData;
}
private static final class RuleSetVisitor implements PrefixVisitor {
// Mutable ICU data collected into during visitation.
private final IcuData icuData = new IcuData("dayPeriods", false);
private int setNum = 0;
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
if (RULESET.matches(prefix)) {
ctx.install(new RuleVisitor(RULESET_TYPE.optionalValueFrom(prefix)));
}
}
private final class RuleVisitor implements PrefixVisitor {
private final RbPath localePrefix;
private RuleVisitor(Optional<String> type) {
// If there's a given type, add it to the prefix path.
this.localePrefix = type.map(t -> RbPath.of("locales_" + t)).orElse(RB_LOCALES);
}
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
if (RULES.matchesSuffixOf(prefix)) {
// Sets are arbitrarily identified by the string "setNN".
String setName = "set" + (++setNum);
RULES_LOCALES.listOfValuesFrom(prefix)
.forEach(locale -> icuData.add(localePrefix.extendBy(locale), setName));
ctx.install(this::visitRule);
}
}
private void visitRule(CldrValue value) {
if (RULE.matchesSuffixOf(value.getPath())) {
RbPath prefix = RbPath.of("rules", "set" + setNum, RULE_TYPE.valueFrom(value));
value.getValueAttributes()
.forEach((k, v) -> icuData.add(prefix.extendBy(k.getAttributeName()), v));
}
}
}
}
private DayPeriodsMapper() {}
}

View file

@ -0,0 +1,183 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.Ordering.natural;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrData.ValueVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrValue;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.SetMultimap;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import org.unicode.icu.tool.cldrtoicu.RbValue;
import org.unicode.icu.tool.cldrtoicu.SupplementalData;
/**
* Generate locale {@link IcuData} by transforming {@link CldrDataType#LDML LDML} data using a
* {@link PathValueTransformer}.
*
* <p>This is currently driven by the {@code ldml2icu_locale.txt} configuration file via a
* {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
*/
public final class LocaleMapper {
// Match territory paths so we can skip processing deprecated territories.
private static final PathMatcher TERRITORY = PathMatcher.of(
"ldml/localeDisplayNames/territories/territory[@type=*]");
private static final AttributeKey TERRITORY_TYPE = keyOf("territory", "type");
// The default calendar (only set is different from inherited parent value).
private static final RbPath RB_CALENDAR = RbPath.of("calendar", "default");
/**
* Processes data from the given supplier to generate general locale data for the given locale
* ID.
*
* @param localeId the locale ID to generate data for.
* @param src the CLDR data supplier to process.
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
* @param transformer the transformer to match and transform each CLDR path/value pair.
* @param supplementalData additional necessary data derived from
* {@link org.unicode.cldr.api.CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data.
* @return IcuData containing locale data for the given locale ID.
*/
public static IcuData process(
String localeId,
CldrDataSupplier src,
Optional<CldrData> icuSpecialData,
PathValueTransformer transformer,
SupplementalData supplementalData) {
IcuData icuData = new IcuData(localeId, true);
// Write out the results into the IcuData class, preserving result grouping and expanding
// path references as necessary.
ResultsCollector collector = new ResultsCollector(transformer);
icuData.addResults(collector.collectResultsFor(localeId, src, icuSpecialData));
doDateTimeHack(icuData);
supplementalData.getDefaultCalendar(icuData.getName())
.ifPresent(c -> icuData.add(RB_CALENDAR, c));
return icuData;
}
// This is an awful hack for post-processing the date-time format patterns to inject a 13th
// pattern at index 8, which is just a duplicate of the "medium" date-time pattern. The reasons
// for this are lost in the midst of time, but essentially there's ICU library code that just
// expects the value at index 8 to be this "default" value, and reads the date-time values
// starting at index 9.
//
// Before the hack would be at index 10, since there are 3 groups:
// "time" -> "date" -> "date-time"
// with 4 patterns each:
// "full" -> "long" -> "medium" -> "short"
private static void doDateTimeHack(IcuData icuData) {
for (RbPath rbPath : icuData.getPaths()) {
if (rbPath.length() == 3
&& rbPath.getSegment(0).equals("calendar")
&& rbPath.getSegment(2).equals("DateTimePatterns")) {
// This cannot be null and should not be empty, since the path is in this data.
List<RbValue> valuesToHack = icuData.get(rbPath);
checkArgument(valuesToHack.size() == 12,
"unexpected number of date/time patterns for '%s': %s", rbPath, valuesToHack);
valuesToHack.add(8, valuesToHack.get(10));
}
}
}
private static final class ResultsCollector {
private final PathValueTransformer transformer;
private final Set<RbPath> validRbPaths = new HashSet<>();
// WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
// each key. The reason is that result comparison is not "consistent with equals", and
// TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
// method), and it does this even if using the add() method of the sorted set (this is in
// fact in violation of the stated behaviour of Set#add).
private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
ResultsCollector(PathValueTransformer transformer) {
this.transformer = checkNotNull(transformer);
}
ImmutableListMultimap<RbPath, Result> collectResultsFor(
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
CldrData unresolved = src.getDataForLocale(localeId, UNRESOLVED);
CldrData resolved = src.getDataForLocale(localeId, RESOLVED);
DynamicVars varFn = p -> {
CldrValue cldrValue = resolved.get(p);
return cldrValue != null ? cldrValue.getValue() : null;
};
collectPaths(unresolved, varFn);
collectResults(resolved, varFn);
icuSpecialData.ifPresent(s -> collectSpecials(s, varFn));
ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
out.orderValuesBy(natural());
for (RbPath rbPath : resultsByRbPath.keySet()) {
Set<Result> existingResults = resultsByRbPath.get(rbPath);
out.putAll(rbPath, existingResults);
for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
out.put(rbPath, fallback);
}
}
}
return out.build();
}
private void collectPaths(CldrData unresolved, DynamicVars varFn) {
ValueVisitor collectPaths =
v -> transformer.transform(v, varFn).forEach(this::collectResultPath);
unresolved.accept(DTD, collectPaths);
}
private void collectResultPath(Result result) {
RbPath rbPath = result.getKey();
validRbPaths.add(rbPath);
if (rbPath.isAnonymous()) {
RbPath parent = rbPath.getParent();
checkState(!parent.isAnonymous(),
"anonymous paths should not be nested: %s", rbPath);
validRbPaths.add(parent);
}
}
void collectResults(CldrData resolved, DynamicVars varFn) {
ValueVisitor collectResults =
v -> transformer.transform(v, varFn).stream()
.filter(r -> validRbPaths.contains(r.getKey()))
.forEach(r -> resultsByRbPath.put(r.getKey(), r));
resolved.accept(DTD, collectResults);
}
private void collectSpecials(CldrData cldrData, DynamicVars varFn) {
cldrData.accept(DTD, v ->
transformer.transform(v, varFn).forEach(r -> resultsByRbPath.put(r.getKey(), r)));
}
}
private LocaleMapper() {}
}

View file

@ -0,0 +1,88 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Preconditions.checkState;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrData.PrefixVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import org.unicode.icu.tool.cldrtoicu.RbValue;
/**
* A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
* the paths:
* <pre>{@code
* //supplementalData/plurals/pluralRanges[@locales=*]/...
* }</pre>
*/
public final class PluralRangesMapper {
private static final PathMatcher RANGES =
PathMatcher.of("supplementalData/plurals/pluralRanges[@locales=*]");
private static final AttributeKey RANGES_LOCALES = keyOf("pluralRanges", "locales");
private static final PathMatcher RANGE = PathMatcher.of("pluralRange[@start=*][@end=*]");
private static final AttributeKey RANGE_START = keyOf("pluralRange", "start");
private static final AttributeKey RANGE_END = keyOf("pluralRange", "end");
private static final AttributeKey RANGE_RESULT = keyOf("pluralRange", "result");
private static final RbPath RB_RULES = RbPath.of("rules");
private static final RbPath RB_LOCALES = RbPath.of("locales");
/**
* Processes data from the given supplier to generate plural-range ICU data.
*
* @param src the CLDR data supplier to process.
* @return the IcuData instance to be written to a file.
*/
public static IcuData process(CldrDataSupplier src) {
PluralRangesVisitor visitor = new PluralRangesVisitor();
CldrData data = src.getDataForType(SUPPLEMENTAL);
data.accept(ARBITRARY, visitor);
return visitor.icuData;
}
private static final class PluralRangesVisitor implements PrefixVisitor {
private final IcuData icuData = new IcuData("pluralRanges", false);
private int setIndex = 0;
private String ruleLabel = null;
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
// Captured type is either "cardinal" or "ordinal" (and will cause exception otherwise).
if (RANGES.matches(prefix)) {
ruleLabel = String.format("set%02d", setIndex++);
RANGES_LOCALES.listOfValuesFrom(prefix)
.forEach(l -> icuData.add(RB_LOCALES.extendBy(l), ruleLabel));
ctx.install(this::visitRange);
}
}
private void visitRange(CldrValue value) {
checkState(RANGE.matchesSuffixOf(value.getPath()),
"unexpected path: %s", value.getPath());
// Note: "range:start" and "range:end" are optional attributes, but the CLDR DTD
// specifies a default via comments. They should probably be changed to just have a
// default in the DTD (and possibly converted to use an enum here).
icuData.add(RB_RULES.extendBy(ruleLabel),
RbValue.of(
RANGE_START.valueFrom(value, "all"),
RANGE_END.valueFrom(value, "all"),
RANGE_RESULT.valueFrom(value)));
}
}
private PluralRangesMapper() {}
}

View file

@ -0,0 +1,150 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrData.PrefixVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
* the paths:
* <pre>{@code
* //supplementalData/plurals[@type=*]/pluralRules[@locales=*]/pluralRule[@count=*]
* }</pre>
*/
public final class PluralsMapper {
private static final PathMatcher PLURALS = PathMatcher.of("supplementalData/plurals[@type=*]");
private static final AttributeKey PLURALS_TYPE = keyOf("plurals", "type");
private static final PathMatcher RULES = PathMatcher.of("pluralRules[@locales=*]");
private static final AttributeKey RULES_LOCALES = keyOf("pluralRules", "locales");
private static final PathMatcher RULE = PathMatcher.of("pluralRule[@count=*]");
private static final AttributeKey RULE_COUNT = keyOf("pluralRule", "count");
private static final ImmutableMap<String, RbPath> ICU_PREFIX_MAP =
ImmutableMap.of("cardinal", RbPath.of("locales"), "ordinal", RbPath.of("locales_ordinals"));
/**
* Processes data from the given supplier to generate plural ICU data.
*
* @param src the CLDR data supplier to process.
* @return the IcuData instance to be written to a file.
*/
public static IcuData process(CldrDataSupplier src) {
PluralsVisitor visitor = new PluralsVisitor();
CldrData data = src.getDataForType(SUPPLEMENTAL);
// Note: We explicitly reset the type to mimic the order of the existing code, since this
// affects the set indices we generate during processing. Ideally this would all be immune
// to ordering (or just enforce DTD ordering) but right now it's very dependent on
// mimicking the order of the existing code to get identical output.
data.accept(ARBITRARY, visitor.setType("cardinal"));
data.accept(ARBITRARY, visitor.setType("ordinal"));
return visitor.icuData;
}
private static final class PluralsVisitor implements PrefixVisitor {
// Mutable ICU data collected into during visitation.
// In a post XML-aware API, is recording the XML file names really a good idea?
private final IcuData icuData = new IcuData("plurals", false);
// Filter for the type we are processing now (this could be removed if we don't mind which
// order the types are processed, and switching to DTD ordering would make it stable).
private String type = null;
private final List<ImmutableMap<String, String>> previousRules = new ArrayList<>();
// Hack method to allow a single type to be processed at a time (the visitor would otherwise
// happily handle both types in a single pass). We can't do this as two different visitors
// (one for each type) because the current behaviour relies on carrying over the calculated
// set numbers from one pass to the next. Once migration is complete we should revisit this
// and allow this visitor to work in a single pass (probably with DTD order for stability).
PluralsVisitor setType(String type) {
this.type = checkNotNull(type);
return this;
}
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
if (PLURALS.matches(prefix)) {
// Note: "plurals:type" is an optional attribute but the CLDR DTD specifies a
// default via comments. It should probably be changed to just have a default in
// the DTD.
if (PLURALS_TYPE.valueFrom(prefix, "cardinal").equals(type)) {
ctx.install(new RulesVisitor(ICU_PREFIX_MAP.get(type)));
}
}
}
private final class RulesVisitor implements PrefixVisitor {
private final RbPath icuPrefix;
private final List<String> locales = new ArrayList<>();
private final Map<String, String> rules = new LinkedHashMap<>();
RulesVisitor(RbPath icuPrefix) {
this.icuPrefix = checkNotNull(icuPrefix);
}
@Override
public void visitPrefixStart(CldrPath prefix, Context ctx) {
if (RULES.matchesSuffixOf(prefix)) {
Iterables.addAll(locales, RULES_LOCALES.listOfValuesFrom(prefix));
ctx.install(value -> {
if (RULE.matchesSuffixOf(value.getPath())) {
rules.put(RULE_COUNT.valueFrom(value), value.getValue());
}
});
}
}
@Override
public void visitPrefixEnd(CldrPath prefix) {
checkState(!locales.isEmpty(), "missing locale data for plurals: %s", prefix);
// Note: The original mapper code "sort of" coped with empty rules, but it's not
// completely well behaved (or documented), so since this doesn't happen in the
// current CLDR data, I decided to just prohibit it in the new code. Support can
// easily be added in once the expected semantics are clear.
checkState(!rules.isEmpty(), "missing rule data for plurals: %s", prefix);
// Have we seen this set of rules before? If so, reuse the existing index. Note
// that an IDE might report this call as suspicious because the key is not yet an
// immutable map (saves creating immutable maps just to check for inclusion) but
// this is fine because collection equality is based only on contents, not
// collection type.
int idx = previousRules.indexOf(rules);
if (idx == -1) {
int newIdx = previousRules.size();
rules.forEach((k, v) -> icuData.add(RbPath.of("rules", "set" + newIdx, k), v));
// Since "rules" is mutable and reused, we must take an immutable copy here.
previousRules.add(ImmutableMap.copyOf(rules));
idx = newIdx;
}
String setName = "set" + idx;
locales.forEach(locale -> icuData.add(icuPrefix.extendBy(locale), setName));
rules.clear();
locales.clear();
}
}
}
private PluralsMapper() {}
}

View file

@ -0,0 +1,145 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrData.PrefixVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import com.google.common.escape.UnicodeEscaper;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* A mapper to collect plural data from {@link CldrDataType#LDML LDML} data via the paths:
* <pre>{@code
* //ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]
* }</pre>
*/
// TODO: This class can almost certainly be written using RegexTransformer and a small config.
public final class RbnfMapper {
private static final PathMatcher RULE_SET =
PathMatcher.of("ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]");
private static final AttributeKey GROUPING_TYPE = keyOf("rulesetGrouping", "type");
private static final AttributeKey RULESET_TYPE = keyOf("ruleset", "type");
private static final PathMatcher RBNF_RULE = PathMatcher.of("rbnfrule");
private static final AttributeKey RBNF_VALUE = keyOf("rbnfrule", "value");
private static final AttributeKey RBNF_RADIX = keyOf("rbnfrule", "radix");
private static final AttributeKey RULESET_ACCESS = keyOf("ruleset", "access");
private static final RbPath RB_PARENT = RbPath.of("%%Parent");
// This is the ICU path prefix, below which everything generated by this visitor will go.
private static final RbPath RB_ROOT = RbPath.of("RBNFRules");
/**
* Processes data from the given supplier to generate RBNF data for a set of locale IDs.
*
* @param localeId the locale ID to generate data for.
* @param src the CLDR data supplier to process.
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
* @return IcuData containing RBNF data for the given locale ID.
*/
public static IcuData process(
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
// Using DTD order is essential here because the RBNF paths contain ordered elements,
// so we must ensure that they appear in sorted order (otherwise we'd have to do more
// work at this end to re-sort the results).
RulesetVisitor visitor = new RulesetVisitor(localeId);
icuSpecialData.ifPresent(s -> s.accept(DTD, visitor));
src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, visitor);
return visitor.icuData;
}
static final class RulesetVisitor implements PrefixVisitor {
private final IcuData icuData;
private RulesetVisitor(String localeId) {
this.icuData = new IcuData(localeId, true);
}
@Override public void visitPrefixStart(CldrPath prefix, Context context) {
if (RULE_SET.matchesPrefixOf(prefix)) {
RbPath rbPath = RB_ROOT.extendBy(GROUPING_TYPE.valueFrom(prefix));
String rulesetType = RULESET_TYPE.valueFrom(prefix);
boolean isStrict = !"lenient-parse".equals(rulesetType);
// This is rather hacky because the access attribute lives on the parent path
// element, but we cannot use it until we visit the child values (because it's a
// value attribute and will not be in the prefix path. So we need to add the header
// only once, just before we start adding the values relating to the child
// elements, so we need a flag.
//
// This cannot be a boolean field since it must be "effectively final".
AtomicBoolean hasHeader = new AtomicBoolean(false);
context.install(
value -> {
if (RBNF_RULE.matchesSuffixOf(value.getPath())) {
if (!hasHeader.get()) {
boolean isPrivate =
RULESET_ACCESS.valueFrom(value, "public").equals("private");
icuData.add(rbPath, (isPrivate ? "%%" : "%") + rulesetType + ":");
hasHeader.set(true);
}
String rulePrefix = "";
if (isStrict) {
String basePrefix = RBNF_VALUE.valueFrom(value);
rulePrefix = RBNF_RADIX.optionalValueFrom(value)
.map(r -> basePrefix + "/" + r)
.orElse(basePrefix);
rulePrefix += ": ";
}
icuData.add(
rbPath,
rulePrefix + ESCAPE_RBNF_DATA.escape(value.getValue()));
}
});
}
}
/*
* Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
* backslash to a double backslash. This class is super slow for non-ASCII escaping due to
* using "String.format()", however there's < 100 values that need any escaping, so it's
* fine.
*/
private static final UnicodeEscaper ESCAPE_RBNF_DATA = new UnicodeEscaper() {
private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
private final char[] LEFT_ANGLE = "<".toCharArray();
private final char[] RIGHT_ANGLE = ">".toCharArray();
@Override
protected char[] escape(int cp) {
// Returning null means "do not escape".
switch (cp) {
case '\\':
return DOUBLE_BACKSLASH;
case '←':
return LEFT_ANGLE;
case '→':
return RIGHT_ANGLE;
default:
if (0x0020 <= cp && cp <= 0x007F) {
return null;
} else if (cp <= 0xFFFF) {
return String.format("\\u%04X", cp).toCharArray();
}
return String.format("\\U%08X", cp).toCharArray();
}
}
};
}
}

View file

@ -0,0 +1,119 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.Ordering.natural;
import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING;
import java.util.Set;
import org.unicode.cldr.api.CldrData;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrValue;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.SetMultimap;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* Generate supplemental {@link IcuData} by transforming {@link CldrDataType#SUPPLEMENTAL
* SUPPLEMENTAL} data using a {@link PathValueTransformer}.
*
* <p>This is currently driven by the {@code ldml2icu_supplemental.txt} configuration file via a
* {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
*/
public final class SupplementalMapper {
private static final RbPath RB_FIFO = RbPath.of("<FIFO>");
/**
* Processes a subset of supplemental data from the given supplier.
*
* @param src the CLDR data supplier to process.
* @param transformer the transformer to match and transform each CLDR path/value pair.
* @param icuName the name for the generated IcuData.
* @param includePaths a matcher to select the CLDR paths to be transformed.
* @return An IcuData instance containing the specified subset of supplemental data with the
* given ICU name.
*/
// TODO: Improve external data splitting and remove need for a PathMatcher here.
public static IcuData process(
CldrDataSupplier src, PathValueTransformer transformer, String icuName,
PathMatcher includePaths) {
ResultsCollector collector = new ResultsCollector(includePaths, transformer);
// Write out the results into the IcuData class, preserving result grouping and expanding
// path references as necessary.
IcuData icuData = new IcuData(icuName, false);
icuData.addResults(collector.getResults(src));
return icuData;
}
private static final class ResultsCollector {
private final PathMatcher pathMatcher;
private final PathValueTransformer transformer;
// WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
// each key. The reason is that result comparison is not "consistent with equals", and
// TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
// method), and it does this even if using the add() method of the sorted set (this is in
// fact in violation of the stated behaviour of Set#add).
private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
private int fifoCounter = 0;
ResultsCollector(PathMatcher pathMatcher, PathValueTransformer transformer) {
this.pathMatcher = checkNotNull(pathMatcher);
this.transformer = checkNotNull(transformer);
}
private void visit(CldrValue value) {
if (pathMatcher.matchesPrefixOf(value.getPath())) {
for (Result r : transformer.transform(value)) {
RbPath rbPath = r.getKey();
if (rbPath.contains(RB_FIFO)) {
// The fifo counter needs to be formatted with leading zeros for sorting.
rbPath = rbPath.mapSegments(
s -> s.equals("<FIFO>") ? String.format("<%04d>", fifoCounter) : s);
}
resultsByRbPath.put(rbPath, r);
}
fifoCounter++;
}
}
ImmutableListMultimap<RbPath, Result> getResults(CldrDataSupplier supplier) {
// DTD and NESTED_GROUPING order differ because of how the magic <FIFO> label works (it
// basically enforces "encounter order" onto things in unlabeled sequences, which matches
// the old behaviour). If it wouldn't break anything, it might be worth moving to DTD order
// to remove any lingering implicit dependencies on the CLDR data behaviour.
CldrData supplementalData = supplier.getDataForType(CldrDataType.SUPPLEMENTAL);
PathValueTransformer.DynamicVars varFn = p -> {
CldrValue cldrValue = supplementalData.get(p);
return cldrValue != null ? cldrValue.getValue() : null;
};
supplementalData.accept(NESTED_GROUPING, this::visit);
ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
out.orderValuesBy(natural());
for (RbPath rbPath : resultsByRbPath.keySet()) {
Set<Result> existingResults = resultsByRbPath.get(rbPath);
out.putAll(rbPath, existingResults);
for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
out.put(rbPath, fallback);
}
}
}
return out.build();
}
}
private SupplementalMapper() {}
}

View file

@ -0,0 +1,183 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Optional;
import java.util.function.Function;
import org.unicode.cldr.api.AttributeKey;
import org.unicode.cldr.api.CldrData.ValueVisitor;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrValue;
import org.unicode.icu.tool.cldrtoicu.IcuData;
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import org.unicode.icu.tool.cldrtoicu.RbValue;
import com.ibm.icu.text.Transliterator;
/**
* A mapper to collect transliteration data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
* data via the paths:
* <pre>{@code
* //supplementalData/transforms/transform/tRule
* }</pre>
*
* <p>This mapper also writes out the transform rule files into a specified directory.
*/
public final class TransformsMapper {
private static final PathMatcher TRULE =
PathMatcher.of("supplementalData/transforms/transform/tRule");
private static final AttributeKey TRANSFORM_SOURCE = keyOf("transform", "source");
private static final AttributeKey TRANSFORM_TARGET = keyOf("transform", "target");
private static final AttributeKey TRANSFORM_DIRECTION = keyOf("transform", "direction");
private static final AttributeKey TRANSFORM_VARIANT = keyOf("transform", "variant");
private static final AttributeKey TRANSFORM_VISIBILITY = keyOf("transform", "visibility");
private static final AttributeKey TRANSFORM_ALIAS = keyOf("transform", "alias");
private static final AttributeKey TRANSFORM_BACKALIAS = keyOf("transform", "backwardAlias");
private static final RbPath RB_TRANSLITERATOR_IDS = RbPath.of("RuleBasedTransliteratorIDs");
// This decomposes some accented characters with accents in the "Mn" (Mark, non-spacing)
// Unicode range by representing the accents in the \u1234 hex form. For example, it converts:
// "ɪ̈" to "ɪ\u0308" and "ɯ̽" to "ɯ\u033D". This does not affect all accented character (e.g.
// ä) and the precise reason this is done was never clearly documented in the code from which
// this code was derived (but it seems necessary to generate the expected output in the
// transliteration rules).
//
// This is one of the only, apparently necessary direct dependencies on the icu4j library.
// TODO: Make this depend icu4j from this project rather than the older version from CLDR.
private static final Transliterator FIXUP = Transliterator.getInstance("[:Mn:]any-hex/java");
// Don't rename these enum constants, they need to match the data directly.
private enum Direction { forward, backward, both }
private enum Visibility { internal, external }
/**
* Processes data from the given supplier to generate transliteration ICU data, writing
* auxiliary transliteration rule files in the process. This is a potentially destructive call
* and will overwrite existing transformation rule files in the specified directory.
*
* @param src the CLDR data supplier to process.
* @param ruleFileOutputDir the directory into which transliteration rule files will be written.
* @return the IcuData instance to be written to a file.
*/
public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) {
RuleVisitor visitor = new RuleVisitor(p -> {
Path file = ruleFileOutputDir.resolve(p);
try {
return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING));
} catch (IOException e) {
throw new RuntimeException("error opening file: " + file, e);
}
});
src.getDataForType(SUPPLEMENTAL).accept(DTD, visitor);
return visitor.icuData;
}
private static class RuleVisitor implements ValueVisitor {
private final IcuData icuData = new IcuData("root", false);
private final Function<Path, PrintWriter> outFn;
RuleVisitor(Function<Path, PrintWriter> outFn) {
this.outFn = checkNotNull(outFn);
icuData.setFileComment("File: root.txt");
// I have _no_ idea what any of this is about, I'm just trying to mimic the original
// (complex and undocumented) code in "ConvertTransforms.java".
icuData.add(RbPath.of("TransliteratorNamePattern"), "{0,choice,0#|1#{1}|2#{1}-{2}}");
// Note that this quoting of path segments is almost certainly unnecessary. It matches
// the old "ConvertTransforms" behaviour, but '%' is used elsewhere without quoting, so
// it seems very likely that it's not needed here.
// TODO: Once migration done, remove quotes here & check in RbPath for unwanted quotes.
icuData.add(RbPath.of("\"%Translit%Hex\""), "%Translit%Hex");
icuData.add(RbPath.of("\"%Translit%UnicodeName\""), "%Translit%UnicodeName");
icuData.add(RbPath.of("\"%Translit%UnicodeChar\""), "%Translit%UnicodeChar");
// Special case, where Latin is a no-op.
icuData.add(RbPath.of("TransliterateLATIN"), RbValue.of("", ""));
// Some hard-coded special case mappings.
icuData.add(
RB_TRANSLITERATOR_IDS.extendBy("Tone-Digit", "alias"),
"Pinyin-NumericPinyin");
icuData.add(
RB_TRANSLITERATOR_IDS.extendBy("Digit-Tone", "alias"),
"NumericPinyin-Pinyin");
}
@Override public void visit(CldrValue value) {
// The other possible element is "comment" but we currently ignore those.
if (TRULE.matches(value.getPath())) {
String source = getExpectedOptionalAttribute(value, TRANSFORM_SOURCE);
String target = getExpectedOptionalAttribute(value, TRANSFORM_TARGET);
Optional<String> variant = TRANSFORM_VARIANT.optionalValueFrom(value);
String baseFilename = source + "_" + target;
String filename =
variant.map(v -> baseFilename + "_" + v).orElse(baseFilename) + ".txt";
writeRootIndexEntry(value, source, target, variant, filename);
writeDataFile(filename, value);
}
}
private void writeDataFile(String filename, CldrValue value) {
try (PrintWriter out = outFn.apply(Paths.get(filename))) {
out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others.");
out.println("# License & terms of use: http://www.unicode.org/copyright.html#License");
out.println("#");
out.println("# File: " + filename);
out.println("# Generated from CLDR");
out.println("#");
out.println();
out.println(FIXUP.transliterate(whitespace().trimFrom(value.getValue())));
out.println();
}
}
private void writeRootIndexEntry(
CldrValue value, String source, String target, Optional<String> variant, String filename) {
Visibility visibility = TRANSFORM_VISIBILITY.valueFrom(value, Visibility.class);
String status = visibility == Visibility.internal ? "internal" : "file";
Direction dir = TRANSFORM_DIRECTION.valueFrom(value, Direction.class);
if (dir != Direction.backward) {
String id = getId(source, target, variant);
TRANSFORM_ALIAS.listOfValuesFrom(value)
.forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
icuData.add(rbPrefix.extendBy("direction"), "FORWARD");
}
if (dir != Direction.forward) {
String id = getId(target, source, variant);
TRANSFORM_BACKALIAS.listOfValuesFrom(value)
.forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
icuData.add(rbPrefix.extendBy("direction"), "REVERSE");
}
}
}
private static String getId(String from, String to, Optional<String> variant) {
String baseId = from + "-" + to;
return variant.map(v -> baseId + "/" + v).orElse(baseId);
}
private static String getExpectedOptionalAttribute(CldrValue value, AttributeKey key) {
return key.optionalValueFrom(value).orElseThrow(() ->
new IllegalArgumentException(String.format("missing data for %s in: %s", key, value)));
}
}

View file

@ -0,0 +1,26 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import com.google.common.base.Ascii;
/** Instructions in result specifications (e.g. "values=..." or "fallback=..."). */
enum Instruction {
/** Defines processing and transformation of CLDR values. */
VALUES,
/** Defines fallback values to be used if no result was matched in a resource bundle. */
FALLBACK,
/** Defines an xpath used to hack result equality to make deduplication work. */
BASE_XPATH,
// TODO: Figure out how to remove this hack (probably by supporting partial matches).
/**
* Defines whether result values should be appended one at a time to a resource bundle
* (default) or grouped into a separate array.
*/
GROUP;
/** Returns the instruction enum for its ID as it appears in the configuration file. */
static Instruction forId(String id) {
return Instruction.valueOf(Ascii.toUpperCase(id));
}
}

View file

@ -0,0 +1,58 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import java.util.List;
import java.util.function.Function;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
/**
* Function used by {@code RegexTransformer} to convert CLDR values in special ways. See also
* {@code IcuFunctions}.
*/
public final class NamedFunction implements Function<List<String>, String> {
private static final CharMatcher NAME_CHARS =
CharMatcher.inRange('a', 'z').or(CharMatcher.is('_'));
private static final Splitter ARG_SPLITTER = Splitter.on(',').trimResults(whitespace());
public static NamedFunction create(
String name, int argCount, Function<List<String>, String> fn) {
return new NamedFunction(name, argCount, fn);
}
private final String name;
private final int maxArgs;
private final Function<List<String>, String> fn;
private NamedFunction(String name, int argCount, Function<List<String>, String> fn) {
checkArgument(!name.isEmpty() && NAME_CHARS.matchesAllOf(name),
"invalid function name (must be lower_case_underscore): %s", name);
checkArgument(argCount >= 0, "invalid argument count: %s", argCount);
this.name = name;
this.maxArgs = argCount;
this.fn = checkNotNull(fn);
}
public String call(String argList) {
List<String> args = ARG_SPLITTER.splitToList(argList);
checkArgument(args.size() <= maxArgs,
"too many arguments for function '%s' (max=%s)", name, maxArgs);
return checkNotNull(apply(args),
"named functions must never return null: function=%s", name);
}
public String getName() {
return name;
}
@Override
public String apply(List<String> args) {
return fn.apply(args);
}
}

View file

@ -0,0 +1,173 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap;
import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
import static java.util.function.Function.identity;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableSetMultimap;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* Path/value transformer configured by {@code ldml2icu_xxx.txt} mapping and configuration files.
* See {@code ldml2icu_readme.txt} for details on the configuration file format and
* {@link PathValueTransformer} for the public API description and usage.
*
* <p>This class is thread safe.
*/
// TODO: Rewrite the readme to match current behaviour and describe edge cases properly.
public final class RegexTransformer extends PathValueTransformer {
/**
* Returns a new transformer based on transformation rules defined in the given configuration
* file contents, and using the specified functions for resolving ICU values.
*/
public static PathValueTransformer fromConfigLines(
List<String> lines, NamedFunction... functions) {
return new RegexTransformer(RuleParser.parseConfig(lines, Arrays.asList(functions)));
}
// Map of path prefixes grouped by DTD type (for early efficient filtering of paths).
private final ImmutableSetMultimap<CldrDataType, String> prefixMap;
// Transformation rules loading from the configuration file, grouped by path prefix.
private final ImmutableListMultimap<String, Rule> rulesMap;
// Functions which can generate a fallback value from a given resource bundle path.
private final ImmutableList<BiFunction<RbPath, DynamicVars, Optional<Result>>> fallbackFunctions;
// Records the total set of rules, removing them as they are matched. Used for reporting any
// unused rules for debugging purposes.
private final Set<Rule> unusedRules = new LinkedHashSet<>();
private RegexTransformer(List<Rule> rules) {
this.prefixMap =
rules.stream().collect(toImmutableSetMultimap(Rule::getDataType, Rule::getPathPrefix));
this.rulesMap =
rules.stream().collect(toImmutableListMultimap(Rule::getPathPrefix, identity()));
this.fallbackFunctions =
rules.stream().flatMap(Rule::getFallbackFunctions).collect(toImmutableList());
// Add all rules first and remove as they are matched.
this.unusedRules.addAll(rules);
}
@Override
public ImmutableList<Result> transform(CldrValue value) {
return transform(value, p -> null);
}
@Override
public ImmutableList<Result> transform(CldrValue value, DynamicVars varLookupFn) {
// This early rejection of non-matching paths, combined with "bucketing" the rules by path
// path prefix for easy lookup dramatically reduces the transformation time.
String pathPrefix = getPathPrefix(value);
if (!prefixMap.get(value.getDataType()).contains(pathPrefix)) {
return ImmutableList.of();
}
// Even though this is just derived from the value, resolve it here and pass it into each
// rule to avoid recalculating the same thing every time.
String fullXPath = getFullXPathWithoutSortIndices(value);
// Bucketing the rules by the path prefix means that each incoming value is only tested
// against likely matches. This reduces the number of tests per value by about 10x.
for (Rule rule : rulesMap.get(pathPrefix)) {
// We break after the first matching rule, since there is an implicit assumption
// that no paths will match more than one rule.
// TODO: Add a debug mode that checks that only one rule matches any given CLDR path.
ImmutableList<Result> results = rule.transform(value, fullXPath, varLookupFn);
if (!results.isEmpty()) {
unusedRules.remove(rule);
return results;
}
}
return ImmutableList.of();
}
// All "leaf" paths must have at least two elements, so we can find the "prefix" which is
// the first element after the DTD root. This corresponds to the value extracted via
// PATH_SPEC_PREFIX in the parser.
private static String getPathPrefix(CldrValue value) {
CldrPath prefix = value.getPath();
checkArgument(prefix.getLength() >= 2, "unexpectedly short path: %s", prefix);
while (prefix.getLength() > 2) {
prefix = prefix.getParent();
}
return prefix.getName();
}
// A regex to capture any sort-indices in the full path string (which must be removed).
private static final Pattern SORT_INDEX = Pattern.compile("(/\\w+)#[0-9]+");
// Note that the full path we get here contains the "sort index" suffix for ORDERED
// elements. This means that some element names are "foo#N" where N is the sort index.
// Since the regex transformer works around "ordered elements" in a completely different
// way and doesn't have them in the regular expressions, we can just remove them.
private static String getFullXPathWithoutSortIndices(CldrValue v) {
String fullPath = v.getFullPath();
for (CldrPath p = v.getPath(); p != null; p = p.getParent()) {
if (p.getSortIndex() != -1) {
// Only do expensive regex stuff if there's an "ordered" element with a sort index.
return SORT_INDEX.matcher(fullPath).replaceAll("$1");
}
}
// No path parts have a sort index, so the original full path string is safe to return.
return fullPath;
}
@Override
public ImmutableList<Result> getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn) {
return fallbackFunctions.stream()
.map(f -> f.apply(rbPath, varLookupFn))
.filter(Optional::isPresent)
.map(Optional::get)
.collect(toImmutableList());
}
@Override public String toString() {
StringWriter buf = new StringWriter();
PrintWriter out = new PrintWriter(buf);
out.println(getClass().getName() + "{");
out.println(" Rules: " + rulesMap.size());
if (!unusedRules.isEmpty()) {
out.println(" Unused Rules:");
unusedRules.forEach(
r -> out.format(" [line=%3d] %s\n", r.getLineNumber(), r.getXpathSpec()));
}
out.println('}');
out.flush();
return buf.toString();
}
// Package use helper for substituting single-character place-holders like '$N' or '%X'.
static String substitute(String s, char token, Function<Character, String> replaceFn) {
if (s.indexOf(token) == -1) {
return s;
}
StringBuilder out = new StringBuilder();
int i = 0;
for (int j = s.indexOf(token); j != -1; i = j + 2, j = s.indexOf(token, i)) {
char varChar = s.charAt(j + 1);
String replacement =
checkNotNull(replaceFn.apply(varChar), "no such variable %s%s", token, varChar);
out.append(s, i, j).append(replacement);
}
return out.append(s.substring(i)).toString();
}
}

View file

@ -0,0 +1,632 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkElementIndex;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static java.util.Comparator.comparing;
import static java.util.Comparator.nullsLast;
import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/**
* A specification for building a result from the arguments in a matched xpath. Results always
* hold a reference to their originating specification to allow them to be ordered in the same
* order as the corresponding specifications in the configuration file.
*/
final class ResultSpec {
// Subtle ordering for results to ensure "config file order" for things in the same
// resource bundle while being "friendly" towards a global ordering. This is NOT consistent
// with equals if duplicate results exist.
//
// This is ESSENTIAL for correct grouping and ordering within resource bundles.
//
// In normal use this is expected only to be used to reorder results within a resource
// bundle (i.e. those sharing the same resource bundle path "key"). Resource bundles
// themselves can just be managed in "visitation order" or similar.
//
// Ordering priority is:
// 1: Result key (resource bundle): Groups results by resource bundle.
// 2: Result specification line number: Orders resource bundle contents by "file order".
// 3: Result distinguishing xpath: Tie breaking if duplicates are not yet removed.
//
// Note that the currently uses the String representation of the resource bundle path (key)
// as the primary order to match legacy behaviour. However it would be better to use the
// natural lexicographical RbPath order (the difference relates to having '/' as the
// separator in the string representation of the path). The string form of a path is a bad
// choice because some paths can contain a literal '/', which makes ordering problematic in
// rare case. However changing this will have the effect of reodering path elements, which
// while it should be safe, must be done with caution.
// TODO: Fix this to use RbPath ordering and NOT the String representation
private static final Comparator<AbstractResult> RESULT_ORDERING =
Comparator.<AbstractResult, String>comparing(r -> r.getKey().toString())
.thenComparing(r -> r.getSpec().lineNumber)
.thenComparing(nullsLast(comparing(r -> r.getPath().orElse(null))));
// Splitter for any values (either in CLDR data or results specifications). The only time
// values are split differently is when quoting exists in the "values" instruction.
private static final Splitter VALUE_SPLITTER = Splitter.on(whitespace()).omitEmptyStrings();
// Matcher for "&foo_bar(a,b,c)" which captures function name and complete argument list.
private static final Pattern FUNCTION = Pattern.compile("\\&(\\w++)\\(([^\\)]++)\\)");
// Resource bundle path specification with placeholders (e.g. "/foo/$1/bar") exactly as it
// appears in the configuration file.
private final String rbPathSpec;
// Declared instructions with which to generate result values (see Instruction).
private final ImmutableMap<Instruction, VarString> instructions;
// This index of the xpath argument whose value should be split to create multiple results.
// This mechanism is used when an xpath attribute is a space separated list of values and
// one result should be created for each value (e.g. [@territories="AA BB CC"] but you want
// a resource bundle for each region code (e.g. "foo/XX/bar", "foo/YY/bar", "foo/ZZ/bar").
// At most one argument is ever split (corresponding to the first unquoted placeholder in
// the resource bundle path specification).
private final int splitArgIndex;
// The line number of the result specification in the file which defines the ordering of
// results within a resource bundle. This needn't be a line number, but must be unique for
// each specification.
private final int lineNumber;
// The named functions available to the parser. Ideally the rules and result specifications
// would be an inner class of some kind of context/environment and just share this.
private final ImmutableMap<String, NamedFunction> icuFunctions;
// The map of dynamic variables (looked up from CldrPaths when a rule is resolved.
private final Function<Character, CldrPath> dynamicVarFn;
ResultSpec(
String rbPathSpec,
Map<Instruction, VarString> instructions,
int lineNumber,
Map<String, NamedFunction> icuFunctions,
Function<Character, CldrPath> dynamicVarFn) {
this.rbPathSpec = checkNotNull(rbPathSpec);
this.instructions = ImmutableMap.copyOf(instructions);
this.splitArgIndex = getSplitArgIndex(rbPathSpec);
this.lineNumber = lineNumber;
this.icuFunctions = ImmutableMap.copyOf(icuFunctions);
this.dynamicVarFn = checkNotNull(dynamicVarFn);
}
/**
* Transforms a path/value into a sequence of results. The given matcher has successfully
* matched the path and contains the captured arguments corresponding to $1..$N in the
* various result specification strings.
*/
Stream<Result> transform(
CldrValue value, Matcher m, DynamicVars varLookupFn) {
// Discard group(0) since that's always the full xpath that was matched, and we don't
// need that any more (so "$N" is args.get(N - 1)).
List<String> args = new ArrayList<>();
for (int i = 1; i <= m.groupCount(); i++) {
// Important since we turn this into an ImmutableList (which is null-hostile).
args.add(checkNotNull(m.group(i),
"captured regex arguments must always be present\n"
+ "(use an non-capturing groups for optional arguments): %s", m.pattern()));
}
// The first unquoted argument in any resource bundle path declaration, is defined as
// being "splittable". Typically this happens if the value of the captured xpath
// argument is expected to be a list of items.
//
// In this case, we generate one result for each individual argument, replacing the
// appropriate captured list with each split value in turn. Thus with original
// arguments:
// ["foo", "bar baz", "quux"]
// where splitArgIndex == 1, we get two results using the argument lists:
// ["foo", "bar", "quux"]
// ["foo", "baz", "quux"]
//
// Note also that since the splittability of the arguments is technically defined
// by the resource bundle path specification (not the xpath regular expression) it
// could differ per ResultSpec instance (but currently never does).
if (splitArgIndex != -1) {
List<String> splitArgs = VALUE_SPLITTER.splitToList(args.get(splitArgIndex));
// Only bother if there was more than one argument there anyway.
if (splitArgs.size() > 1) {
return splitArgs.stream().map(a -> {
args.set(splitArgIndex, a);
return matchedResult(value, args, varLookupFn);
});
}
}
// No splittable argument, or a splittable argument with only one value.
return Stream.of(matchedResult(value, args, varLookupFn));
}
// Simple helper to make results.
private Result matchedResult(
CldrValue value, List<String> args, DynamicVars varLookupFn) {
return new MatchedResult(
getRbPath(args),
getValues(value.getValue(), args, varLookupFn),
getResultPath(value.getPath(), args, varLookupFn));
}
// Resource bundle paths are a bit special (unsurprisingly). The captured arguments can
// contain '/' and will extend the path structure. Thus "foo/$1/bar" might end up as
// "foo/x/y/bar" after argument substitution.
//
// However (a hack for timezone "metazone" paths) if the argument placeholder is quoted
// (e.g. "foo/"$1"/bar") then '/' in arguments is replaced by ':' and quotes are retained
// (e.g. "foo/"x:y"/bar).
// TODO: Replace hard coded hack here with an explicit function in the config file.
private RbPath getRbPath(List<String> args) {
// Without more careful parsing, it's hard to figure out it quotes in a resource bundle
// path specification are around a placeholder or not. Since quotes are only used in a
// small number of cases currently, and only for this purpose, we just assume that any
// quotes in the path specification should trigger this behaviour.
if (rbPathSpec.contains("\"")) {
// Use a lazy transforming list to avoid char replacement in arguments that don't
// appear in the resource bundle path.
args = Lists.transform(args, s -> s.replace('/', ':'));
}
String path = substituteArgs(rbPathSpec, args);
return RbPath.parse(path);
}
// Create an array of output values according to the CLDR value (if present) and the
// "values" instruction in the result specification (if present). Any functions present in
// the "values" instruction are invoked here.
private ImmutableList<String> getValues(
String value, List<String> args, DynamicVars varLookupFn) {
VarString valuesSpec = instructions.get(Instruction.VALUES);
if (valuesSpec == null) {
// No "values" instruction, so just use the _unsplit_ CLDR value. To split a CLDR
// value use "values={value}" in the result specification.
return ImmutableList.of(value);
}
// The "value" instruction is not expected to have any dynamic %N variables in it,
// since those only represent CLDR path mappings, which should not be directly present
// in the ICU data. Hence the valueSpec should have been fully resolved by the static
// variables applied earlier and we should just need to resolve() it into a String.
String resolved = valuesSpec.get();
// First substitute the $N arguments in since they need to be passed to the
// functions.
//
// WARNING: This doesn't strictly work, since an argument or function result could
// (in theory) contain the string "{value}" which would then be substituted in an
// unexpected way. The better way to do this is with a single pass which handles
// arguments, function calling and the special "{value}" token together. This comes
// down to the fact that the mapping file syntax doesn't have a well defined concept
// of escaping or invocation order.
// TODO: Fix this, possibly by rewriting the whole transformer "language" to be consistent.
resolved = substituteArgs(resolved, args);
Matcher m = FUNCTION.matcher(resolved);
if (m.find()) {
StringBuilder buffer = new StringBuilder();
int index = 0;
do {
// Append up to the start of the function call.
buffer.append(resolved, index, m.start());
// Replace '{value}' here so functions can be called with the CLDR value as well
// as captured path arguments. We also have to replace it below, which is all a bit
// dodgy if a function every returned '{value}'.
NamedFunction fn = icuFunctions.get(m.group(1));
checkArgument(fn != null, "no such function: %s", m.group(1));
buffer.append(fn.call(m.group(2).replace("{value}", value)));
index = m.end();
} while (m.find());
resolved = buffer.append(resolved.substring(index)).toString();
}
// Having done function invocation, we handle the special "{value}" token and split
// the value (taking quoting into account).
return splitValues(resolved.replace("{value}", value));
}
// IMPORTANT: The path of a result is either:
// * The original distinguishing path
// * The specified "base_xpath" (which must also be a distinguishing xpath).
// and this is used as part of the equality semantics (which are very subtle).
//
// The existence of "base_xpath" is a hack to get around the fact the xpaths can only be
// matched in full, rather than by a prefix. For some cases this means that the "same"
// result will be created many times by potentially different distinguishing xpaths,
// perhaps even via different result specifications. "base_xpath" exists as a hack to give
// these duplicate results the same "fake" xpath, so deduplication can occur.
private CldrPath getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn) {
VarString basePath = instructions.get(Instruction.BASE_XPATH);
if (basePath == null) {
return path;
}
String resolvedBasePath = basePath.apply(dynamicVarFn.andThen(varLookupFn)).get();
return parseDistinguishingPath(substituteArgs(resolvedBasePath, args));
}
/**
* Returns a fallback function if this specification has the "fallback=" instruction.
* The function takes a resolved resource bundle path and returns the possible fallback
* values for it. Note that currently fallback values do not support either quoting or
* grouping (but they easily could).
*/
Optional<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunction() {
VarString fallbackSpec = instructions.get(Instruction.FALLBACK);
if (fallbackSpec == null) {
return Optional.empty();
}
// This is the only place where any hacking of regular expressions occurs. The fallback
// function must only return a value if the given resolved resource bundle path could
// have been a match for the path specification.
//
// In order to avoid ambiguity for paths such as "foo/$1/$2/bar" and "foo/$1/bar" which
// should not both be matched, we explicitly disallow '/' in argument values. In theory
// this is problematic, since '/' should be an allowed character, but the issues caused
// by ambiguous matching are worse.
// TODO: Fix/replace all of this fallback mess with something cleaner.
Pattern rbPathMatcher = getRbPathMatcher(rbPathSpec);
// Another, frankly terrifying, bit of hackery to support fallback specifications with
// $N argument substitution (this currently only happens once, but must be supported).
// Just another reason to want to replace the current fallback mechanism.
fallbackSpec = maybeRewriteFallbackSpec(fallbackSpec);
// Just copying here to make it effectively final.
VarString finalFallbackSpec = fallbackSpec;
return Optional.of(
(p, varFn) -> getFallbackResult(p, varFn, rbPathMatcher, finalFallbackSpec));
}
private Optional<Result> getFallbackResult(
RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec) {
// Check is the given rbPath could be associated with this fallback (most are not).
Matcher matcher = rbPathMatcher.matcher(rbPath.toString());
if (!matcher.matches()) {
return Optional.empty();
}
// Expect that once any dynamic variables are provided to the fallback specification,
// we can get the resolved fallback specification (potentially with $N placeholders to
// be filled in from the resource bundle path).
String specStr = fallbackSpec.apply(dynamicVarFn.andThen(varFn)).get();
if (matcher.groupCount() > 0) {
specStr = substituteArgs(specStr, n -> matcher.group(n + 1), matcher.groupCount());
}
// Split the fallback value _without_ considering quoting. This matches the original
// behaviour but could cause all sorts of subtle issues if values contained quotes.
// TODO: Rework transformation rules to make quoting behaviour deterministic.
Iterable<String> values =
VALUE_SPLITTER.splitToList(specStr).stream()
// Fallback values that "look like" CLDR paths are auto-magically resolved.
.map(v -> v.startsWith("//") ? varFn.apply(parseDistinguishingPath(v)) : v)
.collect(toImmutableList());
return Optional.of(new FallbackResult(rbPath, values));
}
// WARNING: Another very hacky behaviour (used exactly once) is that "$N" argument
// substitutions are allowed in fallback values. This is highly problematic because
// since the fallback value must be synthesized only from the resource bundle path,
// there's no way for this substitution to handle:
// 1: multi-valued list arguments
// 2: arguments that didn't appear in the resource bundle path
// 3: dynamic path variables (e.g. %D=//some/path)
//
// An example would be something like a resource bundle specification of:
// /Baz/$2/$1
// and a fallback value of:
// Foo$1/Bar$2
//
// Here the order of substitution is not maintained and the original path specification
// has values that are not naturally ordered (or possibly even duplicated). The pattern
// we calculate from the resource bundle path specification will match/capture groups in
// "natural order" (i.e. "/Baz/(...)/(...)") so we have to rewrite the order of the
// placeholders in the fallback specification to match (e.g. "Foo$2/Bar$1").
// TODO: Figure out a way to remove all of this extreme complexity.
private VarString maybeRewriteFallbackSpec(
VarString fallbackSpec) {
Optional<String> fallback = fallbackSpec.resolve();
// If the fallback string is not present, it's because the VarString still has
// unresolved "dynamic" variables for late binding. This is okay, but should not
// be mixed with argument substitution.
if (!fallback.isPresent() || !fallback.get().contains("$")) {
return fallbackSpec;
}
// After the quick rejection check for '$', do a proper search for $N variables (since
// '$' is permitted as a literal if not followed by a digit).
Matcher fallbackMatcher = ARG_PLACEHOLDER.matcher(fallback.get());
if (!fallbackMatcher.find()) {
return fallbackSpec;
}
// Fallback spec has $N in it, triggering super hacky behaviour.
Matcher pathMatcher = ARG_PLACEHOLDER.matcher(rbPathSpec);
checkState(pathMatcher.find(),
"$N arguments in fallback must be present in the resource bundle path: %s",
rbPathSpec);
// Explicit group characters ("1"..."9") in the order they appear in the
// resource bundle path. There can be duplicates (e.g. "/Foo/$1/Bar$1").
List<Character> groupIds = new ArrayList<>();
do {
groupIds.add(pathMatcher.group().charAt(1));
} while (pathMatcher.find());
// Special check to avoid a horrible bug if we every had more than 9 distinct
// placeholders (essentially impossible with current data). If it did happen,
// the returned index below would be >= 9 and we would get "$X", where 'X' was
// not a numeric value.
checkState(groupIds.size() < 10,
"too many placeholders in resource bundle path: %s", rbPathSpec);
// Now find each placeholder in the fallback specification string and map it to
// the equivalent index for the path matcher we just created.
StringBuilder rewrittenFallbackSpec = new StringBuilder(fallback.get());
do {
int placeholderPos = fallbackMatcher.start() + 1;
// The new ID is the index of the corresponding placeholder offset by '1'.
char placeholderDigit = rewrittenFallbackSpec.charAt(placeholderPos);
int newPlaceholderIndex = groupIds.indexOf(placeholderDigit);
checkState(newPlaceholderIndex != -1,
"fallback values may only contain arguments from the resource bundle path: %s",
fallback.get());
rewrittenFallbackSpec.setCharAt(placeholderPos, (char)('1' + newPlaceholderIndex));
} while (fallbackMatcher.find());
return VarString.of(rewrittenFallbackSpec.toString());
}
/** Base class of either a matched or a fallback result. */
private abstract class AbstractResult extends Result {
// Split and resolved values for this result (see also "isGrouped()").
private final ImmutableList<String> values;
// The "source" CLDR path of a matched result (omitted if this is a fallback result).
// Note that this is the resolved "base_xpath" if it was specified in the instructions.
private final Optional<CldrPath> basePath;
// Calculated eagerly since we always expect results to need to be deduplicated.
private final int hashCode;
AbstractResult(RbPath key, Iterable<String> values, Optional<CldrPath> path) {
super(key);
this.values = ImmutableList.copyOf(values);
this.basePath = checkNotNull(path);
// Same attributes in the same order as tested for in equals().
this.hashCode = Objects.hash(getKey(), getPath(), isGrouped(), getValues());
}
// Returns the specification from which this result was obtained. This is essential for
// correct ordering and determining fallback values, but is not directly used for
// determining result equality (since duplicate results can be generated by different
// specifications).
final ResultSpec getSpec() {
return ResultSpec.this;
}
final Optional<CldrPath> getPath() {
return basePath;
}
final boolean wasMatched() {
// We could also do this via a boolean field.
return this instanceof MatchedResult;
}
@Override
public final ImmutableList<String> getValues() {
return values;
}
@Override
public final int compareTo(Result other) {
checkArgument(other instanceof AbstractResult,
"unknown result type: %s", other.getClass());
return RESULT_ORDERING.compare(this, (AbstractResult) other);
}
@Override
public final int hashCode() {
return hashCode;
}
// Equality semantics of results is ESSENTIAL for correct behaviour, especially the
// deduplication of results. See also "getSpec()", "getPath()", and RESULT_ORDERING.
@Override
public final boolean equals(Object obj) {
// Different subclasses are never equal, so test class directly (not instanceof).
if (obj == null || !getClass().equals(obj.getClass())) {
return false;
}
AbstractResult other = (AbstractResult) obj;
// DO NOT test the result specifier here. Equal results can be generated from
// different result specifications (if "base_xpath" was used).
return getKey().equals(other.getKey())
&& getPath().equals(other.getPath())
&& isGrouped() == other.isGrouped()
// Alternatively assert that values are equal if everything else is.
&& getValues().equals(other.getValues());
}
}
// Result created for an explicit path match using captured arguments.
private final class MatchedResult extends AbstractResult {
MatchedResult(RbPath key, Iterable<String> values, CldrPath path) {
super(key, values, Optional.of(path));
}
@Override
public boolean isGrouped() {
// We don't need to use the "group" value at all and it can be removed from the
// configuration file at some point.
return instructions.containsKey(Instruction.GROUP);
}
@Override
public boolean isFallbackFor(Result r) {
// Matched results are never a fallback for anything.
return false;
}
}
// Result created to hold possible fallback values for a specified resource bundle path.
private final class FallbackResult extends AbstractResult {
FallbackResult(RbPath rbPath, Iterable<String> values) {
super(rbPath, values, Optional.empty());
}
// Delete this method and move the other one into AbstractResult if we decide to allow
// grouping for fallback values (it's not clear if it's a good idea).
@Override
public boolean isGrouped() {
return false;
}
@Override
public boolean isFallbackFor(Result r) {
// We are a fallback if we came from the same specification as a matched result.
// To prevent duplication of fallback results, we also return true if the result we
// are "equal()" to the given result (equivalent fallback results can come from
// different input paths).
checkArgument(r instanceof AbstractResult, "unsupported result type: %s", r);
AbstractResult result = (AbstractResult) r;
return result.wasMatched() ? getSpec().equals(result.getSpec()) : equals(result);
}
}
// ==== Static helper functions ====
// Matches any "$N" placeholder without capturing.
private static final Pattern ARG_PLACEHOLDER = Pattern.compile("\\$[1-9]");
// Turn "$N" into a capturing groups.
//
// Note that this code currently assumes that each "$N" placeholder matches a single path
// segment (i.e. the captured values cannot contain '/'). This is an artificial restriction
// since resource bundle paths can have quoting in, so we could detect quoted placeholders
// and allow any characters. However at the moment this isn't an issue, and none of the
// "$N" placeholders in the paths expects to match anything with '/' in.
//
// TODO: Fix this to handle quoted placeholders (e.g. "$N" or <$N>) properly.
private static Pattern getRbPathMatcher(String rbPathSpec) {
// An RbPath instance's toString() does not have a leading '/' on it, so well have to
// account for that here (or we could just remove the leading '/' from paths in the
// config file...
if (rbPathSpec.startsWith("/")) {
rbPathSpec = rbPathSpec.substring(1);
}
// Protect potential regex meta-characters in the original resource bundle path. Using
// '\Q' and '\E' to mark quotation boundaries is the safest way to do this, but that
// means we also need to handle '\E' in the original string (incredibly unlikely but it
// would be super hard to debug if it ever happened).
// TODO: If resource paths cannot contain literal '\' or '$', add checks and simplify.
String regex = "\\Q" + rbPathSpec.replace("\\E", "\\E\\E\\Q") + "\\E";
// Remember that you could get "$1$2" here and the regex groups that replace them will
// abut. Use reluctant matching (i.e. "+?") to avoid any backtracking in this case.
// We assume that the substituted arguments contained at least one character, and so we
// capture at least one character per group here.
regex = ARG_PLACEHOLDER.matcher(regex).replaceAll("\\\\E([^/]+?)\\\\Q");
return Pattern.compile(regex);
}
private static String substituteArgs(String spec, List<String> args) {
return substituteArgs(spec, args::get, args.size());
}
// Substitutes "$N" (N = 1...9) placeholders for values obtained from a zero-indexed
// function (i.e. "$N" --> args(N - 1)).
private static String substituteArgs(String spec, Function<Integer, String> args, int size) {
return RegexTransformer.substitute(
spec, '$', c -> args.apply(checkElementIndex(c - '1', size, "argument index")));
}
// Matches arguments with or without enclosing quotes.
private static final Pattern ARGUMENT = Pattern.compile("[<\"]?\\$(\\d)[\">]?");
// Logic mostly copied from original RegexManager class. Finds first unquoted $N (N=1..9)
// and returns N-1 (or -1 if no match). We do not permit $0 to appear even though it is
// captured by the regex because it's just the entire path.
private static int getSplitArgIndex(String rbPath) {
// Captures a $N placeholder, but might catch surrounding quoting as well.
Matcher matcher = ARGUMENT.matcher(rbPath);
while (matcher.find()) {
char startChar = rbPath.charAt(matcher.start());
char endChar = rbPath.charAt(matcher.end() - 1);
// Splitting occurs for the first unquoted placeholder, so ignore <$1> and "$N".
// Q: Why two different "quoting" schemes?
// A: It's complex and relates the something called "hidden labels".
boolean shouldSplit = !((startChar == '"' && endChar == '"') ||
(startChar == '<' && endChar == '>'));
if (shouldSplit) {
// Allowed "$N" argument placeholders go from $1 to $9 ($0 is disallowed) and
// arguments are zero-indexed, so we expect an index from 0 to 8.
int groupNumber = Integer.parseInt(matcher.group(1));
checkArgument(groupNumber >= 1 && groupNumber <= 9,
"invalid split argument: %s", groupNumber);
return groupNumber - 1;
}
}
return -1;
}
// Splits a possibly quoted string, where we need to handle \". This is a bit dubious
// though as we don't detect or unescape \\. Thus it's impossible to represent a single '\'
// at the end of a quoted string (e.g. "$1" where the expansion of $1 has a trailing '\'.
// It's also impossible to have a value that should be split but which contains '"'.
//
// This mimics the original RegexManager behaviour where spaces in and quotes in
// substituted values are _not_ escaped.
private static ImmutableList<String> splitValues(String value) {
int qstart = nextBareQuoteIndex(value, 0);
if (qstart == -1) {
return ImmutableList.copyOf(VALUE_SPLITTER.split(value));
}
ImmutableList.Builder<String> values = ImmutableList.builder();
int rawStart = 0;
do {
values.addAll(VALUE_SPLITTER.split(value.substring(rawStart, qstart)));
int qend = nextBareQuoteIndex(value, qstart + 1);
checkArgument(qend != -1, "mismatched quotes in splittable value: %s", value);
// Remember to unescape any '"' found in the quoted regions.
values.add(value.substring(qstart + 1, qend).replace("\\\"", "\""));
rawStart = qend + 1;
qstart = nextBareQuoteIndex(value, qend + 1);
} while (qstart != -1);
values.addAll(VALUE_SPLITTER.split(value.substring(rawStart)));
return values.build();
}
// Returns the index of the next '"' character that's not preceded by a '\'.
private static int nextBareQuoteIndex(String s, int i) {
i = s.indexOf('"', i);
// If i == 0, then '"' is the first char and must be "bare".
if (i > 0) {
do {
if (s.charAt(i - 1) != '\\') {
break;
}
i = s.indexOf('\\', i + 1);
} while (i >= 0);
}
return i;
}
}

View file

@ -0,0 +1,180 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.ImmutableList.toImmutableList;
import java.util.Optional;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import com.google.common.collect.ImmutableList;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/*
* Each rule corresponds to a single target xpath specification in the configuration file
* (lines starting //) but may have more than one result specification. For example:
*
* //supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
* ; /languageData/$1/primary/scripts ; values=$2
* ; /languageData/$1/primary/territories; values=$3
*
* is represented by a single rule with two result specifications.
*/
abstract class Rule {
/** Returns a rule for which all '%X' arguments have been resolved (almost all cases). */
static Rule staticRule(
CldrDataType dtdType,
String prefix,
Iterable<ResultSpec> specs,
String pathRegex,
String xpathSpec,
int lineNumber) {
return new StaticRule(dtdType, prefix, specs, pathRegex, xpathSpec, lineNumber);
}
/** Returns a rule for which some '%X' arguments are unresolved until matching occurs. */
static Rule dynamicRule(
CldrDataType dtdType,
String pathRegex,
Iterable<ResultSpec> specs,
VarString varString,
Function<Character, CldrPath> varFn,
String xpathSpec,
int lineNumber) {
return new DynamicRule(dtdType, pathRegex, specs, varString, varFn, xpathSpec, lineNumber);
}
// Type of CLDR path which can match this rule.
private final CldrDataType dtdType;
// The first path element below the root, used to do fast rejection of non-matching paths
// and to "bucket" rules by their prefix to speed up matching.
private final String pathPrefix;
// One or more result specifications to be processed for matching CLDR paths/values.
private final ImmutableList<ResultSpec> resultSpecs;
// Debug information only to help determine unused rules.
private final String xpathSpec;
private final int lineNumber;
private Rule(
CldrDataType dtdType,
String pathPrefix,
Iterable<ResultSpec> resultSpecs,
String xpathSpec,
int lineNumber) {
this.dtdType = checkNotNull(dtdType);
this.pathPrefix = checkNotNull(pathPrefix);
this.resultSpecs = ImmutableList.copyOf(resultSpecs);
this.xpathSpec = checkNotNull(xpathSpec);
this.lineNumber = lineNumber;
}
/** Returns the CLDR DTD type of the path that the rule can match. */
final CldrDataType getDataType() {
return dtdType;
}
/** Returns the name of the first path element below the path root. */
final String getPathPrefix() {
return pathPrefix;
}
/** Returns the regular expression against which CLDR path strings are matched. */
abstract Pattern getPathPattern(DynamicVars varLookupFn);
/**
* Attempts to match the incoming xpath and (if successful) use captured arguments to
* generate one result for each result specification.
*/
final ImmutableList<Result> transform(CldrValue v, String fullXPath, DynamicVars varFn) {
Matcher m = getPathPattern(varFn).matcher(fullXPath);
return m.matches()
? resultSpecs.stream()
.flatMap(r -> r.transform(v, m, varFn))
.collect(toImmutableList())
: ImmutableList.of();
}
/**
* Returns any fallback functions defined in results specifications. These are used to
* determine the set of possible fallback values for a given resource bundle path.
*/
final Stream<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunctions() {
return resultSpecs.stream()
.map(ResultSpec::getFallbackFunction)
.filter(Optional::isPresent)
.map(Optional::get);
}
// Debugging only
final String getXpathSpec() {
return xpathSpec;
}
// Debugging only
final int getLineNumber() {
return lineNumber;
}
private static final class StaticRule extends Rule {
// The processed xpath specification yielding an xpath matching regular expression. This is
// only suitable for matching incoming xpaths and cannot be processed in any other way.
private final Pattern xpathPattern;
StaticRule(
CldrDataType dtdType,
String prefix,
Iterable<ResultSpec> specs,
String pathRegex,
String xpathSpec,
int lineNumber) {
super(dtdType, prefix, specs, xpathSpec, lineNumber);
this.xpathPattern = Pattern.compile(pathRegex);
}
@Override
Pattern getPathPattern(DynamicVars varLookupFn) {
return xpathPattern;
}
}
private static final class DynamicRule extends Rule {
// The processed xpath specification yielding an xpath matching regular expression. This is
// only suitable for matching incoming xpaths and cannot be processed in any other way.
private final VarString varString;
private final Function<Character, CldrPath> dynamicVarFn;
DynamicRule(
CldrDataType dtdType,
String prefix,
Iterable<ResultSpec> specs,
VarString varString,
Function<Character, CldrPath> varFn,
String xpathSpec,
int lineNumber) {
super(dtdType, prefix, specs, xpathSpec, lineNumber);
this.varString = checkNotNull(varString);
this.dynamicVarFn = checkNotNull(varFn);
}
@Override Pattern getPathPattern(DynamicVars varLookupFn) {
String pathRegex = varString.apply(dynamicVarFn.andThen(varLookupFn)).get();
return Pattern.compile(pathRegex);
}
}
}

View file

@ -0,0 +1,152 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.Maps.filterValues;
import static com.google.common.collect.Maps.transformValues;
import static java.util.function.Function.identity;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.unicode.cldr.api.CldrDataType;
import org.unicode.cldr.api.CldrPath;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.escape.CharEscaperBuilder;
import com.google.common.escape.Escaper;
/** Parser for rule specifications in the regex transformer configuration files. */
final class RuleParser {
// Pattern to capture first two path elements (for the dtd type and path prefix).
private static final Pattern PATH_SPEC_PREFIX = Pattern.compile("//([^/]+)/([^/]+)/");
// Preprocessing replaces %X variables defined in the configuration file. This helps to
// keep the path specification a bit easier to read.
private static final Pattern VAR = Pattern.compile("^%([A-Z])=(.*)$");
// Multi-line rules start with " ; " for some optional amount of whitespace.
private static final Pattern RULE_PARTS_SEPERATOR = Pattern.compile("\\s*+;\\s*+");
// Splitter for the resource bundle / value declarations.
private static final Splitter RULE_PARTS_SPLITTER =
Splitter.on(RULE_PARTS_SEPERATOR).trimResults(whitespace()).omitEmptyStrings();
// Splitter for instruction name/expressions.
private static final Splitter INSTRUCTION_SPLITTER =
Splitter.on('=').trimResults(whitespace()).limit(2);
// Only '[',']' need escaping in path specifications (so we can write "foo{@bar="baz"]").
private static final Escaper SPECIAL_CHARS_ESCAPER =
new CharEscaperBuilder().addEscape('[', "\\[").addEscape(']', "\\]").toEscaper();
/** Parses a configuration file to create a sequence of transformation rules. */
static ImmutableList<Rule> parseConfig(
List<String> configLines, List<NamedFunction> functions) {
// Extract '%X' variable declarations in the first pass.
ImmutableMap<Character, String> varMap = configLines.stream()
.filter(s -> s.startsWith("%"))
.map(VAR::matcher)
.peek(m -> checkArgument(m.matches(), "invalid argument declaration: %s", m))
.collect(ImmutableMap.toImmutableMap(m -> m.group(1).charAt(0), m -> m.group(2)));
return new RuleParser(varMap, functions).parseLines(configLines);
}
private final ImmutableMap<Character, String> staticVarMap;
private final ImmutableMap<Character, CldrPath> dynamicVarMap;
private final ImmutableMap<String, NamedFunction> fnMap;
private RuleParser(ImmutableMap<Character, String> varMap, List<NamedFunction> functions) {
this.staticVarMap = ImmutableMap.copyOf(filterValues(varMap, s -> !s.startsWith("//")));
this.dynamicVarMap = ImmutableMap.copyOf(
transformValues(
filterValues(varMap, s -> s.startsWith("//")),
CldrPath::parseDistinguishingPath));
this.fnMap =
functions.stream().collect(toImmutableMap(NamedFunction::getName, identity()));
}
private ImmutableList<Rule> parseLines(List<String> configLines) {
List<Rule> rules = new ArrayList<>();
for (int lineIndex = 0; lineIndex < configLines.size(); lineIndex++) {
String line = configLines.get(lineIndex);
try {
if (line.startsWith("//")) {
// Either it's "//xpath ; resource-bundle-path ; values"
// Or "//xpath" with " ; resource-bundle-path ; values" on subsequent lines.
int ruleLineNumber = lineIndex + 1;
int xpathEnd = line.indexOf(";");
String xpath;
List<ResultSpec> specs = new ArrayList<>();
if (xpathEnd != -1) {
// Single line rule, extract result specification from trailing part.
xpath = whitespace().trimFrom(line.substring(0, xpathEnd));
// Keep leading " ; " in the transformation string since it matches the
// multi-rule case and is handled the same.
specs.add(parseResultSpec(line.substring(xpathEnd), lineIndex + 1));
} else {
xpath = line;
while (++lineIndex < configLines.size()
&& RULE_PARTS_SEPERATOR.matcher(configLines.get(lineIndex)).lookingAt()) {
specs.add(parseResultSpec(configLines.get(lineIndex), lineIndex + 1));
}
// The loop above moved us past the last line of the rule, so readjust.
lineIndex--;
}
rules.add(parseRule(xpath, specs, ruleLineNumber));
}
} catch (Exception e) {
throw new RuntimeException(
String.format("parse error at line %d: %s", lineIndex + 1, line), e);
}
}
return ImmutableList.copyOf(rules);
}
private ResultSpec parseResultSpec(String spec, int lineNumber) {
// The result specifier still has leading separator (e.g. " ; /foo/bar/$1 ; value=$2"),
// but that's okay because the splitter ignores empty results.
List<String> rbPathAndInstructions = RULE_PARTS_SPLITTER.splitToList(spec);
String rbPathSpec = rbPathAndInstructions.get(0);
ImmutableMap<Instruction, VarString> instructions =
rbPathAndInstructions.stream()
.skip(1)
.map(INSTRUCTION_SPLITTER::splitToList)
.collect(toImmutableMap(
p -> Instruction.forId(p.get(0)),
p -> VarString.of(p.size() > 1 ? p.get(1) : "", staticVarMap::get)));
return new ResultSpec(rbPathSpec, instructions, lineNumber, fnMap, dynamicVarMap::get);
}
private Rule parseRule(String xpathSpec, List<ResultSpec> resultSpecs, int lineNumber) {
// The escaped path is nearly a regular expression, but still contains '%X' variables.
String escapedPathSpec = SPECIAL_CHARS_ESCAPER.escape(xpathSpec);
Matcher m = PATH_SPEC_PREFIX.matcher(escapedPathSpec);
checkArgument(m.lookingAt(), "unexpected path spec: %s", escapedPathSpec);
// Extract type a path prefix for rule grouping and fast rejection during matching.
CldrDataType dtdType = CldrDataType.forXmlName(m.group(1));
String pathPrefix = m.group(2);
// If the variable string contains a "dynamic" argument, is cannot be resolved yet and
// must result in a "dynamic" rule being created here (this is very rare though).
VarString varString = VarString.of(escapedPathSpec, staticVarMap::get);
Optional<String> resolved = varString.resolve();
// Don't turn this into a "map().orElse()" chain (despite what your IDE might suggest)
// because we don't want to create lots of unused dynamic rules!
return resolved.isPresent()
? Rule.staticRule(
dtdType, pathPrefix, resultSpecs, resolved.get(), xpathSpec, lineNumber)
: Rule.dynamicRule(
dtdType, pathPrefix, resultSpecs, varString, dynamicVarMap::get, xpathSpec, lineNumber);
}
}

View file

@ -0,0 +1,90 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import java.util.Optional;
import java.util.function.Function;
import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
/**
* An immutable representation of a String with placeholders for variable substitution. A
* VarString can be "resolved" or "partially resolved" by providing a mapping from placeholder
* characters to strings, and any remaining unresolved variables are tracked. This is a very
* private bit of implementation detail with a far from ideal API, so it's probably best not to
* use it elsewhere without careful thought.
*/
final class VarString {
private static final CharMatcher VAR_CHAR = CharMatcher.inRange('A', 'Z');
static VarString of(String varString) {
ImmutableSet.Builder<Character> requiredChars = ImmutableSet.builder();
// Variable placeholders are any % followed by upper-case ASCII letter (A-Z).
// Other '%' chars are ignored.
for (int i = 0; i < varString.length() - 1; i++) {
if (varString.charAt(i) == '%') {
char c = varString.charAt(i + 1);
if (VAR_CHAR.matches(c)) {
requiredChars.add(c);
}
}
}
return new VarString(varString, requiredChars.build(), ImmutableMap.of());
}
static VarString of(String s, Function<Character, String> varFn) {
return of(s).apply(varFn);
}
private final String varString;
private final ImmutableSet<Character> requiredChars;
private final ImmutableMap<Character, String> varMap;
private VarString(
String varString,
ImmutableSet<Character> requiredChars,
ImmutableMap<Character, String> varMap) {
this.varString = checkNotNull(varString);
this.requiredChars = checkNotNull(requiredChars);
this.varMap = checkNotNull(varMap);
}
/** Applies a variable function to produce a new, potentially resolved, VarString. */
VarString apply(Function<Character, String> varFn) {
ImmutableMap.Builder<Character, String> newVarMap = ImmutableMap.builder();
newVarMap.putAll(this.varMap);
for (Character c : requiredChars) {
if (!varMap.containsKey(c)) {
// Allowed to return null if the function cannot resolve a variable.
String v = varFn.apply(c);
if (v != null) {
newVarMap.put(c, v);
}
}
}
return new VarString(varString, requiredChars, newVarMap.build());
}
/** Returns a resolved value if all variables are available for substitution. */
Optional<String> resolve() {
return varMap.keySet().equals(requiredChars)
? Optional.of(
RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c)))
: Optional.empty();
}
/** Returns the resolved value or fails if not all variables are available. */
String get() {
checkState(varMap.keySet().equals(requiredChars), "unresolved variable string: %s", this);
return RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c));
}
@Override public String toString() {
return varString + ": " + varMap;
}
}

View file

@ -0,0 +1,2 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License

View file

@ -0,0 +1,350 @@
# ldml2icu_locale.txt
#
# © 2016 and later: Unicode, Inc. and others.
#
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
# For terms of use, see http://www.unicode.org/copyright.html
#
# Used by LdmlLocaleMapper.
# Data-driven file for mapping LDML locale paths to ICU paths.
# See ldml2icu_readme.txt for a detailed explanation of this file.
# Variables
# Attribute value
%A=[^"']++
# Word
%W=[\w\-]++
# Greedy word match
%G=[\w\-]+
# Number match
%N=\d++
# The default numbering system to be used.
%D=//ldml/numbers/defaultNumberingSystem
# Main locale data
# Aliases
//ldml/dates/calendars/calendar[@type="(%A)"]/alias[@source="locale"][@path="../calendar[@type='(%A)']"]
; /calendar/$1lo ; values=/LOCALE/calendar/$2
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dayPeriods"]
; /calendar/$1/AmPmMarkers:alias ; values=/LOCALE/calendar/$2/AmPmMarkers
; /calendar/$1/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/$2/AmPmMarkersNarrow
; /calendar/$1/NoonMarker:alias ; values=/LOCALE/calendar/$2/NoonMarker
; /calendar/$1/NoonMarkerNarrow:alias ; values=/LOCALE/calendar/$2/NoonMarkerNarrow
//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
; /calendar/gregorian/AmPmMarkers:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
; /calendar/gregorian/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
//ldml/dates/calendars/calendar[@type="(%A)"]/(eras|quarters|cyclicNameSets|monthPatterns)/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/alias[@source="locale"][@path="../eraAbbr"]
; /calendar/$1/eras/narrow:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/alias[@source="locale"][@path="../eraAbbr"]
; /calendar/$1/eras/wide:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/alias[@source="locale"][@path="../\2[@type='(%A)']"]
; /calendar/$1/$2s/$3:alias ; values=/LOCALE/calendar/$1/$2s/$4
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../../../\4Set[@type='(%A)']/\4Context[@type='(%A)']/\4Width[@type='(%A)']"]
; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$7/$8/$9
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../\4Width[@type='(%A)']"]
; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5/$7
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet|monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$5/$6
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
; /calendar/$1/$2Names:alias ; values=/LOCALE/calendar/$3/$2Names
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$3/$5
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$5/$6
//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5
//ldml/dates/calendars/calendar[@type="(%A)"]/dateFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateFormats"]
; /calendar/$1/DateTimePatterns:alias ; values=/LOCALE/calendar/$2/DateTimePatterns
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateTimeFormats"]
; /calendar/$1/availableFormats:alias ; values=/LOCALE/calendar/$2/availableFormats
; /calendar/$1/appendItems:alias ; values=/LOCALE/calendar/$2/appendItems
; /calendar/$1/intervalFormats:alias ; values=/LOCALE/calendar/$2/intervalFormats
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/(availableFormats|appendItems|intervalFormats)/alias[@source="locale"][@path="../../../calendar[@type='(%A)']/dateTimeFormats/\2"]
; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
//ldml/units/unitLength[@type="long"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
; /units:alias ; values=/LOCALE/unitsShort
//ldml/units/unitLength[@type="narrow"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
; /unitsNarrow:alias ; values=/LOCALE/unitsShort
//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern"]
; /listPattern/$1/start:alias ; values=/LOCALE/listPattern/standard/start
; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/standard/middle
; /listPattern/$1/end:alias ; values=/LOCALE/listPattern/standard/end
; /listPattern/$1/2:alias ; values=/LOCALE/listPattern/standard/2
//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern[@type='(%A)']"]
; /listPattern/$1/start:alias ; values=/LOCALE/listPattern/$2/start
; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/$2/middle
; /listPattern/$1/end:alias ; values=/LOCALE/listPattern/$2/end
; /listPattern/$1/2:alias ; values=/LOCALE/listPattern/$2/2
//ldml/numbers/currencyFormats[@numberSystem="(%A)"]/currencyFormatLength/currencyFormat[@type="accounting"]/alias[@source="locale"][@path="../(%W)[@type='standard']"] ; /NumberElements/$1/patterns/accountingFormat:alias ; values=/LOCALE/NumberElements/$1/patterns/$2
# Characters
//ldml/characters/exemplarCharacters[@type="auxiliary"] ; /AuxExemplarCharacters
//ldml/characters/exemplarCharacters[@type="currencySymbol"] ; /ExemplarCharactersCurrency
//ldml/characters/exemplarCharacters[@type="index"] ; /ExemplarCharactersIndex
//ldml/characters/exemplarCharacters[@type="punctuation"] ; /ExemplarCharactersPunctuation
//ldml/characters/exemplarCharacters[@type="numbers"] ; /ExemplarCharactersNumbers
//ldml/characters/exemplarCharacters ; /ExemplarCharacters
//ldml/characters/ellipsis[@type="(%A)"] ; /Ellipsis/$1
//ldml/characters/moreInformation ; /MoreInformation
//ldml/characters/special/icu:scripts/icu:script[@type="%N"] ; /LocaleScript
//ldml/characters/parseLenients[@scope="(%A)"][@level="(%A)"]/parseLenient[@sample="%A"] ; /parse/$1/$2
# Defaults
//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/\2[@type="(%W)"]
; /calendar/$1/$2s/$3/$4/$5
# Dates
//ldml/dates/calendars/calendar[@type="(%A)"]/cyclicNameSets/cyclicNameSet[@type="(%A)"]/cyclicNameContext[@type="(%A)"]/cyclicNameWidth[@type="(%A)"]/cyclicName[@type="(%A)"]
; /calendar/$1/cyclicNameSets/$2/$3/$4 ;
# ---- /calendar/xxx/DateTimePatterns
# Rules are split to force manual ordering within the array produced by them (they share the same output path).
#
# Note that (like many other places) the uncaptured "type" attributes are just expected to be "standard", and the %A
# variable is only used to save a bit of space. The final output array has 3 groups ("time" -> "date" -> "date-time")
# each with 4 elements in based on the pattern length ("full" -> "long" -> "medium" -> "short") giving 12 patterns in
# total.
#
# However due to an awful hack, there end up being 13 values in the array, with the medium date-time value being
# duplicated at index 8. However this hack is done later, because the regex transformer does not permit the same
# CLDR path to emit values in different places in an array.
# Time patterns (4 x values)
//ldml/dates/calendars/calendar[@type="(%A)"]/(timeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
; /calendar/$1/DateTimePatterns
# Date patterns (4 x values)
#
# This is a weird edge case. When the number attribute is present in the xpath, its value needs to be grouped
# together with the xpath value in its own special array, which is treated like just another value in
# /DateTimePatterns. The group keyword is used here to specify that values from the same xpath should be grouped
# into their own separate array. Since each possible pattern length can have patterns with and without the number
# attribute, we must explicitly split the rules to enforce correct output order.
#
# So far (Jan 2014), this only happens in the Chinese calendar for ja/zh/zh_Hant and the Hebrew calendar for he,
# and all calendars for haw (which has numbers="M=romanlow").
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"]
; /calendar/$1/DateTimePatterns
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"]
; /calendar/$1/DateTimePatterns
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"]
; /calendar/$1/DateTimePatterns
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"]
; /calendar/$1/DateTimePatterns
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
# DateTime patterns (4 x values)
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateTimeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
; /calendar/$1/DateTimePatterns
# ----
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/appendItems/appendItem[@request="(%A)"] ; /calendar/$1/appendItems/$2
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"] ; /calendar/$1/availableFormats/$2
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"][@count="(%A)"] ; /calendar/$1/availableFormats/$2/$3
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id="(%A)"]/greatestDifference[@id="(%A)"] ; /calendar/$1/intervalFormats/$2/$3
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatFallback ; /calendar/$1/intervalFormats/fallback
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkers%$3
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersNarrow%$3
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersAbbr%$3
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkers
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersAbbr
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersNarrow
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/dayPeriod/$2/$3/$4
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/narrow%$3
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/abbreviated%$3
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/wide%$3
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"] ; /calendar/$1/eras/narrow
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"] ; /calendar/$1/eras/abbreviated
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"] ; /calendar/$1/eras/wide
# Leap year names go after other month names.
# "yeartype" is an #IMPLIED attribute in the DTD and it should implicitly default to "standard".
# In practice "standard" is never explicitly given, but it could be (so must match it here).
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"](?:[@yeartype="standard"])? ; /calendar/$1/$2Names/$3/$4
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"][@yeartype="leap"] ; /calendar/$1/$2Names/$3/$4
//ldml/dates/calendars/calendar[@type="(%A)"]/(quarters)/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="%A"] ; /calendar/$1/$2/$3/$4
//ldml/dates/fields/field[@type="(%A)"]/displayName[@alt="(%A)"] ; /fields/$1/dn%$2
//ldml/dates/fields/field[@type="(%A)"]/displayName ; /fields/$1/dn
//ldml/dates/fields/field[@type="(%A)"]/relative[@type="(%A)"] ; /fields/$1/relative/"$2"
//ldml/dates/fields/field[@type="(%A)"]/relativePeriod ; /fields/$1/relativePeriod
//ldml/dates/fields/field[@type="(%A)"]/relativeTime[@type="(%A)"]/relativeTimePattern[@count="(%A)"] ; /fields/$1/relativeTime/$2/$3
//ldml/dates/fields/field[@type="(%A)"]/alias[@source="locale"][@path="../field[@type='(%A)']"] ; /fields/$1:alias ; values=/LOCALE/fields/$2
//ldml/dates/timeZoneNames/regionFormat[@type="daylight"] ; /zoneStrings/regionFormatDaylight
//ldml/dates/timeZoneNames/regionFormat[@type="standard"] ; /zoneStrings/regionFormatStandard
//ldml/dates/timeZoneNames/(%GFormat) ; /zoneStrings/$1
//ldml/dates/timeZoneNames/metazone[@type="(%A)"]/(\w)%W/(\w)%W ; /zoneStrings/"meta:$1"/$2$3
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2"/ec%$3
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2"/ec
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/(\w)%W/(\w)%W ; /zoneStrings/"$1:$2"/$3$4
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2:$3"/ec%$4
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2:$3"/ec
# Locale Display Names
//ldml/localeDisplayNames/codePatterns/codePattern[@type="(%A)"] ; /codePatterns/$1
//ldml/localeDisplayNames/annotationPatterns/annotationPattern[@type="(%A)"] ; /codePatterns/$1
//ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
//ldml/localeDisplayNames/languages/language[@type="(%A)"][@alt="(%A)"] ; /Languages%$2/$1
//ldml/localeDisplayNames/languages/language[@type="(%A)"] ; /Languages/$1
//ldml/localeDisplayNames/localeDisplayPattern/localeKeyTypePattern ; /localeDisplayPattern/keyTypePattern
//ldml/localeDisplayNames/localeDisplayPattern/localePattern ; /localeDisplayPattern/pattern
//ldml/localeDisplayNames/localeDisplayPattern/localeSeparator ; /localeDisplayPattern/separator
//ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type="(%A)"] ; /measurementSystemNames/$1
//ldml/localeDisplayNames/scripts/script[@type="(%A)"][@alt="(%A)"] ; /Scripts%$2/$1
//ldml/localeDisplayNames/scripts/script[@type="(%A)"] ; /Scripts/$1
//ldml/localeDisplayNames/territories/territory[@type="(%A)"][@alt="(%A)"] ; /Countries%$2/$1
//ldml/localeDisplayNames/territories/territory[@type="(%A)"] ; /Countries/$1
//ldml/localeDisplayNames/transformNames/transformName[@type="(%W)"] ; /transformNames/$1
//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"][@alt="(%A)"] ; /Types%$3/$1/$2
//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"] ; /Types/$1/$2
//ldml/localeDisplayNames/variants/variant[@type="(%A)"][@alt="(%A)"] ; /Variants%$2/$1
//ldml/localeDisplayNames/variants/variant[@type="(%A)"] ; /Variants/$1
# Numbers
//ldml/numbers/currencies/currency[@type="(%A)"]/displayName[@count="(%A)"] ; /CurrencyPlurals/$1/$2
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol[@alt="(%A)"] ; /Currencies%$2/$1
# ---- /Currencies/XXX bundles
# Ordering of rules is critical here since they write into the same resource bundle path and the
# last 3 values are grouped together as a single value (via the special <FIFO> hidden label).
#
# Note that the <FIFO> label is needed here (not the "group" instruction) because the grouped
# values must be seen as having a resource bundle path that is a child of the "/Currencies/$1"
# path. This is so that the grouped values only appear when one of them is present rather than
# whenever any of the other values in the main resource bundle path exist.
#
# Due to the optional nature of the final sub-array in the bundle, it would be very hard to ever
# add more elements after it.
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
; /Currencies/$1 ; fallback=$1
//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
; /Currencies/$1 ; fallback=$1
//ldml/numbers/currencies/currency[@type="(%W)"]/pattern[@type="standard"]
; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencyFormatLength/currencyFormat[@type="standard"]/pattern[@type="standard"]
//ldml/numbers/currencies/currency[@type="(%W)"]/decimal
; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
//ldml/numbers/currencies/currency[@type="(%W)"]/group
; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/group
# ----
//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencySpacing/(%W)/(%W) ; /currencySpacing/$1/$2
//ldml/numbers/currencyFormats[@numberSystem="%D"]/unitPattern[@count="(%W)"] ; /CurrencyUnitPatterns/$1
//ldml/numbers/defaultNumberingSystem[@alt="(%A)"] ; /NumberElements/default_$1
//ldml/numbers/defaultNumberingSystem ; /NumberElements/default
//ldml/numbers/minimumGroupingDigits ; /NumberElements/minimumGroupingDigits
//ldml/numbers/otherNumberingSystems/(%W) ; /NumberElements/$1
//ldml/numbers/symbols[@numberSystem="(%A)"]/(%W) ; /NumberElements/$1/symbols/$2
//ldml/numbers/(%GFormat)s[@numberSystem="(%W)"]/\1Length/\1[@type="standard"]/pattern[@type="standard"] ; /NumberElements/$2/patterns/$1
//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength/currencyFormat[@type="accounting"]/pattern[@type="standard"] ; /NumberElements/$1/patterns/accountingFormat
//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength[@type="short"]/currencyFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/currencyFormat/$2/$3
//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="short"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/decimalFormat/$2/$3
//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="long"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsLong/decimalFormat/$2/$3
//ldml/numbers/miscPatterns[@numberSystem="(%W)"]/pattern[@type="(%W)"] ; /NumberElements/$1/miscPatterns/$2
//ldml/numbers/minimalPairs/ordinalMinimalPairs[@ordinal="(%A)"] ; /NumberElements/minimalPairs/ordinal/$1
//ldml/numbers/minimalPairs/pluralMinimalPairs[@count="(%A)"] ; /NumberElements/minimalPairs/plural/$1
# Misc
# Ordering of rules is critical here since they write into the same resource bundle path.
//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="uiListOrMenu"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="stand-alone"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
//ldml/delimiters/(%W) ; /delimiters/$1
//ldml/layout/orientation/(%G)Order ; /layout/$1s
//ldml/listPatterns/listPattern/listPatternPart[@type="(%A)"] ; /listPattern/standard/$1
//ldml/listPatterns/listPattern[@type="(%A)"]/listPatternPart[@type="(%A)"] ; /listPattern/$1/$2
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsNarrow/$1/$2/dnam
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsShort/$1/$2/dnam
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/displayName ; /units/$1/$2/dnam
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsNarrow/$1/$2/$3
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsShort/$1/$2/$3
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /units/$1/$2/$3
//ldml/units/unitLength[@type="narrow"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsNarrow/compound/$1
//ldml/units/unitLength[@type="short"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsShort/compound/$1
//ldml/units/unitLength[@type="long"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /units/compound/$1
//ldml/units/unitLength[@type="narrow"]/coordinateUnit/displayName ; /unitsNarrow/coordinate/dnam
//ldml/units/unitLength[@type="short"]/coordinateUnit/displayName ; /unitsShort/coordinate/dnam
//ldml/units/unitLength[@type="long"]/coordinateUnit/displayName ; /units/coordinate/dnam
//ldml/units/unitLength[@type="narrow"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsNarrow/coordinate/$1
//ldml/units/unitLength[@type="short"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsShort/coordinate/$1
//ldml/units/unitLength[@type="long"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /units/coordinate/$1
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsNarrow/$1/$2/per
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsShort/$1/$2/per
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /units/$1/$2/per
//ldml/units/durationUnit[@type="(%A)"]/durationUnitPattern ; /durationUnits/$1
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsNarrow/$1/$2:alias ; values=/LOCALE/unitsNarrow/$3/$4
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsShort/$1/$2:alias ; values=/LOCALE/unitsShort/$3/$4
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /units/$1/$2:alias ; values=/LOCALE/units/$3/$4
//ldml/characterLabels/characterLabelPattern[@type="(%A)"][@count="(%A)"] ; /characterLabelPattern/$1/$2
//ldml/characterLabels/characterLabelPattern[@type="(%A)"] ; /characterLabelPattern/$1
//ldml/characterLabels/characterLabel[@type="(%A)"] ; /characterLabel/$1

View file

@ -0,0 +1,386 @@
# README for configuration files used by org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer.
#
# © 2019 and later: Unicode, Inc. and others.
#
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
# For terms of use, see http://www.unicode.org/copyright.html
======
Basics
======
The RegexTransformer class converts CLDR paths and values to ICU Resource Bundle paths
and values, based on a set of transformation rules typically loaded from a text file
(e.g. ldml2icu_locale.txt).
The basic format of transformation rules is:
<path-specification> ; <resource-bundle-specification> [; <instruction>=<argument>]*
A simple example of a transformation rule is:
//ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
which transforms CLDR values whose path matches the path specification, and emits:
* A resource bundle path "/Keys/xx", where 'xx' is the captured type attribute.
* A resource bundle value, which is just the CLDR value's base value.
A path specification can be thought of as a regular expression which matches the CLDR
path and can capture some element names or attribute values; however unlike a regular
expression, the '[',']' characters are treated as literals, similar to XPath expressions.
If a single CLDR value should produce more than one resource bundle path/value, then
it should be written:
<path-specification>
; <resource-bundle-1-specification> [; <instruction> ]*
; <resource-bundle-2-specification> [; <instruction> ]*
=====================
Argument Substitution
=====================
Before a rule can be matched, any %-variables must be substituted. These are defined
in the same configuration file as the rules, and look something like:
%W=[\w\-]++
or:
%D=//ldml/numbers/defaultNumberingSystem
The first case can be thought of as just a snippet of regular expression (in this case
something that matches hyphen separated words) and, importantly, here '[' and ']' are
treated as regular expression metacharacters. These arguments are static and wil be
substituted exactly as-is into the regular expression to be used for matching.
The second case (used exactly once) is a dynamic argument which references a CLDR value
in the set of data being transformed. This is simply indicated by the fact that it starts
with '//'. This path is resolved and the value is substituted just prior to matching.
Variable names are limited to a single upper-case letter (A-Z).
===========================
Implicit Argument Splitting
===========================
This is a (somewhat non-obvious) mechanism which allows for a single rule to generate
multiple results from a single input path when a argument is a list of tokens.
Consider the rule:
//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
; /timeData/$3/allowed ; values=$1
; /timeData/$3/preferred ; values=$2
where the "regions" attributes (which is captured as '$3') contains a whitespace separated
list of region codes (e.g. "US GB AU NZ"). In this case the rule is applied once for each
region, producing paths such as "/timeData/US/allowed" or "/timeData/NZ/preferred". Note
that there is no explicit instruction to do this, it just happens.
The rule is that the first unquoted argument in the resource bundle path is always treated
as splittable.
To suppress this behaviour, the argument must be quoted (e.g. /timeData/"$3"/allowed). Now,
if there were another following unquoted argument, that would become implicitly splittable
(but only one argument is ever splittable).
============
Instructions
============
Additional instructions can be supplied to control value transformation and specify fallback
values. The set of instructions is:
* values: The most common instruction which defines how values are transformed.
* fallback: Defines a fallback value to be used if this rule was not matched.
There are two other special case instructions which should (if at all possible) not be used,
and might be removed at some point:
* group: Causes values to be grouped as sub-arrays for very specific use cases
(prefer using "Hidden Labels" where possible).
* base_xpath: Allows deduplication of results between multiple different rules (this is a
hack to work around limitations in how matching is performed).
-------------------
values=<expression>
-------------------
The "values" instruction defines an expression whose evaluated result becomes the output
resource bundle value(s). Unless quoting is present, this evaluated expression is split
on whitespace and can become multiple values in the resulting resource bundle.
Examples:
* values=$1 $2 $3
Produces three separate values in the resource bundle for the first three captured
arguments.
* values="$1 $2" $3
Produces two values in the resource bundle, the first of which is two captured values
separated by a space character.
* values={value}
Substitutes the CLDR value, but then performs whitespace splitting on the result. This
differs from the behaviour when no "values" instructions is present (which does not
split the results).
* values="{value}" $1
Produces two values, the first of which is the unsplit CLDR value, and the second is a
captured argument.
* values=&func($1, {value})
Invokes a transformation function, passing in a captured argument and the CLDR value,
and the result is then split. The set of functions available to a transformer is
configured when it is created.
Note that in the above examples, it is assumed that the $N arguments do not contain spaces.
If they did, it would result in more output values. To be strict about things, every value
which should not be split must be quoted (e.g. values="$1" "$2" "$3") but since captured
values are often IDs or other tokens, this is not what is seen in practice, so it is not
reflected in these examples.
---------------------
fallback=<expression>
---------------------
The fallback instruction provides a way for default values to be emitted for a path that
was not matched. Fallbacks are useful when several different rules produce values for the
same resource bundle. In this case the output path produced by one rule can be used as
the "key" for any unmatched rules with fallback values (to "fill in the gaps").
Consider the two rules which can emit the same resource bundle path:
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
; /Currencies/$1 ; fallback=$1
//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
; /Currencies/$1 ; fallback=$1
These rules, if both matched, will produce two values for the same resource bundle path.
Consider the CLDR values:
//ldml/numbers/currencies/currency[@type="USD"]/symbol ==> "$"
//ldml/numbers/currencies/currency[@type="USD"]/displayName ==> "US Dollar"
After matching both of these paths, the values for the resource bundle "/Currencies/USD"
will be the array { "$", "US Dollar" }.
However, if only one value were present to be converted, the converter could use the
matched path "/Currencies/XXX" and infer the missing fallback value, ensuring that the
output array (it if was emitted at all) was always two values.
Note that in order for this to work, the fallback value must be derivable only from the
matched path. E.g. it cannot contain arguments that are not also present in the matched
path, and obviously cannot reference the "{value}" at all. Thus the following would not
be permitted:
//ldml/foo/bar[@type="(%W)"][@region=(%A)] ; /Foo/$1 ; fallback=$2
However the fallback value can reference existing CLDR or resource bundle paths (expected
to be present from other rules). For example:
fallback=/weekData/001:intvector[0]
or:
fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
The latter case is especially complex because it also uses the "dynamic" argument:
%D=//ldml/numbers/defaultNumberingSystem
So determining the resulting value will require:
1) resolving "//ldml/numbers/defaultNumberingSystem" to, for example, "arab"
2) looking up the value of "//ldml/numbers/symbols[@numberSystem="arab"]/decimal"
-----------------
base_xpath=<path>
-----------------
The base_xpath instruction allows a rule to specify a proxy path which is used in place of
the originally matched path in the returned result. This is a useful hack for cases where
values are derived from information in a path prefix.
Because path matching for transformation happens only on full paths, it is possible that
several distinct CLDR paths might effectively generate the same result if they share the
same prefix (i.e. paths in the same "sub hierarchy" of the CLDR data).
If this happens, then you end up generating "the same" result from different paths. To
fix this, a "surrogate" CLDR path can be specified as a proxy for the source path,
allowing several results to appears to have come from the same source, which results in
deduplication of the final value.
For example, the two rules :
//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
Produce the same results for different paths (with or without the "officialStatus"
attribute) but only one such result is desired. By specifying the same base_xpath on
both rules, the conversion logic can deduplicate these to produce only one result.
When using base_xpath, it is worth noting that:
1) Base xpaths must be valid "distinguishing" paths (but are never matched to any rule).
2) Base xpaths can use arguments to achieve the necessary level of uniqueness.
3) Rules which share the same base xpath must always produce the same values.
Note however that this is a still very much a hack because since two rules are responsible
for generating the same result, there is no well defined "line number" to use for ordering
of values. Thus this mechanism should only be used for rules which produce "single"
values, and must not be used in cases where the ordering of values in arrays is important.
This mechanism only exists because there is currently no mechanism for partial matching
or a way to match one path against multiple rules.
-----
group
-----
The "group" instruction should be considered a "last resort" hack for controlling value
grouping, in cases where "hidden labels" are not suitable (see below).
==============================
Value Arrays and Hidden Labels
==============================
In the simplest case, one rule produces one or more output path/values per matched CLDR
value (i.e. one-to-one or one-to-many). If that happens, then output ordering of the
resource bundle paths is just the natural resource bundle path ordering.
However it is also possible for several rules to produce values for a single output path
(i.e. many-to-one). When this happens there are some important details about how results
are grouped and ordered.
------------
Value Arrays
------------
If several rules produce results for the same resource bundle path, the values produced
by the rules are always ordered according to the order of the rule in the configuration
rule (and it is best practice to group any such rules together for clarity).
If each rule produces multiple values, then depending on grouping, those values can either
be concatenated together in a single array or grouped individually to create an array
of arrays.
In the example below, there are four rules producing values for the same path (
//.../firstDay[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1)
//.../minDays[@count="(%N)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=$1
//.../weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0
//.../weekendEnd[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 86400000
The first two rules produce one value each, and the last two produce two values each. This
results in the resource bundle "/weekData/xxx:intvector" having a single array consisting
of six values. In the real configuration, these rules also use fallback instructions to
ensure that the resulting array of values is always six values, even if some CLDR paths are
not present.
-------------
Hidden Labels
-------------
Sometimes rules should produce separate "sub-arrays" of values, rather than having all the
values appended to a single array. Consider the following path/value pairs:
x/y: a
x/y: b
x/y: c
Which produce the resource bundle "x/y" with three values:
x{
y{
"a",
"b",
"c"
}
}
Now suppose we want to make a resource bundle where the values are grouped into their
own sub-array:
x{
y{
{ "a", "b", "c" }
}
}
We can think of this as coming from the path/value pairs:
x/y/-: a
x/y/-: b
x/y/-: c
where to represent the sub-array we introduce the idea of an empty path element '-'.
In a transformation rule, these "empty elements" are represent as "hidden labels", and look
like "<some-label>". They are treated as "normal" path elements for purposes of ordering and
grouping, but are treated as empty when the paths are written to the ICU data files.
For example the rule:
//.../currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
Generates a series of grouped, 2-element sub-arrays split by the captured type attribute.
codeMappingCurrency{
{ type-1, numeric-1 }
{ type-2, numeric-2 }
{ type-3, numeric-3 }
}
<FIFO> is a special hidden label which is substituted for in incrementing counting when
sorting paths. It ensures that values in the same array are sorted in the order that they
were encountered. However this mechanism imposes a strict requirement that the ordering
of CLDR values to be transformed matches the expected ICU value order, so it should be
avoided where possible to avoid this implicit, subtle dependency. Note that this mechanism
is currently only enabled for the transformation of "supplemental data" and may eventually
be removed.
Hidden labels are a neat solution which permits the generation of sub-array values, but they
don't quite work in every case. For example if you need to produce a resource bundle with a
mix of values and sub-arrays, like:
x{
y{
"a",
{ "b", "c" }
"d"
}
}
which can be thought of as coming from the path/value pairs:
x/y: a
x/y/<z>: b
x/y/<z>: c
x/y: d
we find that, after sorting the resource bundle paths, we end up with:
x/y: a
x/y: d
x/y/<z>: b
x/y/<z>: c
which produces the wrong result. This happens because values with different paths are
sorted primarily by their path. I cases like this, where a mix of values and sub-arrays
are required, the "group" instruction can be used instead.
For example:
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol ; /Currencies/$1
//ldml/numbers/currencies/currency[@type="(%W)"]/displayName ; /Currencies/$1
//ldml/numbers/currencies/currency[@type="(%W)"]/pattern ; /Currencies/$1 ; group
//ldml/numbers/currencies/currency[@type="(%W)"]/decimal ; /Currencies/$1 ; group
//ldml/numbers/currencies/currency[@type="(%W)"]/group ; /Currencies/$1 ; group
Produces resource bundles which look like:
Currencies{
xxx{
"<symbol>",
"<display name>",
{ "<pattern>", "<decimal>", "<group>" }
}
}

View file

@ -0,0 +1,202 @@
# ldml2icu_supplemental.txt
#
# © 2016 and later: Unicode, Inc. and others.
#
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
# For terms of use, see http://www.unicode.org/copyright.html
#
# Used by SupplementalMapper.
# Data-driven file for mapping supplemental LDML paths to ICU paths.
# See ldml2icu_readme.txt for a detailed explanation of this file.
# Attribute value
%A=[^"']++
# Attribute value, no underscore
%B=[^"'_]++
# Word/Zone match
%W=[\s\w\-/]++
# Greedy word match
%G=[\s\w\-]+
# Number match
%N=[\d\.]++
# supplementalData.xml
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"]
; /CurrencyMap/$1/<FIFO>/id ; values=$2
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"][@tender="false"]
; /CurrencyMap/$1/<FIFO>/id ; values=$2
; /CurrencyMap/$1/<FIFO>/tender ; values=false
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"]
; /CurrencyMap/$1/<FIFO>/id ; values=$3
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"][@tender="false"]
; /CurrencyMap/$1/<FIFO>/id ; values=$3
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
; /CurrencyMap/$1/<FIFO>/tender ; values=false
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"]
; /CurrencyMap/$1/<FIFO>/id ; values=$4
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"][@tender="false"]
; /CurrencyMap/$1/<FIFO>/id ; values=$4
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
; /CurrencyMap/$1/<FIFO>/tender ; values=false
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@to="(%W)"][@iso4217="(%W)"][@tender="false"]
; /CurrencyMap/$1/<FIFO>/id ; values=$3
; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($2, to)
; /CurrencyMap/$1/<FIFO>/tender ; values=false
//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashDigits="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $4 $5
//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $4
//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $3
//supplementalData/calendarPreferenceData/calendarPreference[@territories="(%A)"][@ordering="(%A)"] ; /calendarPreferenceData/$1 ; values=$2
//supplementalData/codeMappings/territoryCodes[@type="(%W)"][@numeric="(%N)"][@alpha3="(%W)"].* ; /codeMappings/<$1> ; values=$1 $2 $3
//supplementalData/codeMappings/currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"][@alt="secondary"]
; /languageData/$1/secondary/scripts ; values=$2
; /languageData/$1/secondary/territories ; values=$3
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@alt="secondary"] ; /languageData/$1/secondary/scripts ; values=$2
//supplementalData/languageData/language[@type="(%W)"][@territories="(%G)"][@alt="secondary"] ; /languageData/$1/secondary/territories ; values=$2
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
; /languageData/$1/primary/scripts ; values=$2
; /languageData/$1/primary/territories; values=$3
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"] ; /languageData/$1/primary/scripts ; values=$2
//supplementalData/languageData/language[@type="(%W)"][@territories="(%W)"] ; /languageData/$1/primary/territories ; values=$2
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="deprecated"] ; /territoryContainment/deprecated/$1 ; values=$2
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="grouping"] ; /territoryContainment/containedGroupings/$1 ; values=$2
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@grouping="true"] ; /territoryContainment/grouping/$1 ; values=$2
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"]; /territoryContainment/$1 ; values=$2
//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1 ; values=$2
//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@subtype="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1-$2 ; values=$3
//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"](?:[@references="(?:%A)"])?[@alt="(%A)"] ; /weekData%$3/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
//supplementalData/weekData/minDays[@count="(%N)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=$1 ; fallback=/weekData/001:intvector[1]
//supplementalData/weekData/weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0 ; fallback=/weekData/001:intvector[2] /weekData/001:intvector[3]
//supplementalData/weekData/weekendEnd[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 86400000 ; fallback=/weekData/001:intvector[4] /weekData/001:intvector[5]
//supplementalData/weekData/weekOfPreference[@locales="(%A)"][@ordering="(%A)"] ; /weekOfPreference/$1 ; values=$2
//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
; /timeData/$3/allowed ; values=$1
; /timeData/$3/preferred ; values=$2
//supplementalData/measurementData/measurementSystem[@type="metric"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=0
//supplementalData/measurementData/measurementSystem[@type="US"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=1
//supplementalData/measurementData/measurementSystem[@type="UK"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=2
//supplementalData/measurementData/measurementSystem[@type="metric"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=0
//supplementalData/measurementData/measurementSystem[@type="US"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=1
//supplementalData/measurementData/measurementSystem[@type="UK"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=2
//supplementalData/measurementData/paperSize[@type="A4"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=297 210
//supplementalData/measurementData/paperSize[@type="US-Letter"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=279 216
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-small-informal
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2-small
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-informal
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
; /territoryInfo/$1/$5/officialStatus ; values=$8
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
; /territoryInfo/$1/$5/officialStatus ; values=$8
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
; /territoryInfo/$1/$5/officialStatus ; values=$7
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"](?:[@references="%W"])?
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
# This only exists right now for 'ZZ', which has no <languagePopulation> child elements.
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
//supplementalData/calendarData/calendar[@type="(%W)"]/calendarSystem[@type="(%W)"] ; /calendarData/$1/system ; values=$2
//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"][@named="(%W)"]
; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
; /calendarData/$1/eras/$2/named ; values=$5
//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"]
; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
# languageInfo.xml
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/paradigmLocales[@locales="(%A)"] ; /languageMatchingInfo/$1/paradigmLocales ; values=$2
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/matchVariable[@id="\$(%A)"][@value="(%A)"] ; /languageMatchingInfo/$1/matchVariable/$2 ; values=$3
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"][@oneway="true"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 1
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 0
//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"][@oneway="true"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 1
//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 0
# likelySubtags.xml
//supplementalData/likelySubtags/likelySubtag[@from="(%A)"][@to="(%A)"] ; /$1 ; values=$2
# metaZones.xml - metaZones.txt
//supplementalData/metaZones/mapTimezones[@type="metazones"]/mapZone[@type="(%A)"][@other="(%W)"][@territory="(%W)"] ; /mapTimezones/$2/$3 ; values=$1
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$2
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$3 "$2" "9999-12-31 23:59"
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$4 "$2" "$3"
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<1970-01-01 00:00> ; values=$3 "1970-01-01 00:00" "$2"
//supplementalData/primaryZones/primaryZone[@iso3166="(%W)"] ; /primaryZones/$1 ; values={value}
# numberingSystems.txt
//supplementalData/numberingSystems/numberingSystem[@type="algorithmic"][@id="(%W)"][@rules="(%A)"]
; /numberingSystems/$1/algorithmic:int ; values=1
; /numberingSystems/$1/desc ; values=&algorithm($2)
; /numberingSystems/$1/radix:int ; values=10
//supplementalData/numberingSystems/numberingSystem[@type="numeric"][@id="(%W)"][@digits="(%A)"]
; /numberingSystems/$1/algorithmic:int ; values=0
; /numberingSystems/$1/desc ; values=$2
; /numberingSystems/$1/radix:int ; values=10
# windowsZones.txt
//supplementalData/windowsZones/mapTimezones/mapZone[@type="(%A)"][@other="(%A)"][@territory="(%W)"] ; /mapTimezones/"$2"/$3 ; values="$1"
# genderList.txt
//supplementalData/gender/personList[@type="(%W)"][@locales="(%W)"] ; /genderList/$2 ; values=$1
# locale info
//supplementalData/parentLocales/parentLocale[@parent="(%A)"][@locales="(%A)"] ; /parentLocales/$1 ; values=$2
# supplementalMetadata.xml (metadata.txt)
//supplementalData/metadata/defaultContent[@locales="(%A)"] ; /defaultContent ; values=$1
//supplementalData/metadata/alias/(language|script|territory|subdivision|variant)Alias[@type="(%A)"][@replacement="(%A)"][@reason="(%A)"]
; /alias/$1/$2/reason ; values="$4"
; /alias/$1/$2/replacement ; values="$3"
# Region codes used by ICU's Region class
# Specify the value explicitly so that the LDMLConverter will split it.
//supplementalData/metadata/validity/variable[@type="choice"][@id="\$territory"] ; /regionCodes ; values={value}
# validity
//supplementalData/idValidity/id[@type="(%A)"][@idStatus="(%A)"] ; /idValidity/$1/$2 ; values={value}

View file

@ -0,0 +1,127 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth8.assertThat;
import static org.junit.Assert.fail;
import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.unicode.cldr.api.CldrPath;
@RunWith(JUnit4.class)
public class PathMatcherTest {
@Test
public void testMatcher() {
CldrPath calEra = parseDistinguishingPath(
"//ldml/dates/calendars/calendar[@type=\"buddhist\"]/eras/eraAbbr/era[@type=\"0\"]");
CldrPath chineseMon1 = monthInfo("chinese", "format", "abbreviated", 1);
CldrPath chineseMon2 = monthInfo("chinese", "format", "abbreviated", 2);
CldrPath genericMon1 = monthInfo("generic", "stand-alone", "narrow", 1);
CldrPath genericMon2 = monthInfo("generic", "stand-alone", "narrow", 2);
List<CldrPath> calPaths =
Arrays.asList(calEra, chineseMon1, chineseMon2, genericMon1, genericMon2);
PathMatcher anyCalendarPaths = PathMatcher.of("ldml/dates/calendars/calendar");
assertThat(calPaths.stream().allMatch(anyCalendarPaths::matchesPrefixOf)).isTrue();
assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matches)).isTrue();
assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matchesSuffixOf)).isTrue();
PathMatcher chineseCalendars =
PathMatcher.of("ldml/dates/calendars/calendar[@type=\"chinese\"]");
assertThat(calPaths.stream().filter(chineseCalendars::matchesPrefixOf))
.containsExactly(chineseMon1, chineseMon2);
PathMatcher anyMonth = PathMatcher.of("monthWidth[@type=*]/month[@type=*]");
assertThat(calPaths.stream().filter(anyMonth::matchesSuffixOf))
.containsExactly(chineseMon1, chineseMon2, genericMon1, genericMon2);
PathMatcher narrowMonth = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
assertThat(calPaths.stream().filter(narrowMonth::matchesSuffixOf))
.containsExactly(genericMon1, genericMon2);
assertThat(calPaths.stream().filter(narrowMonth::matches)).isEmpty();
PathMatcher firstMonth = PathMatcher.of("month[@type=\"1\"]");
assertThat(calPaths.stream().filter(firstMonth::matchesSuffixOf))
.containsExactly(chineseMon1, genericMon1);
PathMatcher fullMatch = PathMatcher.of("ldml/dates"
+ "/calendars/calendar[@type=\"generic\"]"
+ "/months/monthContext[@type=\"stand-alone\"]"
+ "/monthWidth[@type=\"narrow\"]"
+ "/month[@type=\"2\"]");
assertThat(calPaths.stream().filter(fullMatch::matches)).containsExactly(genericMon2);
}
@Test
public void testWildcardSegment() {
PathMatcher wildcard = PathMatcher.of("ldml/dates"
+ "/calendars/calendar[@type=\"generic\"]"
+ "/*/*[@type=\"format\"]/*[@type=\"narrow\"]/*[@type=*]");
assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 1))).isTrue();
assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 9))).isTrue();
assertThat(wildcard.matches(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
assertThat(wildcard.matches(monthInfo("chinese", "format", "narrow", 1))).isFalse();
assertThat(wildcard.matches(monthInfo("generic", "stand-alone", "narrow", 1))).isFalse();
assertThat(wildcard.matches(dayInfo("generic", "format", "wide", "mon"))).isFalse();
}
@Test
public void testAnyOf() {
PathMatcher monthMatch = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
PathMatcher dayMatch = PathMatcher.of("dayWidth[@type=\"narrow\"]/day[@type=*]");
PathMatcher combined = PathMatcher.anyOf(monthMatch, dayMatch);
assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "narrow", 1))).isTrue();
assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "wide", 1))).isFalse();
assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "wide", "mon"))).isFalse();
}
@Test
public void testBadSpecifiers() {
assertInvalidPathSpecification("");
// Leading and trailing '/' are not permitted (they imply empty segments.
assertInvalidPathSpecification("/foo/");
assertInvalidPathSpecification("foo//bar");
assertInvalidPathSpecification("foo/bad segment name");
assertInvalidPathSpecification("foo/bar[type=*]");
assertInvalidPathSpecification("foo/bar[@type=**]");
assertInvalidPathSpecification("foo/bar[@type='double-quotes-only']");
}
private void assertInvalidPathSpecification(String spec) {
IllegalArgumentException e =
assertThrows(IllegalArgumentException.class, () -> PathMatcher.of(spec));
assertThat(e).hasMessageThat().startsWith("invalid path specification");
assertThat(e).hasMessageThat().contains(spec);
}
private static CldrPath monthInfo(String type, String context, String width, int number) {
return CldrPath.parseDistinguishingPath(String.format(
"//ldml/dates/calendars/calendar[@type=\"%s\"]"
+ "/months/monthContext[@type=\"%s\"]"
+ "/monthWidth[@type=\"%s\"]"
+ "/month[@type=\"%d\"]",
type, context, width, number));
}
private static CldrPath dayInfo(String type, String context, String width, String id) {
return CldrPath.parseDistinguishingPath(String.format(
"//ldml/dates/calendars/calendar[@type=\"%s\"]"
+ "/days/dayContext[@type=\"%s\"]"
+ "/dayWidth[@type=\"%s\"]"
+ "/day[@type=\"%s\"]",
type, context, width, id));
}
}

View file

@ -0,0 +1,44 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static org.unicode.icu.tool.cldrtoicu.testing.RbPathSubjectFactory.assertThat;
import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth8.assertThat;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class RbPathTest {
@Test
public void testEmpty() {
assertThat(RbPath.empty()).hasSegments();
assertThat(RbPath.empty()).hasLength(0);
}
@Test
public void testParseVsOf() {
assertThat(RbPath.of("foo", "bar")).hasSegments("foo", "bar");
assertThat(RbPath.of("foo/bar")).hasSegments("foo/bar");
assertThat(RbPath.parse("foo/bar")).hasSegments("foo", "bar");
}
@Test
public void testBadArgs() {
assertBadPath("", "empty path string");
assertBadPath("foo//bar", "empty path segment");
assertBadPath("foo/<bar/baz", "mismatched quoting");
assertBadPath("foo/\"bar", "mismatched quoting");
assertBadPath("foo/\"bar\"baz\"", "invalid character");
assertBadPath("foo/bar baz", "invalid character");
}
private static void assertBadPath(String path, String errorSnippet) {
IllegalArgumentException e =
assertThrows(IllegalArgumentException.class, () -> RbPath.parse(path));
assertThat(e).hasMessageThat().contains(errorSnippet);
}
}

View file

@ -0,0 +1,357 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth.assertWithMessage;
import static com.google.common.truth.Truth8.assertThat;
import static java.util.Arrays.asList;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
import static org.unicode.cldr.api.CldrValue.parseValue;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.unicode.cldr.api.CldrDataSupplier;
import org.unicode.cldr.api.CldrValue;
import org.unicode.cldr.tool.LikelySubtags;
import org.unicode.cldr.util.LanguageTagCanonicalizer;
import org.unicode.cldr.util.LocaleIDParser;
import org.unicode.cldr.util.SupplementalDataInfo;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableSet;
/**
* Unit tests for the supplemental data API. These tests either use fake data for unit testing, or
* compare behaviour between this API and the equivalent CLDR utility tool for regression testing.
*/
@RunWith(JUnit4.class)
public class SupplementalDataTest {
private static SupplementalData regressionData;
private static LikelySubtags likelySubtags;
@BeforeClass
public static void loadRegressionData() {
Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR"));
regressionData = SupplementalData
.create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL));
SupplementalDataInfo sdi =
SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString());
likelySubtags = new LikelySubtags(sdi);
}
@Test
public void testGetParent_explicit() {
// Locales with an explicit (non truncation) parent (a.k.a "English is weird").
SupplementalData fakeData = fakeSupplementalData(parentLocales("en_001", "en_AU", "en_GB"));
assertThat(fakeData.getExplicitParentLocaleOf("en_GB")).hasValue("en_001");
assertThat(fakeData.getExplicitParentLocaleOf("en_AU")).hasValue("en_001");
assertThat(fakeData.getExplicitParentLocaleOf("en_US")).isEmpty();
assertThat(fakeData.getExplicitParentLocaleOf("en")).isEmpty();
assertThat(fakeData.getParent("en_GB")).isEqualTo("en_001");
assertThat(fakeData.getParent("en_AU")).isEqualTo("en_001");
assertThat(fakeData.getParent("en_001")).isEqualTo("en");
assertThat(fakeData.getParent("en_US")).isEqualTo("en");
assertThat(fakeData.getParent("en")).isEqualTo("root");
}
@Test
public void testGetParent_likelyScript() {
// To figure out default scripts we use likely subtags.
SupplementalData fakeData = fakeSupplementalData(likelySubtag("zh", "zh_Hans_CN"));
// When removing a non-default script, the parent become "root".
assertThat(fakeData.getParent("zh_Hant")).isEqualTo("root");
// "Hans" is recognized as the default script, so the parent is obtained via truncation.
assertThat(fakeData.getParent("zh_Hans")).isEqualTo("zh");
}
@Test
public void testMaximize() {
SupplementalData fakeData = fakeSupplementalData(
likelySubtag("en", "en_Latn_US"),
likelySubtag("pt", "pt_Latn_BR"),
likelySubtag("und", "en_Latn_US"));
// You cannot maximize "root".
assertThat(fakeData.maximize("root")).isEmpty();
// Existing subtags preserved.
assertThat(fakeData.maximize("en")).hasValue("en_Latn_US");
assertThat(fakeData.maximize("en_GB")).hasValue("en_Latn_GB");
assertThat(fakeData.maximize("en_VARIANT")).hasValue("en_Latn_US_VARIANT");
// Some other similar examples.
assertThat(fakeData.maximize("pt")).hasValue("pt_Latn_BR");
assertThat(fakeData.maximize("pt_PT")).hasValue("pt_Latn_PT");
assertThat(fakeData.maximize("und")).hasValue("en_Latn_US");
}
@Test
public void testReplaceDeprecatedTags_iAmRoot() {
SupplementalData fakeData = fakeSupplementalData();
assertThat(fakeData.replaceDeprecatedTags("root")).isEqualTo("root");
}
@Test
public void testReplaceDeprecatedTags_sameSubtags() {
SupplementalData fakeData = fakeSupplementalData(likelySubtag("en", "en_Latn_US"));
// Replacement does not minimize or maximize results (even though "Latn" is likely).
assertThat(fakeData.replaceDeprecatedTags("en_Latn_GB")).isEqualTo("en_Latn_GB");
assertThat(fakeData.replaceDeprecatedTags("en_GB")).isEqualTo("en_GB");
}
@Test
public void testReplaceDeprecatedTags_subtagReplacement() {
SupplementalData fakeData = fakeSupplementalData(
languageAlias("cym", "cy"),
scriptAlias("Qaai", "Zinh"),
territoryAlias("YU", "RS"));
// Region is deprecated
assertThat(fakeData.replaceDeprecatedTags("en_YU")).isEqualTo("en_RS");
// Script is deprecated
assertThat(fakeData.replaceDeprecatedTags("ar_Qaai_IR")).isEqualTo("ar_Zinh_IR");
// Language is deprecated
assertThat(fakeData.replaceDeprecatedTags("cym_GB")).isEqualTo("cy_GB");
}
@Test
public void testReplaceDeprecatedTags_complex() {
SupplementalData fakeData = fakeSupplementalData(
languageAlias("sh", "sr_Latn"),
languageAlias("zh_TW", "zh_Hant_TW"),
languageAlias("tzm_Latn_MA", "tzm_MA"),
territoryAlias("YU", "RS"),
likelySubtag("sr", "sr_Cyrl_RS"),
likelySubtag("zh_Hant", "zh_Hant_TW"));
// "sh" -> "sr_Latn", taking precedence over the fact that "sr" maximizes to "sr_Cyrl_RS".
assertThat(fakeData.replaceDeprecatedTags("sh_YU")).isEqualTo("sr_Latn_RS");
// Alias lookup can add tags however depending on the situation.
assertThat(fakeData.replaceDeprecatedTags("zh_TW")).isEqualTo("zh_Hant_TW");
// But it will NOT remove tags (even though the languageAlias table contains an entry from
// "tzm_Latn_MA" to "tzm_MA").
assertThat(fakeData.replaceDeprecatedTags("tzm_Latn_MA")).isEqualTo("tzm_Latn_MA");
}
@Test
public void testGetDefaultCalendar() {
SupplementalData fakeData = fakeSupplementalData(
defaultCalendar("gregorian", "001"),
defaultCalendar("persian", "AF"),
likelySubtag("uz", "uz_Latn_UZ"),
likelySubtag("uz_AF", "uz_Arab_AF"),
likelySubtag("uz_Arab", "uz_Arab_AF"));
assertThat(fakeData.getDefaultCalendar("root")).hasValue("gregorian");
// Empty because "gregorian" is the default found in the parent locale.
assertThat(fakeData.getDefaultCalendar("en_US")).isEmpty();
assertThat(fakeData.getDefaultCalendar("uz")).isEmpty();
assertThat(fakeData.getDefaultCalendar("uz_AF")).hasValue("persian");
assertThat(fakeData.getDefaultCalendar("uz_Arab")).hasValue("persian");
// Empty because "uz_Arab" defines the persian calendar.
assertThat(fakeData.getDefaultCalendar("uz_Arab_AF")).isEmpty();
}
@Test
public void testGetDefaultCalendar_secretHacks() {
SupplementalData fakeData = fakeSupplementalData(
defaultCalendar("gregorian", "001"),
likelySubtag("ja", "ja_Jpan_JP"),
likelySubtag("th", "th_Thai_TH"));
// Empty because "gregorian" is the default found in the parent locale.
assertThat(fakeData.getDefaultCalendar("ja_US")).isEmpty();
assertThat(fakeData.getDefaultCalendar("ja")).isEmpty();
// Traditional calendars for a region cannot be represented via the territory-only based
// CLDR data calendar mapping, so they exist as hard coded "hacks" in SupplementalData.
// They could be pulled out into the configuration API, but they should ideally just be
// derived from CLDR data directly.
assertThat(fakeData.getDefaultCalendar("ja_JP_TRADITIONAL")).hasValue("japanese");
assertThat(fakeData.getDefaultCalendar("ja_TRADITIONAL")).hasValue("japanese");
assertThat(fakeData.getDefaultCalendar("th_TH_TRADITIONAL")).hasValue("buddhist");
assertThat(fakeData.getDefaultCalendar("th_TRADITIONAL")).hasValue("buddhist");
}
@Test
public void testGetParent_regression() {
for (String id : TEST_LOCALE_IDS) {
assertWithMessage("id=%s", id)
.that(getIdChain(id, regressionData::getParent))
.isEqualTo(getIdChain(id, LocaleIDParser::getParent));
}
}
@Test
public void testMaximize_regression() {
for (String id : TEST_LOCALE_IDS) {
assertWithMessage("id=%s", id)
.that(regressionData.maximize(id).orElse(null))
.isEqualTo(likelySubtags.maximize(id));
}
// ars currently a special case since it's in the ICU data as an alias, but not in the CLDR
// data at all. This while it's a structurally valid language code, it cannot be maximized.
assertThat(regressionData.maximize("ars")).isEmpty();
}
@Test
public void testReplaceDeprecatedTags_regression() {
LanguageTagCanonicalizer ltc = new LanguageTagCanonicalizer();
for (String id : TEST_LOCALE_IDS) {
// Work around:
// https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13194
try {
ltc.transform(id);
} catch (NullPointerException e) {
System.out.println("--> " + id);
continue;
}
// Need to maximize to work around:
// https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13196
assertWithMessage("id=%s", id)
.that(regressionData.maximize(regressionData.replaceDeprecatedTags(id)).orElse(null))
.isEqualTo(likelySubtags.maximize(ltc.transform(id)));
}
}
private static Iterable<String> getIdChain(String id, Function<String, String> fn) {
List<String> chain = new ArrayList<>();
while (!id.equals("root")) {
chain.add(id);
id = fn.apply(id);
}
chain.add(id);
return chain;
}
private static final ImmutableSet<String> TEST_LOCALE_IDS = ImmutableSet.of(
"af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
"ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ", "ar_JO",
"ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS", "ar_QA", "ar_SA",
"ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars", "as", "as_IN",
"asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ", "az_Latn",
"az_Latn_AZ", "bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg",
"bg_BG", "bm", "bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR",
"brx", "brx_IN", "bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA", "ca",
"ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU", "ceb",
"ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs", "cs_CZ", "cy",
"cy_GB", "da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
"de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
"dyo_SN", "dz", "dz_BT", "ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR",
"en", "en_001", "en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB",
"en_BE", "en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
"en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI", "en_FJ",
"en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM", "en_GU", "en_GY",
"en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE", "en_JM", "en_KE", "en_KI",
"en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG", "en_MH", "en_MO", "en_MP", "en_MS",
"en_MT", "en_MU", "en_MW", "en_MY", "en_NA", "en_NF", "en_NG", "en_NL", "en_NR", "en_NU",
"en_NZ", "en_PG", "en_PH", "en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB",
"en_SC", "en_SD", "en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ",
"en_TC", "en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US",
"en_US_POSIX", "en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
"eo_001", "es", "es_003", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
"es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN", "es_IC",
"es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV", "es_US", "es_UY",
"es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM", "fa", "fa_AF", "fa_IR", "ff",
"ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM", "ff_Latn_GH", "ff_Latn_GM",
"ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR", "ff_Latn_NE", "ff_Latn_NG",
"ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi", "fi_FI", "fil", "fil_PH", "fo", "fo_DK",
"fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI", "fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF",
"fr_CG", "fr_CH", "fr_CI", "fr_CM", "fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN",
"fr_GP", "fr_GQ", "fr_HT", "fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML",
"fr_MQ", "fr_MR", "fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC",
"fr_SN", "fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
"fy", "fy_NL", "ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR",
"gsw_LI", "gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM", "ha", "ha_GH", "ha_NE", "ha_NG",
"haw", "haw_US", "he", "he_IL", "hi", "hi_IN", "hr", "hr_BA", "hr_HR", "hsb", "hsb_DE",
"hu", "hu_HU", "hy", "hy_AM", "ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN",
"in", "in_ID", "is", "is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL", "ja",
"ja_JP", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID", "ka", "ka_GE", "kab", "kab_DZ",
"kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV", "khq", "khq_ML", "ki", "ki_KE", "kk",
"kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln", "kln_KE", "km", "km_KH", "kn", "kn_IN",
"ko", "ko_KP", "ko_KR", "kok", "kok_IN", "ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM",
"ksh", "ksh_DE", "ku", "ku_TR", "kw", "kw_GB", "ky", "ky_KG", "lag", "lag_TZ", "lb",
"lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO", "ln_CD", "ln_CF", "ln_CG", "lo",
"lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT", "lu", "lu_CD", "luo", "luo_KE", "luy",
"luy_KE", "lv", "lv_LV", "mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg",
"mg_MG", "mgh", "mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN",
"mn", "mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
"mua_CM", "my", "my_MM", "mzn", "mzn_IR", "naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd",
"nd_ZW", "nds", "nds_DE", "nds_NL", "ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ",
"nl_CW", "nl_NL", "nl_SR", "nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no",
"no_NO", "nus", "nus_SS", "nyn", "nyn_UG", "om", "om_ET", "om_KE", "or", "or_IN", "os",
"os_GE", "os_RU", "pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK",
"pl", "pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
"pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL", "qu", "qu_BO", "qu_EC",
"qu_PE", "rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
"ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ", "sah",
"sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI", "se_NO", "se_SE",
"seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA", "sh_CS", "sh_YU", "shi",
"shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA", "shi_MA", "si", "si_LK", "sk",
"sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn", "sn_ZW", "so", "so_DJ", "so_ET", "so_KE",
"so_SO", "sq", "sq_AL", "sq_MK", "sq_XK", "sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME",
"sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK", "sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA",
"sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS", "sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME",
"sr_RS", "sr_CS", "sr_YU", "sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ",
"sw_UG", "ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
"tg", "tg_TJ", "th", "th_TH", "ti", "ti_ER", "ti_ET", "tk", "tk_TM", "tl", "tl_PH", "to",
"to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU", "twq", "twq_NE", "tzm", "tzm_MA", "ug",
"ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab", "uz_Arab_AF",
"uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ", "vai", "vai_Latn", "vai_Latn_LR",
"vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi", "vi_VN", "vun", "vun_TZ", "wae", "wae_CH", "wo",
"wo_SN", "xh", "xh_ZA", "xog", "xog_UG", "yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ",
"yo_NG", "yue", "yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK", "zgh", "zgh_MA", "zh",
"zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO", "zh_Hans_SG", "zh_Hant", "zh_Hant_HK",
"zh_Hant_MO", "zh_Hant_TW", "zh_CN", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
private static CldrValue parentLocales(String parent, String... locales) {
return supplementalData(
"parentLocales/parentLocale[@parent=\"%s\"][@locales=\"%s\"]",
parent, Joiner.on(' ').join(locales));
}
private static CldrValue defaultCalendar(String calendar, String... territories) {
return supplementalData(
"calendarPreferenceData/calendarPreference[@territories=\"%s\"][@ordering=\"%s\"]",
Joiner.on(' ').join(territories), calendar);
}
private static CldrValue likelySubtag(String from, String to) {
return supplementalData(
"likelySubtags/likelySubtag[@from=\"%s\"][@to=\"%s\"]", from, to);
}
private static CldrValue languageAlias(String type, String replacement) {
return supplementalData(
"metadata/alias/languageAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
}
private static CldrValue scriptAlias(String type, String replacement) {
return supplementalData(
"metadata/alias/scriptAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
}
private static CldrValue territoryAlias(String type, String replacement) {
return supplementalData(
"metadata/alias/territoryAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
}
private static CldrValue supplementalData(String path, Object... args) {
return parseValue(String.format("//supplementalData/" + path, args), "");
}
private static SupplementalData fakeSupplementalData(CldrValue... values) {
return SupplementalData.create(CldrDataSupplier.forValues(asList(values)));
}
}

View file

@ -0,0 +1,538 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.regex;
import static com.google.common.truth.Truth.assertThat;
import static java.util.Arrays.asList;
import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
import static org.unicode.icu.tool.cldrtoicu.testing.ResultSubjectFactory.assertThat;
import java.util.List;
import javax.annotation.concurrent.Immutable;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.unicode.cldr.api.CldrPath;
import org.unicode.cldr.api.CldrValue;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
/**
* Tests for the regex transformer class. Note that in most cases, the rules used here are taken
* directly from one of the config files, simply because it avoids having to invent valid paths
* for testing (and we still need "real" CLDR paths since the path parsing verifies attributes
* against the DTD metadata). Basing tests on real rules illustrates that all of these tests are
* asserting about relied-upon behaviour, however there is nothing inherently special about these
* paths.
*/
@RunWith(JUnit4.class)
public class RegexTransformerTest {
@Test
public void testSingleResults_singleCapture() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"%W=[\\w\\-]++",
"//ldml/numbers/defaultNumberingSystem[@alt=\"(%A)\"] ; /NumberElements/default_$1",
"//ldml/numbers/defaultNumberingSystem ; /NumberElements/default",
"//ldml/numbers/otherNumberingSystems/(%W) ; /NumberElements/$1");
CldrValue defaultNumberingSystem =
CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem", "foobar");
assertSingleResult(
transformer.transform(defaultNumberingSystem), "NumberElements/default", "foobar");
CldrValue altNumberingSystem =
CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem[@alt=\"foo\"]", "bar");
assertSingleResult(
transformer.transform(altNumberingSystem), "NumberElements/default_foo", "bar");
CldrValue otherNumberingSystems =
CldrValue.parseValue("//ldml/numbers/otherNumberingSystems/finance", "foo bar");
assertSingleResult(
transformer.transform(otherNumberingSystems), "NumberElements/finance", "foo bar");
}
@Test
public void testSingleResults_multipleCapture() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"//ldml/characters"
+ "/parseLenients[@scope=\"(%A)\"][@level=\"(%A)\"]"
+ "/parseLenient[@sample=\"%A\"]"
+ " ; /parse/$1/$2");
CldrValue lenient = CldrValue.parseValue(
"//ldml/characters"
+ "/parseLenients[@scope=\"general\"][@level=\"lenient\"]"
+ "/parseLenient[@sample=\"ignored\"]",
"foo");
assertSingleResult(
transformer.transform(lenient), "/parse/general/lenient", "foo");
CldrValue stricter = CldrValue.parseValue(
"//ldml/characters"
+ "/parseLenients[@scope=\"number\"][@level=\"stricter\"]"
+ "/parseLenient[@sample=\"ignored\"]",
"bar");
assertSingleResult(
transformer.transform(stricter), "/parse/number/stricter", "bar");
}
@Test
public void testMultipleResults() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"%W=[\\s\\w\\-/]++",
"//supplementalData/numberingSystems"
+ "/numberingSystem[@type=\"numeric\"][@id=\"(%W)\"][@digits=\"(%A)\"]",
" ; /numberingSystems/$1/algorithmic:int ; values=0",
" ; /numberingSystems/$1/desc ; values=$2",
" ; /numberingSystems/$1/radix:int ; values=10");
CldrValue value = CldrValue.parseValue(
"//supplementalData/numberingSystems"
+ "/numberingSystem[@type=\"numeric\"][@id=\"foo\"][@digits=\"bar\"]",
"");
ImmutableList<Result> results = transformer.transform(value);
assertThat(results).hasSize(3);
assertThat(results.get(0)).hasKey("/numberingSystems/foo/algorithmic:int");
assertThat(results.get(0)).hasValues("0");
assertThat(results.get(0)).isGrouped(false);
assertThat(results.get(1)).hasKey("/numberingSystems/foo/desc");
assertThat(results.get(1)).hasValues("bar");
assertThat(results.get(1)).isGrouped(false);
assertThat(results.get(2)).hasKey("/numberingSystems/foo/radix:int");
assertThat(results.get(2)).hasValues("10");
assertThat(results.get(2)).isGrouped(false);
}
@Test
public void testImplicitArgumentSplitting() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"%W=[\\s\\w\\-/]++",
"//supplementalData/gender/personList[@type=\"(%W)\"][@locales=\"(%W)\"]"
+ " ; /genderList/$2 ; values=$1",
"//supplementalData/windowsZones/mapTimezones"
+ "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
+ " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
// Implicit splitting is based on the first unquoted placeholder in the output path ($2 in
// this case) and not the first captured group of the input path.
CldrValue personList = CldrValue.parseValue(
"//supplementalData/gender/personList[@type=\"neutral\"][@locales=\"xx yy zz\"]", "");
ImmutableList<Result> results = transformer.transform(personList);
assertThat(results).hasSize(3);
assertThat(results.get(0)).hasKey("/genderList/xx");
assertThat(results.get(0)).hasValues("neutral");
assertThat(results.get(1)).hasKey("/genderList/yy");
assertThat(results.get(1)).hasValues("neutral");
assertThat(results.get(2)).hasKey("/genderList/zz");
assertThat(results.get(2)).hasValues("neutral");
// Quoting prevents the first captured argument with spaces from triggering multiple
// results (it will trigger on the first un-quoted argument in the output path). This
// quoting must appear in the output however since spaces are "structural" in paths in
// ICU data files.
CldrValue mapZone = CldrValue.parseValue(
"//supplementalData/windowsZones/mapTimezones/mapZone"
+ "[@type=\"foo\"]"
+ "[@other=\"not split\"]"
+ "[@territory=\"XX YY ZZ\"]",
"");
results = transformer.transform(mapZone);
assertThat(results).hasSize(3);
assertThat(results.get(0)).hasKey("/mapTimezones/\"not split\"/XX");
assertThat(results.get(2)).hasValues("foo");
assertThat(results.get(1)).hasKey("/mapTimezones/\"not split\"/YY");
assertThat(results.get(2)).hasValues("foo");
assertThat(results.get(2)).hasKey("/mapTimezones/\"not split\"/ZZ");
assertThat(results.get(2)).hasValues("foo");
}
@Test
public void testValueSplitting() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"%W=[\\s\\w\\-/]++",
"//supplementalData/parentLocales/parentLocale[@parent=\"(%A)\"][@locales=\"(%A)\"]"
+ " ; /parentLocales/$1 ; values=$2",
"//supplementalData/windowsZones/mapTimezones"
+ "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
+ " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
// Because the value is expressed via an explicit values instruction, it is split by space.
CldrValue parentLocale = CldrValue.parseValue(
"//supplementalData/parentLocales"
+ "/parentLocale[@parent=\"foo\"][@locales=\"value is split\"]",
"");
assertSingleResult(transformer.transform(parentLocale),
"/parentLocales/foo", "value", "is", "split");
// However if a placeholder is quoted in the value instruction, it is not split.
CldrValue mapZone = CldrValue.parseValue(
"//supplementalData/windowsZones/mapTimezones/mapZone"
+ "[@type=\"value is not split\"]"
+ "[@other=\"foo\"]"
+ "[@territory=\"XX\"]",
"");
assertSingleResult(transformer.transform(mapZone),
"/mapTimezones/\"foo\"/XX", "value is not split");
}
@Test
public void testResultFunctionCalling() {
List<String> configLines = asList(
"%A=[^\"']++",
"%W=[\\s\\w\\-/]++",
"//supplementalData/numberingSystems"
+ "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
" ; /numberingSystems/foo ; values=&swap( $1 , $2 ) $3",
" ; /numberingSystems/bar ; values=\"&swap( $1, quux )\"",
" ; /numberingSystems/baz ; values=\"&swap( $1-$2, $3{value} )\"");
CldrValue numberingSystem = CldrValue.parseValue(
"//supplementalData/numberingSystems"
+ "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
"-VALUE");
// Note that joining with a space is rather a trivial function, but it does illustrate that
// a function's output is still subject to value splitting unless quoted. In fact a common
// function (&ymd) is used to split year/month/day strings using spaces exactly so they are
// treated as separate values.
// Note also that the spaces around the arguments to the function are ignored however.
NamedFunction swapFn =
NamedFunction.create("swap", 2, args -> args.get(1) + " " + args.get(0));
PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, swapFn);
ImmutableList<Result> results = transformer.transform(numberingSystem);
assertThat(results).hasSize(3);
assertThat(results.get(0)).hasValues("bar", "foo", "baz");
assertThat(results.get(1)).hasValues("quux foo");
assertThat(results.get(2)).hasValues("baz-VALUE foo-bar");
}
@Test
public void testResultFunctionCalling_edgeCases() {
List<String> configLines = asList(
"%A=[^\"']++",
"%W=[\\s\\w\\-/]++",
"//supplementalData/numberingSystems"
+ "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
" ; /numberingSystems/foo ; values=\"&join( {value} , $1 $2 $3, {value} )\"");
// This illustrates a fundamental problem with the way that quoting and splitting is
// defined in this config language. Splitting is always down after value substitution,
// which is just done as a single pass. This, if a value has a double-quote in it can
// upset the quoting behaviour in odd ways. Here it prevents the outermost quoting from
// working and results in multiple values where there should be one.
//
// To fix this, the implicit splitting should be replaced by a "split()" function and the
// rules should be parsed into something approximating a proper expression AST.
CldrValue badValue = CldrValue.parseValue(
"//supplementalData/numberingSystems"
+ "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
"<< \" >>");
NamedFunction joinFn =
NamedFunction.create("join", 3, args -> args.get(0) + args.get(1) + args.get(2));
PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, joinFn);
ImmutableList<Result> results = transformer.transform(badValue);
// If outer quoting worked, this would be a single value, not five.
assertSingleResult(results, "/numberingSystems/foo", "<< ", ">>foo", "bar", "baz<<", " >>");
}
@Test
public void testDynamicVars() {
PathValueTransformer transformer = transformer(
"%W=[\\w\\-]++",
"%D=//ldml/numbers/defaultNumberingSystem",
"//ldml/numbers/currencyFormats[@numberSystem=\"%D\"]/currencySpacing/(%W)/(%W)",
" ; /currencySpacing/$1/$2");
CldrValue cldrValue = CldrValue.parseValue(
"//ldml/numbers/currencyFormats[@numberSystem=\"latn\"]"
+ "/currencySpacing/beforeCurrency/currencyMatch",
"format");
// The path we expect to be resolved by the dynamic variable function.
CldrPath expectedPath =
CldrPath.parseDistinguishingPath("//ldml/numbers/defaultNumberingSystem");
ImmutableList<Result> format = transformer.transform(cldrValue, p -> {
assertThat(p).isEqualTo(expectedPath);
return "latn";
});
assertSingleResult(format, "/currencySpacing/beforeCurrency/currencyMatch", "format");
}
@Test
public void testFallbacks_simple() {
PathValueTransformer transformer = transformer(
"%W=[\\w\\-/]++",
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol"
+ " ; /Currencies/$1 ; fallback=$1",
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/displayName"
+ " ; /Currencies/$1 ; fallback=$1");
ImmutableList<Result> symbol = transformer.transform(
CldrValue.parseValue(
"//ldml/numbers/currencies/currency[@type=\"Foo\"]/symbol", "symbol"));
assertSingleResult(symbol, "Currencies/Foo", "symbol");
ImmutableList<Result> name = transformer.transform(
CldrValue.parseValue(
"//ldml/numbers/currencies/currency[@type=\"Foo\"]/displayName", "name"));
assertSingleResult(name, "Currencies/Foo", "name");
RbPath rbPath = RbPath.of("Currencies", "Foo");
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
assertThat(fallbacks).hasSize(2);
// Both fallbacks look like they are equal, but they didn't come from the same rule...
assertThat(fallbacks.get(0)).hasKey(rbPath);
assertThat(fallbacks.get(0)).hasValues("Foo");
assertThat(fallbacks.get(1)).hasKey(rbPath);
assertThat(fallbacks.get(1)).hasValues("Foo");
// ... so they correspond to different matched results.
assertThat(fallbacks.get(0).isFallbackFor(symbol.get(0))).isTrue();
assertThat(fallbacks.get(1).isFallbackFor(symbol.get(0))).isFalse();
assertThat(fallbacks.get(0).isFallbackFor(name.get(0))).isFalse();
assertThat(fallbacks.get(1).isFallbackFor(name.get(0))).isTrue();
// And they are ordered by their appearance in the configuration file.
assertThat(fallbacks.get(0)).isLessThan(fallbacks.get(1));
// BUT (and this is important) the fallback results are "equal". This is necessary for
// other situations where results are generated from different rules but should be
// considered "equal" for purposes of deduplication. Deduplication doesn't affect this
// situation though (but it's worth being explicit in this test). This is all a bit subtle
// and should be fixed properly at some point. See also "testBaseXpath()".
assertThat(fallbacks.get(0)).isEqualTo(fallbacks.get(1));
}
@Test
public void testFallbacks_multipleArgs() {
PathValueTransformer transformer = transformer(
"%W=[\\s\\w\\-/]++",
"//supplementalData/calendarData"
+ "/calendar[@type=\"(%W)\"]/eras/era[@type=\"(%W)\"][@(start|end)=\"(%A)\"]",
" ; /fake/$2/$4/$1/$3 ; fallback=$1 $2 $3 $4 $3 $2 $1");
// Path elements match the $N indices so it's easy to see how reordering happens.
RbPath rbPath = RbPath.of("fake", "two", "four", "one", "three");
// This shows that the capturing of arguments done on the resource bundle path for the
// fallback correctly reordered the arguments. Having this many reordered arguments in a
// fallback is not something that really happens in the actual config files currently, but
// it's complex logic and needs to be tested. Note also how captured arguments can appear
// multiple times in the result.
assertSingleResult(
transformer.getFallbackResultsFor(rbPath, p -> null),
rbPath,
"one", "two", "three", "four", "three", "two", "one");
}
@Test
public void testFallbacks_valueSplitting() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
" ; /fake/$1/$2 ; fallback=$1 and $2");
RbPath rbPath = RbPath.of("fake", "Foo", "Bar");
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
assertSingleResult(fallbacks, rbPath, "Foo", "and", "Bar");
}
@Test
public void testFallbacks_missingArgs() {
IllegalStateException e = assertThrows(
IllegalStateException.class,
() -> transformer(
"%A=[^\"']++",
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
" ; /$1 ; fallback=$2"));
// A bit brittle, but this message is important for debugging.
assertThat(e).hasMessageThat()
.contains("fallback values may only contain arguments from the resource bundle path");
assertThat(e).hasMessageThat().contains("$2");
}
@Test
public void testFallbacks_noValueSubstitution() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
" ; /$1 ; fallback=$1-{value}");
RbPath rbPath = RbPath.of("Foo");
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
// The {value} token is not substituted in a fallback because there is not value.
// TODO: Make this into an error (since it's only ever going to happen by mistake)!
assertSingleResult(fallbacks, rbPath, "Foo-{value}");
}
@Test
public void testFallbacks_noQuotingSupport() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
" ; /fake/$1 ; fallback=\"$1\"");
RbPath rbPath = RbPath.of("fake", "Foo");
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
// Fallbacks could support quoting of placeholders, but to match legacy behaviour,
// they don't yet. As it is you cannot prevent fallback values being split on spaces.
assertSingleResult(fallbacks, rbPath, "\"Foo\"");
}
@Test
public void testHiddenLabelsAndMetazones() {
PathValueTransformer transformer = transformer(
"%A=[^\"']++",
"%W=[\\s\\w\\-/]++",
"//supplementalData/metaZones/metazoneInfo"
+ "/timezone[@type=\"(%W)\"]/usesMetazone[@mzone=\"(%W)\"]"
+ " ; /metazoneInfo/\"$1\"/<$2> ; values=$2",
"//supplementalData/metaZones/metazoneInfo"
+ "/timezone[@type=\"(%W)\"]/usesMetazone[@to=\"(%A)\"][@mzone=\"(%W)\"]"
+ " ; /metazoneInfo/\"$1\"/<1970-01-01 00:00> ; values=$3 \"1970-01-01 00:00\" \"$2\"");
ImmutableList<Result> parisTz = transformPath(
transformer,
"//supplementalData/metaZones/metazoneInfo"
+ "/timezone[@type=\"Europe/Paris\"]/usesMetazone[@mzone=\"Europe_Central\"]");
// The conversion from "Europe/Paris" to "Europe:Paris" is a built in special case when
// quoting values with '/' in. It's only actually necessary for these timezone identifiers,
// but the code is applied everywhere since that's easier. Ideally there'd be something
// like the function calling mechanism to make this transformation explicit, but at the
// moment, the output resource bunder paths have no way to control the transformation of
// substituted arguments, so it has to be built in.
assertSingleResult(
parisTz, "/metazoneInfo/\"Europe:Paris\"/<Europe_Central>", "Europe_Central");
ImmutableList<Result> britishTz = transformPath(
transformer,
"//supplementalData/metaZones/metazoneInfo"
+ "/timezone[@type=\"Europe/London\"]"
+ "/usesMetazone[@to=\"1971-10-31 02:00\"][@mzone=\"Europe_Central\"]");
// This example demonstrates that things like ' ' or ':' (normally prohibited in resource
// bundle path elements) are acceptable in hidden labels, since those will be stripped out
// while writing the resulting data file. The date-time values are quoted in the rule to
// ensure they are not split.
assertSingleResult(
britishTz,
"/metazoneInfo/\"Europe:London\"/<1970-01-01 00:00>",
"Europe_Central", "1970-01-01 00:00", "1971-10-31 02:00");
}
@Test
public void testBaseXpath() {
PathValueTransformer transformer = transformer(
"%W=[\\s\\w\\-/]++",
"%N=[\\d\\.]++",
// In the real data, these rules define multiple results which reflect the actual
// differences in the child elements, but the one tested is is only based on the
// <territory> path prefix, which is the same for many child elements (which is all
// that's ever actually transformed).
//
// So for a single path prefix you'll generate multiple identical results which need
// to be de-duplicated, which can only happen if they are considered to have come
// from the same source (since duplicate results happen all the time in general).
//
// This is what the base xpath does, it fakes a different source CLDR path which makes
// the results "equal" (even though they came from different CLDR paths sources).
"//supplementalData/territoryInfo"
+ "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
+ "/languagePopulation[@type=\"(%W)\"][@populationPercent=\"(%N)\"]",
" ; /territoryInfo/$1/territoryF:intvector"
+ " ; values=$2 $3 $4"
+ " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]",
// Same thing but with child element containing "writingPercent".
"//supplementalData/territoryInfo"
+ "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
+ "/languagePopulation[@type=\"(%W)\"][@writingPercent=\"(%N)\"][@populationPercent=\"(%N)\"]",
" ; /territoryInfo/$1/territoryF:intvector"
+ " ; values=$2 $3 $4"
+ " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]");
String commonPrefix =
"//supplementalData/territoryInfo"
+ "/territory[@type=\"CI\"][@gdp=\"97160000000\"][@literacyPercent=\"57\"][@population=\"26260600\"]";
ImmutableList<Result> firstResult = transformPath(
transformer,
commonPrefix + "/languagePopulation[@type=\"kfo\"][@populationPercent=\"0.3\"]");
ImmutableList<Result> secondResult = transformPath(
transformer,
commonPrefix + "/languagePopulation[@type=\"sef\"][@writingPercent=\"5\"][@populationPercent=\"4\"]");
assertSingleResult(
firstResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
assertSingleResult(
secondResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
// Even though they come from different rules, these results are treated as interchangeably
// equal because the base path is the same. Without the base path this would not be equal.
assertThat(firstResult).isEqualTo(secondResult);
}
@Test
public void testResultGrouping() {
PathValueTransformer transformer = transformer(
"%W=[\\w\\-/]++",
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol ; /Currencies/$1",
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/decimal ; /Currencies/$1 ; group");
Result ungrouped = transformSingleResult(
transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/symbol", "$");
Result grouped = transformSingleResult(
transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/decimal", ".");
// Note that grouping is important for some data, but isn't very interesting at the basic
// transformation level (it's just a bit). It's only interesting when the converter
// combines multiple results together.
assertThat(ungrouped).isGrouped(false);
assertThat(grouped).isGrouped(true);
}
private static PathValueTransformer transformer(String... configLines) {
return RegexTransformer.fromConfigLines(asList(configLines));
}
private static ImmutableList<Result> transformPath(
PathValueTransformer transformer, String cldrPath) {
return transformer.transform(CldrValue.parseValue(cldrPath, ""));
}
private static Result transformSingleResult(
PathValueTransformer transformer, String path, String value) {
ImmutableList<Result> results =
transformer.transform(CldrValue.parseValue(path, value));
assertThat(results).hasSize(1);
return results.get(0);
}
private static void assertSingleResult(List<Result> results, RbPath path, String... values) {
assertThat(results).hasSize(1);
assertThat(results.get(0)).isGrouped(false);
assertThat(results.get(0)).hasKey(path);
assertThat(results.get(0)).hasValues(values);
}
private static void assertSingleResult(List<Result> results, String path, String... values) {
assertSingleResult(results, RbPath.parse(path), values);
}
}

View file

@ -0,0 +1,29 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.testing;
import static org.junit.Assert.fail;
/** Static assertion helpers (some of which can be removed if JUnit version is updated). */
public final class AssertUtils {
// Functional interface acting as a lambda target.
public interface CheckedRunnable<T extends Throwable> {
void run() throws T;
}
/** Asserts that an exception is thrown by a given runnable. */
public static <T extends Throwable> T assertThrows(Class<T> cls, CheckedRunnable<T> fn) {
try {
fn.run();
} catch (Throwable t) {
if (cls.isInstance(t)) {
return cls.cast(t);
}
fail("expected " + cls.getName() + " but got " + t.getClass().getName());
}
fail("expected " + cls.getName() + " but nothing was thrown");
throw new AssertionError("unreachable!");
}
private AssertUtils() {}
}

View file

@ -0,0 +1,33 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.testing;
import static com.google.common.base.Preconditions.checkArgument;
import com.google.common.truth.FailureMetadata;
import com.google.common.truth.Subject;
import org.unicode.icu.tool.cldrtoicu.RbPath;
public final class RbPathSubject extends Subject {
// For use when chaining from other subjects.
public static Subject.Factory<RbPathSubject, RbPath> rbPaths() {
return RbPathSubject::new;
}
private final RbPath actual;
protected RbPathSubject(FailureMetadata metadata, RbPath actual) {
super(metadata, actual);
this.actual = actual;
}
/** Asserts the value of the path, as segments (use this if a segment can contain '/'). */
public final void hasSegments(String... segments) {
check("<segments>").that(actual).isEqualTo(RbPath.of(segments));
}
public final void hasLength(int n) {
checkArgument(n >= 0, "invalid path length: %s", n);
check("length()").that(actual.length()).isEqualTo(n);
}
}

View file

@ -0,0 +1,22 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.testing;
import com.google.common.truth.FailureMetadata;
import com.google.common.truth.Subject;
import com.google.common.truth.Truth;
import org.unicode.icu.tool.cldrtoicu.RbPath;
/** Truth subject for asserting about resource bundle paths (makes tests much more readable). */
public final class RbPathSubjectFactory implements Subject.Factory<RbPathSubject, RbPath> {
public static RbPathSubject assertThat(RbPath result) {
return Truth.assertAbout(new RbPathSubjectFactory()).that(result);
}
@Override
public RbPathSubject createSubject(FailureMetadata failureMetadata, RbPath that) {
return new RbPathSubject(failureMetadata, that);
}
RbPathSubjectFactory() {}
}

View file

@ -0,0 +1,53 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.testing;
import static com.google.common.base.Preconditions.checkNotNull;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
import org.unicode.icu.tool.cldrtoicu.RbPath;
import com.google.common.truth.ComparableSubject;
import com.google.common.truth.FailureMetadata;
import com.google.common.truth.IterableSubject;
import com.google.common.truth.Subject;
public final class ResultSubject extends ComparableSubject<Result> {
// For use when chaining from other subjects.
public static Subject.Factory<ResultSubject, Result> results() {
return ResultSubject::new;
}
private final Result actual;
protected ResultSubject(FailureMetadata metadata, Result result) {
super(metadata, checkNotNull(result));
this.actual = result;
}
public final void isGrouped(boolean grouped) {
if (grouped != actual.isGrouped()) {
check("isGrouped()").that(actual.isGrouped()).isEqualTo(grouped);
}
}
public final IterableSubject hasValueListThat() {
return check("getValues()").that(actual.getValues());
}
public final void hasValues(String... values) {
hasValueListThat().containsExactlyElementsIn(values);
}
public final RbPathSubject hasKeyThat() {
return check("getKey()").about(RbPathSubject.rbPaths()).that(actual.getKey());
}
public final void hasKey(RbPath path) {
hasKeyThat().isEqualTo(path);
}
public final void hasKey(String path) {
hasKey(RbPath.parse(path));
}
}

View file

@ -0,0 +1,22 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.testing;
import com.google.common.truth.FailureMetadata;
import com.google.common.truth.Subject;
import com.google.common.truth.Truth;
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
/** Truth subject for asserting about transformation results (makes tests much more readable). */
public class ResultSubjectFactory implements Subject.Factory<ResultSubject, Result> {
public static ResultSubject assertThat(Result result) {
return Truth.assertAbout(new ResultSubjectFactory()).that(result);
}
@Override
public ResultSubject createSubject(FailureMetadata failureMetadata, Result that) {
return new ResultSubject(failureMetadata, that);
}
private ResultSubjectFactory() {}
}