mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
parent
97516f58b1
commit
2528d0bec1
46 changed files with 8684 additions and 0 deletions
7
tools/cldr/cldr-to-icu/.gitignore
vendored
Normal file
7
tools/cldr/cldr-to-icu/.gitignore
vendored
Normal file
|
@ -0,0 +1,7 @@
|
|||
# Exclude the Maven local repository but keep the lib directory and the top-level readme.
|
||||
/lib/**
|
||||
!/lib/README.txt
|
||||
|
||||
# Ignore the default Maven target directory.
|
||||
/target
|
||||
|
55
tools/cldr/cldr-to-icu/README.txt
Normal file
55
tools/cldr/cldr-to-icu/README.txt
Normal file
|
@ -0,0 +1,55 @@
|
|||
*********************************************************************
|
||||
*** © 2019 and later: Unicode, Inc. and others. ***
|
||||
*** License & terms of use: http://www.unicode.org/copyright.html ***
|
||||
*********************************************************************
|
||||
|
||||
Basic instructions for running the LdmlConverter via Maven
|
||||
==========================================================
|
||||
|
||||
Note that these instructions do not currently support configuration of the converter for things
|
||||
such as limiting the set of files produced. That is supported in code and could be easily added
|
||||
to the binary, or encapsulated via an Ant task, but currently it is not directly supported.
|
||||
See the IcuConverterConfig class for the API by which this can be supported.
|
||||
|
||||
|
||||
Important directories
|
||||
---------------------
|
||||
|
||||
<CLDR_DIR> = The root directory of the CLDR release.
|
||||
|
||||
<ICU_DIR> = The root directory of the ICU release (probably a parent directory of where
|
||||
this README file is located). This is an optional property and defaults to
|
||||
the parent directory of the release from which it is run.
|
||||
|
||||
<DTD_CACHE> = The temporary cache directory in which DTD files are downloaded (this is the
|
||||
same directory as would be used when running tools from the CLDR project).
|
||||
Note that the need to specify this directory is scheduled to be removed after
|
||||
ICU release 65.
|
||||
|
||||
<OUT_DIR> = The output directory into which ICU data files should be written.
|
||||
|
||||
|
||||
Generating all ICU data
|
||||
-----------------------
|
||||
|
||||
$ mvn exec:java \
|
||||
-DCLDR_DIR='<CLDR_DIR>' \
|
||||
-DCLDR_DTD_CACHE='<DTD_CACHE>' \
|
||||
-Dexec.args='<OUT_DIR>'
|
||||
|
||||
|
||||
Running unit tests
|
||||
------------------
|
||||
|
||||
$ mvn test \
|
||||
-DCLDR_DIR='<CLDR_DIR>' \
|
||||
-DCLDR_DTD_CACHE='<DTD_CACHE>'
|
||||
|
||||
|
||||
Importing and running from an IDE
|
||||
---------------------------------
|
||||
|
||||
This project should be easy to import into an IDE which supports Maven development, such
|
||||
as IntelliJ or Eclipse. It uses a local Maven repository directory for the unpublished
|
||||
CLDR libraries (which are included in the project), but otherwise gets all dependencies
|
||||
via Maven's public repositories.
|
61
tools/cldr/cldr-to-icu/lib/README.txt
Normal file
61
tools/cldr/cldr-to-icu/lib/README.txt
Normal file
|
@ -0,0 +1,61 @@
|
|||
*********************************************************************
|
||||
*** © 2019 and later: Unicode, Inc. and others. ***
|
||||
*** License & terms of use: http://www.unicode.org/copyright.html ***
|
||||
*********************************************************************
|
||||
|
||||
What is this directory and why is it empty?
|
||||
-------------------------------------------
|
||||
|
||||
This is the root of a local Maven repository which needs to be populated before the
|
||||
code in this project can be executed.
|
||||
|
||||
To do this, you need to have a local copy of the CLDR project configured on your
|
||||
computer and be able able to build the API jar file and copy an existing utility
|
||||
jar file. In the examples below it is assumed that <CLDR_ROOT> references this CLDR
|
||||
release.
|
||||
|
||||
|
||||
Regenerating the CLDR API jar
|
||||
-----------------------------
|
||||
|
||||
To regenerate the CLDR API jar you need to build the "jar" target using the Ant
|
||||
build.xml file in the "tools/java" directory of the CLDR project:
|
||||
|
||||
$ cd <CLDR_ROOT>/tools/java
|
||||
$ ant clean jar
|
||||
|
||||
This should result in the cldr.jar file being built into that directory, which can then
|
||||
be installed as a Maven dependency as described above.
|
||||
|
||||
|
||||
Updating local Maven repository
|
||||
-------------------------------
|
||||
|
||||
To update the local Maven repository (e.g. to install the CLDR jar) then from this
|
||||
directory (lib/) you should run:
|
||||
|
||||
$ mvn install:install-file \
|
||||
-DgroupId=org.unicode.cldr \
|
||||
-DartifactId=cldr-api \
|
||||
-Dversion=0.1-SNAPSHOT \
|
||||
-Dpackaging=jar \
|
||||
-DgeneratePom=true \
|
||||
-DlocalRepositoryPath=. \
|
||||
-Dfile=<CLDR_ROOT>/tools/java/cldr.jar
|
||||
|
||||
And also (for the utility jar):
|
||||
|
||||
$ mvn install:install-file \
|
||||
-DgroupId=com.ibm.icu \
|
||||
-DartifactId=icu-utilities \
|
||||
-Dversion=0.1-SNAPSHOT \
|
||||
-Dpackaging=jar \
|
||||
-DgeneratePom=true \
|
||||
-DlocalRepositoryPath=. \
|
||||
-Dfile=<CLDR_ROOT>/tools/java/libs/utilities.jar
|
||||
|
||||
And if you have updated one of these libraries, run:
|
||||
|
||||
$ mvn dependency:purge-local-repository -DsnapshotsOnly=true
|
||||
|
||||
If you choose to update the version number, then remember to update the root pom.xml.
|
83
tools/cldr/cldr-to-icu/pom.xml
Normal file
83
tools/cldr/cldr-to-icu/pom.xml
Normal file
|
@ -0,0 +1,83 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- © 2019 and later: Unicode, Inc. and others.
|
||||
License & terms of use: http://www.unicode.org/copyright.html
|
||||
See README.txt for instructions on updating the local repository.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>org.unicode.icu</groupId>
|
||||
<artifactId>cldr-to-icu</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.5.1</version>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<mainClass>org.unicode.icu.tool.cldrtoicu.LdmlConverter</mainClass>
|
||||
<systemProperties>
|
||||
<property>
|
||||
<key>ICU_DIR</key>
|
||||
<value>${project.basedir}/../../..</value>
|
||||
</property>
|
||||
</systemProperties>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<!-- This is where the snapshots of the CLDR API and additional auxilliary jars are held. -->
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>local-maven-repo</id>
|
||||
<url>file:///${project.basedir}/lib</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.unicode.cldr</groupId>
|
||||
<artifactId>cldr-api</artifactId>
|
||||
<version>0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu-utilities</artifactId>
|
||||
<version>0.1-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>64.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>27.1-jre</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.truth</groupId>
|
||||
<artifactId>truth</artifactId>
|
||||
<version>1.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.truth.extensions</groupId>
|
||||
<artifactId>truth-java8-extension</artifactId>
|
||||
<version>1.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
|
@ -0,0 +1,381 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.unicode.cldr.api.CldrDraftStatus;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
|
||||
|
||||
/**
|
||||
* The converter config intended to generate the standard ICU data files. This used to be something
|
||||
* that was configured by text files such as "icu-locale-deprecates.xml" and "icu-config.
|
||||
*/
|
||||
public final class IcuConverterConfig implements LdmlConverterConfig {
|
||||
|
||||
private static final Optional<Path> DEFAULT_CLDR_DIR =
|
||||
Optional.ofNullable(System.getProperty("CLDR_DIR", null))
|
||||
.map(d -> Paths.get(d).toAbsolutePath());
|
||||
|
||||
private static final Optional<Path> DEFAULT_ICU_DIR =
|
||||
Optional.ofNullable(System.getProperty("ICU_DIR", null))
|
||||
.map(d -> Paths.get(d).toAbsolutePath());
|
||||
|
||||
/** The builder with which to specify configuration for the {@link LdmlConverter}. */
|
||||
public static final class Builder {
|
||||
private Path cldrDir = DEFAULT_CLDR_DIR.orElse(null);
|
||||
private Path outputDir =
|
||||
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data")).orElse(null);
|
||||
private Path specialsDir =
|
||||
DEFAULT_ICU_DIR.map(d -> d.resolve("icu4c/source/data/xml")).orElse(null);;
|
||||
private ImmutableSet<OutputType> outputTypes = OutputType.ALL;
|
||||
private CldrDraftStatus minimalDraftStatus = CldrDraftStatus.CONTRIBUTED;
|
||||
private boolean emitReport = false;
|
||||
|
||||
/**
|
||||
* Sets the CLDR base directory from which to load all CLDR data. This is optional if the
|
||||
* {@code CLDR_DIR} environment variable is set, which will be used instead.
|
||||
*/
|
||||
public Builder setCldrDir(Path cldrDir) {
|
||||
this.cldrDir = checkNotNull(cldrDir.toAbsolutePath());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the output directory in which the ICU data directories and files will go. This is
|
||||
* optional if the {@code ICU_DIR} system property is set, which will be used to generate
|
||||
* the path instead (i.e. {@code "icu4c/source/data"} inside the ICU release directory).
|
||||
*/
|
||||
public Builder setOutputDir(Path outputDir) {
|
||||
this.outputDir = checkNotNull(outputDir);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the "specials" directory containing additional ICU specific data to be processed.
|
||||
* This is optional if the {@code ICU_DIR} system property is set, which will be used to
|
||||
* generate the path instead (i.e. {@code "icu4c/source/data/xml"} inside the ICU release
|
||||
* directory).
|
||||
*/
|
||||
public Builder setSpecialsDir(Path specialsDir) {
|
||||
this.specialsDir = checkNotNull(specialsDir);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the output types which will be converted. This is optional and defaults to {@link
|
||||
* OutputType#ALL}.
|
||||
*/
|
||||
public Builder setOutputTypes(Iterable<OutputType> types) {
|
||||
this.outputTypes = ImmutableSet.copyOf(types);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the minimum draft status for CLDR data to be converted (paths below this status are
|
||||
* ignored during conversion). This is optional and defaults to {@link
|
||||
* CldrDraftStatus#CONTRIBUTED}.
|
||||
*/
|
||||
public Builder setMinimalDraftStatus(CldrDraftStatus minimalDraftStatus) {
|
||||
this.minimalDraftStatus = checkNotNull(minimalDraftStatus);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setEmitReport(boolean emitReport) {
|
||||
this.emitReport = emitReport;
|
||||
return this;
|
||||
}
|
||||
|
||||
/** Returns a converter config from the current builder state. */
|
||||
public LdmlConverterConfig build() {
|
||||
return new IcuConverterConfig(this);
|
||||
}
|
||||
}
|
||||
|
||||
private final Path cldrDir;
|
||||
private final Path outputDir;
|
||||
private final Path specialsDir;
|
||||
private final ImmutableSet<OutputType> outputTypes;
|
||||
private final CldrDraftStatus minimalDraftStatus;
|
||||
private final boolean emitReport;
|
||||
|
||||
private IcuConverterConfig(Builder builder) {
|
||||
this.cldrDir = checkNotNull(builder.cldrDir,
|
||||
"must set a CLDR directory, or the CLDR_DIR system property");
|
||||
if (DEFAULT_CLDR_DIR.isPresent() && !this.cldrDir.equals(DEFAULT_CLDR_DIR.get())) {
|
||||
System.err.format(
|
||||
"Warning: Specified CLDR base directory does not appear to match the"
|
||||
+ " directory inferred by the 'CLDR_DIR' system property.\n"
|
||||
+ "Specified: %s\n"
|
||||
+ "Inferred: %s\n",
|
||||
this.cldrDir, DEFAULT_CLDR_DIR.get());
|
||||
}
|
||||
this.outputDir = checkNotNull(builder.outputDir);
|
||||
checkArgument(!Files.isRegularFile(outputDir),
|
||||
"specified output directory if not a directory: %s", outputDir);
|
||||
this.specialsDir = checkNotNull(builder.specialsDir,
|
||||
"must specify a 'specials' XML directory");
|
||||
checkArgument(Files.isDirectory(specialsDir),
|
||||
"specified specials directory does not exist: %s", specialsDir);
|
||||
this.outputTypes = builder.outputTypes;
|
||||
checkArgument(!this.outputTypes.isEmpty(),
|
||||
"must specify at least one output type to be generated (possible values are: %s)",
|
||||
Arrays.asList(OutputType.values()));
|
||||
this.minimalDraftStatus = builder.minimalDraftStatus;
|
||||
this.emitReport = builder.emitReport;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
@Override public Path getCldrDirectory() {
|
||||
return cldrDir;
|
||||
}
|
||||
|
||||
@Override public Path getOutputDir() {
|
||||
return outputDir;
|
||||
}
|
||||
|
||||
@Override public Set<OutputType> getOutputTypes() {
|
||||
return outputTypes;
|
||||
}
|
||||
|
||||
@Override public CldrDraftStatus getMinimumDraftStatus() {
|
||||
return minimalDraftStatus;
|
||||
}
|
||||
|
||||
@Override public Path getSpecialsDir() {
|
||||
return specialsDir;
|
||||
}
|
||||
|
||||
@Override public boolean emitReport() {
|
||||
return emitReport;
|
||||
}
|
||||
|
||||
// Currently hard-coded "hacks" which could be encoded via the builder if wanted.
|
||||
|
||||
@Override public Map<String, String> getForcedAliases(IcuLocaleDir dir) {
|
||||
switch (dir) {
|
||||
case COLL:
|
||||
return ImmutableMap.<String, String>builder()
|
||||
// It is not at all clear why this is being done (we expect "sr_Latn_ME" normally).
|
||||
// TODO: Find out and document this properly.
|
||||
.put("sr_ME", "sr_Cyrl_ME")
|
||||
|
||||
// This appears to be a hack to avoid needing to copy and maintain the same "zh"
|
||||
// data for "yue". The files for "yue" in this directory should be empty otherwise.
|
||||
//
|
||||
// The maximized versions of "yue_Hans" is "yue_Hans_CN" (vs "zh_Hans_CN"), and for
|
||||
// "yue" it's "yue_Hant_HK" (vs "zh_Hant_HK"), so the aliases are effectively just
|
||||
// rewriting the base language.
|
||||
.put("yue_Hans", "zh_Hans")
|
||||
.put("yue", "zh_Hant")
|
||||
.build();
|
||||
case RBNF:
|
||||
// It is not at all clear why this is being done. It's certainly not exactly the same
|
||||
// as above, since (a) the alias is reversed (b) "zh_Hant" does exist, with different
|
||||
// data than "yue", so this alias is not just rewriting the base language.
|
||||
// TODO: Find out and document this properly.
|
||||
return ImmutableMap.of("zh_Hant_HK", "yue");
|
||||
default:
|
||||
return ImmutableMap.of();
|
||||
}
|
||||
}
|
||||
|
||||
// This set of locale files in each directory denotes the supported/available locales for that
|
||||
// API. In most cases, it's the same set, but a few directories support only a subset of IDs.
|
||||
@Override public ImmutableSet<String> getTargetLocaleIds(IcuLocaleDir dir) {
|
||||
switch (dir) {
|
||||
case COLL:
|
||||
return COLL_LOCALE_IDS;
|
||||
case BRKITR:
|
||||
return BRKITR_LOCALE_IDS;
|
||||
case RBNF:
|
||||
return RBNF_LOCALE_IDS;
|
||||
default:
|
||||
return ICU_LOCALE_IDS;
|
||||
}
|
||||
}
|
||||
|
||||
// The primary set of locale IDs to be generated. Other, directory specific, sets should be
|
||||
// subsets of this. Some of these ID are aliases, so XML files may not exist for all of them.
|
||||
//
|
||||
// This was further modified (in order to better match the set of generated ICU files) by:
|
||||
// * Removing "es_003" (which just seems to be ignored in current code)
|
||||
// * Adding: "en_NH", "sr_XK", "yue_CN", "yue_HK" (deprecated locale IDs in the manual config)
|
||||
// * Adding: "no_NO_NY" (a not even structurally valid ID that exists for very legacy reasons)
|
||||
private static final ImmutableSet<String> ICU_LOCALE_IDS = ImmutableSet.of(
|
||||
"root",
|
||||
// A
|
||||
"af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
|
||||
"ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ",
|
||||
"ar_JO", "ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS",
|
||||
"ar_QA", "ar_SA", "ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars",
|
||||
"as", "as_IN", "asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ",
|
||||
"az_Latn", "az_Latn_AZ",
|
||||
// B
|
||||
"bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg", "bg_BG", "bm",
|
||||
"bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR", "brx", "brx_IN",
|
||||
"bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA",
|
||||
// C
|
||||
"ca", "ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU",
|
||||
"ceb", "ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs",
|
||||
"cs_CZ", "cy", "cy_GB",
|
||||
// D
|
||||
"da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
|
||||
"de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
|
||||
"dyo_SN", "dz", "dz_BT",
|
||||
// E
|
||||
"ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR", "en", "en_001",
|
||||
"en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB", "en_BE",
|
||||
"en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
|
||||
"en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI",
|
||||
"en_FJ", "en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM",
|
||||
"en_GU", "en_GY", "en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE",
|
||||
"en_JM", "en_KE", "en_KI", "en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG",
|
||||
"en_MH", "en_MO", "en_MP", "en_MS", "en_MT", "en_MU", "en_MW", "en_MY", "en_NA",
|
||||
"en_NF", "en_NG", "en_NH", "en_NL", "en_NR", "en_NU", "en_NZ", "en_PG", "en_PH",
|
||||
"en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB", "en_SC", "en_SD",
|
||||
"en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ", "en_TC",
|
||||
"en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US", "en_US_POSIX",
|
||||
"en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
|
||||
"eo_001", "es", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
|
||||
"es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN",
|
||||
"es_IC", "es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV",
|
||||
"es_US", "es_UY", "es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM",
|
||||
// F
|
||||
"fa", "fa_AF", "fa_IR", "ff", "ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM",
|
||||
"ff_Latn_GH", "ff_Latn_GM", "ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR",
|
||||
"ff_Latn_NE", "ff_Latn_NG", "ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi",
|
||||
"fi_FI", "fil", "fil_PH", "fo", "fo_DK", "fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI",
|
||||
"fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF", "fr_CG", "fr_CH", "fr_CI", "fr_CM",
|
||||
"fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN", "fr_GP", "fr_GQ", "fr_HT",
|
||||
"fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML", "fr_MQ", "fr_MR",
|
||||
"fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC", "fr_SN",
|
||||
"fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
|
||||
"fy", "fy_NL",
|
||||
// G
|
||||
"ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR", "gsw_LI",
|
||||
"gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM",
|
||||
// H
|
||||
"ha", "ha_GH", "ha_NE", "ha_NG", "haw", "haw_US", "he", "he_IL", "hi", "hi_IN",
|
||||
"hr", "hr_BA", "hr_HR", "hsb", "hsb_DE", "hu", "hu_HU", "hy", "hy_AM",
|
||||
// I
|
||||
"ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN", "in", "in_ID", "is",
|
||||
"is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL",
|
||||
// J
|
||||
"ja", "ja_JP", "ja_JP_TRADITIONAL", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID",
|
||||
// K
|
||||
"ka", "ka_GE", "kab", "kab_DZ", "kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV",
|
||||
"khq", "khq_ML", "ki", "ki_KE", "kk", "kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln",
|
||||
"kln_KE", "km", "km_KH", "kn", "kn_IN", "ko", "ko_KP", "ko_KR", "kok", "kok_IN",
|
||||
"ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM", "ksh", "ksh_DE", "ku", "ku_TR",
|
||||
"kw", "kw_GB", "ky", "ky_KG",
|
||||
// L
|
||||
"lag", "lag_TZ", "lb", "lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO",
|
||||
"ln_CD", "ln_CF", "ln_CG", "lo", "lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT",
|
||||
"lu", "lu_CD", "luo", "luo_KE", "luy", "luy_KE", "lv", "lv_LV",
|
||||
// M
|
||||
"mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg", "mg_MG", "mgh",
|
||||
"mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN", "mn",
|
||||
"mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
|
||||
"mua_CM", "my", "my_MM", "mzn", "mzn_IR",
|
||||
// N
|
||||
"naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd", "nd_ZW", "nds", "nds_DE", "nds_NL",
|
||||
"ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ", "nl_CW", "nl_NL", "nl_SR",
|
||||
"nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no", "no_NO", "no_NO_NY",
|
||||
"nus", "nus_SS", "nyn", "nyn_UG",
|
||||
// O
|
||||
"om", "om_ET", "om_KE", "or", "or_IN", "os", "os_GE", "os_RU",
|
||||
// P
|
||||
"pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK", "pl",
|
||||
"pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
|
||||
"pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL",
|
||||
// Q
|
||||
"qu", "qu_BO", "qu_EC", "qu_PE",
|
||||
// R
|
||||
"rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
|
||||
"ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ",
|
||||
// S
|
||||
"sah", "sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI",
|
||||
"se_NO", "se_SE", "seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA",
|
||||
"sh_CS", "sh_YU", "shi", "shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA",
|
||||
"shi_MA", "si", "si_LK", "sk", "sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn",
|
||||
"sn_ZW", "so", "so_DJ", "so_ET", "so_KE", "so_SO", "sq", "sq_AL", "sq_MK", "sq_XK",
|
||||
"sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME", "sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK",
|
||||
"sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA", "sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS",
|
||||
"sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME", "sr_RS", "sr_CS", "sr_XK", "sr_YU",
|
||||
"sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ", "sw_UG",
|
||||
// T
|
||||
"ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
|
||||
"tg", "tg_TJ", "th", "th_TH", "th_TH_TRADITIONAL", "ti", "ti_ER", "ti_ET", "tk",
|
||||
"tk_TM", "tl", "tl_PH", "to", "to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU",
|
||||
"twq", "twq_NE", "tzm", "tzm_MA",
|
||||
// U
|
||||
"ug", "ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab",
|
||||
"uz_Arab_AF", "uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ",
|
||||
// V
|
||||
"vai", "vai_Latn", "vai_Latn_LR", "vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi",
|
||||
"vi_VN", "vun", "vun_TZ",
|
||||
// W
|
||||
"wae", "wae_CH", "wo", "wo_SN",
|
||||
// X
|
||||
"xh", "xh_ZA", "xog", "xog_UG",
|
||||
// Y
|
||||
"yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ", "yo_NG", "yue", "yue_CN", "yue_HK",
|
||||
"yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK",
|
||||
// Z
|
||||
"zgh", "zgh_MA", "zh", "zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO",
|
||||
"zh_Hans_SG", "zh_Hant", "zh_Hant_HK", "zh_Hant_MO", "zh_Hant_TW", "zh_CN",
|
||||
"zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
|
||||
|
||||
private static final ImmutableSet<String> COLL_LOCALE_IDS = ImmutableSet.of(
|
||||
"root",
|
||||
// A-B
|
||||
"af", "am", "ars", "ar", "as", "az", "be", "bg", "bn", "bo", "bs_Cyrl", "bs",
|
||||
// C-F
|
||||
"ca", "ceb", "chr", "cs", "cy", "da", "de_AT", "de", "dsb", "dz", "ee", "el", "en",
|
||||
"en_US_POSIX", "en_US", "eo", "es", "et", "fa_AF", "fa", "fil", "fi", "fo", "fr_CA", "fr",
|
||||
// G-J
|
||||
"ga", "gl", "gu", "ha", "haw", "he", "hi", "hr", "hsb", "hu", "hy",
|
||||
"id_ID", "id", "ig", "in", "in_ID", "is", "it", "iw_IL", "iw", "ja",
|
||||
// K-P
|
||||
"ka", "kk", "kl", "km", "kn", "kok", "ko", "ku", "ky", "lb", "lkt", "ln", "lo", "lt", "lv",
|
||||
"mk", "ml", "mn", "mo", "mr", "ms", "mt", "my", "nb", "ne", "nl", "nn", "no_NO", "no",
|
||||
"om", "or", "pa_IN", "pa", "pa_Guru", "pl", "ps", "pt",
|
||||
// R-T
|
||||
"ro", "ru", "se", "sh_BA", "sh_CS", "sh", "sh_YU", "si", "sk", "sl", "smn", "sq",
|
||||
"sr_BA", "sr_Cyrl_ME", "sr_Latn", "sr_ME", "sr_RS", "sr", "sv", "sw",
|
||||
"ta", "te", "th", "tk", "to", "tr",
|
||||
// U-Z
|
||||
"ug", "uk", "ur", "uz", "vi", "wae", "wo", "xh", "yi", "yo", "yue_CN", "yue_Hans",
|
||||
"yue", "zh_CN", "zh_Hant", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zh", "zu");
|
||||
|
||||
private static final ImmutableSet<String> BRKITR_LOCALE_IDS = ImmutableSet.of(
|
||||
"root", "de", "el", "en", "en_US_POSIX", "en_US", "es", "fr", "it", "ja", "pt", "ru",
|
||||
"zh_Hant", "zh");
|
||||
|
||||
private static final ImmutableSet<String> RBNF_LOCALE_IDS = ImmutableSet.of(
|
||||
"root", "af", "ak", "am", "ars", "ar", "az", "be", "bg", "bs", "ca", "ccp", "chr", "cs",
|
||||
"cy", "da", "de_CH", "de", "ee", "el", "en_001", "en_IN", "en", "eo", "es_419", "es_DO",
|
||||
"es_GT", "es_HN", "es_MX", "es_NI", "es_PA", "es_PR", "es_SV", "es", "es_US", "et",
|
||||
"fa_AF", "fa", "ff", "fil", "fi", "fo", "fr_BE", "fr_CH", "fr", "ga", "he", "hi", "hr",
|
||||
"hu", "hy", "id", "in", "is", "it", "iw", "ja", "ka", "kl", "km", "ko", "ky", "lb",
|
||||
"lo", "lrc", "lt", "lv", "mk", "ms", "mt", "my", "nb", "nl", "nn", "no", "pl", "pt_PT",
|
||||
"pt", "qu", "ro", "ru", "se", "sh", "sk", "sl", "sq", "sr_Latn", "sr", "sv",
|
||||
"sw", "ta", "th", "tr", "uk", "vi", "yue_Hans", "yue", "zh_Hant_HK", "zh_Hant", "zh_HK",
|
||||
"zh_MO", "zh_TW", "zh");
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.NavigableSet;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
|
||||
/**
|
||||
* Mutable ICU data, represented as a mapping from resource bundle paths to a sequence of values.
|
||||
*/
|
||||
public final class IcuData {
|
||||
private static final RbPath RB_VERSION = RbPath.of("Version");
|
||||
private static final Pattern ARRAY_INDEX = Pattern.compile("(/[^\\[]++)(?:\\[(\\d++)\\])?$");
|
||||
|
||||
private final String name;
|
||||
private final boolean hasFallback;
|
||||
private final NavigableSet<RbPath> paths = new TreeSet<>();
|
||||
private final ListMultimap<RbPath, RbValue> rbPathToValues = ArrayListMultimap.create();
|
||||
private ImmutableList<String> commentLines = ImmutableList.of();
|
||||
|
||||
/**
|
||||
* IcuData constructor.
|
||||
*
|
||||
* @param name The name of the IcuData object, used as the name of the root node in the output file
|
||||
* @param hasFallback true if the output file has another ICU file as a fallback.
|
||||
*/
|
||||
public IcuData(String name, boolean hasFallback) {
|
||||
this.hasFallback = hasFallback;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/** @return whether data should fallback on data in other ICU files. */
|
||||
public boolean hasFallback() {
|
||||
return hasFallback;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the name of this ICU data instance. Used in the output filename, and in comments.
|
||||
*/
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/** Sets additional comment lines for the top of the file. */
|
||||
public void setFileComment(String... commentLines) {
|
||||
setFileComment(Arrays.asList(commentLines));
|
||||
}
|
||||
|
||||
public void setFileComment(Iterable<String> commentLines) {
|
||||
this.commentLines = ImmutableList.copyOf(commentLines);
|
||||
}
|
||||
|
||||
public List<String> getFileComment() {
|
||||
return commentLines;
|
||||
}
|
||||
|
||||
/** Adds a singleton resource bundle value for a given path. */
|
||||
public void add(RbPath rbPath, String element) {
|
||||
add(rbPath, RbValue.of(element));
|
||||
}
|
||||
|
||||
/** Adds a single resource bundle value for a given path. */
|
||||
public void add(RbPath rbPath, RbValue rbValue) {
|
||||
rbPathToValues.put(rbPath, rbValue);
|
||||
paths.add(rbPath);
|
||||
}
|
||||
|
||||
/** Adds a sequence of resource bundle values for a given path. */
|
||||
public void add(RbPath rbPath, Iterable<RbValue> rbValues) {
|
||||
rbValues.forEach(v -> rbPathToValues.put(rbPath, v));
|
||||
paths.add(rbPath);
|
||||
}
|
||||
|
||||
/** Replaces all resource bundle values for a given path with the specified singleton value. */
|
||||
public void replace(RbPath rbPath, String element) {
|
||||
rbPathToValues.removeAll(rbPath);
|
||||
rbPathToValues.put(rbPath, RbValue.of(element));
|
||||
paths.add(rbPath);
|
||||
}
|
||||
|
||||
/** Replaces all resource bundle values for a given path with the specified value. */
|
||||
public void replace(RbPath rbPath, RbValue rbValue) {
|
||||
rbPathToValues.removeAll(rbPath);
|
||||
add(rbPath, rbValue);
|
||||
}
|
||||
|
||||
public void setVersion(String versionString) {
|
||||
add(RB_VERSION, versionString);
|
||||
}
|
||||
|
||||
public void addResults(ListMultimap<RbPath, PathValueTransformer.Result> resultsByRbPath) {
|
||||
for (RbPath rbPath : resultsByRbPath.keySet()) {
|
||||
for (PathValueTransformer.Result r : resultsByRbPath.get(rbPath)) {
|
||||
if (r.isGrouped()) {
|
||||
// Grouped results have all the values in a single value entry.
|
||||
add(rbPath, RbValue.of(r.getValues()));
|
||||
} else {
|
||||
if (rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")) {
|
||||
r.getValues().forEach(v -> add(rbPath, RbValue.of(v)));
|
||||
} else {
|
||||
// Ungrouped results are one value per entry, but might be expanded into
|
||||
// grouped results if they are a path referencing a grouped entry.
|
||||
r.getValues().forEach(v -> add(rbPath, replacePathValues(v)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces an ungrouped CLDR value for the form "/foo/bar" or "/foo/bar[N]" which is assumed
|
||||
* to be a reference to an existing value in a resource bundle. Note that the referenced bundle
|
||||
* might be grouped (i.e. an array with more than one element).
|
||||
*/
|
||||
private RbValue replacePathValues(String value) {
|
||||
Matcher m = ARRAY_INDEX.matcher(value);
|
||||
if (!m.matches()) {
|
||||
return RbValue.of(value);
|
||||
}
|
||||
// The only constraint is that the "path" value starts with a leading '/', but parsing into
|
||||
// the RbPath ignores this. We must use "parse()" here, rather than RbPath.of(), since the
|
||||
// captured value contains '/' characters to represent path delimiters.
|
||||
RbPath replacePath = RbPath.parse(m.group(1));
|
||||
List<RbValue> replaceValues = get(replacePath);
|
||||
checkArgument(replaceValues != null, "Path %s is missing from IcuData", replacePath);
|
||||
// If no index is given (e.g. "/foo/bar") then treat it as index 0 (i.e. "/foo/bar[0]").
|
||||
int replaceIndex = m.groupCount() > 1 ? Integer.parseInt(m.group(2)) : 0;
|
||||
return replaceValues.get(replaceIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the mutable list of values associated with the given path (or null if there are no
|
||||
* associated values).
|
||||
*/
|
||||
public List<RbValue> get(RbPath rbPath) {
|
||||
return paths.contains(rbPath) ? rbPathToValues.get(rbPath) : null;
|
||||
}
|
||||
|
||||
/** Returns an unmodifiable view of the set of paths in this instance. */
|
||||
public Set<RbPath> getPaths() {
|
||||
return Collections.unmodifiableSet(paths);
|
||||
}
|
||||
|
||||
/** Returns whether the given path is present in this instance. */
|
||||
public boolean contains(RbPath rbPath) {
|
||||
return paths.contains(rbPath);
|
||||
}
|
||||
|
||||
/** Returns whether there are any paths in this instance. */
|
||||
public boolean isEmpty() {
|
||||
return paths.isEmpty();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,381 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkElementIndex;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static com.google.common.collect.ImmutableList.toImmutableList;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.ArrayListMultimap;
|
||||
import com.google.common.collect.HashMultiset;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableSetMultimap;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Multiset;
|
||||
|
||||
/**
|
||||
* Helper tool to dump the resource bundle paths and values from an IcuData instance in a stable
|
||||
* ordering, to allow easy comparison in cases where ICU ordering changes. This could easily be
|
||||
* extended to be a more fully featured "diff" tool or a proper ICU data file parser.
|
||||
*
|
||||
* <p>This is a temporary debugging tool and should not be relied upon during any part of the data
|
||||
* generation process.
|
||||
*/
|
||||
final class IcuDataDumper {
|
||||
private static final Joiner LIST_JOINER = Joiner.on(',');
|
||||
private static final RbPath VERSION = RbPath.of("Version");
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
Path fileOrDir;
|
||||
Optional<Pattern> name = Optional.empty();
|
||||
switch (args.length) {
|
||||
case 2:
|
||||
name = Optional.of(Pattern.compile(args[1]));
|
||||
case 1:
|
||||
fileOrDir = Paths.get(args[0]);
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("Usage: <file-or-dir> [<name-pattern>]");
|
||||
}
|
||||
|
||||
if (Files.isDirectory(fileOrDir)) {
|
||||
walkDirectory(fileOrDir, name);
|
||||
} else {
|
||||
checkArgument(!name.isPresent(),
|
||||
"cannot specificy a name pattern for a non-directory file: %s", fileOrDir);
|
||||
IcuDataParser parser = new IcuDataParser(fileOrDir);
|
||||
parser.parse();
|
||||
dump(parser.icuData);
|
||||
}
|
||||
}
|
||||
|
||||
private static void walkDirectory(Path fileOrDir, Optional<Pattern> name) throws IOException {
|
||||
Predicate<Path> matchesName =
|
||||
f -> name.map(n -> n.matcher(f.getFileName().toString()).matches()).orElse(true);
|
||||
List<IcuDataParser> icuParsers;
|
||||
try (Stream<Path> files = Files.walk(fileOrDir)) {
|
||||
icuParsers = files
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(matchesName)
|
||||
.map(IcuDataParser::new)
|
||||
.collect(toImmutableList());
|
||||
}
|
||||
ListMultimap<RbPath, RbValue> allPaths = ArrayListMultimap.create();
|
||||
for (IcuDataParser p : icuParsers) {
|
||||
p.parse();
|
||||
for (RbPath k : p.icuData.keySet()) {
|
||||
List<RbValue> values = p.icuData.get(k);
|
||||
if (!allPaths.containsKey(k)) {
|
||||
allPaths.putAll(k, values);
|
||||
} else if (!VERSION.equals(k)) {
|
||||
checkState(allPaths.get(k).equals(values), "inconsistent data for path: ", k);
|
||||
}
|
||||
}
|
||||
}
|
||||
dump(allPaths);
|
||||
}
|
||||
|
||||
private static void dump(ListMultimap<RbPath, RbValue> allPaths) {
|
||||
allPaths.keySet().stream()
|
||||
.sorted()
|
||||
.forEach(k -> System.out.println(k + " :: " + LIST_JOINER.join(allPaths.get(k))));
|
||||
}
|
||||
|
||||
private static final class IcuDataParser {
|
||||
// Path of file being parsed.
|
||||
private final Path path;
|
||||
|
||||
// Comments in header (before data starts), without comment characters.
|
||||
private final List<String> headerComment = new ArrayList<>();
|
||||
// ICU data name (the name of the root element).
|
||||
private String name = null;
|
||||
// ICU data values.
|
||||
private final ListMultimap<RbPath, RbValue> icuData = ArrayListMultimap.create();
|
||||
|
||||
// Current line number (1-indexed).
|
||||
private int lineNumber = 0;
|
||||
// The type of the previous line that was processed.
|
||||
private LineType lastType = LineType.COMMENT;
|
||||
// True when inside /* .. */ comments in the header.
|
||||
private boolean inBlockComment = false;
|
||||
// True when in the final top-level group at the end of parsing.
|
||||
private boolean inFinalGroup = false;
|
||||
// True when a partial (line wrapped) value has been read.
|
||||
private boolean isLineContinuation = false;
|
||||
// Current path while parsing (NOT including the root element).
|
||||
private Deque<String> pathStack = new ArrayDeque<>();
|
||||
// Current sequence of values for the path (as defined in the current path stack).
|
||||
private List<String> currentValue = new ArrayList<>();
|
||||
// Current partially read value of a multi-line value.
|
||||
private String wrappedValue = "";
|
||||
// Map of indices used to auto-generate names for anonymous path segments.
|
||||
// TODO: Check if this is even needed and remove if not.
|
||||
private Multiset<Integer> indices = HashMultiset.create();
|
||||
|
||||
IcuDataParser(Path path) {
|
||||
this.path = checkNotNull(path);
|
||||
}
|
||||
|
||||
public boolean parse() throws IOException {
|
||||
List<String> lines = Files.readAllLines(path);
|
||||
// Best approximation to a magic number be have (BOM plus inline comment). This stops
|
||||
// use trying to parse the transliteration files, which are a different type.
|
||||
if (!lines.get(0).startsWith("\uFEFF//")) {
|
||||
return false;
|
||||
}
|
||||
lines.stream().map(whitespace()::trimFrom).forEach(this::processLineWithCheck);
|
||||
|
||||
// Sanity check for expected final state. Just checking the "lastType" should be enough
|
||||
// to catch everything else (due to transition rules and how the code tidies up) but it
|
||||
// seems prudent to sanity check everything just in case.
|
||||
checkState(lastType == LineType.GROUP_END);
|
||||
checkState(!inBlockComment);
|
||||
checkState(name != null);
|
||||
checkState(pathStack.isEmpty() && inFinalGroup);
|
||||
checkState(wrappedValue.isEmpty() && currentValue.isEmpty());
|
||||
return true;
|
||||
}
|
||||
|
||||
void processLineWithCheck(String line) {
|
||||
lineNumber++;
|
||||
if (lineNumber == 1 && line.startsWith("\uFEFF")) {
|
||||
line = line.substring(1);
|
||||
}
|
||||
try {
|
||||
processLine(line);
|
||||
} catch (RuntimeException e) {
|
||||
throw new RuntimeException(
|
||||
String.format("[%s:%s] %s (%s)", path, lineNumber, e.getMessage(), line),
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
void processLine(String line) {
|
||||
line = maybeTrimEndOfLineComment(line);
|
||||
if (line.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
LineMatch match = LineType.match(line, inBlockComment);
|
||||
checkState(match.getType().isValidTransitionFrom(lastType),
|
||||
"invalid state transition: %s --//-> %s", lastType, match.getType());
|
||||
boolean isEndOfWrappedValue = false;
|
||||
switch (match.getType()) {
|
||||
case COMMENT:
|
||||
if (name != null) {
|
||||
// Comments in data are ignored since they cannot be properly associated with
|
||||
// paths or values in an IcuData instance (only legacy tooling emits these).
|
||||
break;
|
||||
}
|
||||
if (line.startsWith("/*")) {
|
||||
inBlockComment = true;
|
||||
}
|
||||
headerComment.add(match.get(0));
|
||||
if (inBlockComment && line.contains("*/")) {
|
||||
checkState(line.indexOf("*/") == line.length() - 2,
|
||||
"unexpected end of comment block");
|
||||
inBlockComment = false;
|
||||
}
|
||||
break;
|
||||
|
||||
case INLINE_VALUE:
|
||||
icuData.put(
|
||||
getPathFromStack().extendBy(getSegment(match.get(0))),
|
||||
RbValue.of(unquote(match.get(1))));
|
||||
break;
|
||||
|
||||
case GROUP_START:
|
||||
checkState(currentValue.isEmpty());
|
||||
if (name == null) {
|
||||
name = match.get(0);
|
||||
checkState(name != null, "cannot have anonymous top-level group");
|
||||
} else {
|
||||
pathStack.push(getSegment(match.get(0)));
|
||||
}
|
||||
wrappedValue = "";
|
||||
isLineContinuation = false;
|
||||
break;
|
||||
|
||||
case QUOTED_VALUE:
|
||||
wrappedValue += unquote(match.get(0));
|
||||
isLineContinuation = !line.endsWith(",");
|
||||
if (!isLineContinuation) {
|
||||
currentValue.add(wrappedValue);
|
||||
wrappedValue = "";
|
||||
}
|
||||
break;
|
||||
|
||||
case VALUE:
|
||||
checkState(!isLineContinuation, "unexpected unquoted value");
|
||||
currentValue.add(match.get(0));
|
||||
break;
|
||||
|
||||
case GROUP_END:
|
||||
// Account for quoted values without trailing ',' just before group end.
|
||||
if (isLineContinuation) {
|
||||
currentValue.add(wrappedValue);
|
||||
isLineContinuation = false;
|
||||
}
|
||||
// Emit the collection sequence of values for the current path as an RbValue.
|
||||
if (!currentValue.isEmpty()) {
|
||||
icuData.put(getPathFromStack(), RbValue.of(currentValue));
|
||||
currentValue.clear();
|
||||
}
|
||||
// Annoyingly the name is outside the stack so the stack will empty before the last
|
||||
// end group.
|
||||
if (!pathStack.isEmpty()) {
|
||||
pathStack.pop();
|
||||
indices.setCount(pathStack.size(), 0);
|
||||
} else {
|
||||
checkState(!inFinalGroup, "unexpected group end");
|
||||
inFinalGroup = true;
|
||||
}
|
||||
break;
|
||||
|
||||
case UNKNOWN:
|
||||
throw new IllegalStateException("cannot parse line: " + match.get(0));
|
||||
}
|
||||
lastType = match.getType();
|
||||
}
|
||||
|
||||
private RbPath getPathFromStack() {
|
||||
if (pathStack.isEmpty()) {
|
||||
return RbPath.empty();
|
||||
}
|
||||
List<String> segments = new ArrayList<>();
|
||||
Iterables.addAll(segments, pathStack);
|
||||
if (segments.get(0).matches("<[0-9]{4}>")) {
|
||||
segments.remove(0);
|
||||
}
|
||||
return segments.isEmpty() ? RbPath.empty() : RbPath.of(Lists.reverse(segments));
|
||||
}
|
||||
|
||||
private String getSegment(String segmentOrNull) {
|
||||
if (segmentOrNull != null) {
|
||||
return segmentOrNull;
|
||||
}
|
||||
int depth = pathStack.size();
|
||||
int index = indices.count(depth);
|
||||
indices.add(depth, 1);
|
||||
return String.format("<%04d>", index);
|
||||
}
|
||||
|
||||
private String maybeTrimEndOfLineComment(String line) {
|
||||
// Once the name is set, we are past the header and into the data.
|
||||
if (name != null) {
|
||||
// Index to search for '//' from - must skip quoted values.
|
||||
int startIdx = line.startsWith("\"") ? line.indexOf('"', 1) + 1 : 0;
|
||||
int commentIdx = line.indexOf("//", startIdx);
|
||||
if (commentIdx != -1) {
|
||||
line = whitespace().trimTrailingFrom(line.substring(0, commentIdx));
|
||||
}
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
private static String unquote(String s) {
|
||||
if (s.startsWith("\"") && s.endsWith("\"")) {
|
||||
return s.substring(1, s.length() - 1).replaceAll("\\\\([\"\\\\])", "$1");
|
||||
}
|
||||
checkState(!s.contains("\""), "invalid unquoted value: %s", s);
|
||||
return s;
|
||||
}
|
||||
|
||||
private static final class LineMatch {
|
||||
private final LineType type;
|
||||
private final Function<Integer, String> args;
|
||||
|
||||
LineMatch(LineType type, Function<Integer, String> args) {
|
||||
this.type = checkNotNull(type);
|
||||
this.args = checkNotNull(args);
|
||||
}
|
||||
|
||||
String get(int n) {
|
||||
return args.apply(n);
|
||||
}
|
||||
|
||||
LineType getType() {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
private enum LineType {
|
||||
// Comment _start_ with any comment value captured.
|
||||
COMMENT("(?://|/\\*)\\s*(.*)"),
|
||||
// A combination of GROUP_START, VALUE and GROUP_END with whitespace.
|
||||
INLINE_VALUE("(?:(.*\\S)\\s*)?\\{\\s*((?:\".*\")|(?:[^\"{}]*\\S))\\s*\\}"),
|
||||
// Allows for empty segment names (anonymous arrays) which match 'null'.
|
||||
GROUP_START("(?:(.*\\S)\\s*)?\\{"),
|
||||
GROUP_END("\\}"),
|
||||
QUOTED_VALUE("(\".*\"),?"),
|
||||
VALUE("([^\"{}]+),?"),
|
||||
UNKNOWN(".*");
|
||||
|
||||
// Table of allowed transitions expected during parsing.
|
||||
// key=current state, values=set of permitted previous states
|
||||
private static ImmutableSetMultimap<LineType, LineType> TRANSITIONS =
|
||||
ImmutableSetMultimap.<LineType, LineType>builder()
|
||||
.putAll(COMMENT, COMMENT)
|
||||
.putAll(INLINE_VALUE, COMMENT, INLINE_VALUE, GROUP_START, GROUP_END)
|
||||
.putAll(GROUP_START, COMMENT, GROUP_START, GROUP_END, INLINE_VALUE)
|
||||
.putAll(VALUE, GROUP_START, VALUE, QUOTED_VALUE)
|
||||
.putAll(QUOTED_VALUE, GROUP_START, VALUE, QUOTED_VALUE)
|
||||
.putAll(GROUP_END, GROUP_END, INLINE_VALUE, VALUE, QUOTED_VALUE)
|
||||
.build();
|
||||
|
||||
private final Pattern pattern;
|
||||
|
||||
LineType(String regex) {
|
||||
this.pattern = Pattern.compile(regex);
|
||||
}
|
||||
|
||||
boolean isValidTransitionFrom(LineType lastType) {
|
||||
return TRANSITIONS.get(this).contains(lastType);
|
||||
}
|
||||
|
||||
static LineMatch match(String line, boolean inBlockComment) {
|
||||
// Block comments kinda suck and it'd be great if the ICU data only used '//' style
|
||||
// comments (if would definitely simplify any parsers out there). Once the
|
||||
// transition to the new transformation tools is complete, they can be changed to
|
||||
// only emit '//' style comments.
|
||||
if (inBlockComment) {
|
||||
if (line.startsWith("*")) {
|
||||
line = whitespace().trimLeadingFrom(line.substring(1));
|
||||
}
|
||||
return new LineMatch(COMMENT, ImmutableList.of(line)::get);
|
||||
}
|
||||
for (LineType type : TRANSITIONS.keySet()) {
|
||||
// Regex groups start at 1, but we want the getter function to be zero-indexed.
|
||||
Matcher m = type.pattern.matcher(line);
|
||||
if (m.matches()) {
|
||||
return new LineMatch(type, n -> {
|
||||
checkElementIndex(n, m.groupCount());
|
||||
return m.group(n + 1);
|
||||
});
|
||||
}
|
||||
}
|
||||
return new LineMatch(UNKNOWN, ImmutableList.of(line)::get);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,209 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static java.lang.Integer.parseInt;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import org.unicode.icu.tool.cldrtoicu.regex.NamedFunction;
|
||||
|
||||
/**
|
||||
* The named functions used by the {@code RegexTransformer} for {@code ldml2icu_supplemental.txt}.
|
||||
*/
|
||||
final class IcuFunctions {
|
||||
/**
|
||||
* Converts an ISO date string to a space-separated pair of integer values representing the top
|
||||
* and bottom parts of a deconstructed millisecond epoch value (i.e. {@code
|
||||
* "<hi32bits> <low32bits>"}).
|
||||
*
|
||||
* <p>Note that the values are formatted as <em>signed</em> decimal values, so it's entirely
|
||||
* possible that the low bits value will be appear as a negative number (the high bits won't
|
||||
* appear negative for many thousands of years).
|
||||
*
|
||||
* <ul>
|
||||
* <li>args[0] = ISO date string (e.g. "2019-05-23")
|
||||
* <li>args[1] = Date field type name (e.g. "from")
|
||||
* </ul>
|
||||
*/
|
||||
static final NamedFunction DATE_FN =
|
||||
NamedFunction.create("date", 2, args -> {
|
||||
long millis =
|
||||
DateFieldType.toEnum(args.get(1)).toEpochMillis(LocalDate.parse(args.get(0)));
|
||||
// Strictly speaking the masking is redundant and could be removed.
|
||||
int hiBits = (int) ((millis >>> 32) & 0xFFFFFFFFL);
|
||||
int loBits = (int) (millis & 0xFFFFFFFFL);
|
||||
return hiBits + " " + loBits;
|
||||
});
|
||||
|
||||
// TODO(dbeaumont): Improve this documentation (e.g. why is this being done, give examples?).
|
||||
/**
|
||||
* Inserts '%' into numberingSystems descriptions.
|
||||
*
|
||||
* <ul>
|
||||
* <li>args[0] = numbering system description (string)
|
||||
* </ul>
|
||||
*/
|
||||
static final NamedFunction ALGORITHM_FN =
|
||||
NamedFunction.create("algorithm", 1, args -> {
|
||||
String value = args.get(0);
|
||||
int percentPos = value.lastIndexOf('/') + 1;
|
||||
return value.substring(0, percentPos) + '%' + value.substring(percentPos);
|
||||
});
|
||||
|
||||
/**
|
||||
* Converts a number into a special integer that represents the number in normalized scientific
|
||||
* notation for ICU's RB parser.
|
||||
*
|
||||
* <p>Resultant integers are in the form "xxyyyyyy", where "xx" is the exponent offset by 50
|
||||
* and "yyyyyy" is the coefficient to 5 decimal places. Results may also have a leading '-' to
|
||||
* denote negative values.
|
||||
*
|
||||
* <p>For example:
|
||||
* <pre>{@code
|
||||
* 14660000000000 -> 1.466E13 -> 63146600
|
||||
* 0.0001 -> 1E-4 -> 46100000
|
||||
* -123.456 -> -1.23456E-2 -> -48123456
|
||||
* }</pre>
|
||||
*
|
||||
* <p>The additional exponent offset is applied directly to the calculated exponent and is used
|
||||
* to do things like converting percentages into their decimal representation (i.e. by passing
|
||||
* a value of "-2").
|
||||
*
|
||||
* <ul>
|
||||
* <li>args[0] = number to be converted (double)
|
||||
* <li>args[1] = additional exponent offset (integer)
|
||||
* </ul>
|
||||
*/
|
||||
static final NamedFunction EXP_FN =
|
||||
NamedFunction.create("exp", 2, args -> {
|
||||
double value = Double.parseDouble(args.get(0));
|
||||
if (value == 0) {
|
||||
return "0";
|
||||
}
|
||||
int exponent = 50;
|
||||
if (args.size() == 2) {
|
||||
exponent += Integer.parseInt(args.get(1));
|
||||
}
|
||||
String sign = value >= 0 ? "" : "-";
|
||||
value = Math.abs(value);
|
||||
while (value >= 10) {
|
||||
value /= 10;
|
||||
exponent++;
|
||||
}
|
||||
while (value < 1) {
|
||||
value *= 10;
|
||||
exponent--;
|
||||
}
|
||||
if (exponent < 0 || exponent > 99) {
|
||||
throw new IllegalArgumentException("Exponent out of bounds: " + exponent);
|
||||
}
|
||||
return sign + exponent + Math.round(value * 100000);
|
||||
});
|
||||
|
||||
// Allow for single digit values in any part and negative year values.
|
||||
private static final Pattern YMD = Pattern.compile("(-?[0-9]+)-([0-9]{1,2})-([0-9]{1,2})");
|
||||
|
||||
/**
|
||||
* Converts an ISO date string (i.e. "YYYY-MM-DD") into an ICU date string, which is
|
||||
* the same but with spaces instead of hyphens. Since functions are expanded before the
|
||||
* resulting value is split, this function will result in 3 separate values being created,
|
||||
* unless the function call is enclosed in quotes.
|
||||
*
|
||||
* <p>Note that for some cases (e.g. "eras") the year part can be negative (e.g. "-2165-1-1")
|
||||
* so this is not as simple as "split by hyphen".
|
||||
*
|
||||
* <ul>
|
||||
* <li>args[0] = ISO date string (e.g. "2019-05-23" or "-2165-1-1")
|
||||
* </ul>
|
||||
*/
|
||||
static final NamedFunction YMD_FN =
|
||||
NamedFunction.create("ymd", 1, args -> {
|
||||
Matcher m = YMD.matcher(args.get(0));
|
||||
checkArgument(m.matches(), "invalid year-month-day string: %s", args.get(0));
|
||||
// NOTE: Re-parsing is not optional since it removes leading zeros (needed for ICU).
|
||||
return String.format("%s %s %s",
|
||||
parseInt(m.group(1)), parseInt(m.group(2)), parseInt(m.group(3)));
|
||||
});
|
||||
|
||||
// For transforming day-of-week identifiers.
|
||||
private static final ImmutableMap<String, String> WEEKDAY_MAP_ID =
|
||||
ImmutableMap.<String, String>builder()
|
||||
.put("sun", "1")
|
||||
.put("mon", "2")
|
||||
.put("tues", "3")
|
||||
.put("wed", "4")
|
||||
.put("thu", "5")
|
||||
.put("fri", "6")
|
||||
.put("sat", "7")
|
||||
.build();
|
||||
|
||||
/**
|
||||
* Converts a day-of-week identifier into its ordinal value (e.g. "sun" --> 1, "mon" --> 2 ...).
|
||||
*/
|
||||
static final NamedFunction DAY_NUMBER_FN =
|
||||
NamedFunction.create("day_number", 1,
|
||||
args -> {
|
||||
String id = WEEKDAY_MAP_ID.get(args.get(0));
|
||||
checkArgument(id != null, "unknown weekday: %s", args.get(0));
|
||||
return id;
|
||||
});
|
||||
|
||||
// For transform IDs in <contextTransform> elements.
|
||||
private static final ImmutableMap<String, String> TRANSFORM_ID_MAP =
|
||||
ImmutableMap.of("no-change", "0", "titlecase-firstword", "1");
|
||||
|
||||
/**
|
||||
* Converts the transform type in the {@code <contextTransform>} element into its ICU index
|
||||
* (e.g. "titlecase-firstword" --> 1).
|
||||
*/
|
||||
static final NamedFunction CONTEXT_TRANSFORM_INDEX_FN =
|
||||
NamedFunction.create("context_transform_index", 1,
|
||||
args -> {
|
||||
String id = TRANSFORM_ID_MAP.get(args.get(0));
|
||||
checkArgument(id != null, "unknown contextTransform: %s", args.get(0));
|
||||
return id;
|
||||
});
|
||||
|
||||
// For DATE_FN only.
|
||||
private enum DateFieldType {
|
||||
from(LocalDate::atStartOfDay),
|
||||
// Remember that atTime() takes nanoseconds, not micro or milli.
|
||||
to(d -> d.atTime(23, 59, 59, 999_000_000));
|
||||
|
||||
private final Function<LocalDate, LocalDateTime> adjustFn;
|
||||
|
||||
DateFieldType(Function<LocalDate, LocalDateTime> adjustFn) {
|
||||
this.adjustFn = adjustFn;
|
||||
}
|
||||
|
||||
long toEpochMillis(LocalDate date) {
|
||||
return adjustFn.apply(date).toInstant(ZoneOffset.UTC).toEpochMilli();
|
||||
}
|
||||
|
||||
static DateFieldType toEnum(String value) {
|
||||
switch (Ascii.toLowerCase(CharMatcher.whitespace().trimFrom(value))) {
|
||||
case "from":
|
||||
case "start":
|
||||
return from;
|
||||
case "to":
|
||||
case "end":
|
||||
return to;
|
||||
default:
|
||||
throw new IllegalArgumentException(value + " is not a valid date field type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private IcuFunctions() {}
|
||||
}
|
|
@ -0,0 +1,313 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static java.util.stream.Collectors.joining;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.Writer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Writes an IcuData object to a text file. A lot of this class was copied directly from the
|
||||
* original {@code IcuTextWriter} in the CLDR project and has a number of very idiosyncratic
|
||||
* behaviours. The behaviour of this class is currently tuned to produce perfect parity with
|
||||
* the original conversion tools, but once migration of the tools is complete, it should
|
||||
* probably be revisited and tidied up.
|
||||
*/
|
||||
// TODO: Link to a definitive specification for the ICU data files and remove the hacks!
|
||||
final class IcuTextWriter {
|
||||
private static final String INDENT = " ";
|
||||
// List of characters to escape in UnicodeSets
|
||||
// ('\' followed by any of '\', '[', ']', '{', '}', '-', '&', ':', '^', '=').
|
||||
private static final Pattern UNICODESET_ESCAPE =
|
||||
Pattern.compile("\\\\[\\\\\\[\\]\\{\\}\\-&:^=]");
|
||||
// Only escape \ and " from other strings.
|
||||
private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
|
||||
private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
|
||||
|
||||
/** Write a file in ICU data format with the specified header. */
|
||||
static void writeToFile(IcuData icuData, Path outDir, List<String> header) {
|
||||
try {
|
||||
Files.createDirectories(outDir);
|
||||
try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt"));
|
||||
PrintWriter out = new PrintWriter(w)) {
|
||||
new IcuTextWriter(icuData).writeTo(out, header);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("cannot write ICU data file: " + icuData.getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private final IcuData icuData;
|
||||
private int depth = 0;
|
||||
private boolean valueWasInline = false;
|
||||
|
||||
IcuTextWriter(IcuData icuData) {
|
||||
this.icuData = checkNotNull(icuData);
|
||||
}
|
||||
|
||||
// TODO: Write a UTF-8 header (see https://unicode-org.atlassian.net/browse/ICU-10197).
|
||||
private void writeTo(PrintWriter out, List<String> header) throws IOException {
|
||||
out.write('\uFEFF');
|
||||
writeHeaderAndComments(out, header, icuData.getFileComment());
|
||||
|
||||
// Write the ICU data to file. This takes the form:
|
||||
// ----
|
||||
// <name>{
|
||||
// foo{
|
||||
// bar{baz}
|
||||
// }
|
||||
// }
|
||||
// ----
|
||||
// So it's like every RbPath has an implicit prefix of the IcuData name.
|
||||
String root = icuData.getName();
|
||||
if (!icuData.hasFallback()) {
|
||||
root += ":table(nofallback)";
|
||||
}
|
||||
// TODO: Replace with "open(root, out)" once happy with differences (it adds a blank line).
|
||||
out.print(root);
|
||||
out.print("{");
|
||||
depth++;
|
||||
|
||||
RbPath lastPath = RbPath.empty();
|
||||
for (RbPath path : icuData.getPaths()) {
|
||||
// Close any blocks up to the common path length. Since paths are all distinct, the
|
||||
// common length should always be shorter than either path. We add 1 since we must also
|
||||
// account for the implicit root segment.
|
||||
int commonDepth = RbPath.getCommonPrefixLength(lastPath, path) + 1;
|
||||
// Before closing, the "cursor" is at the end of the last value written.
|
||||
closeLastPath(lastPath, commonDepth, out);
|
||||
// After opening the value will be ready for the next value to be written.
|
||||
openNextPath(path, out);
|
||||
valueWasInline = appendValues(icuData.getName(), path, icuData.get(path), out);
|
||||
lastPath = path;
|
||||
}
|
||||
closeLastPath(lastPath, 0, out);
|
||||
out.println();
|
||||
out.close();
|
||||
}
|
||||
|
||||
// Before: Cursor is at the end of the previous line.
|
||||
// After: Cursor is positioned immediately after the last closed '}'
|
||||
private void closeLastPath(RbPath lastPath, int minDepth, PrintWriter out) {
|
||||
if (valueWasInline) {
|
||||
depth--;
|
||||
out.print('}');
|
||||
valueWasInline = false;
|
||||
}
|
||||
while (depth > minDepth) {
|
||||
close(out);
|
||||
}
|
||||
}
|
||||
|
||||
// Before: Cursor is at the end of the previous line.
|
||||
// After: Cursor is positioned immediately after the newly opened '{'
|
||||
private void openNextPath(RbPath path, PrintWriter out) {
|
||||
while (depth <= path.length()) {
|
||||
// The -1 is to adjust for the implicit root element which means indentation (depth)
|
||||
// no longer matches the index of the segment we are writing.
|
||||
open(path.getSegment(depth - 1), out);
|
||||
}
|
||||
}
|
||||
|
||||
private void open(String label, PrintWriter out) {
|
||||
newLineAndIndent(out);
|
||||
depth++;
|
||||
// This handles the "magic" pseudo indexing paths that are added by RegexTransformer.
|
||||
// These take the form of "<any-string>" and are used to ensure that path order can be
|
||||
// well defined even for anonymous lists of items.
|
||||
if (!label.startsWith("<") && !label.endsWith(">")) {
|
||||
out.print(label);
|
||||
}
|
||||
out.print('{');
|
||||
}
|
||||
|
||||
private void close(PrintWriter out) {
|
||||
depth--;
|
||||
newLineAndIndent(out);
|
||||
out.print('}');
|
||||
}
|
||||
|
||||
private void newLineAndIndent(PrintWriter out) {
|
||||
out.println();
|
||||
for (int i = 0; i < depth; i++) {
|
||||
out.print(INDENT);
|
||||
}
|
||||
}
|
||||
|
||||
// Currently the "header" uses '//' line comments but the comments are in a block.
|
||||
// TODO: Sort this out so there isn't a messy mix of comment styles in the data files.
|
||||
private static void writeHeaderAndComments(
|
||||
PrintWriter out, List<String> header, List<String> comments) {
|
||||
header.forEach(out::println);
|
||||
if (!comments.isEmpty()) {
|
||||
// TODO: Don't use /* */ block quotes, just use inline // quotes.
|
||||
out.println(
|
||||
comments.stream().collect(joining("\n * ", "/**\n * ", "\n */")));
|
||||
}
|
||||
}
|
||||
|
||||
/** Inserts padding and values between braces. */
|
||||
private boolean appendValues(
|
||||
String name, RbPath rbPath, List<RbValue> values, PrintWriter out) {
|
||||
|
||||
RbValue onlyValue;
|
||||
boolean wasSingular = false;
|
||||
boolean quote = !rbPath.isIntPath();
|
||||
boolean isSequence = rbPath.endsWith(RB_SEQUENCE);
|
||||
if (values.size() == 1 && !mustBeArray(true, name, rbPath)) {
|
||||
onlyValue = values.get(0);
|
||||
if (onlyValue.size() == 1 && !mustBeArray(false, name, rbPath)) {
|
||||
// Value has a single element and is not being forced to be an array.
|
||||
String onlyElement = onlyValue.getElement(0);
|
||||
if (quote) {
|
||||
onlyElement = quoteInside(onlyElement);
|
||||
}
|
||||
// The numbers below are simply tuned to match the line wrapping in the original
|
||||
// CLDR code. The behaviour it produces is sometimes strange (wrapping a line just
|
||||
// for a single character) and could definitely be improved.
|
||||
// TODO: Simplify this and add hysteresis to ensure less "jarring" line wrapping.
|
||||
int maxWidth = Math.max(68, 80 - Math.min(4, rbPath.length()) * INDENT.length());
|
||||
if (onlyElement.length() <= maxWidth) {
|
||||
// Single element for path: don't add newlines.
|
||||
printValue(out, onlyElement, quote);
|
||||
wasSingular = true;
|
||||
} else {
|
||||
// Element too long to fit in one line, so wrap.
|
||||
int end;
|
||||
for (int i = 0; i < onlyElement.length(); i = end) {
|
||||
end = goodBreak(onlyElement, i + maxWidth);
|
||||
String part = onlyElement.substring(i, end);
|
||||
newLineAndIndent(out);
|
||||
printValue(out, part, quote);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Only one array for the rbPath, so don't add an extra set of braces.
|
||||
printArray(onlyValue, quote, isSequence, out);
|
||||
}
|
||||
} else {
|
||||
for (RbValue value : values) {
|
||||
if (value.size() == 1) {
|
||||
// Single-value array: print normally.
|
||||
printArray(value, quote, isSequence, out);
|
||||
} else {
|
||||
// Enclose this array in braces to separate it from other values.
|
||||
open("", out);
|
||||
printArray(value, quote, isSequence, out);
|
||||
close(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
return wasSingular;
|
||||
}
|
||||
|
||||
private static final RbPath RB_SEQUENCE = RbPath.of("Sequence");
|
||||
private static final RbPath RB_RULES = RbPath.of("rules");
|
||||
private static final RbPath RB_LOCALE_SCRIPT = RbPath.of("LocaleScript");
|
||||
private static final RbPath RB_ERAS = RbPath.of("eras");
|
||||
private static final RbPath RB_NAMED = RbPath.of("named");
|
||||
private static final RbPath RB_CALENDAR_PREFERENCE_DATA = RbPath.of("calendarPreferenceData");
|
||||
private static final RbPath RB_METAZONE_INFO = RbPath.of("metazoneInfo");
|
||||
|
||||
/**
|
||||
* Wrapper for a hack to determine if the given rb path should always present its values as an
|
||||
* array.
|
||||
*/
|
||||
// TODO: Verify this is still needed, and either make it less hacky, or delete it.
|
||||
private static boolean mustBeArray(boolean topValues, String name, RbPath rbPath) {
|
||||
if (topValues) {
|
||||
// matches "rules/setNN" (hence the mucking about with raw segments).
|
||||
return name.equals("pluralRanges")
|
||||
&& rbPath.startsWith(RB_RULES)
|
||||
&& rbPath.getSegment(1).startsWith("set");
|
||||
}
|
||||
return rbPath.equals(RB_LOCALE_SCRIPT)
|
||||
|| (rbPath.contains(RB_ERAS)
|
||||
&& !rbPath.getSegment(rbPath.length() - 1).endsWith(":alias")
|
||||
&& !rbPath.endsWith(RB_NAMED))
|
||||
|| rbPath.startsWith(RB_CALENDAR_PREFERENCE_DATA)
|
||||
|| rbPath.startsWith(RB_METAZONE_INFO);
|
||||
}
|
||||
|
||||
private void printArray(RbValue rbValue, boolean quote, boolean isSequence, PrintWriter out) {
|
||||
for (int n = 0; n < rbValue.size(); n++) {
|
||||
newLineAndIndent(out);
|
||||
printValue(out, quoteInside(rbValue.getElement(n)), quote);
|
||||
if (!isSequence) {
|
||||
out.print(",");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void printValue(PrintWriter out, String value, boolean quote) {
|
||||
if (quote) {
|
||||
out.append('"').append(value).append('"');
|
||||
} else {
|
||||
out.append(value);
|
||||
}
|
||||
}
|
||||
|
||||
// Can a string be broken here? If not, backup until we can.
|
||||
// TODO: Either don't bother line wrapping or look at making this use a line-break iterator.
|
||||
private static int goodBreak(String quoted, int end) {
|
||||
if (end > quoted.length()) {
|
||||
return quoted.length();
|
||||
}
|
||||
// Don't break escaped Unicode characters.
|
||||
// Need to handle both e.g. \u4E00 and \U00020000
|
||||
for (int i = end - 1; i > end - 10;) {
|
||||
char current = quoted.charAt(i--);
|
||||
if (!Character.toString(current).matches("[0-9A-Fa-f]")) {
|
||||
if ((current == 'u' || current == 'U') && i > end - 10
|
||||
&& quoted.charAt(i) == '\\') {
|
||||
return i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
while (end > 0) {
|
||||
char ch = quoted.charAt(end - 1);
|
||||
if (ch != '\\' && (ch < '\uD800' || ch > '\uDFFF')) {
|
||||
break;
|
||||
}
|
||||
--end;
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
// Fix characters inside strings.
|
||||
private static String quoteInside(String item) {
|
||||
// Unicode-escape all quotes.
|
||||
item = QUOTE_ESCAPE.matcher(item).replaceAll("\\\\u0022");
|
||||
// Double up on backslashes, ignoring Unicode-escaped characters.
|
||||
Pattern pattern =
|
||||
item.startsWith("[") && item.endsWith("]") ? UNICODESET_ESCAPE : STRING_ESCAPE;
|
||||
Matcher matcher = pattern.matcher(item);
|
||||
|
||||
if (!matcher.find()) {
|
||||
return item;
|
||||
}
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
int start = 0;
|
||||
do {
|
||||
buffer.append(item, start, matcher.start());
|
||||
int punctuationChar = item.codePointAt(matcher.end() - 1);
|
||||
buffer.append("\\");
|
||||
if (punctuationChar == '\\') {
|
||||
buffer.append('\\');
|
||||
}
|
||||
buffer.append(matcher.group());
|
||||
start = matcher.end();
|
||||
} while (matcher.find());
|
||||
buffer.append(item.substring(start));
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,618 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.BRKITR;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.COLL;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.CURR;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LANG;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.LOCALES;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.RBNF;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.REGION;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.UNIT;
|
||||
import static org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir.ZONE;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
import static org.unicode.cldr.api.CldrDataType.BCP47;
|
||||
import static org.unicode.cldr.api.CldrDataType.LDML;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.LinkedListMultimap;
|
||||
import com.google.common.collect.ListMultimap;
|
||||
import com.google.common.collect.SetMultimap;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.common.io.CharStreams;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.CollationMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.DayPeriodsMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.LocaleMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.PluralRangesMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.PluralsMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.RbnfMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.SupplementalMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.mapper.TransformsMapper;
|
||||
import org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer;
|
||||
|
||||
/**
|
||||
* The main converter tool for CLDR to ICU data. To run this tool, you need to supply a suitable
|
||||
* {@link LdmlConverterConfig} instance. There is a simple {@code main()} method available in this
|
||||
* class which can be invoked passing just the desired output directory and which relies on the
|
||||
* presence of several system properties for the remainder of its parameters:
|
||||
* <ul>
|
||||
* <li>CLDR_DIR: The root of the CLDR release from which CLDR data is read.
|
||||
* <li>ICU_DIR: The root of the ICU release from which additional "specials" XML data is read.
|
||||
* <li>CLDR_DTD_CACHE: A temporary directory with the various DTDs cached (this is a legacy
|
||||
* requirement from the underlying CLDR libraries and might go away one day).
|
||||
* </ul>
|
||||
*/
|
||||
public final class LdmlConverter {
|
||||
// TODO: Do all supplemental data in one go and split similarly to locale data (using RbPath).
|
||||
private static final PathMatcher GENDER_LIST_PATHS =
|
||||
supplementalMatcher("gender");
|
||||
private static final PathMatcher LIKELY_SUBTAGS_PATHS =
|
||||
supplementalMatcher("likelySubtags");
|
||||
private static final PathMatcher METAZONE_PATHS =
|
||||
supplementalMatcher("metaZones", "primaryZones");
|
||||
private static final PathMatcher METADATA_PATHS =
|
||||
supplementalMatcher("metadata");
|
||||
private static final PathMatcher SUPPLEMENTAL_DATA_PATHS =
|
||||
supplementalMatcher(
|
||||
"calendarData",
|
||||
"calendarPreferenceData",
|
||||
"codeMappings",
|
||||
"codeMappingsCurrency",
|
||||
"idValidity",
|
||||
"languageData",
|
||||
"languageMatching",
|
||||
"measurementData",
|
||||
"parentLocales",
|
||||
"subdivisionContainment",
|
||||
"territoryContainment",
|
||||
"territoryInfo",
|
||||
"timeData",
|
||||
"unitPreferenceData",
|
||||
"weekData",
|
||||
"weekOfPreference");
|
||||
private static final PathMatcher CURRENCY_DATA_PATHS =
|
||||
supplementalMatcher("currencyData");
|
||||
private static final PathMatcher NUMBERING_SYSTEMS_PATHS =
|
||||
supplementalMatcher("numberingSystems");
|
||||
private static final PathMatcher WINDOWS_ZONES_PATHS =
|
||||
supplementalMatcher("windowsZones");
|
||||
|
||||
// Special IDs which are not supported via CLDR, but for which synthetic data is injected.
|
||||
// The "TRADITIONAL" variants are here because their calendar differs from the non-variant
|
||||
// locale. However CLDR cannot represent this currently because calendar defaults are in
|
||||
// supplemental data (rather than locale data) and are keyed only on territory.
|
||||
private static final ImmutableSet<String> PHANTOM_LOCALE_IDS =
|
||||
ImmutableSet.of("ja_JP_TRADITIONAL", "th_TH_TRADITIONAL");
|
||||
|
||||
// Special alias mapping which exists in ICU even though "no_NO_NY" is simply not a
|
||||
// structurally valid locale ID. This is injected manually when creating the alias map.
|
||||
// This does mean that nobody can ever parse the _keys_ of the alias map, but so far there
|
||||
// has been no need for that.
|
||||
// TODO: Get "ars" into CLDR and remove this hack.
|
||||
private static final Map<String, String> PHANTOM_ALIASES =
|
||||
ImmutableMap.of("ars", "ar_SA", "no_NO_NY", "nn_NO");
|
||||
|
||||
private static PathMatcher supplementalMatcher(String... spec) {
|
||||
checkArgument(spec.length > 0, "must supply at least one matcher spec");
|
||||
if (spec.length == 1) {
|
||||
return PathMatcher.of("supplementalData/" + spec[0]);
|
||||
}
|
||||
return PathMatcher.anyOf(
|
||||
Arrays.stream(spec)
|
||||
.map(s -> PathMatcher.of("supplementalData/" + s))
|
||||
.toArray(PathMatcher[]::new));
|
||||
}
|
||||
|
||||
private static RbPath RB_PARENT = RbPath.of("%%Parent");
|
||||
// The quotes below are only so we achieve parity with the manually written alias files.
|
||||
// TODO: Remove unnecessary quotes once the migration to this code is complete.
|
||||
private static RbPath RB_ALIAS = RbPath.of("\"%%ALIAS\"");
|
||||
// Special path for adding to empty files which only exist to complete the parent chain.
|
||||
// TODO: Confirm that this has no meaningful effect and unify "empty" file contents.
|
||||
private static RbPath RB_EMPTY_ALIAS = RbPath.of("___");
|
||||
|
||||
/** Provisional entry point until better config support exists. */
|
||||
public static void main(String... args) {
|
||||
convert(IcuConverterConfig.builder()
|
||||
.setOutputDir(Paths.get(args[0]))
|
||||
.setEmitReport(true)
|
||||
.build());
|
||||
}
|
||||
|
||||
/**
|
||||
* Output types defining specific subsets of the ICU data which can be converted separately.
|
||||
* This closely mimics the original "NewLdml2IcuConverter" behaviour but could be simplified to
|
||||
* hide what are essentially implementation specific data splits.
|
||||
*/
|
||||
public enum OutputType {
|
||||
LOCALES(LDML, LdmlConverter::processLocales),
|
||||
BRKITR(LDML, LdmlConverter::processBrkitr),
|
||||
COLL(LDML, LdmlConverter::processCollation),
|
||||
RBNF(LDML, LdmlConverter::processRbnf),
|
||||
|
||||
DAY_PERIODS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processDayPeriods("misc")),
|
||||
GENDER_LIST(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("genderList", GENDER_LIST_PATHS, "misc", false)),
|
||||
LIKELY_SUBTAGS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("likelySubtags", LIKELY_SUBTAGS_PATHS, "misc", false)),
|
||||
SUPPLEMENTAL_DATA(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("supplementalData", SUPPLEMENTAL_DATA_PATHS, "misc", true)),
|
||||
CURRENCY_DATA(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("supplementalData", CURRENCY_DATA_PATHS, "curr", true)),
|
||||
METADATA(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("metadata", METADATA_PATHS, "misc", false)),
|
||||
META_ZONES(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("metaZones", METAZONE_PATHS, "misc", false)),
|
||||
NUMBERING_SYSTEMS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("numberingSystems", NUMBERING_SYSTEMS_PATHS, "misc", false)),
|
||||
PLURALS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processPlurals("misc")),
|
||||
PLURAL_RANGES(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processPluralRanges("misc")),
|
||||
WINDOWS_ZONES(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processSupplemental("windowsZones", WINDOWS_ZONES_PATHS, "misc", false)),
|
||||
TRANSFORMS(
|
||||
SUPPLEMENTAL,
|
||||
c -> c.processTransforms("translit")),
|
||||
KEY_TYPE_DATA(
|
||||
BCP47,
|
||||
c -> c.processKeyTypeData("misc")),
|
||||
|
||||
// Batching by type.
|
||||
DTD_LDML(LDML, c -> c.processAll(LDML)),
|
||||
DTD_SUPPLEMENTAL(SUPPLEMENTAL, c -> c.processAll(SUPPLEMENTAL)),
|
||||
DTD_BCP47(BCP47, c -> c.processAll(BCP47));
|
||||
|
||||
public static final ImmutableSet<OutputType> ALL =
|
||||
ImmutableSet.of(DTD_BCP47, DTD_SUPPLEMENTAL, DTD_LDML);
|
||||
|
||||
private final CldrDataType type;
|
||||
private final Consumer<LdmlConverter> converterFn;
|
||||
|
||||
OutputType(CldrDataType type, Consumer<LdmlConverter> converterFn) {
|
||||
this.type = checkNotNull(type);
|
||||
this.converterFn = checkNotNull(converterFn);
|
||||
}
|
||||
|
||||
void convert(LdmlConverter converter) {
|
||||
converterFn.accept(converter);
|
||||
}
|
||||
|
||||
CldrDataType getCldrType() {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
|
||||
private static void convert(LdmlConverterConfig config) {
|
||||
CldrDataSupplier src = CldrDataSupplier
|
||||
.forCldrFilesIn(config.getCldrDirectory())
|
||||
.withDraftStatusAtLeast(config.getMinimumDraftStatus());
|
||||
new LdmlConverter(config, src).convertAll(config);
|
||||
}
|
||||
|
||||
// The configuration controlling conversion behaviour.
|
||||
private final LdmlConverterConfig config;
|
||||
// The supplier for all data to be converted.
|
||||
private final CldrDataSupplier src;
|
||||
// The set of available locale IDs.
|
||||
// TODO: Make available IDs include specials files (or fail if specials are not available).
|
||||
private final ImmutableSet<String> availableIds;
|
||||
// Supplemental data available to mappers if needed.
|
||||
private final SupplementalData supplementalData;
|
||||
// Transformer for locale data.
|
||||
private final PathValueTransformer localeTransformer;
|
||||
// Transformer for supplemental data.
|
||||
private final PathValueTransformer supplementalTransformer;
|
||||
// Header string to go into every ICU data file.
|
||||
private final ImmutableList<String> icuFileHeader;
|
||||
|
||||
private LdmlConverter(LdmlConverterConfig config, CldrDataSupplier src) {
|
||||
this.config = checkNotNull(config);
|
||||
this.src = checkNotNull(src);
|
||||
this.supplementalData = SupplementalData.create(src.getDataForType(SUPPLEMENTAL));
|
||||
// Sort the set of available locale IDs but add "root" at the front. This is the
|
||||
// set of non-alias locale IDs to be processed.
|
||||
Set<String> localeIds = new LinkedHashSet<>();
|
||||
localeIds.add("root");
|
||||
localeIds.addAll(
|
||||
Sets.intersection(src.getAvailableLocaleIds(), config.getTargetLocaleIds(LOCALES)));
|
||||
localeIds.addAll(PHANTOM_LOCALE_IDS);
|
||||
this.availableIds = ImmutableSet.copyOf(localeIds);
|
||||
|
||||
// Load the remaining path value transformers.
|
||||
this.supplementalTransformer =
|
||||
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_supplemental.txt"),
|
||||
IcuFunctions.ALGORITHM_FN,
|
||||
IcuFunctions.DATE_FN,
|
||||
IcuFunctions.DAY_NUMBER_FN,
|
||||
IcuFunctions.EXP_FN,
|
||||
IcuFunctions.YMD_FN);
|
||||
this.localeTransformer =
|
||||
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
|
||||
IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
|
||||
this.icuFileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
|
||||
}
|
||||
|
||||
private void convertAll(LdmlConverterConfig config) {
|
||||
ListMultimap<CldrDataType, OutputType> groupByType = LinkedListMultimap.create();
|
||||
for (OutputType t : config.getOutputTypes()) {
|
||||
groupByType.put(t.getCldrType(), t);
|
||||
}
|
||||
for (CldrDataType cldrType : groupByType.keySet()) {
|
||||
for (OutputType t : groupByType.get(cldrType)) {
|
||||
t.convert(this);
|
||||
}
|
||||
}
|
||||
if (config.emitReport()) {
|
||||
System.out.println("Supplemental Data Transformer=" + supplementalTransformer);
|
||||
System.out.println("Locale Data Transformer=" + localeTransformer);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<String> readLinesFromResource(String name) {
|
||||
try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) {
|
||||
return CharStreams.readLines(new InputStreamReader(in));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("cannot read resource: " + name, e);
|
||||
}
|
||||
}
|
||||
|
||||
private PathValueTransformer getLocaleTransformer() {
|
||||
return localeTransformer;
|
||||
}
|
||||
|
||||
private PathValueTransformer getSupplementalTransformer() {
|
||||
return supplementalTransformer;
|
||||
}
|
||||
|
||||
private void processAll(CldrDataType cldrType) {
|
||||
List<OutputType> targets = Arrays.stream(OutputType.values())
|
||||
.filter(t -> t.getCldrType().equals(cldrType))
|
||||
.filter(t -> !t.name().startsWith("DTD_"))
|
||||
.collect(toList());
|
||||
for (OutputType t : targets) {
|
||||
t.convert(this);
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<CldrData> loadSpecialsData(String localeId) {
|
||||
String expected = localeId + ".xml";
|
||||
try (Stream<Path> files = Files.walk(config.getSpecialsDir())) {
|
||||
Set<Path> xmlFiles = files
|
||||
.filter(Files::isRegularFile)
|
||||
.filter(f -> f.getFileName().toString().equals(expected))
|
||||
.collect(Collectors.toSet());
|
||||
return !xmlFiles.isEmpty()
|
||||
? Optional.of(
|
||||
CldrDataSupplier.forCldrFiles(LDML, config.getMinimumDraftStatus(), xmlFiles))
|
||||
: Optional.empty();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(
|
||||
"error processing specials directory: " + config.getSpecialsDir(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private void processLocales() {
|
||||
// TODO: Pre-load specials files to avoid repeatedly re-loading them.
|
||||
processAndSplitLocaleFiles(
|
||||
id -> LocaleMapper.process(
|
||||
id, src, loadSpecialsData(id), getLocaleTransformer(), supplementalData),
|
||||
CURR, LANG, LOCALES, REGION, UNIT, ZONE);
|
||||
}
|
||||
|
||||
private void processBrkitr() {
|
||||
processAndSplitLocaleFiles(
|
||||
id -> BreakIteratorMapper.process(id, src, loadSpecialsData(id)), BRKITR);
|
||||
}
|
||||
|
||||
private void processCollation() {
|
||||
processAndSplitLocaleFiles(
|
||||
id -> CollationMapper.process(id, src, loadSpecialsData(id)), COLL);
|
||||
}
|
||||
|
||||
private void processRbnf() {
|
||||
processAndSplitLocaleFiles(
|
||||
id -> RbnfMapper.process(id, src, loadSpecialsData(id)), RBNF);
|
||||
}
|
||||
|
||||
private void processAndSplitLocaleFiles(
|
||||
Function<String, IcuData> icuFn, IcuLocaleDir... splitDirs) {
|
||||
|
||||
SetMultimap<IcuLocaleDir, String> writtenLocaleIds = HashMultimap.create();
|
||||
Path baseDir = config.getOutputDir();
|
||||
|
||||
for (String id : config.getTargetLocaleIds(LOCALES)) {
|
||||
// Skip "target" IDs that are aliases (they are handled later).
|
||||
if (!availableIds.contains(id)) {
|
||||
continue;
|
||||
}
|
||||
IcuData icuData = icuFn.apply(id);
|
||||
|
||||
ListMultimap<IcuLocaleDir, RbPath> splitPaths = LinkedListMultimap.create();
|
||||
for (RbPath p : icuData.getPaths()) {
|
||||
String rootName = getBaseSegmentName(p.getSegment(0));
|
||||
splitPaths.put(LOCALE_SPLIT_INFO.getOrDefault(rootName, LOCALES), p);
|
||||
}
|
||||
|
||||
// We always write base languages (even if empty).
|
||||
boolean isBaseLanguage = !id.contains("_");
|
||||
// Run through all directories (not just the keySet() of the split path map) since we
|
||||
// sometimes write empty files.
|
||||
for (IcuLocaleDir dir : splitDirs) {
|
||||
Set<String> targetIds = config.getTargetLocaleIds(dir);
|
||||
if (!targetIds.contains(id)) {
|
||||
if (!splitPaths.get(dir).isEmpty()) {
|
||||
System.out.format(
|
||||
"target IDs for %s does not contain %s, but it has data: %s\n",
|
||||
dir, id, splitPaths.get(dir));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
Path outDir = baseDir.resolve(dir.getOutputDir());
|
||||
IcuData splitData = new IcuData(icuData.getName(), icuData.hasFallback());
|
||||
// The split data can still be empty for this directory, but that's expected.
|
||||
splitPaths.get(dir).forEach(p -> splitData.add(p, icuData.get(p)));
|
||||
// Adding a parent locale makes the data non-empty and forces it to be written.
|
||||
supplementalData.getExplicitParentLocaleOf(splitData.getName())
|
||||
.ifPresent(p -> splitData.add(RB_PARENT, p));
|
||||
if (!splitData.isEmpty() || isBaseLanguage || dir.includeEmpty()) {
|
||||
splitData.setVersion(CldrDataSupplier.getCldrVersionString());
|
||||
write(splitData, outDir);
|
||||
writtenLocaleIds.put(dir, id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (IcuLocaleDir dir : splitDirs) {
|
||||
Path outDir = baseDir.resolve(dir.getOutputDir());
|
||||
Set<String> targetIds = config.getTargetLocaleIds(dir);
|
||||
|
||||
Map<String, String> aliasMap = getAliasMap(targetIds, dir);
|
||||
aliasMap.forEach((s, t) -> {
|
||||
// It's only important to record which alias files are written because of forced
|
||||
// aliases, but since it's harmless otherwise, we just do it unconditionally.
|
||||
// Normal alias files don't affect the empty file calculation, but forced ones can.
|
||||
writtenLocaleIds.put(dir, s);
|
||||
writeAliasFile(s, t, outDir);
|
||||
});
|
||||
|
||||
calculateEmptyFiles(writtenLocaleIds.get(dir), aliasMap.values())
|
||||
.forEach(id -> writeEmptyFile(id, outDir, aliasMap.values()));
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, String> getAliasMap(Set<String> localeIds, IcuLocaleDir dir) {
|
||||
// There are four reasons for treating a locale ID as an alias.
|
||||
// 1: It contains deprecated subtags (e.g. "sr_YU", which should be "sr_Cyrl_RS").
|
||||
// 2: It has no CLDR data but is missing a script subtag.
|
||||
// 3: It is one of the special "phantom" alias which cannot be represented normally
|
||||
// and must be manually mapped (e.g. legacy locale IDs which don't even parse).
|
||||
// 4: It is a "super special" forced alias, which might replace existing aliases in
|
||||
// some output directories.
|
||||
Map<String, String> aliasMap = new LinkedHashMap<>();
|
||||
for (String id : localeIds) {
|
||||
if (PHANTOM_ALIASES.keySet().contains(id)) {
|
||||
checkArgument(!availableIds.contains(id),
|
||||
"phantom aliases should never be otherwise supported: %s\n"
|
||||
+ "(maybe the phantom alias can now be removed?)", id);
|
||||
aliasMap.put(id, PHANTOM_ALIASES.get(id));
|
||||
continue;
|
||||
}
|
||||
String canonicalId = supplementalData.replaceDeprecatedTags(id);
|
||||
if (!canonicalId.equals(id)) {
|
||||
// If the canonical form of an ID differs from the requested ID, the this is an
|
||||
// alias, and just needs to point to the canonical ID.
|
||||
aliasMap.put(id, canonicalId);
|
||||
continue;
|
||||
}
|
||||
if (availableIds.contains(id)) {
|
||||
// If it's canonical and supported, it's not an alias.
|
||||
continue;
|
||||
}
|
||||
// If the requested locale is not supported, maximize it and alias to that.
|
||||
String maximizedId = supplementalData.maximize(id)
|
||||
.orElseThrow(() -> new IllegalArgumentException("unsupported locale ID: " + id));
|
||||
// We can't alias to ourselves and we shouldn't be here is the ID was already maximal.
|
||||
checkArgument(!maximizedId.equals(id), "unsupported maximized locale ID: %s", id);
|
||||
aliasMap.put(id, maximizedId);
|
||||
}
|
||||
// Important that we overwrite entries which might already exist here, since we might have
|
||||
// already calculated a "natural" alias for something that we want to force (and we should
|
||||
// replace the existing target, since that affects how we determine empty files later).
|
||||
aliasMap.putAll(config.getForcedAliases(dir));
|
||||
return aliasMap;
|
||||
}
|
||||
|
||||
private static final CharMatcher PATH_MODIFIER = CharMatcher.anyOf(":%");
|
||||
|
||||
// Resource bundle paths elements can have variants (e.g. "Currencies%narrow) or type
|
||||
// annotations (e.g. "languages:intvector"). We strip these when considering the element name.
|
||||
private static String getBaseSegmentName(String segment) {
|
||||
int idx = PATH_MODIFIER.indexIn(segment);
|
||||
return idx == -1 ? segment : segment.substring(0, idx);
|
||||
}
|
||||
|
||||
private void processDayPeriods(String dir) {
|
||||
write(DayPeriodsMapper.process(src), dir);
|
||||
}
|
||||
|
||||
private void processPlurals(String dir) {
|
||||
write(PluralsMapper.process(src), dir);
|
||||
}
|
||||
|
||||
private void processPluralRanges(String dir) {
|
||||
write(PluralRangesMapper.process(src), dir);
|
||||
}
|
||||
|
||||
private void processKeyTypeData(String dir) {
|
||||
Bcp47Mapper.process(src).forEach(d -> write(d, dir));
|
||||
}
|
||||
|
||||
private void processTransforms(String dir) {
|
||||
Path transformDir = createDirectory(config.getOutputDir().resolve(dir));
|
||||
write(TransformsMapper.process(src, transformDir), transformDir);
|
||||
}
|
||||
|
||||
private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion");
|
||||
|
||||
private void processSupplemental(
|
||||
String label, PathMatcher paths, String dir, boolean addCldrVersion) {
|
||||
IcuData icuData =
|
||||
SupplementalMapper.process(src, getSupplementalTransformer(), label, paths);
|
||||
// A hack for "supplementalData.txt" since the "cldrVersion" value doesn't come from the
|
||||
// supplemental data XML files.
|
||||
if (addCldrVersion) {
|
||||
icuData.add(RB_CLDR_VERSION, CldrDataSupplier.getCldrVersionString());
|
||||
}
|
||||
write(icuData, dir);
|
||||
}
|
||||
|
||||
private void writeAliasFile(String srcId, String destId, Path dir) {
|
||||
IcuData icuData = new IcuData(srcId, true);
|
||||
icuData.add(RB_ALIAS, destId);
|
||||
write(icuData, dir);
|
||||
}
|
||||
|
||||
private void writeEmptyFile(String id, Path dir, Collection<String> aliasTargets) {
|
||||
IcuData icuData = new IcuData(id, true);
|
||||
// TODO: Document the reason for this (i.e. why does it matter what goes into empty files?)
|
||||
if (aliasTargets.contains(id)) {
|
||||
icuData.setFileComment("generated alias target");
|
||||
icuData.add(RB_EMPTY_ALIAS, "");
|
||||
} else {
|
||||
// These empty files only exist because the target of an alias has a parent locale
|
||||
// which is itself not in the set of written ICU files. An "indirect alias target".
|
||||
icuData.setVersion(CldrDataSupplier.getCldrVersionString());
|
||||
}
|
||||
write(icuData, dir);
|
||||
}
|
||||
|
||||
private void write(IcuData icuData, String dir) {
|
||||
write(icuData, config.getOutputDir().resolve(dir));
|
||||
}
|
||||
|
||||
private void write(IcuData icuData, Path dir) {
|
||||
createDirectory(dir);
|
||||
IcuTextWriter.writeToFile(icuData, dir, icuFileHeader);
|
||||
}
|
||||
|
||||
private Path createDirectory(Path dir) {
|
||||
try {
|
||||
Files.createDirectories(dir);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("cannot create directory: " + dir, e);
|
||||
}
|
||||
return dir;
|
||||
}
|
||||
|
||||
// The set of IDs to process is:
|
||||
// * any file that was written
|
||||
// * any alias target (not written)
|
||||
//
|
||||
// From which we generate the complete "closure" under the "getParent()" function. This set
|
||||
// contains all file (written or not) which need to exist to complete the locale hierarchy.
|
||||
//
|
||||
// Then we remove all the written files to just leave the ones that need to be generated.
|
||||
// This is a simple and robust approach that handles things like "gaps" in non-aliased
|
||||
// locale IDs, where an intermediate parent is not present.
|
||||
private ImmutableSet<String> calculateEmptyFiles(
|
||||
Set<String> writtenIds, Collection<String> aliasTargetIds) {
|
||||
|
||||
Set<String> seedIds = new HashSet<>(writtenIds);
|
||||
seedIds.addAll(aliasTargetIds);
|
||||
// Be nice and sort the output (makes easier debugging).
|
||||
Set<String> allIds = new TreeSet<>();
|
||||
for (String id : seedIds) {
|
||||
while (!id.equals("root") && !allIds.contains(id)) {
|
||||
allIds.add(id);
|
||||
id = supplementalData.getParent(id);
|
||||
}
|
||||
}
|
||||
return ImmutableSet.copyOf(Sets.difference(allIds, writtenIds));
|
||||
}
|
||||
|
||||
private static final ImmutableMap<String, IcuLocaleDir> LOCALE_SPLIT_INFO =
|
||||
ImmutableMap.<String, IcuLocaleDir>builder()
|
||||
// BRKITR
|
||||
.put("boundaries", BRKITR)
|
||||
.put("dictionaries", BRKITR)
|
||||
.put("exceptions", BRKITR)
|
||||
// COLL
|
||||
.put("collations", COLL)
|
||||
.put("depends", COLL)
|
||||
.put("UCARules", COLL)
|
||||
// CURR
|
||||
.put("Currencies", CURR)
|
||||
.put("CurrencyPlurals", CURR)
|
||||
.put("CurrencyUnitPatterns", CURR)
|
||||
.put("currencySpacing", CURR)
|
||||
// LANG
|
||||
.put("Keys", LANG)
|
||||
.put("Languages", LANG)
|
||||
.put("Scripts", LANG)
|
||||
.put("Types", LANG)
|
||||
.put("Variants", LANG)
|
||||
.put("characterLabelPattern", LANG)
|
||||
.put("codePatterns", LANG)
|
||||
.put("localeDisplayPattern", LANG)
|
||||
// RBNF
|
||||
.put("RBNFRules", RBNF)
|
||||
// REGION
|
||||
.put("Countries", REGION)
|
||||
// UNIT
|
||||
.put("durationUnits", UNIT)
|
||||
.put("units", UNIT)
|
||||
.put("unitsShort", UNIT)
|
||||
.put("unitsNarrow", UNIT)
|
||||
// ZONE
|
||||
.put("zoneStrings", ZONE)
|
||||
.build();
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.unicode.cldr.api.CldrDraftStatus;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
import org.unicode.icu.tool.cldrtoicu.LdmlConverter.OutputType;
|
||||
|
||||
/** API for configuring the LDML converter. */
|
||||
public interface LdmlConverterConfig {
|
||||
/** Output directories for ICU locale data (this is not used for supplemental data). */
|
||||
enum IcuLocaleDir {
|
||||
/** Data for the break-iterator library. */
|
||||
BRKITR(true),
|
||||
/** Data for the collations library. */
|
||||
COLL(true),
|
||||
/** Currency data. */
|
||||
CURR(false),
|
||||
/** Language data. */
|
||||
LANG(false),
|
||||
/** General locale data. */
|
||||
LOCALES(true),
|
||||
/** Rule-based number formatter data. */
|
||||
RBNF(true),
|
||||
/** Region data. */
|
||||
REGION(false),
|
||||
/** Measurement and units data. */
|
||||
UNIT(false),
|
||||
/** Timezone data. */
|
||||
ZONE(false);
|
||||
|
||||
private final String dirName = Ascii.toLowerCase(name());
|
||||
private final boolean includeEmpty;
|
||||
|
||||
IcuLocaleDir(boolean includeEmpty) {
|
||||
this.includeEmpty = includeEmpty;
|
||||
}
|
||||
|
||||
/** Returns the relative output directory name. */
|
||||
String getOutputDir() {
|
||||
return dirName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether the directory is expected to contain empty data files (used to advertise
|
||||
* the supported set of locales for the "service" provided by the data in that
|
||||
* directory).
|
||||
*/
|
||||
// TODO: Document why there's a difference between directories for empty directories.
|
||||
boolean includeEmpty() {
|
||||
return includeEmpty;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of output types to be converted. Use {@link OutputType#ALL} to convert
|
||||
* everything.
|
||||
*/
|
||||
Set<OutputType> getOutputTypes();
|
||||
|
||||
/** Returns the root directory in which the CLDR release is located. */
|
||||
Path getCldrDirectory();
|
||||
|
||||
/**
|
||||
* Returns an additional "specials" directory containing additional ICU specific XML
|
||||
* files depending on the given output type. This is where the converter finds any XML
|
||||
* files using the "icu:" namespace.
|
||||
*/
|
||||
Path getSpecialsDir();
|
||||
|
||||
/**
|
||||
* Returns the root of the ICU output directory hierarchy into which ICU data file are
|
||||
* written.
|
||||
*/
|
||||
Path getOutputDir();
|
||||
|
||||
/** Returns the minimal draft status for CLDR data to be converted. */
|
||||
CldrDraftStatus getMinimumDraftStatus();
|
||||
|
||||
/**
|
||||
* Returns the set of locale IDs to be processed for the given directory.
|
||||
*
|
||||
* <p>This set can contain IDs which have noICU data associated with them if they are
|
||||
* suitable aliases (e.g. they are deprecated versions of locale IDs for which data does
|
||||
* exist).
|
||||
*/
|
||||
Set<String> getTargetLocaleIds(IcuLocaleDir dir);
|
||||
|
||||
/**
|
||||
* Return a map of locale IDs which specifies aliases which are applied to the given
|
||||
* directory in contradiction to the natural alias or parent ID which would otherwise
|
||||
* be generated. This is a mechanism for restructuring the parent chain and linking
|
||||
* locales together in non-standard and unexpected ways.
|
||||
*/
|
||||
Map<String, String> getForcedAliases(IcuLocaleDir dir);
|
||||
|
||||
/**
|
||||
* Whether to emit a summary report for debug purposes after conversion is complete.
|
||||
*/
|
||||
boolean emitReport();
|
||||
}
|
|
@ -0,0 +1,259 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkPositionIndex;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static com.google.common.collect.ImmutableMap.toImmutableMap;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
/**
|
||||
* An immutable matcher for {@link CldrPath} instances. A path matcher specification looks like
|
||||
* {@code "foo/*[@x="z"]/bar[@y=*]"}, where element names and attribute values can be wildcards.
|
||||
*
|
||||
* <p>Note that the path fragment represented by the specification does not include either leading
|
||||
* or trailing {@code '/'}. This is because matching can occur at any point in a {@code CdlrPath}.
|
||||
* The choice of where to match in the path is governed by the match method used (e.g.
|
||||
* {@link PathMatcher#matchesSuffixOf(CldrPath)}.
|
||||
*/
|
||||
public abstract class PathMatcher {
|
||||
/** Parses the path specification into a matcher. */
|
||||
public static PathMatcher of(String pathSpec) {
|
||||
// Supported so far: "a", "a/b", "a/b[@x=*]"
|
||||
return new BasicMatcher(parse(pathSpec));
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines the given matchers into a single composite matcher which tests all the given
|
||||
* matchers in order.
|
||||
*/
|
||||
public static PathMatcher anyOf(PathMatcher... matchers) {
|
||||
checkArgument(matchers.length > 0, "must supply at least one matcher");
|
||||
if (matchers.length == 1) {
|
||||
return checkNotNull(matchers[0]);
|
||||
}
|
||||
return new CompositeMatcher(ImmutableList.copyOf(matchers));
|
||||
}
|
||||
|
||||
/** Attempts a full match against a given path. */
|
||||
public abstract boolean matches(CldrPath path);
|
||||
|
||||
/** Attempts a suffix match against a given path. */
|
||||
public abstract boolean matchesSuffixOf(CldrPath path);
|
||||
|
||||
/** Attempts a prefix match against a given path. */
|
||||
public abstract boolean matchesPrefixOf(CldrPath path);
|
||||
|
||||
// A matcher that simply combines a sequences of other matchers in order.
|
||||
private static final class CompositeMatcher extends PathMatcher {
|
||||
private final ImmutableList<PathMatcher> matchers;
|
||||
|
||||
private CompositeMatcher(ImmutableList<PathMatcher> matchers) {
|
||||
checkArgument(matchers.size() > 1);
|
||||
this.matchers = checkNotNull(matchers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(CldrPath path) {
|
||||
for (PathMatcher m : matchers) {
|
||||
if (m.matches(path)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matchesSuffixOf(CldrPath path) {
|
||||
for (PathMatcher m : matchers) {
|
||||
if (m.matchesSuffixOf(path)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matchesPrefixOf(CldrPath path) {
|
||||
for (PathMatcher m : matchers) {
|
||||
if (m.matchesPrefixOf(path)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class BasicMatcher extends PathMatcher {
|
||||
private final ImmutableList<Predicate<CldrPath>> elementMatchers;
|
||||
|
||||
private BasicMatcher(List<Predicate<CldrPath>> elementMatchers) {
|
||||
this.elementMatchers = ImmutableList.copyOf(elementMatchers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(CldrPath path) {
|
||||
return elementMatchers.size() == path.getLength() && matchRegion(path, 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matchesSuffixOf(CldrPath path) {
|
||||
int start = path.getLength() - elementMatchers.size();
|
||||
return start >= 0 && matchRegion(path, start);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matchesPrefixOf(CldrPath path) {
|
||||
return path.getLength() >= elementMatchers.size() && matchRegion(path, 0);
|
||||
}
|
||||
|
||||
private boolean matchRegion(CldrPath path, int offset) {
|
||||
// offset is the path element corresponding the the "top most" element matcher, it
|
||||
// must be in the range 0 ... (path.length() - elementMatchers.size()).
|
||||
checkPositionIndex(offset, path.getLength() - elementMatchers.size());
|
||||
// First jump over the path parents until we find the last matcher.
|
||||
int matchPathLength = offset + elementMatchers.size();
|
||||
while (path.getLength() > matchPathLength) {
|
||||
path = path.getParent();
|
||||
}
|
||||
return matchForward(path, elementMatchers.size() - 1);
|
||||
}
|
||||
|
||||
private boolean matchForward(CldrPath path, int matcherIndex) {
|
||||
if (matcherIndex < 0) {
|
||||
return true;
|
||||
}
|
||||
return matchForward(path.getParent(), matcherIndex - 1)
|
||||
&& elementMatchers.get(matcherIndex).test(path);
|
||||
}
|
||||
}
|
||||
|
||||
// Make a new, non-interned, unique instance here which we can test by reference to
|
||||
// determine if the argument is to be captured (needed as ImmutableMap prohibits null).
|
||||
// DO NOT change this code to assign "*" as the value directly, it MUST be a new instance.
|
||||
private static final String WILDCARD = new String("*");
|
||||
|
||||
private static final Pattern ELEMENT_START_REGEX =
|
||||
Pattern.compile("(\\*|[-:\\w]+)(?:/|\\[|$)");
|
||||
private static final Pattern ATTRIBUTE_REGEX =
|
||||
Pattern.compile("\\[@([-:\\w]+)=(?:\\*|\"([^\"]*)\")\\]");
|
||||
|
||||
// element := foo, foo[@bar="baz"], foo[@bar=*]
|
||||
// pathspec := element{/element}*
|
||||
private static List<Predicate<CldrPath>> parse(String pathSpec) {
|
||||
List<Predicate<CldrPath>> specs = new ArrayList<>();
|
||||
int pos = 0;
|
||||
do {
|
||||
pos = parse(pathSpec, pos, specs);
|
||||
} while (pos >= 0);
|
||||
return specs;
|
||||
}
|
||||
|
||||
// Return next start index or -1.
|
||||
private static int parse(String pathSpec, int pos, List<Predicate<CldrPath>> specs) {
|
||||
Matcher m = ELEMENT_START_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
|
||||
checkArgument(m.lookingAt(), "invalid path specification (index=%s): %s", pos, pathSpec);
|
||||
String name = m.group(1);
|
||||
Map<String, String> attributes = ImmutableMap.of();
|
||||
pos = m.end(1);
|
||||
if (pos < pathSpec.length() && pathSpec.charAt(pos) == '[') {
|
||||
// We have attributes to add.
|
||||
attributes = new LinkedHashMap<>();
|
||||
do {
|
||||
m = ATTRIBUTE_REGEX.matcher(pathSpec).region(pos, pathSpec.length());
|
||||
checkArgument(m.lookingAt(),
|
||||
"invalid path specification (index=%s): %s", pos, pathSpec);
|
||||
// Null if we matched the '*' wildcard.
|
||||
String value = m.group(2);
|
||||
attributes.put(m.group(1), value != null ? value : WILDCARD);
|
||||
pos = m.end();
|
||||
} while (pos < pathSpec.length() && pathSpec.charAt(pos) == '[');
|
||||
}
|
||||
// Wildcard matching is less efficient because attribute keys cannot be made in advance, so
|
||||
// since it's also very rare, we special case it.
|
||||
Predicate<CldrPath> matcher = name.equals(WILDCARD)
|
||||
? new WildcardElementMatcher(attributes)::match
|
||||
: new ElementMatcher(name, attributes)::match;
|
||||
specs.add(matcher);
|
||||
if (pos == pathSpec.length()) {
|
||||
return -1;
|
||||
}
|
||||
checkState(pathSpec.charAt(pos) == '/',
|
||||
"invalid path specification (index=%s): %s", pos, pathSpec);
|
||||
return pos + 1;
|
||||
}
|
||||
|
||||
// Matcher for path elements like "foo[@bar=*]" where the name is known in advance.
|
||||
private static final class ElementMatcher {
|
||||
private final String name;
|
||||
private final ImmutableMap<AttributeKey, String> attributes;
|
||||
|
||||
private ElementMatcher(String name, Map<String, String> attributes) {
|
||||
this.name = checkNotNull(name);
|
||||
this.attributes = attributes.entrySet().stream()
|
||||
.collect(toImmutableMap(e -> keyOf(name, e.getKey()), Entry::getValue));
|
||||
}
|
||||
|
||||
boolean match(CldrPath path) {
|
||||
if (!path.getName().equals(name)) {
|
||||
return false;
|
||||
}
|
||||
for (Entry<AttributeKey, String> e : attributes.entrySet()) {
|
||||
String actual = path.get(e.getKey());
|
||||
if (actual == null) {
|
||||
return false;
|
||||
}
|
||||
String expected = e.getValue();
|
||||
// DO NOT change this to use expected.equals(WILDCARD).
|
||||
if (expected != WILDCARD && !expected.equals(actual)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Matcher for path elements like "*[@bar=*]", where the name isn't known until match time.
|
||||
private static final class WildcardElementMatcher {
|
||||
private final ImmutableMap<String, String> attributes;
|
||||
|
||||
private WildcardElementMatcher(Map<String, String> attributes) {
|
||||
this.attributes = ImmutableMap.copyOf(attributes);
|
||||
}
|
||||
|
||||
private boolean match(CldrPath path) {
|
||||
// The wildcard matcher never fails due to the element name but must create new key
|
||||
// instances every time matching occurs (because the key name is dynamic). Since this
|
||||
// is rare, it's worth making into a separate case.
|
||||
for (Entry<String, String> attribute : attributes.entrySet()) {
|
||||
String actual = path.get(keyOf(path.getName(), attribute.getKey()));
|
||||
if (actual == null) {
|
||||
return false;
|
||||
}
|
||||
String expected = attribute.getValue();
|
||||
// DO NOT change this to use expected.equals(WILDCARD).
|
||||
if (expected != WILDCARD && !expected.equals(actual)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
/**
|
||||
* API for transforming CLDR path/value pairs. Transformed results support grouping by their key
|
||||
* and the ability to generate default "fallback" values to account for missing values in a group.
|
||||
*
|
||||
* <p>To transform some set of CLDR path/values:
|
||||
* <ol>
|
||||
* <li>Transform all desired path/value pairs into a set of matched results, discarding duplicates
|
||||
* (see {@link #transform(CldrValue)}.
|
||||
* <li>Group the results by key (e.g. into a {@code ListMultimap}).
|
||||
* <li>For each group, add any fallback values which don't yet exist for that key (see
|
||||
* {@link #getFallbackResultsFor(RbPath, DynamicVars)} and {@link Result#isFallbackFor(Result)}).
|
||||
* <li>Sort elements within each group and flatten result values (see {@link Result#isGrouped()}).
|
||||
* </ol>
|
||||
*
|
||||
* <p>For each unique key, this should yield correctly ordered sequence of values (according to the
|
||||
* semantics of the chosen transformer implementation).
|
||||
*/
|
||||
public abstract class PathValueTransformer {
|
||||
/**
|
||||
* A result either obtained by transforming a path/value pair, or as a potential fallback for
|
||||
* some known key (see {@link PathValueTransformer#transform(CldrValue)} and
|
||||
* {@link PathValueTransformer#getFallbackResultsFor(RbPath, DynamicVars)}).
|
||||
*/
|
||||
public static abstract class Result implements Comparable<Result> {
|
||||
private final RbPath key;
|
||||
|
||||
protected Result(RbPath key) {
|
||||
this.key = checkNotNull(key);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the key of this result, used to group results and determine fallback values
|
||||
* according to the semantics of the chosen transformer.
|
||||
*/
|
||||
public RbPath getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the values in this result should be grouped or not. Un-grouped values
|
||||
* should be considered as individual values in a sequence and might be joined with values
|
||||
* from other results in the same group. Grouped values cannot be split and must appear
|
||||
* as a single value.
|
||||
*
|
||||
* <p>For example for the ordered results:
|
||||
* <pre>
|
||||
* Result X = { key=K, values=[ "a", "b" ], grouped=false }
|
||||
* Result Y = { key=K, values=[ "c", "d" ], grouped=false }
|
||||
* Result Z = { key=K, values=[ "e" ], grouped=false }
|
||||
* </pre>
|
||||
* the values for key {@code K} are conceptually {@code [ "a", "b", "c", "d", "e" ]}.
|
||||
*
|
||||
* <p>However if result {@code Y} has {@code grouped=true} then there are now 4 values
|
||||
* {@code [ "a", "b", ["c", "d"], "e" ]}, and if {@code X} is also grouped, then it is
|
||||
* {@code [ ["a", "b"], ["c", "d"], "e" ]}, producing only 3 top-level values.
|
||||
*/
|
||||
public abstract boolean isGrouped();
|
||||
|
||||
/**
|
||||
* Returns the transformed values of this result, which may or may not be grouped
|
||||
* according to {@link #isGrouped()}.
|
||||
*/
|
||||
public abstract ImmutableList<String> getValues();
|
||||
|
||||
/**
|
||||
* Returns whether this result is a fallback for some existing matched result. Fallback
|
||||
* results should only be used when it is not a fallback for any existing result.
|
||||
*/
|
||||
public abstract boolean isFallbackFor(Result r);
|
||||
|
||||
/** Debug only string representation. */
|
||||
@Override
|
||||
public final String toString() {
|
||||
return String.format(
|
||||
"Result{ key='%s', grouped=%s, values=%s }",
|
||||
getKey(), isGrouped(), getValues());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A "typedef" for the function to do late binding of dynamic variables. This is used for edge
|
||||
* cases where a %N variable in the rules config is bound to a CLDR path (e.g. "//foo/bar")
|
||||
* which cannot be resolved until the rule is evaluated. Unfortunately the need to support late
|
||||
* binding of variables incurs significant additional complexity in the code, despite being
|
||||
* used in exactly one situation so far (the '%D' variable to represent the default numbering
|
||||
* scheme.
|
||||
*/
|
||||
// TODO: Figure out how to get rid of all of this mess.
|
||||
public interface DynamicVars extends Function<CldrPath, String> {}
|
||||
|
||||
/**
|
||||
* Transforms a CLDR value into a sequence of results (empty if the value was not matched by
|
||||
* any rule).
|
||||
*
|
||||
* @param cldrValue the value to transform.
|
||||
* @return the transformed result(s).
|
||||
*/
|
||||
public abstract ImmutableList<Result> transform(CldrValue cldrValue);
|
||||
|
||||
/**
|
||||
* Transforms a CLDR value into a sequence of results (empty if the value was not matched by
|
||||
* any rule). The dynamic variable function provides any "late bound" CLDR path variables to be
|
||||
* resolved from CLDR data during processing (e.g "%D=//ldml/numbers/defaultNumberingSystem").
|
||||
*
|
||||
* @param cldrValue the value to transform.
|
||||
* @param varFn a function for resolving "late bound" variables.
|
||||
* @return the transformed result(s).
|
||||
*/
|
||||
public abstract ImmutableList<Result> transform(CldrValue cldrValue, DynamicVars varFn);
|
||||
|
||||
/**
|
||||
* Returns a possibly empty sequence of fallback results for a given key. A fallback result for
|
||||
* a key should be used only if it is not a fallback for any other result with that key; see
|
||||
* also {@link Result#isFallbackFor(Result)}.
|
||||
*/
|
||||
public abstract ImmutableList<Result> getFallbackResultsFor(RbPath key, DynamicVars varFn);
|
||||
}
|
|
@ -0,0 +1,232 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static com.google.common.collect.ImmutableList.toImmutableList;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Comparators;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
/**
|
||||
* A resource bundle path, used to identify entries in ICU data.
|
||||
*
|
||||
* <p>Immutable and thread safe.
|
||||
*/
|
||||
public final class RbPath implements Comparable<RbPath> {
|
||||
private static final Splitter PATH_SPLITTER = Splitter.on('/').trimResults();
|
||||
|
||||
// This defines ordering of paths in IcuData instances and thus the order in ICU data files.
|
||||
// If there's ever a reason to have a different "natural" order for paths, this Comparator
|
||||
// should be moved into the ICU file writer class(es).
|
||||
private static final Comparator<RbPath> ORDERING =
|
||||
Comparator.comparing(
|
||||
p -> p.segments,
|
||||
Comparators.lexicographical(Comparator.<String>naturalOrder()));
|
||||
|
||||
// Matches the definition of invariant characters in "uinvchar.cpp". We can make this all much
|
||||
// faster if needed with a custom matcher (it's just a 128 way bit lookup via 2 longs).
|
||||
private static final CharMatcher INVARIANT_CHARS =
|
||||
CharMatcher.ascii().and(CharMatcher.anyOf("!#$@[\\]^`{|}~").negate());
|
||||
|
||||
// Note that we must also prohibit double-quote from appearing anywhere other than surrounding
|
||||
// segment values. This is because some segment values can contain special ICU data characters
|
||||
// (e.g. ':') but must be treated as literals. There is not proper "escaping" mechanism in ICU
|
||||
// data for key values (since '\' is not an invariant, things like \\uxxxx are not possible).
|
||||
//
|
||||
// Ideally quoting would be done when the file is written, but that would require additional
|
||||
// complexity in RbPath, since suffixes like ":intvector" must not be quoted and must somehow
|
||||
// be distinguished from timezone "metazone" names which also contain ':'.
|
||||
private static final CharMatcher QUOTED_SEGMENT_CHARS =
|
||||
INVARIANT_CHARS
|
||||
.and(CharMatcher.javaIsoControl().negate())
|
||||
.and(CharMatcher.isNot('"'));
|
||||
private static final CharMatcher UNQUOTED_SEGMENT_CHARS =
|
||||
QUOTED_SEGMENT_CHARS.and(whitespace().negate());
|
||||
|
||||
// Characters allowed in path segments which separate the "base name" from any suffix (e.g.
|
||||
// the base name of "Foo:intvector" is "Foo").
|
||||
private static final CharMatcher SEGMENT_SEPARATORS = CharMatcher.anyOf("%:");
|
||||
|
||||
private static final RbPath EMPTY = new RbPath(ImmutableList.of());
|
||||
|
||||
public static RbPath empty() {
|
||||
return EMPTY;
|
||||
}
|
||||
|
||||
public static RbPath of(String... segments) {
|
||||
return of(Arrays.asList(segments));
|
||||
}
|
||||
|
||||
public static RbPath of(Iterable<String> segments) {
|
||||
return new RbPath(segments);
|
||||
}
|
||||
|
||||
public static RbPath parse(String path) {
|
||||
checkArgument(!path.isEmpty(), "cannot parse an empty path string");
|
||||
// Allow leading '/', but don't allow empty segments anywhere else.
|
||||
if (path.startsWith("/")) {
|
||||
path = path.substring(1);
|
||||
}
|
||||
return new RbPath(PATH_SPLITTER.split(path));
|
||||
}
|
||||
|
||||
static int getCommonPrefixLength(RbPath lhs, RbPath rhs) {
|
||||
int maxLength = Math.min(lhs.length(), rhs.length());
|
||||
int n = 0;
|
||||
while (n < maxLength && lhs.getSegment(n).equals(rhs.getSegment(n))) {
|
||||
n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
private final ImmutableList<String> segments;
|
||||
private final int hashCode;
|
||||
|
||||
private RbPath(Iterable<String> segments) {
|
||||
this.segments = ImmutableList.copyOf(segments);
|
||||
this.hashCode = Objects.hash(this.segments);
|
||||
for (String segment : this.segments) {
|
||||
checkArgument(!segment.isEmpty(),
|
||||
"empty path segments not permitted: %s", this.segments);
|
||||
// Either the label is quoted (e.g. "foo") or it is bar (e.g. foo) but it can only
|
||||
// contain double quotes at either end, or not at all. If the string is quoted, only
|
||||
// validate the content, and not the quotes themselves.
|
||||
String toValidate;
|
||||
switch (segment.charAt(0)) {
|
||||
case '<':
|
||||
// Allow anything in hidden labels, since they will be removed later and never
|
||||
// appear in the final ICU data.
|
||||
checkArgument(segment.endsWith(">"),
|
||||
"mismatched quoting for hidden label: %s", segment);
|
||||
continue;
|
||||
|
||||
case '"':
|
||||
checkArgument(segment.endsWith("\""),
|
||||
"mismatched quoting for segment: %s", segment);
|
||||
checkArgument(
|
||||
QUOTED_SEGMENT_CHARS.matchesAllOf(segment.substring(1, segment.length() - 1)),
|
||||
"invalid character in unquoted resource bundle path segment: %s", segment);
|
||||
break;
|
||||
|
||||
default:
|
||||
checkArgument(
|
||||
UNQUOTED_SEGMENT_CHARS.matchesAllOf(segment),
|
||||
"invalid character in unquoted resource bundle path segment: %s", segment);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return segments.size();
|
||||
}
|
||||
|
||||
public String getSegment(int n) {
|
||||
return segments.get(n);
|
||||
}
|
||||
|
||||
public RbPath getParent() {
|
||||
checkState(length() > 0, "cannot get parent of the empty path");
|
||||
return length() > 1 ? new RbPath(segments.subList(0, length() - 1)) : EMPTY;
|
||||
}
|
||||
|
||||
public boolean isAnonymous() {
|
||||
return length() > 0 && segments.get(length() - 1).charAt(0) == '<';
|
||||
}
|
||||
|
||||
public RbPath extendBy(String... parts) {
|
||||
return new RbPath(Iterables.concat(segments, Arrays.asList(parts)));
|
||||
}
|
||||
|
||||
public RbPath extendBy(RbPath suffix) {
|
||||
return new RbPath(Iterables.concat(segments, suffix.segments));
|
||||
}
|
||||
|
||||
public RbPath mapSegments(Function<? super String, String> fn) {
|
||||
return new RbPath(segments.stream().map(fn).collect(toImmutableList()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the first element of this path is prefix by the given "base name".
|
||||
*
|
||||
* <p>Resource bundle paths relating to semantically similar data are typically grouped by the
|
||||
* same first path element. This is not as simple as just comparing the first element, as in
|
||||
* {@code path.startsWith(prefix)} however, since path elements can have suffixes, such as
|
||||
* {@code "Foo:alias"} or {@code "Foo%subtype"}.
|
||||
*
|
||||
* @param baseName the base name to test for.
|
||||
* @return true is the "base name" of the first path element is the given prefix.
|
||||
*/
|
||||
public boolean hasPrefix(String baseName) {
|
||||
checkArgument(!baseName.isEmpty() && SEGMENT_SEPARATORS.matchesNoneOf(baseName));
|
||||
if (length() == 0) {
|
||||
return false;
|
||||
}
|
||||
String firstElement = getSegment(0);
|
||||
// Slightly subtle (but safe) access to the separator character, since:
|
||||
// (!a.equals(b) && a.startsWith(b)) ==> a.length() > b.length().
|
||||
return firstElement.equals(baseName)
|
||||
|| (firstElement.startsWith(baseName)
|
||||
&& SEGMENT_SEPARATORS.matches(firstElement.charAt(baseName.length())));
|
||||
}
|
||||
|
||||
public boolean startsWith(RbPath prefix) {
|
||||
return prefix.length() <= length() && matchesSublist(prefix, 0);
|
||||
}
|
||||
|
||||
public boolean endsWith(RbPath suffix) {
|
||||
return suffix.length() <= length() && matchesSublist(suffix, length() - suffix.length());
|
||||
}
|
||||
|
||||
public boolean contains(RbPath path) {
|
||||
int maxOffset = length() - path.length();
|
||||
for (int i = 0; i <= maxOffset; i++) {
|
||||
if (matchesSublist(path, i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Assume length check has been done.
|
||||
private boolean matchesSublist(RbPath path, int offset) {
|
||||
for (int i = 0; i < path.length(); i++) {
|
||||
if (!path.getSegment(i).equals(getSegment(i + offset))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean isIntPath() {
|
||||
String lastElement = segments.get(segments.size() - 1);
|
||||
return lastElement.endsWith(":int") || lastElement.endsWith(":intvector");
|
||||
}
|
||||
|
||||
@Override public int compareTo(RbPath other) {
|
||||
return ORDERING.compare(this, other);
|
||||
}
|
||||
|
||||
@Override public boolean equals(Object other) {
|
||||
return (other instanceof RbPath) && segments.equals(((RbPath) other).segments);
|
||||
}
|
||||
|
||||
@Override public int hashCode() {
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
return String.join("/", segments);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
/**
|
||||
* A resource bundle value containing a sequence of elements. This is a very thin wrapper over an
|
||||
* immutable list, with a few additional constraints (e.g. cannot be empty).
|
||||
*
|
||||
* <p>Immutable and thread safe.
|
||||
*/
|
||||
public final class RbValue {
|
||||
private final ImmutableList<String> elements;
|
||||
|
||||
/** Returns a resource bundle value of the given elements. */
|
||||
public static RbValue of(String... elements) {
|
||||
return of(Arrays.asList(elements));
|
||||
}
|
||||
|
||||
/** Returns a resource bundle value of the given elements. */
|
||||
public static RbValue of(Iterable<String> elements) {
|
||||
return new RbValue(elements);
|
||||
}
|
||||
|
||||
private RbValue(Iterable<String> elements) {
|
||||
this.elements = ImmutableList.copyOf(elements);
|
||||
checkArgument(!this.elements.isEmpty(), "Resource bundle values cannot be empty");
|
||||
}
|
||||
|
||||
/** Returns the (non zero) number of elements in this value. */
|
||||
public int size() {
|
||||
return elements.size();
|
||||
}
|
||||
|
||||
/** Returns the Nth element of this value. */
|
||||
public String getElement(int n) {
|
||||
return elements.get(n);
|
||||
}
|
||||
|
||||
@Override public int hashCode() {
|
||||
return Objects.hashCode(elements);
|
||||
}
|
||||
|
||||
@Override public boolean equals(Object obj) {
|
||||
return obj instanceof RbValue && elements.equals(((RbValue) obj).elements);
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
return elements.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,593 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static com.google.common.collect.ImmutableMap.toImmutableMap;
|
||||
import static java.util.function.Function.identity;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.collect.HashBasedTable;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableTable;
|
||||
import com.google.common.collect.Table;
|
||||
|
||||
/**
|
||||
* Auxiliary APIs for processing locale IDs and other supplemental data needed by business logic
|
||||
* in some mapper classes.
|
||||
*
|
||||
* When a {@link SupplementalData} instance is used in a mapper class, it is imperative that it is
|
||||
* build using the same underlying CLDR data. The only reason mapper classes do not create their
|
||||
* own instances directly is the relative cost of processing all the supplemental data each time.
|
||||
*/
|
||||
// TODO: This should be moved into the API and leverage some of the existing utility functions.
|
||||
public final class SupplementalData {
|
||||
private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}");
|
||||
|
||||
private static final PathMatcher ALIAS =
|
||||
PathMatcher.of("supplementalData/metadata/alias/*[@type=*]");
|
||||
|
||||
private static final PathMatcher PARENT_LOCALE =
|
||||
PathMatcher.of("supplementalData/parentLocales/parentLocale[@parent=*]");
|
||||
private static final AttributeKey PARENT = keyOf("parentLocale", "parent");
|
||||
private static final AttributeKey LOCALES = keyOf("parentLocale", "locales");
|
||||
|
||||
private static final PathMatcher CALENDER_PREFERENCE =
|
||||
PathMatcher.of("supplementalData/calendarPreferenceData/calendarPreference[@territories=*]");
|
||||
private static final AttributeKey CALENDER_TERRITORIES =
|
||||
keyOf("calendarPreference", "territories");
|
||||
private static final AttributeKey CALENDER_ORDERING =
|
||||
keyOf("calendarPreference", "ordering");
|
||||
|
||||
private static final PathMatcher LIKELY_SUBTAGS =
|
||||
PathMatcher.of("supplementalData/likelySubtags/likelySubtag[@from=*]");
|
||||
private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from");
|
||||
private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to");
|
||||
|
||||
private static final Splitter LIST_SPLITTER =
|
||||
Splitter.on(whitespace()).omitEmptyStrings();
|
||||
|
||||
// Aliases come in three flavours. Note that the TERRITORY aliases map to a _list_ rather than
|
||||
// a single value (it's structurally always a list, but only territory aliases have a need for
|
||||
// more than one value).
|
||||
private enum Alias {
|
||||
LANGUAGE, SCRIPT, TERRITORY;
|
||||
|
||||
private static final ImmutableMap<String, Alias> TYPE_MAP =
|
||||
Arrays.stream(values())
|
||||
.collect(toImmutableMap(a -> Ascii.toLowerCase(a.name()) + "Alias", identity()));
|
||||
|
||||
private final String elementName = Ascii.toLowerCase(name()) + "Alias";
|
||||
final AttributeKey typeKey = AttributeKey.keyOf(elementName, "type");
|
||||
final AttributeKey replacementKey = AttributeKey.keyOf(elementName, "replacement");
|
||||
|
||||
static Optional<Alias> forElementName(String name) {
|
||||
return Optional.ofNullable(TYPE_MAP.get(name));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a supplemental data API instance from the given CLDR data.
|
||||
*
|
||||
* @param supplementalData the raw CLDR supplemental data instance.
|
||||
* @return the supplemental data API.
|
||||
*/
|
||||
static SupplementalData create(CldrData supplementalData) {
|
||||
Table<Alias, String, String> aliasTable = HashBasedTable.create();
|
||||
Map<String, String> parentLocaleMap = new HashMap<>();
|
||||
Map<String, String> defaultCalendarMap = new HashMap<>();
|
||||
Map<String, String> likelySubtagMap = new HashMap<>();
|
||||
|
||||
supplementalData.accept(
|
||||
ARBITRARY,
|
||||
v -> {
|
||||
if (ALIAS.matches(v.getPath())) {
|
||||
// Territory alias replacements can be a list of values (e.g. when countries
|
||||
// break up). We use the first (geo-politically most significant) value. This
|
||||
// doesn't happen for languages or scripts, but could in theory.
|
||||
Alias.forElementName(v.getPath().getName()).ifPresent(
|
||||
alias -> aliasTable.put(
|
||||
alias,
|
||||
alias.typeKey.valueFrom(v),
|
||||
alias.replacementKey.valueFrom(v)));
|
||||
} else if (PARENT_LOCALE.matches(v.getPath())) {
|
||||
String p = PARENT.valueFrom(v);
|
||||
LOCALES.listOfValuesFrom(v).forEach(c -> parentLocaleMap.put(c, p));
|
||||
} else if (CALENDER_PREFERENCE.matches(v.getPath())) {
|
||||
String c = CALENDER_ORDERING.listOfValuesFrom(v).get(0);
|
||||
CALENDER_TERRITORIES.listOfValuesFrom(v).forEach(t -> defaultCalendarMap.put(t, c));
|
||||
} else if (LIKELY_SUBTAGS.matches(v.getPath())) {
|
||||
likelySubtagMap.put(SUBTAG_FROM.valueFrom(v), SUBTAG_TO.valueFrom(v));
|
||||
}
|
||||
});
|
||||
|
||||
// WARNING: The original mapper code determines the full set of deprecated territories and
|
||||
// then removes the following hard-coded list without any explanation as to why. While this
|
||||
// is presumably to "undeprecate" them for the purposes of the locale processing, there's
|
||||
// no explanation of where this list comes from, and thus no way to maintain it.
|
||||
//
|
||||
// asList("062", "172", "200", "830", "AN", "CS", "QU")
|
||||
// .forEach(t -> aliasTable.remove(Alias.TERRITORY, t));
|
||||
// TODO: Understand and document what on Earth this is all about or delete this comment.
|
||||
|
||||
return new SupplementalData(
|
||||
aliasTable, parentLocaleMap, defaultCalendarMap, likelySubtagMap);
|
||||
}
|
||||
|
||||
// A simple-as-possible, mutable, locale ID data "struct" to handle the IDs used during ICU
|
||||
// data generation. Because this is mutable, it is thoroughly unsuitable for general use.
|
||||
private static final class LocaleId {
|
||||
// From: https://unicode.org/reports/tr35/#Identifiers
|
||||
// Locale ID is:
|
||||
// (<language>(_<script>)?|<script>)(_<region>)?(_<variant>)*
|
||||
//
|
||||
// However in CLDR data, there's always a language (even if it's "und"), and never more
|
||||
// than one variant, so this can be simplified to:
|
||||
// <language>(_<script>)?(_<region>)?(_<variant>)?
|
||||
//
|
||||
// * Required language is lowercase 2 or 3 letter language ID (e.g. "en", "gsw").
|
||||
// Note that the specification allows for languages 5-8 characters long, but in reality
|
||||
// this has never occurred yet, so it's ignored in this code.
|
||||
//
|
||||
// * Script is 4-letter Xxxx script identifier (e.g. "Latn").
|
||||
// The specification permits any casing for script subtags, but since all the data uses
|
||||
// the capitalized "Xxxx" form, that's what this code expects.
|
||||
//
|
||||
// * Region is the uppercase 2-letter CLDR region code ("GB") or the 3-digit numeric
|
||||
// identifier (e.g. "001").
|
||||
//
|
||||
// * Variants are a bit complex; either 5-8 length alphanumerics, or length 4 but starting
|
||||
// with a digit (this avoids any ambiguity with script subtags). However because ICU
|
||||
// violates this rule by using "TRADITIONAL" (11-letters) the length restriction is
|
||||
// merely "longer than 5".
|
||||
//
|
||||
// Finaly, CLDR data only uses an '_' as the separator, whereas the specification allows
|
||||
// for either '-' or '_').
|
||||
//
|
||||
// The regex for unambiguously capturing the parts of a locale ID from the CLDR data is:
|
||||
private static final Pattern LOCALE_ID =
|
||||
Pattern.compile("([a-z]{2,3})"
|
||||
+ "(?:_([A-Z][a-z]{3}))?"
|
||||
+ "(?:_([A-Z]{2}|[0-9]{3}))?"
|
||||
+ "(?:_([a-zA-Z]{5,}|[0-9][a-zA-Z0-9]{3}))?");
|
||||
|
||||
static LocaleId parse(String localeId) {
|
||||
Matcher m = LOCALE_ID.matcher(checkNotNull(localeId, "locale ID cannot be null"));
|
||||
checkArgument(m.matches(), "invalid locale ID: %s", localeId);
|
||||
return of(m.group(1), m.group(2), m.group(3)).setVariant(m.group(4));
|
||||
}
|
||||
|
||||
static LocaleId of(String language, String script, String region) {
|
||||
return new LocaleId().setLanguage(language).setScript(script).setRegion(region);
|
||||
}
|
||||
|
||||
// Only the language subtag is non-nullable.
|
||||
private String languageSubtag;
|
||||
private String scriptSubtag;
|
||||
private String regionSubtag;
|
||||
private String variantSubtag;
|
||||
|
||||
String getLanguage() {
|
||||
return languageSubtag;
|
||||
}
|
||||
|
||||
String getScript() {
|
||||
return scriptSubtag;
|
||||
}
|
||||
|
||||
String getRegion() {
|
||||
return regionSubtag;
|
||||
}
|
||||
|
||||
String getVariant() {
|
||||
return variantSubtag;
|
||||
}
|
||||
|
||||
LocaleId setLanguage(String languageSubtag) {
|
||||
checkNotNull(languageSubtag, "language subtag must not be null");
|
||||
checkArgument(!languageSubtag.isEmpty(), "language subtag must not be empty");
|
||||
this.languageSubtag = languageSubtag;
|
||||
return this;
|
||||
}
|
||||
|
||||
LocaleId setScript(String scriptSubtag) {
|
||||
this.scriptSubtag = Strings.emptyToNull(scriptSubtag);
|
||||
return this;
|
||||
}
|
||||
|
||||
LocaleId setRegion(String regionSubtag) {
|
||||
this.regionSubtag = Strings.emptyToNull(regionSubtag);
|
||||
return this;
|
||||
}
|
||||
|
||||
LocaleId setVariant(String variantSubtag) {
|
||||
this.variantSubtag = Strings.emptyToNull(variantSubtag);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
StringBuilder id = new StringBuilder(languageSubtag);
|
||||
if (scriptSubtag != null) {
|
||||
id.append("_").append(scriptSubtag);
|
||||
}
|
||||
if (regionSubtag != null) {
|
||||
id.append("_").append(regionSubtag);
|
||||
}
|
||||
if (variantSubtag != null) {
|
||||
id.append("_").append(variantSubtag);
|
||||
}
|
||||
return id.toString();
|
||||
}
|
||||
|
||||
@Override public boolean equals(Object o) {
|
||||
if (!(o instanceof LocaleId)) {
|
||||
return false;
|
||||
}
|
||||
LocaleId other = (LocaleId) o;
|
||||
return Objects.equals(languageSubtag, other.languageSubtag)
|
||||
&& Objects.equals(scriptSubtag, other.scriptSubtag)
|
||||
&& Objects.equals(regionSubtag, other.regionSubtag)
|
||||
&& Objects.equals(variantSubtag, other.variantSubtag);
|
||||
}
|
||||
|
||||
@Override public int hashCode() {
|
||||
return Objects.hash(languageSubtag, scriptSubtag, regionSubtag, variantSubtag);
|
||||
}
|
||||
}
|
||||
|
||||
private final ImmutableTable<Alias, String, String> aliasTable;
|
||||
private final ImmutableMap<String, String> parentLocaleMap;
|
||||
private final ImmutableMap<String, String> defaultCalendarMap;
|
||||
private final ImmutableMap<String, String> likelySubtagMap;
|
||||
|
||||
private SupplementalData(
|
||||
Table<Alias, String, String> aliasTable,
|
||||
Map<String, String> parentLocaleMap,
|
||||
Map<String, String> defaultCalendarMap,
|
||||
Map<String, String> likelySubtagMap) {
|
||||
this.aliasTable = ImmutableTable.copyOf(aliasTable);
|
||||
this.parentLocaleMap = ImmutableMap.copyOf(parentLocaleMap);
|
||||
this.defaultCalendarMap = ImmutableMap.copyOf(defaultCalendarMap);
|
||||
this.likelySubtagMap = ImmutableMap.copyOf(likelySubtagMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the "maximized" form of a given locale ID, by adding likely subtags where possible.
|
||||
*/
|
||||
public Optional<String> maximize(String localeId) {
|
||||
return addLikelySubtags(localeId).map(Object::toString);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the locale ID with any deprecated elements replaced. This is an
|
||||
* implementation of the algorithm specified in
|
||||
* <a href="http://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers">the LDML
|
||||
* specification</a> but without any "minimizing" of the final result (as happens for
|
||||
* canonicalization in the CLDR tools).
|
||||
*/
|
||||
public String replaceDeprecatedTags(String localeId) {
|
||||
if (localeId.equals("root")) {
|
||||
return localeId;
|
||||
}
|
||||
LocaleId id = LocaleId.parse(localeId);
|
||||
|
||||
// ---- LDML Specification ----
|
||||
// If the region subtag matches the type attribute of a territoryAlias element in
|
||||
// Supplemental Data, replace the region subtag with the replacement value, as follows:
|
||||
//
|
||||
// * If there is a single territory in the replacement, use it.
|
||||
// * If there are multiple territories:
|
||||
// * Look up the most likely territory for the base language code (and script, if there
|
||||
// is one).
|
||||
// * If that likely territory is in the list, use it.
|
||||
// * Otherwise, use the first territory in the list.
|
||||
// ----
|
||||
// However there is a footnote that says:
|
||||
// Formally, replacement of multiple territories uses Section 4.3 Likely Subtags.
|
||||
// However, there are a small number of cases of multiple territories, so the mappings
|
||||
// can be precomputed. This results in a faster lookup with a very small subset of the
|
||||
// likely subtags data.
|
||||
//
|
||||
// Note that (contrary to the order implied by the LDML specification) this step is
|
||||
// performed _before_ the language alias lookup. This is to allow ID such as "sr_YU" to
|
||||
// work, where "YU" should be replaced with "RS" and _then_ "sr_RS" is expanded to
|
||||
// "sr_Cryl_RS" by the language alias lookup. In the other order, you just get "sr_RS" out.
|
||||
//
|
||||
// TODO: Can we simplify this my just using "addLikelySubtags()" when region is missing?
|
||||
if (id.getRegion() != null) {
|
||||
String replacementRegions = aliasTable.get(Alias.TERRITORY, id.getRegion());
|
||||
if (replacementRegions != null) {
|
||||
List<String> regions = LIST_SPLITTER.splitToList(replacementRegions);
|
||||
checkArgument(!regions.isEmpty(), "invalid empty region list for %s", localeId);
|
||||
if (regions.size() == 1) {
|
||||
id.setRegion(regions.get(0));
|
||||
} else {
|
||||
LocaleId key = LocaleId.of(id.getLanguage(), id.getScript(), null);
|
||||
String likelyId = likelySubtagMap.get(key.toString());
|
||||
if (likelyId == null) {
|
||||
likelyId = likelySubtagMap.get(key.setScript(null).toString());
|
||||
}
|
||||
String likelyRegion =
|
||||
likelyId != null ? LocaleId.parse(likelyId).getRegion() : null;
|
||||
if (regions.contains(likelyRegion)) {
|
||||
id.setRegion(likelyRegion);
|
||||
} else {
|
||||
id.setRegion(regions.get(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// While it's not mentioned in the LDML specification, there is data in the alias table for
|
||||
// replacement scripts (currently it contains exactly one entry with one value). Because
|
||||
// its not clear if this is intended to only be single values or a list (and how to handle
|
||||
// it if it were a list), there's a hard check to ensure it's only ever a single value.
|
||||
if (id.getScript() != null) {
|
||||
String replacementScript = aliasTable.get(Alias.SCRIPT, id.getScript());
|
||||
if (replacementScript != null) {
|
||||
checkArgument(whitespace().matchesNoneOf(replacementScript),
|
||||
"unexpected list of replacement scripts: %s", replacementScript);
|
||||
id.setScript(replacementScript);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- LDML Specification ----
|
||||
// If the language subtag matches the type attribute of a languageAlias element in
|
||||
// Supplemental Data, replace the language subtag with the replacement value.
|
||||
//
|
||||
// If there are additional subtags in the replacement value, add them to the result, but
|
||||
// only if there is no corresponding subtag already in the tag.
|
||||
// ----
|
||||
// Contrary to the precise wording of the specification, we don't just check the language
|
||||
// subtag, since language aliases can contain script and even region information. Instead
|
||||
// we check the alias table using the same order as defined in subtag maximizing:
|
||||
//
|
||||
// <language>_<script>_<region>
|
||||
// <language>_<region>
|
||||
// <language>_<script>
|
||||
// <language>
|
||||
//
|
||||
// There is no need to check for "und" however since that's not aliased anything, but since
|
||||
// it shares the same code it's harmless to do.
|
||||
resolveLocaleId(id, s -> aliasTable.get(Alias.LANGUAGE, s))
|
||||
.ifPresent(resolvedId -> {
|
||||
id.setLanguage(checkNotNull(resolvedId.getLanguage(),
|
||||
"missing language subtag in language alias: %s", resolvedId));
|
||||
if (id.getScript() == null) {
|
||||
id.setScript(resolvedId.getScript());
|
||||
}
|
||||
if (id.getRegion() == null) {
|
||||
id.setRegion(resolvedId.getRegion());
|
||||
}
|
||||
if (id.getVariant() == null) {
|
||||
id.setVariant(resolvedId.getVariant());
|
||||
}
|
||||
});
|
||||
return id.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a suitable default calendar for a given locale if it's different from the default
|
||||
* calendar inferred by the locale's parent.
|
||||
*
|
||||
* <p>Note that since the default calendar data is keyed from territory (region subtag) rather
|
||||
* than the complete locale ID, it is impossible to encode some real life cases (e.g. the fact
|
||||
* that "ja_JP_TRADITIONAL" has a different default calendar to "ja_JP"). This is currently
|
||||
* handled with hard-code special casing, but should probably be data driven eventually.
|
||||
*/
|
||||
public Optional<String> getDefaultCalendar(String localeId) {
|
||||
Optional<String> calendar = getSpecialCaseCalendar(localeId);
|
||||
if (calendar.isPresent()) {
|
||||
return calendar;
|
||||
}
|
||||
String t = territoryOf(localeId);
|
||||
calendar = Optional.ofNullable(defaultCalendarMap.get(t));
|
||||
if (!calendar.isPresent()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
String rootCalendar = defaultCalendarMap.get("001");
|
||||
checkState(!rootCalendar.isEmpty(), "missing root calendar");
|
||||
if (localeId.equals("root")) {
|
||||
return Optional.of(rootCalendar);
|
||||
}
|
||||
// All locales reach "root" eventually, and that maps to territory "001" which
|
||||
// we already know has a value, so this loop *must* exit.
|
||||
String parentCalendar;
|
||||
do {
|
||||
localeId = getParent(localeId);
|
||||
String territory = territoryOf(localeId);
|
||||
parentCalendar = defaultCalendarMap.get(territory);
|
||||
} while (parentCalendar == null);
|
||||
return parentCalendar.equals(calendar.get()) ? Optional.empty() : calendar;
|
||||
}
|
||||
|
||||
// Hack to work around the limitation that CLDR data cannot represent default calendars that
|
||||
// change because of non-territory information. Since this is limited to exactly two cases at
|
||||
// the moment, and is unlikely to be expanded, it's being done directly in code.
|
||||
private Optional<String> getSpecialCaseCalendar(String localeId) {
|
||||
Optional<String> maximized = maximize(localeId);
|
||||
if (maximized.isPresent()) {
|
||||
switch (maximized.get()) {
|
||||
case "ja_Jpan_JP_TRADITIONAL":
|
||||
return Optional.of("japanese");
|
||||
case "th_Thai_TH_TRADITIONAL":
|
||||
return Optional.of("buddhist");
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the parent of a non-root locale ID. This is more complex than simple truncation for
|
||||
* two reasons:
|
||||
* <ul>
|
||||
* <li>There may be an explicit parent locale ID specified in the CLDR data.
|
||||
* <li>Removal of non-default script subtags makes the parent locale "root" (unless there
|
||||
* was an explicit parent specified).
|
||||
* </ul>
|
||||
* Note that all valid locale ID parent "chains" must end up at "root" eventually.
|
||||
*
|
||||
* For example (showing parent "chains"):
|
||||
* <ul>
|
||||
* <li>{@code en_GB} --> {@code en_001} --> {@code en} --> {@code root}
|
||||
* <li>{@code en_Cyrl_RU} --> {@code en_Cyrl} --> {@code root}
|
||||
* </ul>
|
||||
*
|
||||
* @throws IllegalArgumentException if the given locale ID is invalid or "root".
|
||||
*/
|
||||
public String getParent(String localeId) {
|
||||
checkState(!localeId.equals("root"), "cannot ask for parent of 'root' locale");
|
||||
// Always defer to an explicit parent locale set in the CLDR data.
|
||||
Optional<String> explicitParent = getExplicitParentLocaleOf(localeId);
|
||||
if (explicitParent.isPresent()) {
|
||||
return explicitParent.get();
|
||||
}
|
||||
// Now look for the start of the last ID "part" in order to truncate.
|
||||
int lastPartSeperatorIndex = localeId.lastIndexOf('_');
|
||||
// The parent of a base language ID (e.g. "en" or "fr") is always "root".
|
||||
if (lastPartSeperatorIndex == -1) {
|
||||
return "root";
|
||||
}
|
||||
String parentId = localeId.substring(0, lastPartSeperatorIndex);
|
||||
|
||||
// However, if the script of the locale is what's being truncated and it's NOT the default
|
||||
// script for the language, return "root" as the parent rather than truncating.
|
||||
String lastPart = localeId.substring(lastPartSeperatorIndex + 1);
|
||||
if (SCRIPT_SUBTAG.matcher(lastPart).matches() && !lastPart.equals(scriptOf(parentId))) {
|
||||
return "root";
|
||||
}
|
||||
return !parentId.isEmpty() ? parentId : "root";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the explicit parent of a locale ID if specified in the CLDR data.
|
||||
*
|
||||
* Note that this method will not return a value for most locale IDs, since they do not have
|
||||
* an explicit parent set. If you just want "normal" parent of a locale ID, use {@link
|
||||
* #getParent(String)}.
|
||||
*/
|
||||
public Optional<String> getExplicitParentLocaleOf(String localeId) {
|
||||
return Optional.ofNullable(parentLocaleMap.get(localeId));
|
||||
}
|
||||
|
||||
private String territoryOf(String localeId) {
|
||||
return localeId.equals("root")
|
||||
? "001"
|
||||
: addLikelySubtags(localeId).map(LocaleId::getRegion).orElse("ZZ");
|
||||
}
|
||||
|
||||
private String scriptOf(String localeId) {
|
||||
return addLikelySubtags(localeId).map(LocaleId::getScript).orElse("Zzzz");
|
||||
}
|
||||
|
||||
// From: https://unicode.org/reports/tr35/#Likely_Subtags
|
||||
//
|
||||
// Add Likely Subtags
|
||||
// ------------------
|
||||
// Given a source locale X, to return a locale Y where the empty subtags have been filled in
|
||||
// by the most likely subtags. A subtag is called empty if it is a missing script or region
|
||||
// subtag, or it is a base language subtag with the value "und".
|
||||
//
|
||||
// Canonicalize
|
||||
// ------------
|
||||
// Make sure the input locale is in canonical form ...
|
||||
// ...
|
||||
// Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
|
||||
//
|
||||
// Note that this implementation does not need to handle "grandfathered" tags.
|
||||
private Optional<LocaleId> addLikelySubtags(String localeId) {
|
||||
if (localeId.equals("root")) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
LocaleId id = LocaleId.parse(localeId);
|
||||
// ---- LDML Specification ----
|
||||
// Remove the script code 'Zzzz' and the region code 'ZZ' if they occur.
|
||||
if ("Zzzz".equals(id.getScript())) {
|
||||
id.setScript(null);
|
||||
}
|
||||
if ("ZZ".equals(id.getRegion())) {
|
||||
id.setRegion(null);
|
||||
}
|
||||
// ---- LDML Specification ----
|
||||
// A subtag is called empty if it is a missing script or region subtag, or it is a base
|
||||
// language subtag with the value "und"
|
||||
if (!id.getLanguage().equals("und") && id.getScript() != null && id.getRegion() != null) {
|
||||
// We are already canonical, so just return.
|
||||
return Optional.of(id);
|
||||
}
|
||||
Optional<LocaleId> optTags = resolveLocaleId(id, likelySubtagMap::get);
|
||||
if (!optTags.isPresent()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
LocaleId subtags = optTags.get();
|
||||
checkArgument(!subtags.getLanguage().equals("und"), "invalid subtags: %s", subtags);
|
||||
// Replace "missing" elements in the original ID with likely subtags.
|
||||
if (id.getLanguage().equals("und")) {
|
||||
id.setLanguage(subtags.getLanguage());
|
||||
}
|
||||
if (id.getScript() == null) {
|
||||
id.setScript(checkNotNull(subtags.getScript()));
|
||||
}
|
||||
if (id.getRegion() == null) {
|
||||
id.setRegion(checkNotNull(subtags.getRegion()));
|
||||
}
|
||||
// Language is not "und" and both script and region subtags are set!
|
||||
return Optional.of(id);
|
||||
}
|
||||
|
||||
// From: https://unicode.org/reports/tr35/#Likely_Subtags
|
||||
//
|
||||
// Lookup
|
||||
// ------
|
||||
// Lookup each of the following in order, and stop on the first match:
|
||||
// <language>_<script>_<region>
|
||||
// <language>_<region>
|
||||
// <language>_<script>
|
||||
// <language>
|
||||
// "und"_<script>
|
||||
private Optional<LocaleId> resolveLocaleId(LocaleId id, Function<String, String> fn) {
|
||||
String lang = id.getLanguage();
|
||||
String script = id.getScript();
|
||||
String region = id.getRegion();
|
||||
Stream<LocaleId> candidateIds = Stream.of(
|
||||
LocaleId.of(lang, script, region),
|
||||
LocaleId.of(lang, null, region),
|
||||
LocaleId.of(lang, script, null),
|
||||
LocaleId.of(lang, null, null));
|
||||
// Only add "und"_<script> if there's a script, otherwise you end up maximizing "und" on
|
||||
// its own ("en_Latn_US") which is not intended.
|
||||
if (script != null) {
|
||||
candidateIds = Stream.concat(candidateIds, Stream.of(LocaleId.of("und", script, null)));
|
||||
}
|
||||
return candidateIds
|
||||
// Remove duplicate IDs (keeps the first one encountered).
|
||||
.distinct()
|
||||
.map(Object::toString)
|
||||
.map(fn)
|
||||
.filter(Objects::nonNull)
|
||||
.findFirst()
|
||||
.map(LocaleId::parse);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.Ascii.toLowerCase;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
import static org.unicode.cldr.api.CldrDataType.BCP47;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData.PrefixVisitor;
|
||||
import org.unicode.cldr.api.CldrData.ValueVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Sets;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* A mapper to collect BCP-47 data from {@link CldrDataType#BCP47 BCP47} data under paths
|
||||
* matching:
|
||||
* <pre>{@code
|
||||
* //ldmlBCP47/keyword/key[@name=*]/type[@name=*]
|
||||
* }</pre>
|
||||
*/
|
||||
public final class Bcp47Mapper {
|
||||
// Other attributes (e.g. "alias") are value attributes and don't need to be matched here.
|
||||
private static final PathMatcher KEY = PathMatcher.of("ldmlBCP47/keyword/key[@name=*]");
|
||||
private static final AttributeKey KEY_NAME = keyOf("key", "name");
|
||||
private static final AttributeKey KEY_ALIAS = keyOf("key", "alias");
|
||||
private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType");
|
||||
|
||||
private static final PathMatcher TYPE = PathMatcher.of("type[@name=*]");
|
||||
private static final AttributeKey TYPE_NAME = keyOf("type", "name");
|
||||
private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias");
|
||||
private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred");
|
||||
|
||||
// Deprecation of the data is not the same as deprecation of attributes themselves. This
|
||||
// deprecation relates to identifying data which exists, but is not longer the right way to
|
||||
// represent things (which means it can be important for clients to know about).
|
||||
private static final AttributeKey KEY_DEPRECATED = keyOf("key", "deprecated");
|
||||
private static final AttributeKey TYPE_DEPRECATED = keyOf("type", "deprecated");
|
||||
|
||||
// Attributes that can be emitted under the /keyInfo or /typeInfo paths for auxiliary
|
||||
// information in the ICU data. If the value is equal to the declared default, it is ignored.
|
||||
// NOTE: The need for hard-coded default values is a hack because there's not nice way (yet)
|
||||
// to determine the default for implicit values via the DTD. Ideally this would be automatic
|
||||
// and the AttributeKey class would be able to have a method like "isDefault(String value)".
|
||||
private static final ImmutableMap<AttributeKey, String> INFO_ATTRIBUTES =
|
||||
ImmutableMap.of(KEY_VALUE_TYPE, "", KEY_DEPRECATED, "false", TYPE_DEPRECATED, "false");
|
||||
|
||||
private static final RbPath RB_KEYMAP = RbPath.of("keyMap");
|
||||
private static final RbPath RB_TYPE_ALIAS = RbPath.of("typeAlias", "timezone:alias");
|
||||
private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias");
|
||||
private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias");
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate Timezone and BCP-47 ICU data.
|
||||
*
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @return A list of IcuData instances containing BCP-47 data to be written to files.
|
||||
*/
|
||||
public static ImmutableList<IcuData> process(CldrDataSupplier src) {
|
||||
Bcp47Visitor visitor = new Bcp47Visitor();
|
||||
src.getDataForType(BCP47).accept(ARBITRARY, visitor);
|
||||
visitor.addKeyMapValues();
|
||||
return ImmutableList.of(visitor.keyTypeData.icuData, visitor.tzData.icuData);
|
||||
}
|
||||
|
||||
// Outer visitor which handles "key" paths by installing sub-visitor methods to process
|
||||
// each child "type" element. Depending on the key name, values are stored in different
|
||||
// IcuData instances.
|
||||
private static final class Bcp47Visitor implements PrefixVisitor {
|
||||
private final ValueCollector tzData =
|
||||
new ValueCollector(new IcuData("timezoneTypes", false));
|
||||
private final ValueCollector keyTypeData =
|
||||
new ValueCollector(new IcuData("keyTypeData", false));
|
||||
|
||||
// The current key name from the parent path element (set when a prefix is matched).
|
||||
@Nullable private String keyName = null;
|
||||
// A map collecting each key and values as they are visited.
|
||||
// TODO: Convert this to a Map<RbPath, String> which involves removing the '@' prefix hack.
|
||||
private Map<String, String> keyMap = new LinkedHashMap<>();
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
if (KEY.matches(prefix)) {
|
||||
// Don't inline this since it also sets the field!!
|
||||
keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix));
|
||||
|
||||
// How the data is visited is the same for both timezone and other BCP-47 data,
|
||||
// it's just split into different data files, so we just install a different
|
||||
// instance of the visitor class according to where the data in this sub-hierarchy
|
||||
// should end up.
|
||||
ctx.install(keyName.equals("tz") ? tzData : keyTypeData);
|
||||
}
|
||||
}
|
||||
|
||||
// Post processing to add additional captured attribute values and some special cases.
|
||||
private void addKeyMapValues() {
|
||||
IcuData keyData = keyTypeData.icuData;
|
||||
// Add all the keyMap values into the IcuData file.
|
||||
for (Entry<String, String> kmData : keyMap.entrySet()) {
|
||||
String bcpKey = kmData.getKey();
|
||||
String key = kmData.getValue();
|
||||
if (bcpKey.startsWith("@")) {
|
||||
// Undoing the weird hack in addInfoAttributes(). This can be done better.
|
||||
// We use "parse()" because these are full paths, and not single elements.
|
||||
keyData.add(RbPath.parse(bcpKey.substring(1)), key);
|
||||
continue;
|
||||
}
|
||||
if (bcpKey.equals(key)) {
|
||||
// An empty value indicates that the BCP47 key is same as the legacy key.
|
||||
bcpKey = "";
|
||||
}
|
||||
keyData.add(RB_KEYMAP.extendBy(key), bcpKey);
|
||||
}
|
||||
// Add aliases for timezone data.
|
||||
keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone");
|
||||
keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone");
|
||||
keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz");
|
||||
}
|
||||
|
||||
private final class ValueCollector implements ValueVisitor {
|
||||
// Mutable ICU data collected into during visitation.
|
||||
private final IcuData icuData;
|
||||
|
||||
ValueCollector(IcuData data) {
|
||||
this.icuData = checkNotNull(data);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(CldrValue value) {
|
||||
checkArgument(TYPE.matchesSuffixOf(value.getPath()),
|
||||
"unexpected child element: %s", value.getPath());
|
||||
String typeName = TYPE_NAME.valueFrom(value);
|
||||
// Note that if a "preferred" type exists, we treat the value specially and add
|
||||
// it only as an alias. We expected values with a preferred replacement to
|
||||
// always be explicitly deprecated.
|
||||
Optional<String> prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value);
|
||||
if (prefName.isPresent()) {
|
||||
checkState(KEY_DEPRECATED.booleanValueFrom(value, false)
|
||||
|| TYPE_DEPRECATED.booleanValueFrom(value, false),
|
||||
"unexpected 'preferred' attribute for non-deprecated value: %s", value);
|
||||
icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get());
|
||||
return;
|
||||
}
|
||||
// Note: There are some deprecated values which don't have a preferred
|
||||
// replacement and these will be processed below (in particular we need to emit
|
||||
// the fact that they are deprecated).
|
||||
|
||||
// According to the old mapper code, it's an error not to have an alias, but
|
||||
// it's emitted via debug logging and not actually enforced.
|
||||
// TODO: Consider making this an error if possible.
|
||||
String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName));
|
||||
|
||||
keyMap.put(keyName, keyAlias);
|
||||
RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias);
|
||||
List<String> typeAliases = TYPE_ALIASES.listOfValuesFrom(value);
|
||||
if (typeAliases.isEmpty()) {
|
||||
// Generate type map entry using empty value (an empty value indicates same
|
||||
// type name is used for both BCP47 and legacy type).
|
||||
icuData.add(typeMapPrefix.extendBy(typeName), "");
|
||||
} else {
|
||||
String mainAlias = typeAliases.get(0);
|
||||
icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName);
|
||||
// Put additional aliases as secondary aliases referencing the main alias.
|
||||
RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias);
|
||||
typeAliases.stream()
|
||||
.skip(1)
|
||||
.map(Bcp47Visitor::quoteAlias)
|
||||
.forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias));
|
||||
}
|
||||
addInfoAttributes(keyName, typeName, value.getValueAttributes());
|
||||
}
|
||||
|
||||
// Add any additional attributes present to the attribute map. Note that this code was
|
||||
// copied from largely undocumented code, and the precise reasoning for why this is
|
||||
// needed or why it's done this way is not completely clear. It is very likely that it
|
||||
// can be simplified.
|
||||
//
|
||||
// The '@' symbol added here is just a magic token that gets stripped off again in the
|
||||
// addKeyMapValues() method, it appears to just be a way to distinguish keys added via
|
||||
// this method vs during the visit method. A better approach might just be to have two
|
||||
// maps.
|
||||
// TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?).
|
||||
private void addInfoAttributes(
|
||||
String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes) {
|
||||
// Only emit deprecation for the "key" level, even if all types below that are also
|
||||
// marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES).
|
||||
Set<AttributeKey> keys =
|
||||
Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet());
|
||||
for (AttributeKey a : keys) {
|
||||
String value = attributes.get(a);
|
||||
// Skip empty or default values in attributes.
|
||||
if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) {
|
||||
continue;
|
||||
}
|
||||
// The ID for the xxxInfo paths in ICU is the path fragment at which the
|
||||
// attribute exists. Since we only process complete paths here, we must do a
|
||||
// bit of reconstruction based on the element name of the attribute we are
|
||||
// processing. This relies on explicit knowledge that the paths are "<key>" or
|
||||
// "<key>/<type>". This all gets less messy if we switch to RbPath.
|
||||
String id =
|
||||
a.getElementName().equals("key") ? keyName : keyName + "/" + typeName;
|
||||
keyMap.put(
|
||||
"@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id,
|
||||
value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes alias values containing '/' so they can appear in resource bundle paths. This
|
||||
* function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar").
|
||||
*
|
||||
* <p>This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar'
|
||||
* in the CLDR data.
|
||||
*/
|
||||
// TODO: Switch to RbPath and do quoting automatically when ICU data is written out.
|
||||
private static String quoteAlias(String str) {
|
||||
return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"';
|
||||
}
|
||||
}
|
||||
|
||||
private Bcp47Mapper() {}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
|
||||
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.escape.UnicodeEscaper;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* A mapper to collect break-iterator data from {@link CldrDataType#LDML LDML} data under
|
||||
* paths matching:
|
||||
* <pre>{@code
|
||||
* //ldml/segmentations/segmentation/suppressions/suppression
|
||||
* //ldml/special/icu:breakIteratorData/...
|
||||
* }</pre>
|
||||
*/
|
||||
// TODO: This class can almost certainly be replace with a small RegexTransformer config.
|
||||
public final class BreakIteratorMapper {
|
||||
// The "type" attribute is not required here, so cannot appear in the matcher.
|
||||
private static final PathMatcher SUPPRESSION =
|
||||
PathMatcher.of("ldml/segmentations/segmentation/suppressions/suppression");
|
||||
private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type");
|
||||
|
||||
// Note: This could be done with an intermediate matcher for
|
||||
// "ldml/special/icu:breakIteratorData" but there are so few "special" values it's not worth it
|
||||
private static final PathMatcher BOUNDARIES =
|
||||
PathMatcher.of("ldml/special/icu:breakIteratorData/icu:boundaries/*");
|
||||
private static final PathMatcher DICTIONARY =
|
||||
PathMatcher.of("ldml/special/icu:breakIteratorData/icu:dictionaries/icu:dictionary");
|
||||
|
||||
private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency");
|
||||
private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type");
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate break-iterator data for a set of locale
|
||||
* IDs.
|
||||
*
|
||||
* @param localeId the locale ID to generate data for.
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
|
||||
* @return IcuData containing break-iterator data for the given locale ID.
|
||||
*/
|
||||
public static IcuData process(
|
||||
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
|
||||
|
||||
BreakIteratorMapper mapper = new BreakIteratorMapper(localeId);
|
||||
icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, mapper::addSpecials));
|
||||
src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, mapper::addSuppression);
|
||||
return mapper.icuData;
|
||||
}
|
||||
|
||||
// The per-locale ICU data being collected by this visitor.
|
||||
private final IcuData icuData;
|
||||
|
||||
private BreakIteratorMapper(String localeId) {
|
||||
this.icuData = new IcuData(localeId, true);
|
||||
}
|
||||
|
||||
private void addSuppression(CldrValue v) {
|
||||
if (SUPPRESSION.matches(v.getPath())) {
|
||||
String type = SEGMENTATION_TYPE.valueFrom(v);
|
||||
// TODO: Understand and document why we escape values here, but not for collation data.
|
||||
icuData.add(
|
||||
RbPath.of("exceptions", type + ":array"),
|
||||
ESCAPE_NON_ASCII.escape(v.getValue()));
|
||||
}
|
||||
}
|
||||
|
||||
private void addSpecials(CldrValue v) {
|
||||
CldrPath p = v.getPath();
|
||||
if (BOUNDARIES.matches(p)) {
|
||||
addDependency(
|
||||
getDependencyName(v),
|
||||
getBoundaryType(v),
|
||||
getBoundaryDependency(v));
|
||||
} else if (DICTIONARY.matches(p)) {
|
||||
addDependency(
|
||||
getDependencyName(v),
|
||||
DICTIONARY_TYPE.valueFrom(v),
|
||||
DICTIONARY_DEP.optionalValueFrom(v));
|
||||
}
|
||||
}
|
||||
|
||||
private void addDependency(String name, String type, Optional<String> dependency) {
|
||||
icuData.add(
|
||||
RbPath.of(name, type + ":process(dependency)"),
|
||||
dependency.orElseThrow(() -> new IllegalArgumentException("missing dependency")));
|
||||
}
|
||||
|
||||
// Must match the BOUNDARIES or DICTIONARY path.
|
||||
private static String getDependencyName(CldrValue value) {
|
||||
return stripXmlNamespace(value.getPath().getParent().getName());
|
||||
}
|
||||
|
||||
// Must match the BOUNDARIES path.
|
||||
private static String getBoundaryType(CldrValue value) {
|
||||
String elementName = value.getPath().getName();
|
||||
String type = stripXmlNamespace(elementName);
|
||||
return keyOf(elementName, "alt")
|
||||
.optionalValueFrom(value).map(a -> type + "_" + a).orElse(type);
|
||||
}
|
||||
|
||||
// Must match the BOUNDARIES path.
|
||||
private static Optional<String> getBoundaryDependency(CldrValue value) {
|
||||
return keyOf(value.getPath().getName(), "icu:dependency").optionalValueFrom(value);
|
||||
}
|
||||
|
||||
// Strips the first prefix of the form "xxx:" from a string.
|
||||
private static String stripXmlNamespace(String s) {
|
||||
return s.substring(s.indexOf(':') + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
|
||||
* backslash to a double backslash. This class is super slow for non-ASCII escaping due to
|
||||
* using "String.format()", however there's < 100 values that need any escaping, so it's fine.
|
||||
*/
|
||||
private static final UnicodeEscaper ESCAPE_NON_ASCII = new UnicodeEscaper() {
|
||||
private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
|
||||
|
||||
@Override
|
||||
protected char[] escape(int cp) {
|
||||
// Returning null means "do not escape".
|
||||
if (0x0020 <= cp && cp <= 0x007F) {
|
||||
return cp == '\\' ? DOUBLE_BACKSLASH : null;
|
||||
} else if (cp <= 0xFFFF) {
|
||||
return String.format("\\u%04X", cp).toCharArray();
|
||||
}
|
||||
return String.format("\\U%08X", cp).toCharArray();
|
||||
}
|
||||
};
|
||||
}
|
|
@ -0,0 +1,198 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrData.PrefixVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.base.Splitter;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbValue;
|
||||
|
||||
/**
|
||||
* A mapper to collect collation data from {@link CldrDataType#LDML LDML} data via the paths:
|
||||
* <pre>{@code
|
||||
* //ldml/collations/*
|
||||
* //ldml/special/icu:UCARules
|
||||
* //ldml/special/icu:depends
|
||||
* }</pre>
|
||||
*/
|
||||
public final class CollationMapper {
|
||||
private static final PathMatcher COLLATIONS = PathMatcher.of("ldml/collations");
|
||||
|
||||
// Note that the 'type' attribute is optional, so cannot be in the path matcher.
|
||||
// However since the CLDR data never actually omits the value, it would be easy to change the
|
||||
// attribute metadata to stop it being an implicit attribute and then it could appear.
|
||||
private static final PathMatcher COLLATION_RULE = PathMatcher.of("collation/cr");
|
||||
private static final AttributeKey COLLATION_TYPE = keyOf("collation", "type");
|
||||
private static final AttributeKey COLLATION_RULE_ALT = keyOf("cr", "alt");
|
||||
|
||||
private static final PathMatcher DEFAULT_COLLATION = PathMatcher.of("defaultCollation");
|
||||
|
||||
private static final PathMatcher SPECIAL = PathMatcher.of("ldml/special");
|
||||
private static final AttributeKey SPECIAL_RULES = keyOf("icu:UCARules", "icu:uca_rules");
|
||||
private static final AttributeKey SPECIAL_DEP = keyOf("icu:depends", "icu:dependency");
|
||||
|
||||
private static final RbPath RB_COLLATIONS_DEFAULT = RbPath.of("collations", "default");
|
||||
private static final RbPath RB_STANDARD_SEQUENCE =
|
||||
RbPath.of("collations", "standard", "Sequence");
|
||||
private static final RbPath RB_STANDARD_VERSION =
|
||||
RbPath.of("collations", "standard", "Version");
|
||||
|
||||
private static final Splitter LINE_SPLITTER =
|
||||
Splitter.on('\n').trimResults().omitEmptyStrings();
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate collation data for a set of locale IDs.
|
||||
*
|
||||
* @param localeId the locale ID to generate data for.
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
|
||||
* @return IcuData containing RBNF data for the given locale ID.
|
||||
*/
|
||||
public static IcuData process(
|
||||
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
|
||||
|
||||
CollationVisitor visitor = new CollationVisitor(localeId);
|
||||
icuSpecialData.ifPresent(s -> s.accept(ARBITRARY, visitor));
|
||||
src.getDataForLocale(localeId, UNRESOLVED).accept(ARBITRARY, visitor);
|
||||
return visitor.icuData;
|
||||
}
|
||||
|
||||
final static class CollationVisitor implements PrefixVisitor {
|
||||
private final IcuData icuData;
|
||||
|
||||
CollationVisitor(String localeId) {
|
||||
this.icuData = new IcuData(localeId, true);
|
||||
// Super special hack case because the XML data is a bit broken for the root collation
|
||||
// data (there's an empty <collation> element that's a non-leaf element and thus not
|
||||
// visited, but we should add an empty sequence to the output data.
|
||||
if (localeId.equals("root")) {
|
||||
icuData.replace(RB_STANDARD_SEQUENCE, "");
|
||||
// TODO: Collation versioning probably needs to be improved.
|
||||
icuData.replace(RB_STANDARD_VERSION, CldrDataSupplier.getCldrVersionString());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
if (COLLATIONS.matchesPrefixOf(prefix)) {
|
||||
ctx.install(this::collectRules);
|
||||
} else if (SPECIAL.matchesPrefixOf(prefix)) {
|
||||
ctx.install(this::maybeAddSpecial);
|
||||
}
|
||||
}
|
||||
|
||||
private void collectRules(CldrValue v) {
|
||||
CldrPath p = v.getPath();
|
||||
if (COLLATION_RULE.matchesSuffixOf(p)) {
|
||||
String type = COLLATION_TYPE.valueFrom(v);
|
||||
RbPath rbPath = RbPath.of("collations", type, "Sequence");
|
||||
|
||||
// WARNING: This is almost certainly a bug, since while @type can have the value
|
||||
// "short" it can also have other values. This code was copied from CollationMapper
|
||||
// which has the line;
|
||||
// isShort = attr.getValue("alt") != null;
|
||||
boolean isShort = COLLATION_RULE_ALT.optionalValueFrom(v).isPresent();
|
||||
|
||||
// Note that it's not clear why there's a check for "contains()" here. The code
|
||||
// from which this was derived is largely undocumented and this check could have
|
||||
// been overly defensive (perhaps a duplicate key should be an error?).
|
||||
if (isShort || !icuData.contains(rbPath)) {
|
||||
RbValue rules = RbValue.of(
|
||||
LINE_SPLITTER.splitToList(v.getValue()).stream()
|
||||
.map(CollationMapper::removeComment)
|
||||
.filter(s -> !s.isEmpty())::iterator);
|
||||
icuData.replace(rbPath, rules);
|
||||
icuData.replace(
|
||||
RbPath.of("collations", type, "Version"),
|
||||
CldrDataSupplier.getCldrVersionString());
|
||||
}
|
||||
} else if (DEFAULT_COLLATION.matchesSuffixOf(p)) {
|
||||
icuData.add(RB_COLLATIONS_DEFAULT, v.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
// This is a bit special since the attribute we want to add depends on the element we are
|
||||
// visiting (which is somewhat unusual in the transformation classes).
|
||||
private void maybeAddSpecial(CldrValue value) {
|
||||
AttributeKey key;
|
||||
switch (value.getPath().getName()) {
|
||||
case "icu:UCARules":
|
||||
key = SPECIAL_RULES;
|
||||
break;
|
||||
case "icu:depends":
|
||||
key = SPECIAL_DEP;
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
// substring(4) just removes the "icu:" prefix (which we know is present in the key).
|
||||
RbPath rbPath = RbPath.of(
|
||||
String.format("%s:process(%s)",
|
||||
key.getElementName().substring(4), key.getAttributeName().substring(4)));
|
||||
icuData.add(rbPath, key.valueFrom(value));
|
||||
}
|
||||
}
|
||||
|
||||
// Collation data can contain # to mark an end-of-line comment, but it can also contain data
|
||||
// with # in it. In the latter case it must be in a single-quoted string (e.g. 'x#y'). However
|
||||
// the precise semantics of the quoting rules are not particularly clear, so this method
|
||||
// assumes that:
|
||||
// * single quote (apostrophe) begins and ends quoting.
|
||||
// * outside a quoted section, all characters are literal.
|
||||
// * inside a quoted section, backslash '\' escapes any single character (e.g \a, \', \\)
|
||||
private static String removeComment(String s) {
|
||||
int i = findCommentStart(s);
|
||||
if (i >= 0) {
|
||||
s = CharMatcher.whitespace().trimTrailingFrom(s.substring(0, i));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Returns the index of the first unquoted '#' in the string.
|
||||
private static int findCommentStart(String s) {
|
||||
boolean quoted = false;
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
switch (s.charAt(i)) {
|
||||
case '\'':
|
||||
quoted = !quoted;
|
||||
break;
|
||||
|
||||
case '\\':
|
||||
if (quoted) {
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
|
||||
case '#':
|
||||
if (!quoted) {
|
||||
return i;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
// Do nothing and consume the character
|
||||
}
|
||||
}
|
||||
checkArgument(!quoted, "mismatched quotes in: %s", s);
|
||||
return -1;
|
||||
}
|
||||
|
||||
private CollationMapper() {}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrData.PrefixVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* A mapper to collect day-period data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
|
||||
* data via the paths:
|
||||
* <pre>{@code
|
||||
* //supplementalData/dayPeriodRuleSet/*
|
||||
* }</pre>
|
||||
*/
|
||||
public final class DayPeriodsMapper {
|
||||
private static final PathMatcher RULESET =
|
||||
PathMatcher.of("supplementalData/dayPeriodRuleSet");
|
||||
private static final AttributeKey RULESET_TYPE = keyOf("dayPeriodRuleSet", "type");
|
||||
|
||||
private static final PathMatcher RULES = PathMatcher.of("dayPeriodRules[@locales=*]");
|
||||
private static final AttributeKey RULES_LOCALES = keyOf("dayPeriodRules", "locales");
|
||||
|
||||
private static final PathMatcher RULE = PathMatcher.of("dayPeriodRule[@type=*]");
|
||||
private static final AttributeKey RULE_TYPE = keyOf("dayPeriodRule", "type");
|
||||
|
||||
private static final RbPath RB_LOCALES = RbPath.of("locales");
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate day-period ICU data.
|
||||
*
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @return the IcuData instance to be written to a file.
|
||||
*/
|
||||
public static IcuData process(CldrDataSupplier src) {
|
||||
RuleSetVisitor mapper = new RuleSetVisitor();
|
||||
CldrData data = src.getDataForType(SUPPLEMENTAL);
|
||||
data.accept(ARBITRARY, mapper);
|
||||
return mapper.icuData;
|
||||
}
|
||||
|
||||
private static final class RuleSetVisitor implements PrefixVisitor {
|
||||
// Mutable ICU data collected into during visitation.
|
||||
private final IcuData icuData = new IcuData("dayPeriods", false);
|
||||
private int setNum = 0;
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
if (RULESET.matches(prefix)) {
|
||||
ctx.install(new RuleVisitor(RULESET_TYPE.optionalValueFrom(prefix)));
|
||||
}
|
||||
}
|
||||
|
||||
private final class RuleVisitor implements PrefixVisitor {
|
||||
private final RbPath localePrefix;
|
||||
|
||||
private RuleVisitor(Optional<String> type) {
|
||||
// If there's a given type, add it to the prefix path.
|
||||
this.localePrefix = type.map(t -> RbPath.of("locales_" + t)).orElse(RB_LOCALES);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
if (RULES.matchesSuffixOf(prefix)) {
|
||||
// Sets are arbitrarily identified by the string "setNN".
|
||||
String setName = "set" + (++setNum);
|
||||
RULES_LOCALES.listOfValuesFrom(prefix)
|
||||
.forEach(locale -> icuData.add(localePrefix.extendBy(locale), setName));
|
||||
ctx.install(this::visitRule);
|
||||
}
|
||||
}
|
||||
|
||||
private void visitRule(CldrValue value) {
|
||||
if (RULE.matchesSuffixOf(value.getPath())) {
|
||||
RbPath prefix = RbPath.of("rules", "set" + setNum, RULE_TYPE.valueFrom(value));
|
||||
value.getValueAttributes()
|
||||
.forEach((k, v) -> icuData.add(prefix.extendBy(k.getAttributeName()), v));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private DayPeriodsMapper() {}
|
||||
}
|
|
@ -0,0 +1,183 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static com.google.common.collect.Ordering.natural;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
|
||||
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.RESOLVED;
|
||||
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrData.ValueVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.collect.ImmutableListMultimap;
|
||||
import com.google.common.collect.LinkedHashMultimap;
|
||||
import com.google.common.collect.SetMultimap;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbValue;
|
||||
import org.unicode.icu.tool.cldrtoicu.SupplementalData;
|
||||
|
||||
/**
|
||||
* Generate locale {@link IcuData} by transforming {@link CldrDataType#LDML LDML} data using a
|
||||
* {@link PathValueTransformer}.
|
||||
*
|
||||
* <p>This is currently driven by the {@code ldml2icu_locale.txt} configuration file via a
|
||||
* {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
|
||||
*/
|
||||
public final class LocaleMapper {
|
||||
// Match territory paths so we can skip processing deprecated territories.
|
||||
private static final PathMatcher TERRITORY = PathMatcher.of(
|
||||
"ldml/localeDisplayNames/territories/territory[@type=*]");
|
||||
private static final AttributeKey TERRITORY_TYPE = keyOf("territory", "type");
|
||||
|
||||
// The default calendar (only set is different from inherited parent value).
|
||||
private static final RbPath RB_CALENDAR = RbPath.of("calendar", "default");
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate general locale data for the given locale
|
||||
* ID.
|
||||
*
|
||||
* @param localeId the locale ID to generate data for.
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
|
||||
* @param transformer the transformer to match and transform each CLDR path/value pair.
|
||||
* @param supplementalData additional necessary data derived from
|
||||
* {@link org.unicode.cldr.api.CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data.
|
||||
* @return IcuData containing locale data for the given locale ID.
|
||||
*/
|
||||
public static IcuData process(
|
||||
String localeId,
|
||||
CldrDataSupplier src,
|
||||
Optional<CldrData> icuSpecialData,
|
||||
PathValueTransformer transformer,
|
||||
SupplementalData supplementalData) {
|
||||
|
||||
IcuData icuData = new IcuData(localeId, true);
|
||||
// Write out the results into the IcuData class, preserving result grouping and expanding
|
||||
// path references as necessary.
|
||||
ResultsCollector collector = new ResultsCollector(transformer);
|
||||
icuData.addResults(collector.collectResultsFor(localeId, src, icuSpecialData));
|
||||
doDateTimeHack(icuData);
|
||||
supplementalData.getDefaultCalendar(icuData.getName())
|
||||
.ifPresent(c -> icuData.add(RB_CALENDAR, c));
|
||||
return icuData;
|
||||
}
|
||||
|
||||
// This is an awful hack for post-processing the date-time format patterns to inject a 13th
|
||||
// pattern at index 8, which is just a duplicate of the "medium" date-time pattern. The reasons
|
||||
// for this are lost in the midst of time, but essentially there's ICU library code that just
|
||||
// expects the value at index 8 to be this "default" value, and reads the date-time values
|
||||
// starting at index 9.
|
||||
//
|
||||
// Before the hack would be at index 10, since there are 3 groups:
|
||||
// "time" -> "date" -> "date-time"
|
||||
// with 4 patterns each:
|
||||
// "full" -> "long" -> "medium" -> "short"
|
||||
private static void doDateTimeHack(IcuData icuData) {
|
||||
for (RbPath rbPath : icuData.getPaths()) {
|
||||
if (rbPath.length() == 3
|
||||
&& rbPath.getSegment(0).equals("calendar")
|
||||
&& rbPath.getSegment(2).equals("DateTimePatterns")) {
|
||||
// This cannot be null and should not be empty, since the path is in this data.
|
||||
List<RbValue> valuesToHack = icuData.get(rbPath);
|
||||
checkArgument(valuesToHack.size() == 12,
|
||||
"unexpected number of date/time patterns for '%s': %s", rbPath, valuesToHack);
|
||||
valuesToHack.add(8, valuesToHack.get(10));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static final class ResultsCollector {
|
||||
private final PathValueTransformer transformer;
|
||||
private final Set<RbPath> validRbPaths = new HashSet<>();
|
||||
|
||||
// WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
|
||||
// each key. The reason is that result comparison is not "consistent with equals", and
|
||||
// TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
|
||||
// method), and it does this even if using the add() method of the sorted set (this is in
|
||||
// fact in violation of the stated behaviour of Set#add).
|
||||
private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
|
||||
|
||||
ResultsCollector(PathValueTransformer transformer) {
|
||||
this.transformer = checkNotNull(transformer);
|
||||
}
|
||||
|
||||
ImmutableListMultimap<RbPath, Result> collectResultsFor(
|
||||
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
|
||||
|
||||
CldrData unresolved = src.getDataForLocale(localeId, UNRESOLVED);
|
||||
CldrData resolved = src.getDataForLocale(localeId, RESOLVED);
|
||||
DynamicVars varFn = p -> {
|
||||
CldrValue cldrValue = resolved.get(p);
|
||||
return cldrValue != null ? cldrValue.getValue() : null;
|
||||
};
|
||||
|
||||
collectPaths(unresolved, varFn);
|
||||
collectResults(resolved, varFn);
|
||||
icuSpecialData.ifPresent(s -> collectSpecials(s, varFn));
|
||||
|
||||
ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
|
||||
out.orderValuesBy(natural());
|
||||
for (RbPath rbPath : resultsByRbPath.keySet()) {
|
||||
Set<Result> existingResults = resultsByRbPath.get(rbPath);
|
||||
out.putAll(rbPath, existingResults);
|
||||
for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
|
||||
if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
|
||||
out.put(rbPath, fallback);
|
||||
}
|
||||
}
|
||||
}
|
||||
return out.build();
|
||||
}
|
||||
|
||||
private void collectPaths(CldrData unresolved, DynamicVars varFn) {
|
||||
ValueVisitor collectPaths =
|
||||
v -> transformer.transform(v, varFn).forEach(this::collectResultPath);
|
||||
unresolved.accept(DTD, collectPaths);
|
||||
}
|
||||
|
||||
private void collectResultPath(Result result) {
|
||||
RbPath rbPath = result.getKey();
|
||||
validRbPaths.add(rbPath);
|
||||
if (rbPath.isAnonymous()) {
|
||||
RbPath parent = rbPath.getParent();
|
||||
checkState(!parent.isAnonymous(),
|
||||
"anonymous paths should not be nested: %s", rbPath);
|
||||
validRbPaths.add(parent);
|
||||
}
|
||||
}
|
||||
|
||||
void collectResults(CldrData resolved, DynamicVars varFn) {
|
||||
ValueVisitor collectResults =
|
||||
v -> transformer.transform(v, varFn).stream()
|
||||
.filter(r -> validRbPaths.contains(r.getKey()))
|
||||
.forEach(r -> resultsByRbPath.put(r.getKey(), r));
|
||||
resolved.accept(DTD, collectResults);
|
||||
}
|
||||
|
||||
private void collectSpecials(CldrData cldrData, DynamicVars varFn) {
|
||||
cldrData.accept(DTD, v ->
|
||||
transformer.transform(v, varFn).forEach(r -> resultsByRbPath.put(r.getKey(), r)));
|
||||
}
|
||||
}
|
||||
|
||||
private LocaleMapper() {}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrData.PrefixVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbValue;
|
||||
|
||||
/**
|
||||
* A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
|
||||
* the paths:
|
||||
* <pre>{@code
|
||||
* //supplementalData/plurals/pluralRanges[@locales=*]/...
|
||||
* }</pre>
|
||||
*/
|
||||
public final class PluralRangesMapper {
|
||||
private static final PathMatcher RANGES =
|
||||
PathMatcher.of("supplementalData/plurals/pluralRanges[@locales=*]");
|
||||
private static final AttributeKey RANGES_LOCALES = keyOf("pluralRanges", "locales");
|
||||
|
||||
private static final PathMatcher RANGE = PathMatcher.of("pluralRange[@start=*][@end=*]");
|
||||
private static final AttributeKey RANGE_START = keyOf("pluralRange", "start");
|
||||
private static final AttributeKey RANGE_END = keyOf("pluralRange", "end");
|
||||
private static final AttributeKey RANGE_RESULT = keyOf("pluralRange", "result");
|
||||
|
||||
private static final RbPath RB_RULES = RbPath.of("rules");
|
||||
private static final RbPath RB_LOCALES = RbPath.of("locales");
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate plural-range ICU data.
|
||||
*
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @return the IcuData instance to be written to a file.
|
||||
*/
|
||||
public static IcuData process(CldrDataSupplier src) {
|
||||
PluralRangesVisitor visitor = new PluralRangesVisitor();
|
||||
CldrData data = src.getDataForType(SUPPLEMENTAL);
|
||||
data.accept(ARBITRARY, visitor);
|
||||
return visitor.icuData;
|
||||
}
|
||||
|
||||
private static final class PluralRangesVisitor implements PrefixVisitor {
|
||||
private final IcuData icuData = new IcuData("pluralRanges", false);
|
||||
|
||||
private int setIndex = 0;
|
||||
private String ruleLabel = null;
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
// Captured type is either "cardinal" or "ordinal" (and will cause exception otherwise).
|
||||
if (RANGES.matches(prefix)) {
|
||||
ruleLabel = String.format("set%02d", setIndex++);
|
||||
RANGES_LOCALES.listOfValuesFrom(prefix)
|
||||
.forEach(l -> icuData.add(RB_LOCALES.extendBy(l), ruleLabel));
|
||||
ctx.install(this::visitRange);
|
||||
}
|
||||
}
|
||||
|
||||
private void visitRange(CldrValue value) {
|
||||
checkState(RANGE.matchesSuffixOf(value.getPath()),
|
||||
"unexpected path: %s", value.getPath());
|
||||
// Note: "range:start" and "range:end" are optional attributes, but the CLDR DTD
|
||||
// specifies a default via comments. They should probably be changed to just have a
|
||||
// default in the DTD (and possibly converted to use an enum here).
|
||||
icuData.add(RB_RULES.extendBy(ruleLabel),
|
||||
RbValue.of(
|
||||
RANGE_START.valueFrom(value, "all"),
|
||||
RANGE_END.valueFrom(value, "all"),
|
||||
RANGE_RESULT.valueFrom(value)));
|
||||
}
|
||||
}
|
||||
|
||||
private PluralRangesMapper() {}
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.ARBITRARY;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrData.PrefixVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Iterables;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via
|
||||
* the paths:
|
||||
* <pre>{@code
|
||||
* //supplementalData/plurals[@type=*]/pluralRules[@locales=*]/pluralRule[@count=*]
|
||||
* }</pre>
|
||||
*/
|
||||
public final class PluralsMapper {
|
||||
private static final PathMatcher PLURALS = PathMatcher.of("supplementalData/plurals[@type=*]");
|
||||
private static final AttributeKey PLURALS_TYPE = keyOf("plurals", "type");
|
||||
|
||||
private static final PathMatcher RULES = PathMatcher.of("pluralRules[@locales=*]");
|
||||
private static final AttributeKey RULES_LOCALES = keyOf("pluralRules", "locales");
|
||||
|
||||
private static final PathMatcher RULE = PathMatcher.of("pluralRule[@count=*]");
|
||||
private static final AttributeKey RULE_COUNT = keyOf("pluralRule", "count");
|
||||
|
||||
private static final ImmutableMap<String, RbPath> ICU_PREFIX_MAP =
|
||||
ImmutableMap.of("cardinal", RbPath.of("locales"), "ordinal", RbPath.of("locales_ordinals"));
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate plural ICU data.
|
||||
*
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @return the IcuData instance to be written to a file.
|
||||
*/
|
||||
public static IcuData process(CldrDataSupplier src) {
|
||||
PluralsVisitor visitor = new PluralsVisitor();
|
||||
CldrData data = src.getDataForType(SUPPLEMENTAL);
|
||||
// Note: We explicitly reset the type to mimic the order of the existing code, since this
|
||||
// affects the set indices we generate during processing. Ideally this would all be immune
|
||||
// to ordering (or just enforce DTD ordering) but right now it's very dependent on
|
||||
// mimicking the order of the existing code to get identical output.
|
||||
data.accept(ARBITRARY, visitor.setType("cardinal"));
|
||||
data.accept(ARBITRARY, visitor.setType("ordinal"));
|
||||
return visitor.icuData;
|
||||
}
|
||||
|
||||
private static final class PluralsVisitor implements PrefixVisitor {
|
||||
// Mutable ICU data collected into during visitation.
|
||||
// In a post XML-aware API, is recording the XML file names really a good idea?
|
||||
private final IcuData icuData = new IcuData("plurals", false);
|
||||
// Filter for the type we are processing now (this could be removed if we don't mind which
|
||||
// order the types are processed, and switching to DTD ordering would make it stable).
|
||||
private String type = null;
|
||||
private final List<ImmutableMap<String, String>> previousRules = new ArrayList<>();
|
||||
|
||||
// Hack method to allow a single type to be processed at a time (the visitor would otherwise
|
||||
// happily handle both types in a single pass). We can't do this as two different visitors
|
||||
// (one for each type) because the current behaviour relies on carrying over the calculated
|
||||
// set numbers from one pass to the next. Once migration is complete we should revisit this
|
||||
// and allow this visitor to work in a single pass (probably with DTD order for stability).
|
||||
PluralsVisitor setType(String type) {
|
||||
this.type = checkNotNull(type);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
if (PLURALS.matches(prefix)) {
|
||||
// Note: "plurals:type" is an optional attribute but the CLDR DTD specifies a
|
||||
// default via comments. It should probably be changed to just have a default in
|
||||
// the DTD.
|
||||
if (PLURALS_TYPE.valueFrom(prefix, "cardinal").equals(type)) {
|
||||
ctx.install(new RulesVisitor(ICU_PREFIX_MAP.get(type)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final class RulesVisitor implements PrefixVisitor {
|
||||
private final RbPath icuPrefix;
|
||||
private final List<String> locales = new ArrayList<>();
|
||||
private final Map<String, String> rules = new LinkedHashMap<>();
|
||||
|
||||
RulesVisitor(RbPath icuPrefix) {
|
||||
this.icuPrefix = checkNotNull(icuPrefix);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitPrefixStart(CldrPath prefix, Context ctx) {
|
||||
if (RULES.matchesSuffixOf(prefix)) {
|
||||
Iterables.addAll(locales, RULES_LOCALES.listOfValuesFrom(prefix));
|
||||
ctx.install(value -> {
|
||||
if (RULE.matchesSuffixOf(value.getPath())) {
|
||||
rules.put(RULE_COUNT.valueFrom(value), value.getValue());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitPrefixEnd(CldrPath prefix) {
|
||||
checkState(!locales.isEmpty(), "missing locale data for plurals: %s", prefix);
|
||||
// Note: The original mapper code "sort of" coped with empty rules, but it's not
|
||||
// completely well behaved (or documented), so since this doesn't happen in the
|
||||
// current CLDR data, I decided to just prohibit it in the new code. Support can
|
||||
// easily be added in once the expected semantics are clear.
|
||||
checkState(!rules.isEmpty(), "missing rule data for plurals: %s", prefix);
|
||||
|
||||
// Have we seen this set of rules before? If so, reuse the existing index. Note
|
||||
// that an IDE might report this call as suspicious because the key is not yet an
|
||||
// immutable map (saves creating immutable maps just to check for inclusion) but
|
||||
// this is fine because collection equality is based only on contents, not
|
||||
// collection type.
|
||||
int idx = previousRules.indexOf(rules);
|
||||
if (idx == -1) {
|
||||
int newIdx = previousRules.size();
|
||||
rules.forEach((k, v) -> icuData.add(RbPath.of("rules", "set" + newIdx, k), v));
|
||||
// Since "rules" is mutable and reused, we must take an immutable copy here.
|
||||
previousRules.add(ImmutableMap.copyOf(rules));
|
||||
idx = newIdx;
|
||||
}
|
||||
String setName = "set" + idx;
|
||||
locales.forEach(locale -> icuData.add(icuPrefix.extendBy(locale), setName));
|
||||
rules.clear();
|
||||
locales.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private PluralsMapper() {}
|
||||
}
|
|
@ -0,0 +1,145 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
|
||||
import static org.unicode.cldr.api.CldrDataSupplier.CldrResolution.UNRESOLVED;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrData.PrefixVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
|
||||
import com.google.common.escape.UnicodeEscaper;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* A mapper to collect plural data from {@link CldrDataType#LDML LDML} data via the paths:
|
||||
* <pre>{@code
|
||||
* //ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]
|
||||
* }</pre>
|
||||
*/
|
||||
// TODO: This class can almost certainly be written using RegexTransformer and a small config.
|
||||
public final class RbnfMapper {
|
||||
private static final PathMatcher RULE_SET =
|
||||
PathMatcher.of("ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]");
|
||||
private static final AttributeKey GROUPING_TYPE = keyOf("rulesetGrouping", "type");
|
||||
private static final AttributeKey RULESET_TYPE = keyOf("ruleset", "type");
|
||||
|
||||
private static final PathMatcher RBNF_RULE = PathMatcher.of("rbnfrule");
|
||||
private static final AttributeKey RBNF_VALUE = keyOf("rbnfrule", "value");
|
||||
private static final AttributeKey RBNF_RADIX = keyOf("rbnfrule", "radix");
|
||||
private static final AttributeKey RULESET_ACCESS = keyOf("ruleset", "access");
|
||||
|
||||
private static final RbPath RB_PARENT = RbPath.of("%%Parent");
|
||||
// This is the ICU path prefix, below which everything generated by this visitor will go.
|
||||
private static final RbPath RB_ROOT = RbPath.of("RBNFRules");
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate RBNF data for a set of locale IDs.
|
||||
*
|
||||
* @param localeId the locale ID to generate data for.
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @param icuSpecialData additional ICU data (in the "icu:" namespace)
|
||||
* @return IcuData containing RBNF data for the given locale ID.
|
||||
*/
|
||||
public static IcuData process(
|
||||
String localeId, CldrDataSupplier src, Optional<CldrData> icuSpecialData) {
|
||||
|
||||
// Using DTD order is essential here because the RBNF paths contain ordered elements,
|
||||
// so we must ensure that they appear in sorted order (otherwise we'd have to do more
|
||||
// work at this end to re-sort the results).
|
||||
RulesetVisitor visitor = new RulesetVisitor(localeId);
|
||||
icuSpecialData.ifPresent(s -> s.accept(DTD, visitor));
|
||||
src.getDataForLocale(localeId, UNRESOLVED).accept(DTD, visitor);
|
||||
return visitor.icuData;
|
||||
}
|
||||
|
||||
static final class RulesetVisitor implements PrefixVisitor {
|
||||
|
||||
private final IcuData icuData;
|
||||
|
||||
private RulesetVisitor(String localeId) {
|
||||
this.icuData = new IcuData(localeId, true);
|
||||
}
|
||||
|
||||
@Override public void visitPrefixStart(CldrPath prefix, Context context) {
|
||||
if (RULE_SET.matchesPrefixOf(prefix)) {
|
||||
RbPath rbPath = RB_ROOT.extendBy(GROUPING_TYPE.valueFrom(prefix));
|
||||
String rulesetType = RULESET_TYPE.valueFrom(prefix);
|
||||
boolean isStrict = !"lenient-parse".equals(rulesetType);
|
||||
|
||||
// This is rather hacky because the access attribute lives on the parent path
|
||||
// element, but we cannot use it until we visit the child values (because it's a
|
||||
// value attribute and will not be in the prefix path. So we need to add the header
|
||||
// only once, just before we start adding the values relating to the child
|
||||
// elements, so we need a flag.
|
||||
//
|
||||
// This cannot be a boolean field since it must be "effectively final".
|
||||
AtomicBoolean hasHeader = new AtomicBoolean(false);
|
||||
context.install(
|
||||
value -> {
|
||||
if (RBNF_RULE.matchesSuffixOf(value.getPath())) {
|
||||
if (!hasHeader.get()) {
|
||||
boolean isPrivate =
|
||||
RULESET_ACCESS.valueFrom(value, "public").equals("private");
|
||||
icuData.add(rbPath, (isPrivate ? "%%" : "%") + rulesetType + ":");
|
||||
hasHeader.set(true);
|
||||
}
|
||||
String rulePrefix = "";
|
||||
if (isStrict) {
|
||||
String basePrefix = RBNF_VALUE.valueFrom(value);
|
||||
rulePrefix = RBNF_RADIX.optionalValueFrom(value)
|
||||
.map(r -> basePrefix + "/" + r)
|
||||
.orElse(basePrefix);
|
||||
rulePrefix += ": ";
|
||||
}
|
||||
icuData.add(
|
||||
rbPath,
|
||||
rulePrefix + ESCAPE_RBNF_DATA.escape(value.getValue()));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert
|
||||
* backslash to a double backslash. This class is super slow for non-ASCII escaping due to
|
||||
* using "String.format()", however there's < 100 values that need any escaping, so it's
|
||||
* fine.
|
||||
*/
|
||||
private static final UnicodeEscaper ESCAPE_RBNF_DATA = new UnicodeEscaper() {
|
||||
private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray();
|
||||
private final char[] LEFT_ANGLE = "<".toCharArray();
|
||||
private final char[] RIGHT_ANGLE = ">".toCharArray();
|
||||
|
||||
@Override
|
||||
protected char[] escape(int cp) {
|
||||
// Returning null means "do not escape".
|
||||
switch (cp) {
|
||||
case '\\':
|
||||
return DOUBLE_BACKSLASH;
|
||||
case '←':
|
||||
return LEFT_ANGLE;
|
||||
case '→':
|
||||
return RIGHT_ANGLE;
|
||||
default:
|
||||
if (0x0020 <= cp && cp <= 0x007F) {
|
||||
return null;
|
||||
} else if (cp <= 0xFFFF) {
|
||||
return String.format("\\u%04X", cp).toCharArray();
|
||||
}
|
||||
return String.format("\\U%08X", cp).toCharArray();
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.collect.Ordering.natural;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.unicode.cldr.api.CldrData;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.collect.ImmutableListMultimap;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.LinkedHashMultimap;
|
||||
import com.google.common.collect.SetMultimap;
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* Generate supplemental {@link IcuData} by transforming {@link CldrDataType#SUPPLEMENTAL
|
||||
* SUPPLEMENTAL} data using a {@link PathValueTransformer}.
|
||||
*
|
||||
* <p>This is currently driven by the {@code ldml2icu_supplemental.txt} configuration file via a
|
||||
* {@code RegexTransformer}, but could use any {@link PathValueTransformer} implementation.
|
||||
*/
|
||||
public final class SupplementalMapper {
|
||||
private static final RbPath RB_FIFO = RbPath.of("<FIFO>");
|
||||
|
||||
/**
|
||||
* Processes a subset of supplemental data from the given supplier.
|
||||
*
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @param transformer the transformer to match and transform each CLDR path/value pair.
|
||||
* @param icuName the name for the generated IcuData.
|
||||
* @param includePaths a matcher to select the CLDR paths to be transformed.
|
||||
* @return An IcuData instance containing the specified subset of supplemental data with the
|
||||
* given ICU name.
|
||||
*/
|
||||
// TODO: Improve external data splitting and remove need for a PathMatcher here.
|
||||
public static IcuData process(
|
||||
CldrDataSupplier src, PathValueTransformer transformer, String icuName,
|
||||
PathMatcher includePaths) {
|
||||
ResultsCollector collector = new ResultsCollector(includePaths, transformer);
|
||||
// Write out the results into the IcuData class, preserving result grouping and expanding
|
||||
// path references as necessary.
|
||||
IcuData icuData = new IcuData(icuName, false);
|
||||
icuData.addResults(collector.getResults(src));
|
||||
return icuData;
|
||||
}
|
||||
|
||||
private static final class ResultsCollector {
|
||||
private final PathMatcher pathMatcher;
|
||||
private final PathValueTransformer transformer;
|
||||
|
||||
// WARNING: TreeMultimap() is NOT suitable here, even though it would sort the values for
|
||||
// each key. The reason is that result comparison is not "consistent with equals", and
|
||||
// TreeMultimap uses the comparator to decide if two elements are equal (not the equals()
|
||||
// method), and it does this even if using the add() method of the sorted set (this is in
|
||||
// fact in violation of the stated behaviour of Set#add).
|
||||
private final SetMultimap<RbPath, Result> resultsByRbPath = LinkedHashMultimap.create();
|
||||
private int fifoCounter = 0;
|
||||
|
||||
ResultsCollector(PathMatcher pathMatcher, PathValueTransformer transformer) {
|
||||
this.pathMatcher = checkNotNull(pathMatcher);
|
||||
this.transformer = checkNotNull(transformer);
|
||||
}
|
||||
|
||||
private void visit(CldrValue value) {
|
||||
if (pathMatcher.matchesPrefixOf(value.getPath())) {
|
||||
for (Result r : transformer.transform(value)) {
|
||||
RbPath rbPath = r.getKey();
|
||||
if (rbPath.contains(RB_FIFO)) {
|
||||
// The fifo counter needs to be formatted with leading zeros for sorting.
|
||||
rbPath = rbPath.mapSegments(
|
||||
s -> s.equals("<FIFO>") ? String.format("<%04d>", fifoCounter) : s);
|
||||
}
|
||||
resultsByRbPath.put(rbPath, r);
|
||||
}
|
||||
fifoCounter++;
|
||||
}
|
||||
}
|
||||
|
||||
ImmutableListMultimap<RbPath, Result> getResults(CldrDataSupplier supplier) {
|
||||
// DTD and NESTED_GROUPING order differ because of how the magic <FIFO> label works (it
|
||||
// basically enforces "encounter order" onto things in unlabeled sequences, which matches
|
||||
// the old behaviour). If it wouldn't break anything, it might be worth moving to DTD order
|
||||
// to remove any lingering implicit dependencies on the CLDR data behaviour.
|
||||
CldrData supplementalData = supplier.getDataForType(CldrDataType.SUPPLEMENTAL);
|
||||
PathValueTransformer.DynamicVars varFn = p -> {
|
||||
CldrValue cldrValue = supplementalData.get(p);
|
||||
return cldrValue != null ? cldrValue.getValue() : null;
|
||||
};
|
||||
|
||||
supplementalData.accept(NESTED_GROUPING, this::visit);
|
||||
|
||||
ImmutableListMultimap.Builder<RbPath, Result> out = ImmutableListMultimap.builder();
|
||||
out.orderValuesBy(natural());
|
||||
for (RbPath rbPath : resultsByRbPath.keySet()) {
|
||||
Set<Result> existingResults = resultsByRbPath.get(rbPath);
|
||||
out.putAll(rbPath, existingResults);
|
||||
for (Result fallback : transformer.getFallbackResultsFor(rbPath, varFn)) {
|
||||
if (existingResults.stream().noneMatch(fallback::isFallbackFor)) {
|
||||
out.put(rbPath, fallback);
|
||||
}
|
||||
}
|
||||
}
|
||||
return out.build();
|
||||
}
|
||||
}
|
||||
|
||||
private SupplementalMapper() {}
|
||||
}
|
|
@ -0,0 +1,183 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.mapper;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static java.nio.file.StandardOpenOption.CREATE;
|
||||
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
|
||||
import static org.unicode.cldr.api.AttributeKey.keyOf;
|
||||
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.unicode.cldr.api.AttributeKey;
|
||||
import org.unicode.cldr.api.CldrData.ValueVisitor;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import org.unicode.icu.tool.cldrtoicu.IcuData;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathMatcher;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbValue;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
/**
|
||||
* A mapper to collect transliteration data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL}
|
||||
* data via the paths:
|
||||
* <pre>{@code
|
||||
* //supplementalData/transforms/transform/tRule
|
||||
* }</pre>
|
||||
*
|
||||
* <p>This mapper also writes out the transform rule files into a specified directory.
|
||||
*/
|
||||
public final class TransformsMapper {
|
||||
private static final PathMatcher TRULE =
|
||||
PathMatcher.of("supplementalData/transforms/transform/tRule");
|
||||
private static final AttributeKey TRANSFORM_SOURCE = keyOf("transform", "source");
|
||||
private static final AttributeKey TRANSFORM_TARGET = keyOf("transform", "target");
|
||||
private static final AttributeKey TRANSFORM_DIRECTION = keyOf("transform", "direction");
|
||||
private static final AttributeKey TRANSFORM_VARIANT = keyOf("transform", "variant");
|
||||
private static final AttributeKey TRANSFORM_VISIBILITY = keyOf("transform", "visibility");
|
||||
private static final AttributeKey TRANSFORM_ALIAS = keyOf("transform", "alias");
|
||||
private static final AttributeKey TRANSFORM_BACKALIAS = keyOf("transform", "backwardAlias");
|
||||
|
||||
private static final RbPath RB_TRANSLITERATOR_IDS = RbPath.of("RuleBasedTransliteratorIDs");
|
||||
|
||||
// This decomposes some accented characters with accents in the "Mn" (Mark, non-spacing)
|
||||
// Unicode range by representing the accents in the \u1234 hex form. For example, it converts:
|
||||
// "ɪ̈" to "ɪ\u0308" and "ɯ̽" to "ɯ\u033D". This does not affect all accented character (e.g.
|
||||
// ä) and the precise reason this is done was never clearly documented in the code from which
|
||||
// this code was derived (but it seems necessary to generate the expected output in the
|
||||
// transliteration rules).
|
||||
//
|
||||
// This is one of the only, apparently necessary direct dependencies on the icu4j library.
|
||||
// TODO: Make this depend icu4j from this project rather than the older version from CLDR.
|
||||
private static final Transliterator FIXUP = Transliterator.getInstance("[:Mn:]any-hex/java");
|
||||
|
||||
// Don't rename these enum constants, they need to match the data directly.
|
||||
private enum Direction { forward, backward, both }
|
||||
private enum Visibility { internal, external }
|
||||
|
||||
/**
|
||||
* Processes data from the given supplier to generate transliteration ICU data, writing
|
||||
* auxiliary transliteration rule files in the process. This is a potentially destructive call
|
||||
* and will overwrite existing transformation rule files in the specified directory.
|
||||
*
|
||||
* @param src the CLDR data supplier to process.
|
||||
* @param ruleFileOutputDir the directory into which transliteration rule files will be written.
|
||||
* @return the IcuData instance to be written to a file.
|
||||
*/
|
||||
public static IcuData process(CldrDataSupplier src, Path ruleFileOutputDir) {
|
||||
RuleVisitor visitor = new RuleVisitor(p -> {
|
||||
Path file = ruleFileOutputDir.resolve(p);
|
||||
try {
|
||||
return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("error opening file: " + file, e);
|
||||
}
|
||||
});
|
||||
src.getDataForType(SUPPLEMENTAL).accept(DTD, visitor);
|
||||
return visitor.icuData;
|
||||
}
|
||||
|
||||
private static class RuleVisitor implements ValueVisitor {
|
||||
private final IcuData icuData = new IcuData("root", false);
|
||||
private final Function<Path, PrintWriter> outFn;
|
||||
|
||||
RuleVisitor(Function<Path, PrintWriter> outFn) {
|
||||
this.outFn = checkNotNull(outFn);
|
||||
icuData.setFileComment("File: root.txt");
|
||||
|
||||
// I have _no_ idea what any of this is about, I'm just trying to mimic the original
|
||||
// (complex and undocumented) code in "ConvertTransforms.java".
|
||||
icuData.add(RbPath.of("TransliteratorNamePattern"), "{0,choice,0#|1#{1}|2#{1}-{2}}");
|
||||
// Note that this quoting of path segments is almost certainly unnecessary. It matches
|
||||
// the old "ConvertTransforms" behaviour, but '%' is used elsewhere without quoting, so
|
||||
// it seems very likely that it's not needed here.
|
||||
// TODO: Once migration done, remove quotes here & check in RbPath for unwanted quotes.
|
||||
icuData.add(RbPath.of("\"%Translit%Hex\""), "%Translit%Hex");
|
||||
icuData.add(RbPath.of("\"%Translit%UnicodeName\""), "%Translit%UnicodeName");
|
||||
icuData.add(RbPath.of("\"%Translit%UnicodeChar\""), "%Translit%UnicodeChar");
|
||||
// Special case, where Latin is a no-op.
|
||||
icuData.add(RbPath.of("TransliterateLATIN"), RbValue.of("", ""));
|
||||
// Some hard-coded special case mappings.
|
||||
icuData.add(
|
||||
RB_TRANSLITERATOR_IDS.extendBy("Tone-Digit", "alias"),
|
||||
"Pinyin-NumericPinyin");
|
||||
icuData.add(
|
||||
RB_TRANSLITERATOR_IDS.extendBy("Digit-Tone", "alias"),
|
||||
"NumericPinyin-Pinyin");
|
||||
}
|
||||
|
||||
@Override public void visit(CldrValue value) {
|
||||
// The other possible element is "comment" but we currently ignore those.
|
||||
if (TRULE.matches(value.getPath())) {
|
||||
String source = getExpectedOptionalAttribute(value, TRANSFORM_SOURCE);
|
||||
String target = getExpectedOptionalAttribute(value, TRANSFORM_TARGET);
|
||||
Optional<String> variant = TRANSFORM_VARIANT.optionalValueFrom(value);
|
||||
String baseFilename = source + "_" + target;
|
||||
String filename =
|
||||
variant.map(v -> baseFilename + "_" + v).orElse(baseFilename) + ".txt";
|
||||
writeRootIndexEntry(value, source, target, variant, filename);
|
||||
writeDataFile(filename, value);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeDataFile(String filename, CldrValue value) {
|
||||
try (PrintWriter out = outFn.apply(Paths.get(filename))) {
|
||||
out.println("\uFEFF# © 2016 and later: Unicode, Inc. and others.");
|
||||
out.println("# License & terms of use: http://www.unicode.org/copyright.html#License");
|
||||
out.println("#");
|
||||
out.println("# File: " + filename);
|
||||
out.println("# Generated from CLDR");
|
||||
out.println("#");
|
||||
out.println();
|
||||
out.println(FIXUP.transliterate(whitespace().trimFrom(value.getValue())));
|
||||
out.println();
|
||||
}
|
||||
}
|
||||
|
||||
private void writeRootIndexEntry(
|
||||
CldrValue value, String source, String target, Optional<String> variant, String filename) {
|
||||
Visibility visibility = TRANSFORM_VISIBILITY.valueFrom(value, Visibility.class);
|
||||
String status = visibility == Visibility.internal ? "internal" : "file";
|
||||
|
||||
Direction dir = TRANSFORM_DIRECTION.valueFrom(value, Direction.class);
|
||||
if (dir != Direction.backward) {
|
||||
String id = getId(source, target, variant);
|
||||
TRANSFORM_ALIAS.listOfValuesFrom(value)
|
||||
.forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
|
||||
RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
|
||||
icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
|
||||
icuData.add(rbPrefix.extendBy("direction"), "FORWARD");
|
||||
}
|
||||
if (dir != Direction.forward) {
|
||||
String id = getId(target, source, variant);
|
||||
TRANSFORM_BACKALIAS.listOfValuesFrom(value)
|
||||
.forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id));
|
||||
RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status);
|
||||
icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename);
|
||||
icuData.add(rbPrefix.extendBy("direction"), "REVERSE");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static String getId(String from, String to, Optional<String> variant) {
|
||||
String baseId = from + "-" + to;
|
||||
return variant.map(v -> baseId + "/" + v).orElse(baseId);
|
||||
}
|
||||
|
||||
private static String getExpectedOptionalAttribute(CldrValue value, AttributeKey key) {
|
||||
return key.optionalValueFrom(value).orElseThrow(() ->
|
||||
new IllegalArgumentException(String.format("missing data for %s in: %s", key, value)));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import com.google.common.base.Ascii;
|
||||
|
||||
/** Instructions in result specifications (e.g. "values=..." or "fallback=..."). */
|
||||
enum Instruction {
|
||||
/** Defines processing and transformation of CLDR values. */
|
||||
VALUES,
|
||||
/** Defines fallback values to be used if no result was matched in a resource bundle. */
|
||||
FALLBACK,
|
||||
/** Defines an xpath used to hack result equality to make deduplication work. */
|
||||
BASE_XPATH,
|
||||
// TODO: Figure out how to remove this hack (probably by supporting partial matches).
|
||||
/**
|
||||
* Defines whether result values should be appended one at a time to a resource bundle
|
||||
* (default) or grouped into a separate array.
|
||||
*/
|
||||
GROUP;
|
||||
|
||||
/** Returns the instruction enum for its ID as it appears in the configuration file. */
|
||||
static Instruction forId(String id) {
|
||||
return Instruction.valueOf(Ascii.toUpperCase(id));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.base.Splitter;
|
||||
|
||||
/**
|
||||
* Function used by {@code RegexTransformer} to convert CLDR values in special ways. See also
|
||||
* {@code IcuFunctions}.
|
||||
*/
|
||||
public final class NamedFunction implements Function<List<String>, String> {
|
||||
private static final CharMatcher NAME_CHARS =
|
||||
CharMatcher.inRange('a', 'z').or(CharMatcher.is('_'));
|
||||
private static final Splitter ARG_SPLITTER = Splitter.on(',').trimResults(whitespace());
|
||||
|
||||
public static NamedFunction create(
|
||||
String name, int argCount, Function<List<String>, String> fn) {
|
||||
return new NamedFunction(name, argCount, fn);
|
||||
}
|
||||
|
||||
private final String name;
|
||||
private final int maxArgs;
|
||||
private final Function<List<String>, String> fn;
|
||||
|
||||
private NamedFunction(String name, int argCount, Function<List<String>, String> fn) {
|
||||
checkArgument(!name.isEmpty() && NAME_CHARS.matchesAllOf(name),
|
||||
"invalid function name (must be lower_case_underscore): %s", name);
|
||||
checkArgument(argCount >= 0, "invalid argument count: %s", argCount);
|
||||
this.name = name;
|
||||
this.maxArgs = argCount;
|
||||
this.fn = checkNotNull(fn);
|
||||
}
|
||||
|
||||
public String call(String argList) {
|
||||
List<String> args = ARG_SPLITTER.splitToList(argList);
|
||||
checkArgument(args.size() <= maxArgs,
|
||||
"too many arguments for function '%s' (max=%s)", name, maxArgs);
|
||||
return checkNotNull(apply(args),
|
||||
"named functions must never return null: function=%s", name);
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(List<String> args) {
|
||||
return fn.apply(args);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,173 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.collect.ImmutableList.toImmutableList;
|
||||
import static com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap;
|
||||
import static com.google.common.collect.ImmutableSetMultimap.toImmutableSetMultimap;
|
||||
import static java.util.function.Function.identity;
|
||||
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringWriter;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableListMultimap;
|
||||
import com.google.common.collect.ImmutableSetMultimap;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* Path/value transformer configured by {@code ldml2icu_xxx.txt} mapping and configuration files.
|
||||
* See {@code ldml2icu_readme.txt} for details on the configuration file format and
|
||||
* {@link PathValueTransformer} for the public API description and usage.
|
||||
*
|
||||
* <p>This class is thread safe.
|
||||
*/
|
||||
// TODO: Rewrite the readme to match current behaviour and describe edge cases properly.
|
||||
public final class RegexTransformer extends PathValueTransformer {
|
||||
/**
|
||||
* Returns a new transformer based on transformation rules defined in the given configuration
|
||||
* file contents, and using the specified functions for resolving ICU values.
|
||||
*/
|
||||
public static PathValueTransformer fromConfigLines(
|
||||
List<String> lines, NamedFunction... functions) {
|
||||
return new RegexTransformer(RuleParser.parseConfig(lines, Arrays.asList(functions)));
|
||||
}
|
||||
|
||||
// Map of path prefixes grouped by DTD type (for early efficient filtering of paths).
|
||||
private final ImmutableSetMultimap<CldrDataType, String> prefixMap;
|
||||
// Transformation rules loading from the configuration file, grouped by path prefix.
|
||||
private final ImmutableListMultimap<String, Rule> rulesMap;
|
||||
// Functions which can generate a fallback value from a given resource bundle path.
|
||||
private final ImmutableList<BiFunction<RbPath, DynamicVars, Optional<Result>>> fallbackFunctions;
|
||||
// Records the total set of rules, removing them as they are matched. Used for reporting any
|
||||
// unused rules for debugging purposes.
|
||||
private final Set<Rule> unusedRules = new LinkedHashSet<>();
|
||||
|
||||
private RegexTransformer(List<Rule> rules) {
|
||||
this.prefixMap =
|
||||
rules.stream().collect(toImmutableSetMultimap(Rule::getDataType, Rule::getPathPrefix));
|
||||
this.rulesMap =
|
||||
rules.stream().collect(toImmutableListMultimap(Rule::getPathPrefix, identity()));
|
||||
this.fallbackFunctions =
|
||||
rules.stream().flatMap(Rule::getFallbackFunctions).collect(toImmutableList());
|
||||
// Add all rules first and remove as they are matched.
|
||||
this.unusedRules.addAll(rules);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ImmutableList<Result> transform(CldrValue value) {
|
||||
return transform(value, p -> null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ImmutableList<Result> transform(CldrValue value, DynamicVars varLookupFn) {
|
||||
// This early rejection of non-matching paths, combined with "bucketing" the rules by path
|
||||
// path prefix for easy lookup dramatically reduces the transformation time.
|
||||
String pathPrefix = getPathPrefix(value);
|
||||
if (!prefixMap.get(value.getDataType()).contains(pathPrefix)) {
|
||||
return ImmutableList.of();
|
||||
}
|
||||
// Even though this is just derived from the value, resolve it here and pass it into each
|
||||
// rule to avoid recalculating the same thing every time.
|
||||
String fullXPath = getFullXPathWithoutSortIndices(value);
|
||||
// Bucketing the rules by the path prefix means that each incoming value is only tested
|
||||
// against likely matches. This reduces the number of tests per value by about 10x.
|
||||
for (Rule rule : rulesMap.get(pathPrefix)) {
|
||||
// We break after the first matching rule, since there is an implicit assumption
|
||||
// that no paths will match more than one rule.
|
||||
// TODO: Add a debug mode that checks that only one rule matches any given CLDR path.
|
||||
ImmutableList<Result> results = rule.transform(value, fullXPath, varLookupFn);
|
||||
if (!results.isEmpty()) {
|
||||
unusedRules.remove(rule);
|
||||
return results;
|
||||
}
|
||||
}
|
||||
return ImmutableList.of();
|
||||
}
|
||||
|
||||
// All "leaf" paths must have at least two elements, so we can find the "prefix" which is
|
||||
// the first element after the DTD root. This corresponds to the value extracted via
|
||||
// PATH_SPEC_PREFIX in the parser.
|
||||
private static String getPathPrefix(CldrValue value) {
|
||||
CldrPath prefix = value.getPath();
|
||||
checkArgument(prefix.getLength() >= 2, "unexpectedly short path: %s", prefix);
|
||||
while (prefix.getLength() > 2) {
|
||||
prefix = prefix.getParent();
|
||||
}
|
||||
return prefix.getName();
|
||||
}
|
||||
|
||||
// A regex to capture any sort-indices in the full path string (which must be removed).
|
||||
private static final Pattern SORT_INDEX = Pattern.compile("(/\\w+)#[0-9]+");
|
||||
|
||||
// Note that the full path we get here contains the "sort index" suffix for ORDERED
|
||||
// elements. This means that some element names are "foo#N" where N is the sort index.
|
||||
// Since the regex transformer works around "ordered elements" in a completely different
|
||||
// way and doesn't have them in the regular expressions, we can just remove them.
|
||||
private static String getFullXPathWithoutSortIndices(CldrValue v) {
|
||||
String fullPath = v.getFullPath();
|
||||
for (CldrPath p = v.getPath(); p != null; p = p.getParent()) {
|
||||
if (p.getSortIndex() != -1) {
|
||||
// Only do expensive regex stuff if there's an "ordered" element with a sort index.
|
||||
return SORT_INDEX.matcher(fullPath).replaceAll("$1");
|
||||
}
|
||||
}
|
||||
// No path parts have a sort index, so the original full path string is safe to return.
|
||||
return fullPath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ImmutableList<Result> getFallbackResultsFor(RbPath rbPath, DynamicVars varLookupFn) {
|
||||
return fallbackFunctions.stream()
|
||||
.map(f -> f.apply(rbPath, varLookupFn))
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.collect(toImmutableList());
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
StringWriter buf = new StringWriter();
|
||||
PrintWriter out = new PrintWriter(buf);
|
||||
out.println(getClass().getName() + "{");
|
||||
out.println(" Rules: " + rulesMap.size());
|
||||
if (!unusedRules.isEmpty()) {
|
||||
out.println(" Unused Rules:");
|
||||
unusedRules.forEach(
|
||||
r -> out.format(" [line=%3d] %s\n", r.getLineNumber(), r.getXpathSpec()));
|
||||
}
|
||||
out.println('}');
|
||||
out.flush();
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
// Package use helper for substituting single-character place-holders like '$N' or '%X'.
|
||||
static String substitute(String s, char token, Function<Character, String> replaceFn) {
|
||||
if (s.indexOf(token) == -1) {
|
||||
return s;
|
||||
}
|
||||
StringBuilder out = new StringBuilder();
|
||||
int i = 0;
|
||||
for (int j = s.indexOf(token); j != -1; i = j + 2, j = s.indexOf(token, i)) {
|
||||
char varChar = s.charAt(j + 1);
|
||||
String replacement =
|
||||
checkNotNull(replaceFn.apply(varChar), "no such variable %s%s", token, varChar);
|
||||
out.append(s, i, j).append(replacement);
|
||||
}
|
||||
return out.append(s.substring(i)).toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,632 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.base.Preconditions.checkElementIndex;
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
import static com.google.common.collect.ImmutableList.toImmutableList;
|
||||
import static java.util.Comparator.comparing;
|
||||
import static java.util.Comparator.nullsLast;
|
||||
import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Lists;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/**
|
||||
* A specification for building a result from the arguments in a matched xpath. Results always
|
||||
* hold a reference to their originating specification to allow them to be ordered in the same
|
||||
* order as the corresponding specifications in the configuration file.
|
||||
*/
|
||||
final class ResultSpec {
|
||||
// Subtle ordering for results to ensure "config file order" for things in the same
|
||||
// resource bundle while being "friendly" towards a global ordering. This is NOT consistent
|
||||
// with equals if duplicate results exist.
|
||||
//
|
||||
// This is ESSENTIAL for correct grouping and ordering within resource bundles.
|
||||
//
|
||||
// In normal use this is expected only to be used to reorder results within a resource
|
||||
// bundle (i.e. those sharing the same resource bundle path "key"). Resource bundles
|
||||
// themselves can just be managed in "visitation order" or similar.
|
||||
//
|
||||
// Ordering priority is:
|
||||
// 1: Result key (resource bundle): Groups results by resource bundle.
|
||||
// 2: Result specification line number: Orders resource bundle contents by "file order".
|
||||
// 3: Result distinguishing xpath: Tie breaking if duplicates are not yet removed.
|
||||
//
|
||||
// Note that the currently uses the String representation of the resource bundle path (key)
|
||||
// as the primary order to match legacy behaviour. However it would be better to use the
|
||||
// natural lexicographical RbPath order (the difference relates to having '/' as the
|
||||
// separator in the string representation of the path). The string form of a path is a bad
|
||||
// choice because some paths can contain a literal '/', which makes ordering problematic in
|
||||
// rare case. However changing this will have the effect of reodering path elements, which
|
||||
// while it should be safe, must be done with caution.
|
||||
// TODO: Fix this to use RbPath ordering and NOT the String representation
|
||||
private static final Comparator<AbstractResult> RESULT_ORDERING =
|
||||
Comparator.<AbstractResult, String>comparing(r -> r.getKey().toString())
|
||||
.thenComparing(r -> r.getSpec().lineNumber)
|
||||
.thenComparing(nullsLast(comparing(r -> r.getPath().orElse(null))));
|
||||
|
||||
// Splitter for any values (either in CLDR data or results specifications). The only time
|
||||
// values are split differently is when quoting exists in the "values" instruction.
|
||||
private static final Splitter VALUE_SPLITTER = Splitter.on(whitespace()).omitEmptyStrings();
|
||||
|
||||
// Matcher for "&foo_bar(a,b,c)" which captures function name and complete argument list.
|
||||
private static final Pattern FUNCTION = Pattern.compile("\\&(\\w++)\\(([^\\)]++)\\)");
|
||||
|
||||
// Resource bundle path specification with placeholders (e.g. "/foo/$1/bar") exactly as it
|
||||
// appears in the configuration file.
|
||||
private final String rbPathSpec;
|
||||
|
||||
// Declared instructions with which to generate result values (see Instruction).
|
||||
private final ImmutableMap<Instruction, VarString> instructions;
|
||||
|
||||
// This index of the xpath argument whose value should be split to create multiple results.
|
||||
// This mechanism is used when an xpath attribute is a space separated list of values and
|
||||
// one result should be created for each value (e.g. [@territories="AA BB CC"] but you want
|
||||
// a resource bundle for each region code (e.g. "foo/XX/bar", "foo/YY/bar", "foo/ZZ/bar").
|
||||
// At most one argument is ever split (corresponding to the first unquoted placeholder in
|
||||
// the resource bundle path specification).
|
||||
private final int splitArgIndex;
|
||||
|
||||
// The line number of the result specification in the file which defines the ordering of
|
||||
// results within a resource bundle. This needn't be a line number, but must be unique for
|
||||
// each specification.
|
||||
private final int lineNumber;
|
||||
|
||||
// The named functions available to the parser. Ideally the rules and result specifications
|
||||
// would be an inner class of some kind of context/environment and just share this.
|
||||
private final ImmutableMap<String, NamedFunction> icuFunctions;
|
||||
|
||||
// The map of dynamic variables (looked up from CldrPaths when a rule is resolved.
|
||||
private final Function<Character, CldrPath> dynamicVarFn;
|
||||
|
||||
ResultSpec(
|
||||
String rbPathSpec,
|
||||
Map<Instruction, VarString> instructions,
|
||||
int lineNumber,
|
||||
Map<String, NamedFunction> icuFunctions,
|
||||
Function<Character, CldrPath> dynamicVarFn) {
|
||||
this.rbPathSpec = checkNotNull(rbPathSpec);
|
||||
this.instructions = ImmutableMap.copyOf(instructions);
|
||||
this.splitArgIndex = getSplitArgIndex(rbPathSpec);
|
||||
this.lineNumber = lineNumber;
|
||||
this.icuFunctions = ImmutableMap.copyOf(icuFunctions);
|
||||
this.dynamicVarFn = checkNotNull(dynamicVarFn);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms a path/value into a sequence of results. The given matcher has successfully
|
||||
* matched the path and contains the captured arguments corresponding to $1..$N in the
|
||||
* various result specification strings.
|
||||
*/
|
||||
Stream<Result> transform(
|
||||
CldrValue value, Matcher m, DynamicVars varLookupFn) {
|
||||
// Discard group(0) since that's always the full xpath that was matched, and we don't
|
||||
// need that any more (so "$N" is args.get(N - 1)).
|
||||
List<String> args = new ArrayList<>();
|
||||
for (int i = 1; i <= m.groupCount(); i++) {
|
||||
// Important since we turn this into an ImmutableList (which is null-hostile).
|
||||
args.add(checkNotNull(m.group(i),
|
||||
"captured regex arguments must always be present\n"
|
||||
+ "(use an non-capturing groups for optional arguments): %s", m.pattern()));
|
||||
}
|
||||
|
||||
// The first unquoted argument in any resource bundle path declaration, is defined as
|
||||
// being "splittable". Typically this happens if the value of the captured xpath
|
||||
// argument is expected to be a list of items.
|
||||
//
|
||||
// In this case, we generate one result for each individual argument, replacing the
|
||||
// appropriate captured list with each split value in turn. Thus with original
|
||||
// arguments:
|
||||
// ["foo", "bar baz", "quux"]
|
||||
// where splitArgIndex == 1, we get two results using the argument lists:
|
||||
// ["foo", "bar", "quux"]
|
||||
// ["foo", "baz", "quux"]
|
||||
//
|
||||
// Note also that since the splittability of the arguments is technically defined
|
||||
// by the resource bundle path specification (not the xpath regular expression) it
|
||||
// could differ per ResultSpec instance (but currently never does).
|
||||
if (splitArgIndex != -1) {
|
||||
List<String> splitArgs = VALUE_SPLITTER.splitToList(args.get(splitArgIndex));
|
||||
// Only bother if there was more than one argument there anyway.
|
||||
if (splitArgs.size() > 1) {
|
||||
return splitArgs.stream().map(a -> {
|
||||
args.set(splitArgIndex, a);
|
||||
return matchedResult(value, args, varLookupFn);
|
||||
});
|
||||
}
|
||||
}
|
||||
// No splittable argument, or a splittable argument with only one value.
|
||||
return Stream.of(matchedResult(value, args, varLookupFn));
|
||||
}
|
||||
|
||||
// Simple helper to make results.
|
||||
private Result matchedResult(
|
||||
CldrValue value, List<String> args, DynamicVars varLookupFn) {
|
||||
return new MatchedResult(
|
||||
getRbPath(args),
|
||||
getValues(value.getValue(), args, varLookupFn),
|
||||
getResultPath(value.getPath(), args, varLookupFn));
|
||||
}
|
||||
|
||||
// Resource bundle paths are a bit special (unsurprisingly). The captured arguments can
|
||||
// contain '/' and will extend the path structure. Thus "foo/$1/bar" might end up as
|
||||
// "foo/x/y/bar" after argument substitution.
|
||||
//
|
||||
// However (a hack for timezone "metazone" paths) if the argument placeholder is quoted
|
||||
// (e.g. "foo/"$1"/bar") then '/' in arguments is replaced by ':' and quotes are retained
|
||||
// (e.g. "foo/"x:y"/bar).
|
||||
// TODO: Replace hard coded hack here with an explicit function in the config file.
|
||||
private RbPath getRbPath(List<String> args) {
|
||||
// Without more careful parsing, it's hard to figure out it quotes in a resource bundle
|
||||
// path specification are around a placeholder or not. Since quotes are only used in a
|
||||
// small number of cases currently, and only for this purpose, we just assume that any
|
||||
// quotes in the path specification should trigger this behaviour.
|
||||
if (rbPathSpec.contains("\"")) {
|
||||
// Use a lazy transforming list to avoid char replacement in arguments that don't
|
||||
// appear in the resource bundle path.
|
||||
args = Lists.transform(args, s -> s.replace('/', ':'));
|
||||
}
|
||||
String path = substituteArgs(rbPathSpec, args);
|
||||
return RbPath.parse(path);
|
||||
}
|
||||
|
||||
// Create an array of output values according to the CLDR value (if present) and the
|
||||
// "values" instruction in the result specification (if present). Any functions present in
|
||||
// the "values" instruction are invoked here.
|
||||
private ImmutableList<String> getValues(
|
||||
String value, List<String> args, DynamicVars varLookupFn) {
|
||||
VarString valuesSpec = instructions.get(Instruction.VALUES);
|
||||
if (valuesSpec == null) {
|
||||
// No "values" instruction, so just use the _unsplit_ CLDR value. To split a CLDR
|
||||
// value use "values={value}" in the result specification.
|
||||
return ImmutableList.of(value);
|
||||
}
|
||||
// The "value" instruction is not expected to have any dynamic %N variables in it,
|
||||
// since those only represent CLDR path mappings, which should not be directly present
|
||||
// in the ICU data. Hence the valueSpec should have been fully resolved by the static
|
||||
// variables applied earlier and we should just need to resolve() it into a String.
|
||||
String resolved = valuesSpec.get();
|
||||
|
||||
// First substitute the $N arguments in since they need to be passed to the
|
||||
// functions.
|
||||
//
|
||||
// WARNING: This doesn't strictly work, since an argument or function result could
|
||||
// (in theory) contain the string "{value}" which would then be substituted in an
|
||||
// unexpected way. The better way to do this is with a single pass which handles
|
||||
// arguments, function calling and the special "{value}" token together. This comes
|
||||
// down to the fact that the mapping file syntax doesn't have a well defined concept
|
||||
// of escaping or invocation order.
|
||||
// TODO: Fix this, possibly by rewriting the whole transformer "language" to be consistent.
|
||||
resolved = substituteArgs(resolved, args);
|
||||
|
||||
Matcher m = FUNCTION.matcher(resolved);
|
||||
if (m.find()) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
int index = 0;
|
||||
do {
|
||||
// Append up to the start of the function call.
|
||||
buffer.append(resolved, index, m.start());
|
||||
|
||||
// Replace '{value}' here so functions can be called with the CLDR value as well
|
||||
// as captured path arguments. We also have to replace it below, which is all a bit
|
||||
// dodgy if a function every returned '{value}'.
|
||||
NamedFunction fn = icuFunctions.get(m.group(1));
|
||||
checkArgument(fn != null, "no such function: %s", m.group(1));
|
||||
buffer.append(fn.call(m.group(2).replace("{value}", value)));
|
||||
index = m.end();
|
||||
} while (m.find());
|
||||
resolved = buffer.append(resolved.substring(index)).toString();
|
||||
}
|
||||
// Having done function invocation, we handle the special "{value}" token and split
|
||||
// the value (taking quoting into account).
|
||||
return splitValues(resolved.replace("{value}", value));
|
||||
}
|
||||
|
||||
// IMPORTANT: The path of a result is either:
|
||||
// * The original distinguishing path
|
||||
// * The specified "base_xpath" (which must also be a distinguishing xpath).
|
||||
// and this is used as part of the equality semantics (which are very subtle).
|
||||
//
|
||||
// The existence of "base_xpath" is a hack to get around the fact the xpaths can only be
|
||||
// matched in full, rather than by a prefix. For some cases this means that the "same"
|
||||
// result will be created many times by potentially different distinguishing xpaths,
|
||||
// perhaps even via different result specifications. "base_xpath" exists as a hack to give
|
||||
// these duplicate results the same "fake" xpath, so deduplication can occur.
|
||||
private CldrPath getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn) {
|
||||
VarString basePath = instructions.get(Instruction.BASE_XPATH);
|
||||
if (basePath == null) {
|
||||
return path;
|
||||
}
|
||||
String resolvedBasePath = basePath.apply(dynamicVarFn.andThen(varLookupFn)).get();
|
||||
return parseDistinguishingPath(substituteArgs(resolvedBasePath, args));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a fallback function if this specification has the "fallback=" instruction.
|
||||
* The function takes a resolved resource bundle path and returns the possible fallback
|
||||
* values for it. Note that currently fallback values do not support either quoting or
|
||||
* grouping (but they easily could).
|
||||
*/
|
||||
Optional<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunction() {
|
||||
VarString fallbackSpec = instructions.get(Instruction.FALLBACK);
|
||||
if (fallbackSpec == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
// This is the only place where any hacking of regular expressions occurs. The fallback
|
||||
// function must only return a value if the given resolved resource bundle path could
|
||||
// have been a match for the path specification.
|
||||
//
|
||||
// In order to avoid ambiguity for paths such as "foo/$1/$2/bar" and "foo/$1/bar" which
|
||||
// should not both be matched, we explicitly disallow '/' in argument values. In theory
|
||||
// this is problematic, since '/' should be an allowed character, but the issues caused
|
||||
// by ambiguous matching are worse.
|
||||
// TODO: Fix/replace all of this fallback mess with something cleaner.
|
||||
Pattern rbPathMatcher = getRbPathMatcher(rbPathSpec);
|
||||
|
||||
// Another, frankly terrifying, bit of hackery to support fallback specifications with
|
||||
// $N argument substitution (this currently only happens once, but must be supported).
|
||||
// Just another reason to want to replace the current fallback mechanism.
|
||||
fallbackSpec = maybeRewriteFallbackSpec(fallbackSpec);
|
||||
|
||||
// Just copying here to make it effectively final.
|
||||
VarString finalFallbackSpec = fallbackSpec;
|
||||
return Optional.of(
|
||||
(p, varFn) -> getFallbackResult(p, varFn, rbPathMatcher, finalFallbackSpec));
|
||||
}
|
||||
|
||||
private Optional<Result> getFallbackResult(
|
||||
RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec) {
|
||||
// Check is the given rbPath could be associated with this fallback (most are not).
|
||||
Matcher matcher = rbPathMatcher.matcher(rbPath.toString());
|
||||
if (!matcher.matches()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
// Expect that once any dynamic variables are provided to the fallback specification,
|
||||
// we can get the resolved fallback specification (potentially with $N placeholders to
|
||||
// be filled in from the resource bundle path).
|
||||
String specStr = fallbackSpec.apply(dynamicVarFn.andThen(varFn)).get();
|
||||
if (matcher.groupCount() > 0) {
|
||||
specStr = substituteArgs(specStr, n -> matcher.group(n + 1), matcher.groupCount());
|
||||
}
|
||||
|
||||
// Split the fallback value _without_ considering quoting. This matches the original
|
||||
// behaviour but could cause all sorts of subtle issues if values contained quotes.
|
||||
// TODO: Rework transformation rules to make quoting behaviour deterministic.
|
||||
Iterable<String> values =
|
||||
VALUE_SPLITTER.splitToList(specStr).stream()
|
||||
// Fallback values that "look like" CLDR paths are auto-magically resolved.
|
||||
.map(v -> v.startsWith("//") ? varFn.apply(parseDistinguishingPath(v)) : v)
|
||||
.collect(toImmutableList());
|
||||
return Optional.of(new FallbackResult(rbPath, values));
|
||||
}
|
||||
|
||||
// WARNING: Another very hacky behaviour (used exactly once) is that "$N" argument
|
||||
// substitutions are allowed in fallback values. This is highly problematic because
|
||||
// since the fallback value must be synthesized only from the resource bundle path,
|
||||
// there's no way for this substitution to handle:
|
||||
// 1: multi-valued list arguments
|
||||
// 2: arguments that didn't appear in the resource bundle path
|
||||
// 3: dynamic path variables (e.g. %D=//some/path)
|
||||
//
|
||||
// An example would be something like a resource bundle specification of:
|
||||
// /Baz/$2/$1
|
||||
// and a fallback value of:
|
||||
// Foo$1/Bar$2
|
||||
//
|
||||
// Here the order of substitution is not maintained and the original path specification
|
||||
// has values that are not naturally ordered (or possibly even duplicated). The pattern
|
||||
// we calculate from the resource bundle path specification will match/capture groups in
|
||||
// "natural order" (i.e. "/Baz/(...)/(...)") so we have to rewrite the order of the
|
||||
// placeholders in the fallback specification to match (e.g. "Foo$2/Bar$1").
|
||||
// TODO: Figure out a way to remove all of this extreme complexity.
|
||||
private VarString maybeRewriteFallbackSpec(
|
||||
VarString fallbackSpec) {
|
||||
Optional<String> fallback = fallbackSpec.resolve();
|
||||
// If the fallback string is not present, it's because the VarString still has
|
||||
// unresolved "dynamic" variables for late binding. This is okay, but should not
|
||||
// be mixed with argument substitution.
|
||||
if (!fallback.isPresent() || !fallback.get().contains("$")) {
|
||||
return fallbackSpec;
|
||||
}
|
||||
// After the quick rejection check for '$', do a proper search for $N variables (since
|
||||
// '$' is permitted as a literal if not followed by a digit).
|
||||
Matcher fallbackMatcher = ARG_PLACEHOLDER.matcher(fallback.get());
|
||||
if (!fallbackMatcher.find()) {
|
||||
return fallbackSpec;
|
||||
}
|
||||
|
||||
// Fallback spec has $N in it, triggering super hacky behaviour.
|
||||
Matcher pathMatcher = ARG_PLACEHOLDER.matcher(rbPathSpec);
|
||||
checkState(pathMatcher.find(),
|
||||
"$N arguments in fallback must be present in the resource bundle path: %s",
|
||||
rbPathSpec);
|
||||
// Explicit group characters ("1"..."9") in the order they appear in the
|
||||
// resource bundle path. There can be duplicates (e.g. "/Foo/$1/Bar$1").
|
||||
List<Character> groupIds = new ArrayList<>();
|
||||
do {
|
||||
groupIds.add(pathMatcher.group().charAt(1));
|
||||
} while (pathMatcher.find());
|
||||
|
||||
// Special check to avoid a horrible bug if we every had more than 9 distinct
|
||||
// placeholders (essentially impossible with current data). If it did happen,
|
||||
// the returned index below would be >= 9 and we would get "$X", where 'X' was
|
||||
// not a numeric value.
|
||||
checkState(groupIds.size() < 10,
|
||||
"too many placeholders in resource bundle path: %s", rbPathSpec);
|
||||
|
||||
// Now find each placeholder in the fallback specification string and map it to
|
||||
// the equivalent index for the path matcher we just created.
|
||||
StringBuilder rewrittenFallbackSpec = new StringBuilder(fallback.get());
|
||||
do {
|
||||
int placeholderPos = fallbackMatcher.start() + 1;
|
||||
// The new ID is the index of the corresponding placeholder offset by '1'.
|
||||
char placeholderDigit = rewrittenFallbackSpec.charAt(placeholderPos);
|
||||
int newPlaceholderIndex = groupIds.indexOf(placeholderDigit);
|
||||
checkState(newPlaceholderIndex != -1,
|
||||
"fallback values may only contain arguments from the resource bundle path: %s",
|
||||
fallback.get());
|
||||
rewrittenFallbackSpec.setCharAt(placeholderPos, (char)('1' + newPlaceholderIndex));
|
||||
} while (fallbackMatcher.find());
|
||||
return VarString.of(rewrittenFallbackSpec.toString());
|
||||
}
|
||||
|
||||
/** Base class of either a matched or a fallback result. */
|
||||
private abstract class AbstractResult extends Result {
|
||||
// Split and resolved values for this result (see also "isGrouped()").
|
||||
private final ImmutableList<String> values;
|
||||
|
||||
// The "source" CLDR path of a matched result (omitted if this is a fallback result).
|
||||
// Note that this is the resolved "base_xpath" if it was specified in the instructions.
|
||||
private final Optional<CldrPath> basePath;
|
||||
|
||||
// Calculated eagerly since we always expect results to need to be deduplicated.
|
||||
private final int hashCode;
|
||||
|
||||
AbstractResult(RbPath key, Iterable<String> values, Optional<CldrPath> path) {
|
||||
super(key);
|
||||
this.values = ImmutableList.copyOf(values);
|
||||
this.basePath = checkNotNull(path);
|
||||
// Same attributes in the same order as tested for in equals().
|
||||
this.hashCode = Objects.hash(getKey(), getPath(), isGrouped(), getValues());
|
||||
}
|
||||
|
||||
// Returns the specification from which this result was obtained. This is essential for
|
||||
// correct ordering and determining fallback values, but is not directly used for
|
||||
// determining result equality (since duplicate results can be generated by different
|
||||
// specifications).
|
||||
final ResultSpec getSpec() {
|
||||
return ResultSpec.this;
|
||||
}
|
||||
|
||||
final Optional<CldrPath> getPath() {
|
||||
return basePath;
|
||||
}
|
||||
|
||||
final boolean wasMatched() {
|
||||
// We could also do this via a boolean field.
|
||||
return this instanceof MatchedResult;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final ImmutableList<String> getValues() {
|
||||
return values;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int compareTo(Result other) {
|
||||
checkArgument(other instanceof AbstractResult,
|
||||
"unknown result type: %s", other.getClass());
|
||||
return RESULT_ORDERING.compare(this, (AbstractResult) other);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int hashCode() {
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
// Equality semantics of results is ESSENTIAL for correct behaviour, especially the
|
||||
// deduplication of results. See also "getSpec()", "getPath()", and RESULT_ORDERING.
|
||||
@Override
|
||||
public final boolean equals(Object obj) {
|
||||
// Different subclasses are never equal, so test class directly (not instanceof).
|
||||
if (obj == null || !getClass().equals(obj.getClass())) {
|
||||
return false;
|
||||
}
|
||||
AbstractResult other = (AbstractResult) obj;
|
||||
// DO NOT test the result specifier here. Equal results can be generated from
|
||||
// different result specifications (if "base_xpath" was used).
|
||||
return getKey().equals(other.getKey())
|
||||
&& getPath().equals(other.getPath())
|
||||
&& isGrouped() == other.isGrouped()
|
||||
// Alternatively assert that values are equal if everything else is.
|
||||
&& getValues().equals(other.getValues());
|
||||
}
|
||||
}
|
||||
|
||||
// Result created for an explicit path match using captured arguments.
|
||||
private final class MatchedResult extends AbstractResult {
|
||||
MatchedResult(RbPath key, Iterable<String> values, CldrPath path) {
|
||||
super(key, values, Optional.of(path));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isGrouped() {
|
||||
// We don't need to use the "group" value at all and it can be removed from the
|
||||
// configuration file at some point.
|
||||
return instructions.containsKey(Instruction.GROUP);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isFallbackFor(Result r) {
|
||||
// Matched results are never a fallback for anything.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Result created to hold possible fallback values for a specified resource bundle path.
|
||||
private final class FallbackResult extends AbstractResult {
|
||||
FallbackResult(RbPath rbPath, Iterable<String> values) {
|
||||
super(rbPath, values, Optional.empty());
|
||||
}
|
||||
|
||||
// Delete this method and move the other one into AbstractResult if we decide to allow
|
||||
// grouping for fallback values (it's not clear if it's a good idea).
|
||||
@Override
|
||||
public boolean isGrouped() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isFallbackFor(Result r) {
|
||||
// We are a fallback if we came from the same specification as a matched result.
|
||||
// To prevent duplication of fallback results, we also return true if the result we
|
||||
// are "equal()" to the given result (equivalent fallback results can come from
|
||||
// different input paths).
|
||||
checkArgument(r instanceof AbstractResult, "unsupported result type: %s", r);
|
||||
AbstractResult result = (AbstractResult) r;
|
||||
return result.wasMatched() ? getSpec().equals(result.getSpec()) : equals(result);
|
||||
}
|
||||
}
|
||||
|
||||
// ==== Static helper functions ====
|
||||
|
||||
// Matches any "$N" placeholder without capturing.
|
||||
private static final Pattern ARG_PLACEHOLDER = Pattern.compile("\\$[1-9]");
|
||||
|
||||
// Turn "$N" into a capturing groups.
|
||||
//
|
||||
// Note that this code currently assumes that each "$N" placeholder matches a single path
|
||||
// segment (i.e. the captured values cannot contain '/'). This is an artificial restriction
|
||||
// since resource bundle paths can have quoting in, so we could detect quoted placeholders
|
||||
// and allow any characters. However at the moment this isn't an issue, and none of the
|
||||
// "$N" placeholders in the paths expects to match anything with '/' in.
|
||||
//
|
||||
// TODO: Fix this to handle quoted placeholders (e.g. "$N" or <$N>) properly.
|
||||
private static Pattern getRbPathMatcher(String rbPathSpec) {
|
||||
// An RbPath instance's toString() does not have a leading '/' on it, so well have to
|
||||
// account for that here (or we could just remove the leading '/' from paths in the
|
||||
// config file...
|
||||
if (rbPathSpec.startsWith("/")) {
|
||||
rbPathSpec = rbPathSpec.substring(1);
|
||||
}
|
||||
// Protect potential regex meta-characters in the original resource bundle path. Using
|
||||
// '\Q' and '\E' to mark quotation boundaries is the safest way to do this, but that
|
||||
// means we also need to handle '\E' in the original string (incredibly unlikely but it
|
||||
// would be super hard to debug if it ever happened).
|
||||
// TODO: If resource paths cannot contain literal '\' or '$', add checks and simplify.
|
||||
String regex = "\\Q" + rbPathSpec.replace("\\E", "\\E\\E\\Q") + "\\E";
|
||||
|
||||
// Remember that you could get "$1$2" here and the regex groups that replace them will
|
||||
// abut. Use reluctant matching (i.e. "+?") to avoid any backtracking in this case.
|
||||
// We assume that the substituted arguments contained at least one character, and so we
|
||||
// capture at least one character per group here.
|
||||
regex = ARG_PLACEHOLDER.matcher(regex).replaceAll("\\\\E([^/]+?)\\\\Q");
|
||||
return Pattern.compile(regex);
|
||||
}
|
||||
|
||||
private static String substituteArgs(String spec, List<String> args) {
|
||||
return substituteArgs(spec, args::get, args.size());
|
||||
}
|
||||
|
||||
// Substitutes "$N" (N = 1...9) placeholders for values obtained from a zero-indexed
|
||||
// function (i.e. "$N" --> args(N - 1)).
|
||||
private static String substituteArgs(String spec, Function<Integer, String> args, int size) {
|
||||
return RegexTransformer.substitute(
|
||||
spec, '$', c -> args.apply(checkElementIndex(c - '1', size, "argument index")));
|
||||
}
|
||||
|
||||
// Matches arguments with or without enclosing quotes.
|
||||
private static final Pattern ARGUMENT = Pattern.compile("[<\"]?\\$(\\d)[\">]?");
|
||||
|
||||
// Logic mostly copied from original RegexManager class. Finds first unquoted $N (N=1..9)
|
||||
// and returns N-1 (or -1 if no match). We do not permit $0 to appear even though it is
|
||||
// captured by the regex because it's just the entire path.
|
||||
private static int getSplitArgIndex(String rbPath) {
|
||||
// Captures a $N placeholder, but might catch surrounding quoting as well.
|
||||
Matcher matcher = ARGUMENT.matcher(rbPath);
|
||||
while (matcher.find()) {
|
||||
char startChar = rbPath.charAt(matcher.start());
|
||||
char endChar = rbPath.charAt(matcher.end() - 1);
|
||||
// Splitting occurs for the first unquoted placeholder, so ignore <$1> and "$N".
|
||||
// Q: Why two different "quoting" schemes?
|
||||
// A: It's complex and relates the something called "hidden labels".
|
||||
boolean shouldSplit = !((startChar == '"' && endChar == '"') ||
|
||||
(startChar == '<' && endChar == '>'));
|
||||
if (shouldSplit) {
|
||||
// Allowed "$N" argument placeholders go from $1 to $9 ($0 is disallowed) and
|
||||
// arguments are zero-indexed, so we expect an index from 0 to 8.
|
||||
int groupNumber = Integer.parseInt(matcher.group(1));
|
||||
checkArgument(groupNumber >= 1 && groupNumber <= 9,
|
||||
"invalid split argument: %s", groupNumber);
|
||||
return groupNumber - 1;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Splits a possibly quoted string, where we need to handle \". This is a bit dubious
|
||||
// though as we don't detect or unescape \\. Thus it's impossible to represent a single '\'
|
||||
// at the end of a quoted string (e.g. "$1" where the expansion of $1 has a trailing '\'.
|
||||
// It's also impossible to have a value that should be split but which contains '"'.
|
||||
//
|
||||
// This mimics the original RegexManager behaviour where spaces in and quotes in
|
||||
// substituted values are _not_ escaped.
|
||||
private static ImmutableList<String> splitValues(String value) {
|
||||
int qstart = nextBareQuoteIndex(value, 0);
|
||||
if (qstart == -1) {
|
||||
return ImmutableList.copyOf(VALUE_SPLITTER.split(value));
|
||||
}
|
||||
ImmutableList.Builder<String> values = ImmutableList.builder();
|
||||
int rawStart = 0;
|
||||
do {
|
||||
values.addAll(VALUE_SPLITTER.split(value.substring(rawStart, qstart)));
|
||||
int qend = nextBareQuoteIndex(value, qstart + 1);
|
||||
checkArgument(qend != -1, "mismatched quotes in splittable value: %s", value);
|
||||
// Remember to unescape any '"' found in the quoted regions.
|
||||
values.add(value.substring(qstart + 1, qend).replace("\\\"", "\""));
|
||||
rawStart = qend + 1;
|
||||
qstart = nextBareQuoteIndex(value, qend + 1);
|
||||
} while (qstart != -1);
|
||||
values.addAll(VALUE_SPLITTER.split(value.substring(rawStart)));
|
||||
return values.build();
|
||||
}
|
||||
|
||||
// Returns the index of the next '"' character that's not preceded by a '\'.
|
||||
private static int nextBareQuoteIndex(String s, int i) {
|
||||
i = s.indexOf('"', i);
|
||||
// If i == 0, then '"' is the first char and must be "bare".
|
||||
if (i > 0) {
|
||||
do {
|
||||
if (s.charAt(i - 1) != '\\') {
|
||||
break;
|
||||
}
|
||||
i = s.indexOf('\\', i + 1);
|
||||
} while (i >= 0);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.collect.ImmutableList.toImmutableList;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/*
|
||||
* Each rule corresponds to a single target xpath specification in the configuration file
|
||||
* (lines starting //) but may have more than one result specification. For example:
|
||||
*
|
||||
* //supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
|
||||
* ; /languageData/$1/primary/scripts ; values=$2
|
||||
* ; /languageData/$1/primary/territories; values=$3
|
||||
*
|
||||
* is represented by a single rule with two result specifications.
|
||||
*/
|
||||
abstract class Rule {
|
||||
/** Returns a rule for which all '%X' arguments have been resolved (almost all cases). */
|
||||
static Rule staticRule(
|
||||
CldrDataType dtdType,
|
||||
String prefix,
|
||||
Iterable<ResultSpec> specs,
|
||||
String pathRegex,
|
||||
String xpathSpec,
|
||||
int lineNumber) {
|
||||
|
||||
return new StaticRule(dtdType, prefix, specs, pathRegex, xpathSpec, lineNumber);
|
||||
}
|
||||
|
||||
/** Returns a rule for which some '%X' arguments are unresolved until matching occurs. */
|
||||
static Rule dynamicRule(
|
||||
CldrDataType dtdType,
|
||||
String pathRegex,
|
||||
Iterable<ResultSpec> specs,
|
||||
VarString varString,
|
||||
Function<Character, CldrPath> varFn,
|
||||
String xpathSpec,
|
||||
int lineNumber) {
|
||||
|
||||
return new DynamicRule(dtdType, pathRegex, specs, varString, varFn, xpathSpec, lineNumber);
|
||||
}
|
||||
|
||||
// Type of CLDR path which can match this rule.
|
||||
private final CldrDataType dtdType;
|
||||
// The first path element below the root, used to do fast rejection of non-matching paths
|
||||
// and to "bucket" rules by their prefix to speed up matching.
|
||||
private final String pathPrefix;
|
||||
// One or more result specifications to be processed for matching CLDR paths/values.
|
||||
private final ImmutableList<ResultSpec> resultSpecs;
|
||||
// Debug information only to help determine unused rules.
|
||||
private final String xpathSpec;
|
||||
private final int lineNumber;
|
||||
|
||||
private Rule(
|
||||
CldrDataType dtdType,
|
||||
String pathPrefix,
|
||||
Iterable<ResultSpec> resultSpecs,
|
||||
String xpathSpec,
|
||||
int lineNumber) {
|
||||
|
||||
this.dtdType = checkNotNull(dtdType);
|
||||
this.pathPrefix = checkNotNull(pathPrefix);
|
||||
this.resultSpecs = ImmutableList.copyOf(resultSpecs);
|
||||
this.xpathSpec = checkNotNull(xpathSpec);
|
||||
this.lineNumber = lineNumber;
|
||||
}
|
||||
|
||||
/** Returns the CLDR DTD type of the path that the rule can match. */
|
||||
final CldrDataType getDataType() {
|
||||
return dtdType;
|
||||
}
|
||||
|
||||
/** Returns the name of the first path element below the path root. */
|
||||
final String getPathPrefix() {
|
||||
return pathPrefix;
|
||||
}
|
||||
|
||||
/** Returns the regular expression against which CLDR path strings are matched. */
|
||||
abstract Pattern getPathPattern(DynamicVars varLookupFn);
|
||||
|
||||
/**
|
||||
* Attempts to match the incoming xpath and (if successful) use captured arguments to
|
||||
* generate one result for each result specification.
|
||||
*/
|
||||
final ImmutableList<Result> transform(CldrValue v, String fullXPath, DynamicVars varFn) {
|
||||
Matcher m = getPathPattern(varFn).matcher(fullXPath);
|
||||
return m.matches()
|
||||
? resultSpecs.stream()
|
||||
.flatMap(r -> r.transform(v, m, varFn))
|
||||
.collect(toImmutableList())
|
||||
: ImmutableList.of();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns any fallback functions defined in results specifications. These are used to
|
||||
* determine the set of possible fallback values for a given resource bundle path.
|
||||
*/
|
||||
final Stream<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunctions() {
|
||||
return resultSpecs.stream()
|
||||
.map(ResultSpec::getFallbackFunction)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get);
|
||||
}
|
||||
|
||||
// Debugging only
|
||||
final String getXpathSpec() {
|
||||
return xpathSpec;
|
||||
}
|
||||
|
||||
// Debugging only
|
||||
final int getLineNumber() {
|
||||
return lineNumber;
|
||||
}
|
||||
|
||||
private static final class StaticRule extends Rule {
|
||||
// The processed xpath specification yielding an xpath matching regular expression. This is
|
||||
// only suitable for matching incoming xpaths and cannot be processed in any other way.
|
||||
private final Pattern xpathPattern;
|
||||
|
||||
StaticRule(
|
||||
CldrDataType dtdType,
|
||||
String prefix,
|
||||
Iterable<ResultSpec> specs,
|
||||
String pathRegex,
|
||||
String xpathSpec,
|
||||
int lineNumber) {
|
||||
|
||||
super(dtdType, prefix, specs, xpathSpec, lineNumber);
|
||||
this.xpathPattern = Pattern.compile(pathRegex);
|
||||
}
|
||||
|
||||
@Override
|
||||
Pattern getPathPattern(DynamicVars varLookupFn) {
|
||||
return xpathPattern;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class DynamicRule extends Rule {
|
||||
// The processed xpath specification yielding an xpath matching regular expression. This is
|
||||
// only suitable for matching incoming xpaths and cannot be processed in any other way.
|
||||
private final VarString varString;
|
||||
private final Function<Character, CldrPath> dynamicVarFn;
|
||||
|
||||
DynamicRule(
|
||||
CldrDataType dtdType,
|
||||
String prefix,
|
||||
Iterable<ResultSpec> specs,
|
||||
VarString varString,
|
||||
Function<Character, CldrPath> varFn,
|
||||
String xpathSpec,
|
||||
int lineNumber) {
|
||||
|
||||
super(dtdType, prefix, specs, xpathSpec, lineNumber);
|
||||
this.varString = checkNotNull(varString);
|
||||
this.dynamicVarFn = checkNotNull(varFn);
|
||||
}
|
||||
|
||||
@Override Pattern getPathPattern(DynamicVars varLookupFn) {
|
||||
String pathRegex = varString.apply(dynamicVarFn.andThen(varLookupFn)).get();
|
||||
return Pattern.compile(pathRegex);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,152 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.base.CharMatcher.whitespace;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static com.google.common.collect.ImmutableMap.toImmutableMap;
|
||||
import static com.google.common.collect.Maps.filterValues;
|
||||
import static com.google.common.collect.Maps.transformValues;
|
||||
import static java.util.function.Function.identity;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.unicode.cldr.api.CldrDataType;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.escape.CharEscaperBuilder;
|
||||
import com.google.common.escape.Escaper;
|
||||
|
||||
/** Parser for rule specifications in the regex transformer configuration files. */
|
||||
final class RuleParser {
|
||||
// Pattern to capture first two path elements (for the dtd type and path prefix).
|
||||
private static final Pattern PATH_SPEC_PREFIX = Pattern.compile("//([^/]+)/([^/]+)/");
|
||||
|
||||
// Preprocessing replaces %X variables defined in the configuration file. This helps to
|
||||
// keep the path specification a bit easier to read.
|
||||
private static final Pattern VAR = Pattern.compile("^%([A-Z])=(.*)$");
|
||||
|
||||
// Multi-line rules start with " ; " for some optional amount of whitespace.
|
||||
private static final Pattern RULE_PARTS_SEPERATOR = Pattern.compile("\\s*+;\\s*+");
|
||||
|
||||
// Splitter for the resource bundle / value declarations.
|
||||
private static final Splitter RULE_PARTS_SPLITTER =
|
||||
Splitter.on(RULE_PARTS_SEPERATOR).trimResults(whitespace()).omitEmptyStrings();
|
||||
|
||||
// Splitter for instruction name/expressions.
|
||||
private static final Splitter INSTRUCTION_SPLITTER =
|
||||
Splitter.on('=').trimResults(whitespace()).limit(2);
|
||||
|
||||
// Only '[',']' need escaping in path specifications (so we can write "foo{@bar="baz"]").
|
||||
private static final Escaper SPECIAL_CHARS_ESCAPER =
|
||||
new CharEscaperBuilder().addEscape('[', "\\[").addEscape(']', "\\]").toEscaper();
|
||||
|
||||
/** Parses a configuration file to create a sequence of transformation rules. */
|
||||
static ImmutableList<Rule> parseConfig(
|
||||
List<String> configLines, List<NamedFunction> functions) {
|
||||
// Extract '%X' variable declarations in the first pass.
|
||||
ImmutableMap<Character, String> varMap = configLines.stream()
|
||||
.filter(s -> s.startsWith("%"))
|
||||
.map(VAR::matcher)
|
||||
.peek(m -> checkArgument(m.matches(), "invalid argument declaration: %s", m))
|
||||
.collect(ImmutableMap.toImmutableMap(m -> m.group(1).charAt(0), m -> m.group(2)));
|
||||
return new RuleParser(varMap, functions).parseLines(configLines);
|
||||
}
|
||||
|
||||
private final ImmutableMap<Character, String> staticVarMap;
|
||||
private final ImmutableMap<Character, CldrPath> dynamicVarMap;
|
||||
private final ImmutableMap<String, NamedFunction> fnMap;
|
||||
|
||||
private RuleParser(ImmutableMap<Character, String> varMap, List<NamedFunction> functions) {
|
||||
this.staticVarMap = ImmutableMap.copyOf(filterValues(varMap, s -> !s.startsWith("//")));
|
||||
this.dynamicVarMap = ImmutableMap.copyOf(
|
||||
transformValues(
|
||||
filterValues(varMap, s -> s.startsWith("//")),
|
||||
CldrPath::parseDistinguishingPath));
|
||||
this.fnMap =
|
||||
functions.stream().collect(toImmutableMap(NamedFunction::getName, identity()));
|
||||
}
|
||||
|
||||
private ImmutableList<Rule> parseLines(List<String> configLines) {
|
||||
List<Rule> rules = new ArrayList<>();
|
||||
for (int lineIndex = 0; lineIndex < configLines.size(); lineIndex++) {
|
||||
String line = configLines.get(lineIndex);
|
||||
try {
|
||||
if (line.startsWith("//")) {
|
||||
// Either it's "//xpath ; resource-bundle-path ; values"
|
||||
// Or "//xpath" with " ; resource-bundle-path ; values" on subsequent lines.
|
||||
int ruleLineNumber = lineIndex + 1;
|
||||
int xpathEnd = line.indexOf(";");
|
||||
String xpath;
|
||||
List<ResultSpec> specs = new ArrayList<>();
|
||||
if (xpathEnd != -1) {
|
||||
// Single line rule, extract result specification from trailing part.
|
||||
xpath = whitespace().trimFrom(line.substring(0, xpathEnd));
|
||||
// Keep leading " ; " in the transformation string since it matches the
|
||||
// multi-rule case and is handled the same.
|
||||
specs.add(parseResultSpec(line.substring(xpathEnd), lineIndex + 1));
|
||||
} else {
|
||||
xpath = line;
|
||||
while (++lineIndex < configLines.size()
|
||||
&& RULE_PARTS_SEPERATOR.matcher(configLines.get(lineIndex)).lookingAt()) {
|
||||
specs.add(parseResultSpec(configLines.get(lineIndex), lineIndex + 1));
|
||||
}
|
||||
// The loop above moved us past the last line of the rule, so readjust.
|
||||
lineIndex--;
|
||||
}
|
||||
rules.add(parseRule(xpath, specs, ruleLineNumber));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(
|
||||
String.format("parse error at line %d: %s", lineIndex + 1, line), e);
|
||||
}
|
||||
}
|
||||
return ImmutableList.copyOf(rules);
|
||||
}
|
||||
|
||||
private ResultSpec parseResultSpec(String spec, int lineNumber) {
|
||||
// The result specifier still has leading separator (e.g. " ; /foo/bar/$1 ; value=$2"),
|
||||
// but that's okay because the splitter ignores empty results.
|
||||
List<String> rbPathAndInstructions = RULE_PARTS_SPLITTER.splitToList(spec);
|
||||
String rbPathSpec = rbPathAndInstructions.get(0);
|
||||
|
||||
ImmutableMap<Instruction, VarString> instructions =
|
||||
rbPathAndInstructions.stream()
|
||||
.skip(1)
|
||||
.map(INSTRUCTION_SPLITTER::splitToList)
|
||||
.collect(toImmutableMap(
|
||||
p -> Instruction.forId(p.get(0)),
|
||||
p -> VarString.of(p.size() > 1 ? p.get(1) : "", staticVarMap::get)));
|
||||
return new ResultSpec(rbPathSpec, instructions, lineNumber, fnMap, dynamicVarMap::get);
|
||||
}
|
||||
|
||||
private Rule parseRule(String xpathSpec, List<ResultSpec> resultSpecs, int lineNumber) {
|
||||
// The escaped path is nearly a regular expression, but still contains '%X' variables.
|
||||
String escapedPathSpec = SPECIAL_CHARS_ESCAPER.escape(xpathSpec);
|
||||
Matcher m = PATH_SPEC_PREFIX.matcher(escapedPathSpec);
|
||||
checkArgument(m.lookingAt(), "unexpected path spec: %s", escapedPathSpec);
|
||||
|
||||
// Extract type a path prefix for rule grouping and fast rejection during matching.
|
||||
CldrDataType dtdType = CldrDataType.forXmlName(m.group(1));
|
||||
String pathPrefix = m.group(2);
|
||||
|
||||
// If the variable string contains a "dynamic" argument, is cannot be resolved yet and
|
||||
// must result in a "dynamic" rule being created here (this is very rare though).
|
||||
VarString varString = VarString.of(escapedPathSpec, staticVarMap::get);
|
||||
Optional<String> resolved = varString.resolve();
|
||||
// Don't turn this into a "map().orElse()" chain (despite what your IDE might suggest)
|
||||
// because we don't want to create lots of unused dynamic rules!
|
||||
return resolved.isPresent()
|
||||
? Rule.staticRule(
|
||||
dtdType, pathPrefix, resultSpecs, resolved.get(), xpathSpec, lineNumber)
|
||||
: Rule.dynamicRule(
|
||||
dtdType, pathPrefix, resultSpecs, varString, dynamicVarMap::get, xpathSpec, lineNumber);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static com.google.common.base.Preconditions.checkState;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
|
||||
/**
|
||||
* An immutable representation of a String with placeholders for variable substitution. A
|
||||
* VarString can be "resolved" or "partially resolved" by providing a mapping from placeholder
|
||||
* characters to strings, and any remaining unresolved variables are tracked. This is a very
|
||||
* private bit of implementation detail with a far from ideal API, so it's probably best not to
|
||||
* use it elsewhere without careful thought.
|
||||
*/
|
||||
final class VarString {
|
||||
private static final CharMatcher VAR_CHAR = CharMatcher.inRange('A', 'Z');
|
||||
|
||||
static VarString of(String varString) {
|
||||
ImmutableSet.Builder<Character> requiredChars = ImmutableSet.builder();
|
||||
// Variable placeholders are any % followed by upper-case ASCII letter (A-Z).
|
||||
// Other '%' chars are ignored.
|
||||
for (int i = 0; i < varString.length() - 1; i++) {
|
||||
if (varString.charAt(i) == '%') {
|
||||
char c = varString.charAt(i + 1);
|
||||
if (VAR_CHAR.matches(c)) {
|
||||
requiredChars.add(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return new VarString(varString, requiredChars.build(), ImmutableMap.of());
|
||||
}
|
||||
|
||||
static VarString of(String s, Function<Character, String> varFn) {
|
||||
return of(s).apply(varFn);
|
||||
}
|
||||
|
||||
private final String varString;
|
||||
private final ImmutableSet<Character> requiredChars;
|
||||
private final ImmutableMap<Character, String> varMap;
|
||||
|
||||
private VarString(
|
||||
String varString,
|
||||
ImmutableSet<Character> requiredChars,
|
||||
ImmutableMap<Character, String> varMap) {
|
||||
this.varString = checkNotNull(varString);
|
||||
this.requiredChars = checkNotNull(requiredChars);
|
||||
this.varMap = checkNotNull(varMap);
|
||||
}
|
||||
|
||||
/** Applies a variable function to produce a new, potentially resolved, VarString. */
|
||||
VarString apply(Function<Character, String> varFn) {
|
||||
ImmutableMap.Builder<Character, String> newVarMap = ImmutableMap.builder();
|
||||
newVarMap.putAll(this.varMap);
|
||||
for (Character c : requiredChars) {
|
||||
if (!varMap.containsKey(c)) {
|
||||
// Allowed to return null if the function cannot resolve a variable.
|
||||
String v = varFn.apply(c);
|
||||
if (v != null) {
|
||||
newVarMap.put(c, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
return new VarString(varString, requiredChars, newVarMap.build());
|
||||
}
|
||||
|
||||
/** Returns a resolved value if all variables are available for substitution. */
|
||||
Optional<String> resolve() {
|
||||
return varMap.keySet().equals(requiredChars)
|
||||
? Optional.of(
|
||||
RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c)))
|
||||
: Optional.empty();
|
||||
}
|
||||
|
||||
/** Returns the resolved value or fails if not all variables are available. */
|
||||
String get() {
|
||||
checkState(varMap.keySet().equals(requiredChars), "unresolved variable string: %s", this);
|
||||
return RegexTransformer.substitute(varString, '%', c -> varMap.getOrDefault(c, "%" + c));
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
return varString + ": " + varMap;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
350
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_locale.txt
Normal file
350
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_locale.txt
Normal file
|
@ -0,0 +1,350 @@
|
|||
# ldml2icu_locale.txt
|
||||
#
|
||||
# © 2016 and later: Unicode, Inc. and others.
|
||||
#
|
||||
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
|
||||
# For terms of use, see http://www.unicode.org/copyright.html
|
||||
#
|
||||
# Used by LdmlLocaleMapper.
|
||||
# Data-driven file for mapping LDML locale paths to ICU paths.
|
||||
# See ldml2icu_readme.txt for a detailed explanation of this file.
|
||||
|
||||
# Variables
|
||||
# Attribute value
|
||||
%A=[^"']++
|
||||
# Word
|
||||
%W=[\w\-]++
|
||||
# Greedy word match
|
||||
%G=[\w\-]+
|
||||
# Number match
|
||||
%N=\d++
|
||||
# The default numbering system to be used.
|
||||
%D=//ldml/numbers/defaultNumberingSystem
|
||||
|
||||
# Main locale data
|
||||
|
||||
# Aliases
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/alias[@source="locale"][@path="../calendar[@type='(%A)']"]
|
||||
; /calendar/$1lo ; values=/LOCALE/calendar/$2
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dayPeriods"]
|
||||
; /calendar/$1/AmPmMarkers:alias ; values=/LOCALE/calendar/$2/AmPmMarkers
|
||||
; /calendar/$1/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/$2/AmPmMarkersNarrow
|
||||
; /calendar/$1/NoonMarker:alias ; values=/LOCALE/calendar/$2/NoonMarker
|
||||
; /calendar/$1/NoonMarkerNarrow:alias ; values=/LOCALE/calendar/$2/NoonMarkerNarrow
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
|
||||
; /calendar/gregorian/AmPmMarkers:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
|
||||
//ldml/dates/calendars/calendar[@type="gregorian"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/alias[@source="locale"][@path="../dayPeriodWidth[@type='abbreviated']"]
|
||||
; /calendar/gregorian/AmPmMarkersNarrow:alias ; values=/LOCALE/calendar/gregorian/AmPmMarkersAbbr
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(eras|quarters|cyclicNameSets|monthPatterns)/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
|
||||
; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/alias[@source="locale"][@path="../eraAbbr"]
|
||||
; /calendar/$1/eras/narrow:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/alias[@source="locale"][@path="../eraAbbr"]
|
||||
; /calendar/$1/eras/wide:alias ; values=/LOCALE/calendar/$1/eras/abbreviated
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/alias[@source="locale"][@path="../\2[@type='(%A)']"]
|
||||
; /calendar/$1/$2s/$3:alias ; values=/LOCALE/calendar/$1/$2s/$4
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../../../\4Set[@type='(%A)']/\4Context[@type='(%A)']/\4Width[@type='(%A)']"]
|
||||
; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$7/$8/$9
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet)s/\2[@type="(%W)"]/(cyclicName)Context[@type="(%W)"]/\4Width[@type="(%W)"]/alias[@source="locale"][@path="../\4Width[@type='(%A)']"]
|
||||
; /calendar/$1/$2s/$3/$5/$6:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5/$7
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(cyclicNameSet|monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
|
||||
; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$5/$6
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/alias[@source="locale"][@path="../../%W[@type='(%A)']/%W"]
|
||||
; /calendar/$1/$2Names:alias ; values=/LOCALE/calendar/$3/$2Names
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
|
||||
; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$3/$5
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../../\2Context[@type='(%W)']/\2Width[@type='(%A)']"]
|
||||
; /calendar/$1/$2Names/$3/$4:alias ; values=/LOCALE/calendar/$1/$2Names/$5/$6
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern|quarter)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/alias[@source="locale"][@path="../\2Width[@type='(%A)']"]
|
||||
; /calendar/$1/$2s/$3/$4:alias ; values=/LOCALE/calendar/$1/$2s/$3/$5
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateFormats"]
|
||||
; /calendar/$1/DateTimePatterns:alias ; values=/LOCALE/calendar/$2/DateTimePatterns
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/alias[@source="locale"][@path="../../calendar[@type='(%A)']/dateTimeFormats"]
|
||||
; /calendar/$1/availableFormats:alias ; values=/LOCALE/calendar/$2/availableFormats
|
||||
; /calendar/$1/appendItems:alias ; values=/LOCALE/calendar/$2/appendItems
|
||||
; /calendar/$1/intervalFormats:alias ; values=/LOCALE/calendar/$2/intervalFormats
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/(availableFormats|appendItems|intervalFormats)/alias[@source="locale"][@path="../../../calendar[@type='(%A)']/dateTimeFormats/\2"]
|
||||
; /calendar/$1/$2:alias ; values=/LOCALE/calendar/$3/$2
|
||||
|
||||
//ldml/units/unitLength[@type="long"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
|
||||
; /units:alias ; values=/LOCALE/unitsShort
|
||||
//ldml/units/unitLength[@type="narrow"]/alias[@source="locale"][@path="../unitLength[@type='short']"]
|
||||
; /unitsNarrow:alias ; values=/LOCALE/unitsShort
|
||||
|
||||
//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern"]
|
||||
; /listPattern/$1/start:alias ; values=/LOCALE/listPattern/standard/start
|
||||
; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/standard/middle
|
||||
; /listPattern/$1/end:alias ; values=/LOCALE/listPattern/standard/end
|
||||
; /listPattern/$1/2:alias ; values=/LOCALE/listPattern/standard/2
|
||||
//ldml/listPatterns/listPattern[@type="(%A)"]/alias[@source="locale"][@path="../listPattern[@type='(%A)']"]
|
||||
; /listPattern/$1/start:alias ; values=/LOCALE/listPattern/$2/start
|
||||
; /listPattern/$1/middle:alias ; values=/LOCALE/listPattern/$2/middle
|
||||
; /listPattern/$1/end:alias ; values=/LOCALE/listPattern/$2/end
|
||||
; /listPattern/$1/2:alias ; values=/LOCALE/listPattern/$2/2
|
||||
|
||||
//ldml/numbers/currencyFormats[@numberSystem="(%A)"]/currencyFormatLength/currencyFormat[@type="accounting"]/alias[@source="locale"][@path="../(%W)[@type='standard']"] ; /NumberElements/$1/patterns/accountingFormat:alias ; values=/LOCALE/NumberElements/$1/patterns/$2
|
||||
|
||||
# Characters
|
||||
|
||||
//ldml/characters/exemplarCharacters[@type="auxiliary"] ; /AuxExemplarCharacters
|
||||
//ldml/characters/exemplarCharacters[@type="currencySymbol"] ; /ExemplarCharactersCurrency
|
||||
//ldml/characters/exemplarCharacters[@type="index"] ; /ExemplarCharactersIndex
|
||||
//ldml/characters/exemplarCharacters[@type="punctuation"] ; /ExemplarCharactersPunctuation
|
||||
//ldml/characters/exemplarCharacters[@type="numbers"] ; /ExemplarCharactersNumbers
|
||||
//ldml/characters/exemplarCharacters ; /ExemplarCharacters
|
||||
|
||||
//ldml/characters/ellipsis[@type="(%A)"] ; /Ellipsis/$1
|
||||
//ldml/characters/moreInformation ; /MoreInformation
|
||||
//ldml/characters/special/icu:scripts/icu:script[@type="%N"] ; /LocaleScript
|
||||
|
||||
//ldml/characters/parseLenients[@scope="(%A)"][@level="(%A)"]/parseLenient[@sample="%A"] ; /parse/$1/$2
|
||||
|
||||
# Defaults
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(monthPattern)s/\2Context[@type="(%W)"]/\2Width[@type="(%W)"]/\2[@type="(%W)"]
|
||||
; /calendar/$1/$2s/$3/$4/$5
|
||||
|
||||
# Dates
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/cyclicNameSets/cyclicNameSet[@type="(%A)"]/cyclicNameContext[@type="(%A)"]/cyclicNameWidth[@type="(%A)"]/cyclicName[@type="(%A)"]
|
||||
; /calendar/$1/cyclicNameSets/$2/$3/$4 ;
|
||||
|
||||
# ---- /calendar/xxx/DateTimePatterns
|
||||
# Rules are split to force manual ordering within the array produced by them (they share the same output path).
|
||||
#
|
||||
# Note that (like many other places) the uncaptured "type" attributes are just expected to be "standard", and the %A
|
||||
# variable is only used to save a bit of space. The final output array has 3 groups ("time" -> "date" -> "date-time")
|
||||
# each with 4 elements in based on the pattern length ("full" -> "long" -> "medium" -> "short") giving 12 patterns in
|
||||
# total.
|
||||
#
|
||||
# However due to an awful hack, there end up being 13 values in the array, with the medium date-time value being
|
||||
# duplicated at index 8. However this hack is done later, because the regex transformer does not permit the same
|
||||
# CLDR path to emit values in different places in an array.
|
||||
|
||||
# Time patterns (4 x values)
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(timeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
|
||||
; /calendar/$1/DateTimePatterns
|
||||
|
||||
# Date patterns (4 x values)
|
||||
#
|
||||
# This is a weird edge case. When the number attribute is present in the xpath, its value needs to be grouped
|
||||
# together with the xpath value in its own special array, which is treated like just another value in
|
||||
# /DateTimePatterns. The group keyword is used here to specify that values from the same xpath should be grouped
|
||||
# into their own separate array. Since each possible pattern length can have patterns with and without the number
|
||||
# attribute, we must explicitly split the rules to enforce correct output order.
|
||||
#
|
||||
# So far (Jan 2014), this only happens in the Chinese calendar for ja/zh/zh_Hant and the Hebrew calendar for he,
|
||||
# and all calendars for haw (which has numbers="M=romanlow").
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"]
|
||||
; /calendar/$1/DateTimePatterns
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(full)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
|
||||
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"]
|
||||
; /calendar/$1/DateTimePatterns
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(long)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
|
||||
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"]
|
||||
; /calendar/$1/DateTimePatterns
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(medium)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
|
||||
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"]
|
||||
; /calendar/$1/DateTimePatterns
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateFormat)s/\2Length[@type="(short)"]/\2[@type="%A"]/pattern[@type="%A"][@numbers="(%A)"]
|
||||
; /calendar/$1/DateTimePatterns ; values="{value}" $4 ; group
|
||||
|
||||
# DateTime patterns (4 x values)
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(dateTimeFormat)s/\2Length[@type="(%A)"]/\2[@type="%A"]/pattern[@type="%A"]
|
||||
; /calendar/$1/DateTimePatterns
|
||||
# ----
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/appendItems/appendItem[@request="(%A)"] ; /calendar/$1/appendItems/$2
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"] ; /calendar/$1/availableFormats/$2
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/availableFormats/dateFormatItem[@id="(%A)"][@count="(%A)"] ; /calendar/$1/availableFormats/$2/$3
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id="(%A)"]/greatestDifference[@id="(%A)"] ; /calendar/$1/intervalFormats/$2/$3
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dateTimeFormats/intervalFormats/intervalFormatFallback ; /calendar/$1/intervalFormats/fallback
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkers%$3
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersNarrow%$3
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/AmPmMarkersAbbr%$3
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="wide"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkers
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="abbreviated"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersAbbr
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="format"]/dayPeriodWidth[@type="narrow"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/AmPmMarkersNarrow
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(stand-alone)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(am|pm)"] ; /calendar/$1/dayPeriod/$2/$3/$4
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"][@alt="(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4%$5
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/dayPeriods/dayPeriodContext[@type="(%A)"]/dayPeriodWidth[@type="(%A)"]/dayPeriod[@type="(?!am|pm)(%A)"] ; /calendar/$1/dayPeriod/$2/$3/$4
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/narrow%$3
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/abbreviated%$3
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"][@alt="(%A)"] ; /calendar/$1/eras/wide%$3
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNarrow/era[@type="(%A)"] ; /calendar/$1/eras/narrow
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraAbbr/era[@type="(%A)"] ; /calendar/$1/eras/abbreviated
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/eras/eraNames/era[@type="(%A)"] ; /calendar/$1/eras/wide
|
||||
|
||||
# Leap year names go after other month names.
|
||||
# "yeartype" is an #IMPLIED attribute in the DTD and it should implicitly default to "standard".
|
||||
# In practice "standard" is never explicitly given, but it could be (so must match it here).
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"](?:[@yeartype="standard"])? ; /calendar/$1/$2Names/$3/$4
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(day|month)s/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="(%A)"][@yeartype="leap"] ; /calendar/$1/$2Names/$3/$4
|
||||
|
||||
//ldml/dates/calendars/calendar[@type="(%A)"]/(quarters)/%W[@type="(%A)"]/%W[@type="(%A)"]/%W[@type="%A"] ; /calendar/$1/$2/$3/$4
|
||||
|
||||
//ldml/dates/fields/field[@type="(%A)"]/displayName[@alt="(%A)"] ; /fields/$1/dn%$2
|
||||
//ldml/dates/fields/field[@type="(%A)"]/displayName ; /fields/$1/dn
|
||||
//ldml/dates/fields/field[@type="(%A)"]/relative[@type="(%A)"] ; /fields/$1/relative/"$2"
|
||||
//ldml/dates/fields/field[@type="(%A)"]/relativePeriod ; /fields/$1/relativePeriod
|
||||
//ldml/dates/fields/field[@type="(%A)"]/relativeTime[@type="(%A)"]/relativeTimePattern[@count="(%A)"] ; /fields/$1/relativeTime/$2/$3
|
||||
|
||||
//ldml/dates/fields/field[@type="(%A)"]/alias[@source="locale"][@path="../field[@type='(%A)']"] ; /fields/$1:alias ; values=/LOCALE/fields/$2
|
||||
|
||||
//ldml/dates/timeZoneNames/regionFormat[@type="daylight"] ; /zoneStrings/regionFormatDaylight
|
||||
//ldml/dates/timeZoneNames/regionFormat[@type="standard"] ; /zoneStrings/regionFormatStandard
|
||||
//ldml/dates/timeZoneNames/(%GFormat) ; /zoneStrings/$1
|
||||
|
||||
//ldml/dates/timeZoneNames/metazone[@type="(%A)"]/(\w)%W/(\w)%W ; /zoneStrings/"meta:$1"/$2$3
|
||||
|
||||
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2"/ec%$3
|
||||
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2"/ec
|
||||
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)"]/(\w)%W/(\w)%W ; /zoneStrings/"$1:$2"/$3$4
|
||||
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity[@alt="(%A)"] ; /zoneStrings/"$1:$2:$3"/ec%$4
|
||||
//ldml/dates/timeZoneNames/zone[@type="(%W)/(%W)/(%W)"]/exemplarCity ; /zoneStrings/"$1:$2:$3"/ec
|
||||
|
||||
# Locale Display Names
|
||||
|
||||
//ldml/localeDisplayNames/codePatterns/codePattern[@type="(%A)"] ; /codePatterns/$1
|
||||
//ldml/localeDisplayNames/annotationPatterns/annotationPattern[@type="(%A)"] ; /codePatterns/$1
|
||||
|
||||
//ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
|
||||
|
||||
//ldml/localeDisplayNames/languages/language[@type="(%A)"][@alt="(%A)"] ; /Languages%$2/$1
|
||||
//ldml/localeDisplayNames/languages/language[@type="(%A)"] ; /Languages/$1
|
||||
|
||||
//ldml/localeDisplayNames/localeDisplayPattern/localeKeyTypePattern ; /localeDisplayPattern/keyTypePattern
|
||||
//ldml/localeDisplayNames/localeDisplayPattern/localePattern ; /localeDisplayPattern/pattern
|
||||
//ldml/localeDisplayNames/localeDisplayPattern/localeSeparator ; /localeDisplayPattern/separator
|
||||
|
||||
//ldml/localeDisplayNames/measurementSystemNames/measurementSystemName[@type="(%A)"] ; /measurementSystemNames/$1
|
||||
|
||||
//ldml/localeDisplayNames/scripts/script[@type="(%A)"][@alt="(%A)"] ; /Scripts%$2/$1
|
||||
//ldml/localeDisplayNames/scripts/script[@type="(%A)"] ; /Scripts/$1
|
||||
|
||||
//ldml/localeDisplayNames/territories/territory[@type="(%A)"][@alt="(%A)"] ; /Countries%$2/$1
|
||||
//ldml/localeDisplayNames/territories/territory[@type="(%A)"] ; /Countries/$1
|
||||
|
||||
//ldml/localeDisplayNames/transformNames/transformName[@type="(%W)"] ; /transformNames/$1
|
||||
|
||||
//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"][@alt="(%A)"] ; /Types%$3/$1/$2
|
||||
//ldml/localeDisplayNames/types/type[@key="(%A)"][@type="(%A)"] ; /Types/$1/$2
|
||||
|
||||
//ldml/localeDisplayNames/variants/variant[@type="(%A)"][@alt="(%A)"] ; /Variants%$2/$1
|
||||
//ldml/localeDisplayNames/variants/variant[@type="(%A)"] ; /Variants/$1
|
||||
|
||||
# Numbers
|
||||
|
||||
//ldml/numbers/currencies/currency[@type="(%A)"]/displayName[@count="(%A)"] ; /CurrencyPlurals/$1/$2
|
||||
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol[@alt="(%A)"] ; /Currencies%$2/$1
|
||||
|
||||
# ---- /Currencies/XXX bundles
|
||||
# Ordering of rules is critical here since they write into the same resource bundle path and the
|
||||
# last 3 values are grouped together as a single value (via the special <FIFO> hidden label).
|
||||
#
|
||||
# Note that the <FIFO> label is needed here (not the "group" instruction) because the grouped
|
||||
# values must be seen as having a resource bundle path that is a child of the "/Currencies/$1"
|
||||
# path. This is so that the grouped values only appear when one of them is present rather than
|
||||
# whenever any of the other values in the main resource bundle path exist.
|
||||
#
|
||||
# Due to the optional nature of the final sub-array in the bundle, it would be very hard to ever
|
||||
# add more elements after it.
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
|
||||
; /Currencies/$1 ; fallback=$1
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
|
||||
; /Currencies/$1 ; fallback=$1
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/pattern[@type="standard"]
|
||||
; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencyFormatLength/currencyFormat[@type="standard"]/pattern[@type="standard"]
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/decimal
|
||||
; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/group
|
||||
; /Currencies/$1/<FIFO> ; fallback=//ldml/numbers/symbols[@numberSystem="%D"]/group
|
||||
# ----
|
||||
|
||||
//ldml/numbers/currencyFormats[@numberSystem="%D"]/currencySpacing/(%W)/(%W) ; /currencySpacing/$1/$2
|
||||
//ldml/numbers/currencyFormats[@numberSystem="%D"]/unitPattern[@count="(%W)"] ; /CurrencyUnitPatterns/$1
|
||||
|
||||
//ldml/numbers/defaultNumberingSystem[@alt="(%A)"] ; /NumberElements/default_$1
|
||||
//ldml/numbers/defaultNumberingSystem ; /NumberElements/default
|
||||
//ldml/numbers/minimumGroupingDigits ; /NumberElements/minimumGroupingDigits
|
||||
//ldml/numbers/otherNumberingSystems/(%W) ; /NumberElements/$1
|
||||
|
||||
//ldml/numbers/symbols[@numberSystem="(%A)"]/(%W) ; /NumberElements/$1/symbols/$2
|
||||
//ldml/numbers/(%GFormat)s[@numberSystem="(%W)"]/\1Length/\1[@type="standard"]/pattern[@type="standard"] ; /NumberElements/$2/patterns/$1
|
||||
//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength/currencyFormat[@type="accounting"]/pattern[@type="standard"] ; /NumberElements/$1/patterns/accountingFormat
|
||||
//ldml/numbers/currencyFormats[@numberSystem="(%W)"]/currencyFormatLength[@type="short"]/currencyFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/currencyFormat/$2/$3
|
||||
//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="short"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsShort/decimalFormat/$2/$3
|
||||
//ldml/numbers/decimalFormats[@numberSystem="(%W)"]/decimalFormatLength[@type="long"]/decimalFormat[@type="standard"]/pattern[@type="(%N)"][@count="(%W)"] ; /NumberElements/$1/patternsLong/decimalFormat/$2/$3
|
||||
|
||||
//ldml/numbers/miscPatterns[@numberSystem="(%W)"]/pattern[@type="(%W)"] ; /NumberElements/$1/miscPatterns/$2
|
||||
//ldml/numbers/minimalPairs/ordinalMinimalPairs[@ordinal="(%A)"] ; /NumberElements/minimalPairs/ordinal/$1
|
||||
//ldml/numbers/minimalPairs/pluralMinimalPairs[@count="(%A)"] ; /NumberElements/minimalPairs/plural/$1
|
||||
|
||||
# Misc
|
||||
|
||||
# Ordering of rules is critical here since they write into the same resource bundle path.
|
||||
//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="uiListOrMenu"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
|
||||
//ldml/contextTransforms/contextTransformUsage[@type="(%W)"]/contextTransform[@type="stand-alone"] ; /contextTransforms/$1:intvector ; values=&context_transform_index({value}) ; fallback=0
|
||||
|
||||
//ldml/delimiters/(%W) ; /delimiters/$1
|
||||
|
||||
//ldml/layout/orientation/(%G)Order ; /layout/$1s
|
||||
|
||||
//ldml/listPatterns/listPattern/listPatternPart[@type="(%A)"] ; /listPattern/standard/$1
|
||||
//ldml/listPatterns/listPattern[@type="(%A)"]/listPatternPart[@type="(%A)"] ; /listPattern/$1/$2
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsNarrow/$1/$2/dnam
|
||||
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/displayName ; /unitsShort/$1/$2/dnam
|
||||
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/displayName ; /units/$1/$2/dnam
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsNarrow/$1/$2/$3
|
||||
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /unitsShort/$1/$2/$3
|
||||
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/unitPattern[@count="(%A)"] ; /units/$1/$2/$3
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsNarrow/compound/$1
|
||||
//ldml/units/unitLength[@type="short"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /unitsShort/compound/$1
|
||||
//ldml/units/unitLength[@type="long"]/compoundUnit[@type="(%A)"]/compoundUnitPattern ; /units/compound/$1
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/coordinateUnit/displayName ; /unitsNarrow/coordinate/dnam
|
||||
//ldml/units/unitLength[@type="short"]/coordinateUnit/displayName ; /unitsShort/coordinate/dnam
|
||||
//ldml/units/unitLength[@type="long"]/coordinateUnit/displayName ; /units/coordinate/dnam
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsNarrow/coordinate/$1
|
||||
//ldml/units/unitLength[@type="short"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /unitsShort/coordinate/$1
|
||||
//ldml/units/unitLength[@type="long"]/coordinateUnit/coordinateUnitPattern[@type="(%A)"] ; /units/coordinate/$1
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsNarrow/$1/$2/per
|
||||
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /unitsShort/$1/$2/per
|
||||
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/perUnitPattern ; /units/$1/$2/per
|
||||
|
||||
//ldml/units/durationUnit[@type="(%A)"]/durationUnitPattern ; /durationUnits/$1
|
||||
|
||||
//ldml/units/unitLength[@type="narrow"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsNarrow/$1/$2:alias ; values=/LOCALE/unitsNarrow/$3/$4
|
||||
//ldml/units/unitLength[@type="short"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /unitsShort/$1/$2:alias ; values=/LOCALE/unitsShort/$3/$4
|
||||
//ldml/units/unitLength[@type="long"]/unit[@type="(\w++)-(%A)"]/alias[@source="locale"][@path="../unit[@type='(\w++)-(%A)']"] ; /units/$1/$2:alias ; values=/LOCALE/units/$3/$4
|
||||
|
||||
//ldml/characterLabels/characterLabelPattern[@type="(%A)"][@count="(%A)"] ; /characterLabelPattern/$1/$2
|
||||
//ldml/characterLabels/characterLabelPattern[@type="(%A)"] ; /characterLabelPattern/$1
|
||||
//ldml/characterLabels/characterLabel[@type="(%A)"] ; /characterLabel/$1
|
386
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_readme.txt
Normal file
386
tools/cldr/cldr-to-icu/src/main/resources/ldml2icu_readme.txt
Normal file
|
@ -0,0 +1,386 @@
|
|||
# README for configuration files used by org.unicode.icu.tool.cldrtoicu.regex.RegexTransformer.
|
||||
#
|
||||
# © 2019 and later: Unicode, Inc. and others.
|
||||
#
|
||||
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
|
||||
# For terms of use, see http://www.unicode.org/copyright.html
|
||||
|
||||
======
|
||||
Basics
|
||||
======
|
||||
|
||||
The RegexTransformer class converts CLDR paths and values to ICU Resource Bundle paths
|
||||
and values, based on a set of transformation rules typically loaded from a text file
|
||||
(e.g. ldml2icu_locale.txt).
|
||||
|
||||
The basic format of transformation rules is:
|
||||
<path-specification> ; <resource-bundle-specification> [; <instruction>=<argument>]*
|
||||
|
||||
A simple example of a transformation rule is:
|
||||
|
||||
//ldml/localeDisplayNames/keys/key[@type="(%A)"] ; /Keys/$1
|
||||
|
||||
which transforms CLDR values whose path matches the path specification, and emits:
|
||||
* A resource bundle path "/Keys/xx", where 'xx' is the captured type attribute.
|
||||
* A resource bundle value, which is just the CLDR value's base value.
|
||||
|
||||
A path specification can be thought of as a regular expression which matches the CLDR
|
||||
path and can capture some element names or attribute values; however unlike a regular
|
||||
expression, the '[',']' characters are treated as literals, similar to XPath expressions.
|
||||
|
||||
If a single CLDR value should produce more than one resource bundle path/value, then
|
||||
it should be written:
|
||||
|
||||
<path-specification>
|
||||
; <resource-bundle-1-specification> [; <instruction> ]*
|
||||
; <resource-bundle-2-specification> [; <instruction> ]*
|
||||
|
||||
=====================
|
||||
Argument Substitution
|
||||
=====================
|
||||
|
||||
Before a rule can be matched, any %-variables must be substituted. These are defined
|
||||
in the same configuration file as the rules, and look something like:
|
||||
%W=[\w\-]++
|
||||
or:
|
||||
%D=//ldml/numbers/defaultNumberingSystem
|
||||
|
||||
The first case can be thought of as just a snippet of regular expression (in this case
|
||||
something that matches hyphen separated words) and, importantly, here '[' and ']' are
|
||||
treated as regular expression metacharacters. These arguments are static and wil be
|
||||
substituted exactly as-is into the regular expression to be used for matching.
|
||||
|
||||
The second case (used exactly once) is a dynamic argument which references a CLDR value
|
||||
in the set of data being transformed. This is simply indicated by the fact that it starts
|
||||
with '//'. This path is resolved and the value is substituted just prior to matching.
|
||||
|
||||
Variable names are limited to a single upper-case letter (A-Z).
|
||||
|
||||
===========================
|
||||
Implicit Argument Splitting
|
||||
===========================
|
||||
|
||||
This is a (somewhat non-obvious) mechanism which allows for a single rule to generate
|
||||
multiple results from a single input path when a argument is a list of tokens.
|
||||
|
||||
Consider the rule:
|
||||
|
||||
//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
|
||||
; /timeData/$3/allowed ; values=$1
|
||||
; /timeData/$3/preferred ; values=$2
|
||||
|
||||
where the "regions" attributes (which is captured as '$3') contains a whitespace separated
|
||||
list of region codes (e.g. "US GB AU NZ"). In this case the rule is applied once for each
|
||||
region, producing paths such as "/timeData/US/allowed" or "/timeData/NZ/preferred". Note
|
||||
that there is no explicit instruction to do this, it just happens.
|
||||
|
||||
The rule is that the first unquoted argument in the resource bundle path is always treated
|
||||
as splittable.
|
||||
|
||||
To suppress this behaviour, the argument must be quoted (e.g. /timeData/"$3"/allowed). Now,
|
||||
if there were another following unquoted argument, that would become implicitly splittable
|
||||
(but only one argument is ever splittable).
|
||||
|
||||
============
|
||||
Instructions
|
||||
============
|
||||
|
||||
Additional instructions can be supplied to control value transformation and specify fallback
|
||||
values. The set of instructions is:
|
||||
* values: The most common instruction which defines how values are transformed.
|
||||
* fallback: Defines a fallback value to be used if this rule was not matched.
|
||||
|
||||
There are two other special case instructions which should (if at all possible) not be used,
|
||||
and might be removed at some point:
|
||||
* group: Causes values to be grouped as sub-arrays for very specific use cases
|
||||
(prefer using "Hidden Labels" where possible).
|
||||
* base_xpath: Allows deduplication of results between multiple different rules (this is a
|
||||
hack to work around limitations in how matching is performed).
|
||||
|
||||
-------------------
|
||||
values=<expression>
|
||||
-------------------
|
||||
|
||||
The "values" instruction defines an expression whose evaluated result becomes the output
|
||||
resource bundle value(s). Unless quoting is present, this evaluated expression is split
|
||||
on whitespace and can become multiple values in the resulting resource bundle.
|
||||
|
||||
Examples:
|
||||
|
||||
* values=$1 $2 $3
|
||||
|
||||
Produces three separate values in the resource bundle for the first three captured
|
||||
arguments.
|
||||
|
||||
* values="$1 $2" $3
|
||||
|
||||
Produces two values in the resource bundle, the first of which is two captured values
|
||||
separated by a space character.
|
||||
|
||||
* values={value}
|
||||
|
||||
Substitutes the CLDR value, but then performs whitespace splitting on the result. This
|
||||
differs from the behaviour when no "values" instructions is present (which does not
|
||||
split the results).
|
||||
|
||||
* values="{value}" $1
|
||||
|
||||
Produces two values, the first of which is the unsplit CLDR value, and the second is a
|
||||
captured argument.
|
||||
|
||||
* values=&func($1, {value})
|
||||
|
||||
Invokes a transformation function, passing in a captured argument and the CLDR value,
|
||||
and the result is then split. The set of functions available to a transformer is
|
||||
configured when it is created.
|
||||
|
||||
Note that in the above examples, it is assumed that the $N arguments do not contain spaces.
|
||||
If they did, it would result in more output values. To be strict about things, every value
|
||||
which should not be split must be quoted (e.g. values="$1" "$2" "$3") but since captured
|
||||
values are often IDs or other tokens, this is not what is seen in practice, so it is not
|
||||
reflected in these examples.
|
||||
|
||||
---------------------
|
||||
fallback=<expression>
|
||||
---------------------
|
||||
|
||||
The fallback instruction provides a way for default values to be emitted for a path that
|
||||
was not matched. Fallbacks are useful when several different rules produce values for the
|
||||
same resource bundle. In this case the output path produced by one rule can be used as
|
||||
the "key" for any unmatched rules with fallback values (to "fill in the gaps").
|
||||
|
||||
Consider the two rules which can emit the same resource bundle path:
|
||||
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol
|
||||
; /Currencies/$1 ; fallback=$1
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/displayName
|
||||
; /Currencies/$1 ; fallback=$1
|
||||
|
||||
These rules, if both matched, will produce two values for the same resource bundle path.
|
||||
Consider the CLDR values:
|
||||
|
||||
//ldml/numbers/currencies/currency[@type="USD"]/symbol ==> "$"
|
||||
//ldml/numbers/currencies/currency[@type="USD"]/displayName ==> "US Dollar"
|
||||
|
||||
After matching both of these paths, the values for the resource bundle "/Currencies/USD"
|
||||
will be the array { "$", "US Dollar" }.
|
||||
|
||||
However, if only one value were present to be converted, the converter could use the
|
||||
matched path "/Currencies/XXX" and infer the missing fallback value, ensuring that the
|
||||
output array (it if was emitted at all) was always two values.
|
||||
|
||||
Note that in order for this to work, the fallback value must be derivable only from the
|
||||
matched path. E.g. it cannot contain arguments that are not also present in the matched
|
||||
path, and obviously cannot reference the "{value}" at all. Thus the following would not
|
||||
be permitted:
|
||||
|
||||
//ldml/foo/bar[@type="(%W)"][@region=(%A)] ; /Foo/$1 ; fallback=$2
|
||||
|
||||
However the fallback value can reference existing CLDR or resource bundle paths (expected
|
||||
to be present from other rules). For example:
|
||||
fallback=/weekData/001:intvector[0]
|
||||
or:
|
||||
fallback=//ldml/numbers/symbols[@numberSystem="%D"]/decimal
|
||||
|
||||
The latter case is especially complex because it also uses the "dynamic" argument:
|
||||
%D=//ldml/numbers/defaultNumberingSystem
|
||||
|
||||
So determining the resulting value will require:
|
||||
1) resolving "//ldml/numbers/defaultNumberingSystem" to, for example, "arab"
|
||||
2) looking up the value of "//ldml/numbers/symbols[@numberSystem="arab"]/decimal"
|
||||
|
||||
-----------------
|
||||
base_xpath=<path>
|
||||
-----------------
|
||||
|
||||
The base_xpath instruction allows a rule to specify a proxy path which is used in place of
|
||||
the originally matched path in the returned result. This is a useful hack for cases where
|
||||
values are derived from information in a path prefix.
|
||||
|
||||
Because path matching for transformation happens only on full paths, it is possible that
|
||||
several distinct CLDR paths might effectively generate the same result if they share the
|
||||
same prefix (i.e. paths in the same "sub hierarchy" of the CLDR data).
|
||||
|
||||
If this happens, then you end up generating "the same" result from different paths. To
|
||||
fix this, a "surrogate" CLDR path can be specified as a proxy for the source path,
|
||||
allowing several results to appears to have come from the same source, which results in
|
||||
deduplication of the final value.
|
||||
|
||||
For example, the two rules :
|
||||
|
||||
//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
|
||||
//supplementalData/territoryInfo/territory[...][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
|
||||
Produce the same results for different paths (with or without the "officialStatus"
|
||||
attribute) but only one such result is desired. By specifying the same base_xpath on
|
||||
both rules, the conversion logic can deduplicate these to produce only one result.
|
||||
|
||||
When using base_xpath, it is worth noting that:
|
||||
1) Base xpaths must be valid "distinguishing" paths (but are never matched to any rule).
|
||||
2) Base xpaths can use arguments to achieve the necessary level of uniqueness.
|
||||
3) Rules which share the same base xpath must always produce the same values.
|
||||
|
||||
Note however that this is a still very much a hack because since two rules are responsible
|
||||
for generating the same result, there is no well defined "line number" to use for ordering
|
||||
of values. Thus this mechanism should only be used for rules which produce "single"
|
||||
values, and must not be used in cases where the ordering of values in arrays is important.
|
||||
|
||||
This mechanism only exists because there is currently no mechanism for partial matching
|
||||
or a way to match one path against multiple rules.
|
||||
|
||||
-----
|
||||
group
|
||||
-----
|
||||
|
||||
The "group" instruction should be considered a "last resort" hack for controlling value
|
||||
grouping, in cases where "hidden labels" are not suitable (see below).
|
||||
|
||||
==============================
|
||||
Value Arrays and Hidden Labels
|
||||
==============================
|
||||
|
||||
In the simplest case, one rule produces one or more output path/values per matched CLDR
|
||||
value (i.e. one-to-one or one-to-many). If that happens, then output ordering of the
|
||||
resource bundle paths is just the natural resource bundle path ordering.
|
||||
|
||||
However it is also possible for several rules to produce values for a single output path
|
||||
(i.e. many-to-one). When this happens there are some important details about how results
|
||||
are grouped and ordered.
|
||||
|
||||
------------
|
||||
Value Arrays
|
||||
------------
|
||||
|
||||
If several rules produce results for the same resource bundle path, the values produced
|
||||
by the rules are always ordered according to the order of the rule in the configuration
|
||||
rule (and it is best practice to group any such rules together for clarity).
|
||||
|
||||
If each rule produces multiple values, then depending on grouping, those values can either
|
||||
be concatenated together in a single array or grouped individually to create an array
|
||||
of arrays.
|
||||
|
||||
In the example below, there are four rules producing values for the same path (
|
||||
|
||||
//.../firstDay[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1)
|
||||
//.../minDays[@count="(%N)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=$1
|
||||
//.../weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0
|
||||
//.../weekendEnd[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 86400000
|
||||
|
||||
The first two rules produce one value each, and the last two produce two values each. This
|
||||
results in the resource bundle "/weekData/xxx:intvector" having a single array consisting
|
||||
of six values. In the real configuration, these rules also use fallback instructions to
|
||||
ensure that the resulting array of values is always six values, even if some CLDR paths are
|
||||
not present.
|
||||
|
||||
-------------
|
||||
Hidden Labels
|
||||
-------------
|
||||
|
||||
Sometimes rules should produce separate "sub-arrays" of values, rather than having all the
|
||||
values appended to a single array. Consider the following path/value pairs:
|
||||
|
||||
x/y: a
|
||||
x/y: b
|
||||
x/y: c
|
||||
|
||||
Which produce the resource bundle "x/y" with three values:
|
||||
|
||||
x{
|
||||
y{
|
||||
"a",
|
||||
"b",
|
||||
"c"
|
||||
}
|
||||
}
|
||||
|
||||
Now suppose we want to make a resource bundle where the values are grouped into their
|
||||
own sub-array:
|
||||
|
||||
x{
|
||||
y{
|
||||
{ "a", "b", "c" }
|
||||
}
|
||||
}
|
||||
|
||||
We can think of this as coming from the path/value pairs:
|
||||
|
||||
x/y/-: a
|
||||
x/y/-: b
|
||||
x/y/-: c
|
||||
|
||||
where to represent the sub-array we introduce the idea of an empty path element '-'.
|
||||
|
||||
In a transformation rule, these "empty elements" are represent as "hidden labels", and look
|
||||
like "<some-label>". They are treated as "normal" path elements for purposes of ordering and
|
||||
grouping, but are treated as empty when the paths are written to the ICU data files.
|
||||
|
||||
For example the rule:
|
||||
|
||||
//.../currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
|
||||
|
||||
Generates a series of grouped, 2-element sub-arrays split by the captured type attribute.
|
||||
|
||||
codeMappingCurrency{
|
||||
{ type-1, numeric-1 }
|
||||
{ type-2, numeric-2 }
|
||||
{ type-3, numeric-3 }
|
||||
}
|
||||
|
||||
<FIFO> is a special hidden label which is substituted for in incrementing counting when
|
||||
sorting paths. It ensures that values in the same array are sorted in the order that they
|
||||
were encountered. However this mechanism imposes a strict requirement that the ordering
|
||||
of CLDR values to be transformed matches the expected ICU value order, so it should be
|
||||
avoided where possible to avoid this implicit, subtle dependency. Note that this mechanism
|
||||
is currently only enabled for the transformation of "supplemental data" and may eventually
|
||||
be removed.
|
||||
|
||||
Hidden labels are a neat solution which permits the generation of sub-array values, but they
|
||||
don't quite work in every case. For example if you need to produce a resource bundle with a
|
||||
mix of values and sub-arrays, like:
|
||||
|
||||
x{
|
||||
y{
|
||||
"a",
|
||||
{ "b", "c" }
|
||||
"d"
|
||||
}
|
||||
}
|
||||
|
||||
which can be thought of as coming from the path/value pairs:
|
||||
|
||||
x/y: a
|
||||
x/y/<z>: b
|
||||
x/y/<z>: c
|
||||
x/y: d
|
||||
|
||||
we find that, after sorting the resource bundle paths, we end up with:
|
||||
|
||||
x/y: a
|
||||
x/y: d
|
||||
x/y/<z>: b
|
||||
x/y/<z>: c
|
||||
|
||||
which produces the wrong result. This happens because values with different paths are
|
||||
sorted primarily by their path. I cases like this, where a mix of values and sub-arrays
|
||||
are required, the "group" instruction can be used instead.
|
||||
|
||||
For example:
|
||||
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/symbol ; /Currencies/$1
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/displayName ; /Currencies/$1
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/pattern ; /Currencies/$1 ; group
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/decimal ; /Currencies/$1 ; group
|
||||
//ldml/numbers/currencies/currency[@type="(%W)"]/group ; /Currencies/$1 ; group
|
||||
|
||||
Produces resource bundles which look like:
|
||||
|
||||
Currencies{
|
||||
xxx{
|
||||
"<symbol>",
|
||||
"<display name>",
|
||||
{ "<pattern>", "<decimal>", "<group>" }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,202 @@
|
|||
# ldml2icu_supplemental.txt
|
||||
#
|
||||
# © 2016 and later: Unicode, Inc. and others.
|
||||
#
|
||||
# CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
|
||||
# For terms of use, see http://www.unicode.org/copyright.html
|
||||
#
|
||||
# Used by SupplementalMapper.
|
||||
# Data-driven file for mapping supplemental LDML paths to ICU paths.
|
||||
# See ldml2icu_readme.txt for a detailed explanation of this file.
|
||||
|
||||
# Attribute value
|
||||
%A=[^"']++
|
||||
# Attribute value, no underscore
|
||||
%B=[^"'_]++
|
||||
# Word/Zone match
|
||||
%W=[\s\w\-/]++
|
||||
# Greedy word match
|
||||
%G=[\s\w\-]+
|
||||
# Number match
|
||||
%N=[\d\.]++
|
||||
|
||||
# supplementalData.xml
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$2
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@iso4217="(%W)"][@tender="false"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$2
|
||||
; /CurrencyMap/$1/<FIFO>/tender ; values=false
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$3
|
||||
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@iso4217="(%W)"][@tender="false"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$3
|
||||
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
|
||||
; /CurrencyMap/$1/<FIFO>/tender ; values=false
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$4
|
||||
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
|
||||
; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@from="(%W)"][@to="(%W)"][@iso4217="(%W)"][@tender="false"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$4
|
||||
; /CurrencyMap/$1/<FIFO>/from:intvector ; values=&date($2, from)
|
||||
; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($3, to)
|
||||
; /CurrencyMap/$1/<FIFO>/tender ; values=false
|
||||
//supplementalData/currencyData/region[@iso3166="(%W)"]/currency[@to="(%W)"][@iso4217="(%W)"][@tender="false"]
|
||||
; /CurrencyMap/$1/<FIFO>/id ; values=$3
|
||||
; /CurrencyMap/$1/<FIFO>/to:intvector ; values=&date($2, to)
|
||||
; /CurrencyMap/$1/<FIFO>/tender ; values=false
|
||||
//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashDigits="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $4 $5
|
||||
//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"][@cashRounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $4
|
||||
//supplementalData/currencyData/fractions/info[@iso4217="(%W)"][@digits="(%N)"][@rounding="(%N)"] ; /CurrencyMeta/$1:intvector ; values=$2 $3 $2 $3
|
||||
|
||||
//supplementalData/calendarPreferenceData/calendarPreference[@territories="(%A)"][@ordering="(%A)"] ; /calendarPreferenceData/$1 ; values=$2
|
||||
//supplementalData/codeMappings/territoryCodes[@type="(%W)"][@numeric="(%N)"][@alpha3="(%W)"].* ; /codeMappings/<$1> ; values=$1 $2 $3
|
||||
|
||||
//supplementalData/codeMappings/currencyCodes[@type="(%W)"][@numeric="(%N)"].* ; /codeMappingsCurrency/<$1> ; values=$1 $2
|
||||
|
||||
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"][@alt="secondary"]
|
||||
; /languageData/$1/secondary/scripts ; values=$2
|
||||
; /languageData/$1/secondary/territories ; values=$3
|
||||
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@alt="secondary"] ; /languageData/$1/secondary/scripts ; values=$2
|
||||
//supplementalData/languageData/language[@type="(%W)"][@territories="(%G)"][@alt="secondary"] ; /languageData/$1/secondary/territories ; values=$2
|
||||
|
||||
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"][@territories="(%W)"]
|
||||
; /languageData/$1/primary/scripts ; values=$2
|
||||
; /languageData/$1/primary/territories; values=$3
|
||||
//supplementalData/languageData/language[@type="(%W)"][@scripts="(%W)"] ; /languageData/$1/primary/scripts ; values=$2
|
||||
//supplementalData/languageData/language[@type="(%W)"][@territories="(%W)"] ; /languageData/$1/primary/territories ; values=$2
|
||||
|
||||
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="deprecated"] ; /territoryContainment/deprecated/$1 ; values=$2
|
||||
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@status="grouping"] ; /territoryContainment/containedGroupings/$1 ; values=$2
|
||||
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"][@grouping="true"] ; /territoryContainment/grouping/$1 ; values=$2
|
||||
//supplementalData/territoryContainment/group[@type="(%W)"][@contains="(%A)"]; /territoryContainment/$1 ; values=$2
|
||||
|
||||
//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1 ; values=$2
|
||||
//supplementalData/subdivisionContainment/subgroup[@type="(%W)"][@subtype="(%W)"][@contains="(%A)"]; /subdivisionContainment/$1-$2 ; values=$3
|
||||
|
||||
//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"](?:[@references="(?:%A)"])?[@alt="(%A)"] ; /weekData%$3/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
|
||||
|
||||
//supplementalData/weekData/firstDay[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) ; fallback=/weekData/001:intvector[0]
|
||||
//supplementalData/weekData/minDays[@count="(%N)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=$1 ; fallback=/weekData/001:intvector[1]
|
||||
//supplementalData/weekData/weekendStart[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 0 ; fallback=/weekData/001:intvector[2] /weekData/001:intvector[3]
|
||||
//supplementalData/weekData/weekendEnd[@day="(%W)"][@territories="(%W)"] ; /weekData/$2:intvector ; values=&day_number($1) 86400000 ; fallback=/weekData/001:intvector[4] /weekData/001:intvector[5]
|
||||
|
||||
//supplementalData/weekData/weekOfPreference[@locales="(%A)"][@ordering="(%A)"] ; /weekOfPreference/$1 ; values=$2
|
||||
|
||||
//supplementalData/timeData/hours[@allowed="(%W)"][@preferred="(%W)"][@regions="(%W)"]
|
||||
; /timeData/$3/allowed ; values=$1
|
||||
; /timeData/$3/preferred ; values=$2
|
||||
|
||||
//supplementalData/measurementData/measurementSystem[@type="metric"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=0
|
||||
//supplementalData/measurementData/measurementSystem[@type="US"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=1
|
||||
//supplementalData/measurementData/measurementSystem[@type="UK"][@category="(%W)"][@territories="(%W)"] ; /measurementData/$2/MeasurementSystemCategory/$1:int ; values=2
|
||||
|
||||
//supplementalData/measurementData/measurementSystem[@type="metric"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=0
|
||||
//supplementalData/measurementData/measurementSystem[@type="US"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=1
|
||||
//supplementalData/measurementData/measurementSystem[@type="UK"][@territories="(%W)"] ; /measurementData/$1/MeasurementSystem:int ; values=2
|
||||
//supplementalData/measurementData/paperSize[@type="A4"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=297 210
|
||||
//supplementalData/measurementData/paperSize[@type="US-Letter"][@territories="(%W)"] ; /measurementData/$1/PaperSize:intvector ; values=279 216
|
||||
|
||||
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-small-informal
|
||||
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"][@scope="small"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2-small
|
||||
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"][@alt="informal"] ; /unitPreferenceData/$3/$1-$2-informal
|
||||
//supplementalData/unitPreferenceData/unitPreferences[@category="(%W)"][@usage="(%A)"]/unitPreference[@regions="(%A)"] ; /unitPreferenceData/$3/$1-$2
|
||||
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
|
||||
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
|
||||
; /territoryInfo/$1/$5/officialStatus ; values=$8
|
||||
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@writingPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
; /territoryInfo/$1/$5/writingShareF:int ; values=&exp($6,-2)
|
||||
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
|
||||
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
|
||||
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
|
||||
; /territoryInfo/$1/$5/officialStatus ; values=$8
|
||||
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@literacyPercent="(%N)"][@populationPercent="(%N)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
; /territoryInfo/$1/$5/literacyShareF:int ; values=&exp($6,-2)
|
||||
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($7,-2)
|
||||
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"][@officialStatus="(%W)"](?:[@@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
|
||||
; /territoryInfo/$1/$5/officialStatus ; values=$7
|
||||
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]/languagePopulation[@type="(%W)"][@populationPercent="(%N)"](?:[@references="%W"])?
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
; /territoryInfo/$1/$5/populationShareF:int ; values=&exp($6,-2)
|
||||
|
||||
# This only exists right now for 'ZZ', which has no <languagePopulation> child elements.
|
||||
//supplementalData/territoryInfo/territory[@type="(%W)"][@gdp="(%N)"][@literacyPercent="(%N)"][@population="(%N)"]
|
||||
; /territoryInfo/$1/territoryF:intvector ; values=&exp($2) &exp($3,-2) &exp($4) ; base_xpath=//supplementalData/territoryInfo/territory[@type="$1"]
|
||||
|
||||
//supplementalData/calendarData/calendar[@type="(%W)"]/calendarSystem[@type="(%W)"] ; /calendarData/$1/system ; values=$2
|
||||
//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"][@named="(%W)"]
|
||||
; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
|
||||
; /calendarData/$1/eras/$2/named ; values=$5
|
||||
//supplementalData/calendarData/calendar[@type="(%W)"]/eras/era[@type="(%W)"][@(start|end)="(%A)"]
|
||||
; /calendarData/$1/eras/$2/$3:intvector ; values=&ymd($4)
|
||||
|
||||
# languageInfo.xml
|
||||
|
||||
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/paradigmLocales[@locales="(%A)"] ; /languageMatchingInfo/$1/paradigmLocales ; values=$2
|
||||
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/matchVariable[@id="\$(%A)"][@value="(%A)"] ; /languageMatchingInfo/$1/matchVariable/$2 ; values=$3
|
||||
|
||||
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"][@oneway="true"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 1
|
||||
//supplementalData/languageMatching/languageMatches[@type="(%B)_new"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@distance="(%N)"] ; /languageMatchingNew/$1/<FIFO> ; values=$2 $3 $4 0
|
||||
|
||||
//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"][@oneway="true"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 1
|
||||
//supplementalData/languageMatching/languageMatches[@type="(%B)"]/languageMatch[@desired="(%A)"][@supported="(%A)"][@percent="(%N)"] ; /languageMatching/$1/<FIFO> ; values=$2 $3 $4 0
|
||||
|
||||
# likelySubtags.xml
|
||||
//supplementalData/likelySubtags/likelySubtag[@from="(%A)"][@to="(%A)"] ; /$1 ; values=$2
|
||||
|
||||
# metaZones.xml - metaZones.txt
|
||||
//supplementalData/metaZones/mapTimezones[@type="metazones"]/mapZone[@type="(%A)"][@other="(%W)"][@territory="(%W)"] ; /mapTimezones/$2/$3 ; values=$1
|
||||
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$2
|
||||
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$3 "$2" "9999-12-31 23:59"
|
||||
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@from="(%A)"][@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<$2> ; values=$4 "$2" "$3"
|
||||
//supplementalData/metaZones/metazoneInfo/timezone[@type="(%W)"]/usesMetazone[@to="(%A)"][@mzone="(%W)"] ; /metazoneInfo/"$1"/<1970-01-01 00:00> ; values=$3 "1970-01-01 00:00" "$2"
|
||||
|
||||
//supplementalData/primaryZones/primaryZone[@iso3166="(%W)"] ; /primaryZones/$1 ; values={value}
|
||||
|
||||
# numberingSystems.txt
|
||||
//supplementalData/numberingSystems/numberingSystem[@type="algorithmic"][@id="(%W)"][@rules="(%A)"]
|
||||
; /numberingSystems/$1/algorithmic:int ; values=1
|
||||
; /numberingSystems/$1/desc ; values=&algorithm($2)
|
||||
; /numberingSystems/$1/radix:int ; values=10
|
||||
|
||||
//supplementalData/numberingSystems/numberingSystem[@type="numeric"][@id="(%W)"][@digits="(%A)"]
|
||||
; /numberingSystems/$1/algorithmic:int ; values=0
|
||||
; /numberingSystems/$1/desc ; values=$2
|
||||
; /numberingSystems/$1/radix:int ; values=10
|
||||
|
||||
# windowsZones.txt
|
||||
//supplementalData/windowsZones/mapTimezones/mapZone[@type="(%A)"][@other="(%A)"][@territory="(%W)"] ; /mapTimezones/"$2"/$3 ; values="$1"
|
||||
|
||||
# genderList.txt
|
||||
//supplementalData/gender/personList[@type="(%W)"][@locales="(%W)"] ; /genderList/$2 ; values=$1
|
||||
|
||||
# locale info
|
||||
//supplementalData/parentLocales/parentLocale[@parent="(%A)"][@locales="(%A)"] ; /parentLocales/$1 ; values=$2
|
||||
|
||||
# supplementalMetadata.xml (metadata.txt)
|
||||
//supplementalData/metadata/defaultContent[@locales="(%A)"] ; /defaultContent ; values=$1
|
||||
//supplementalData/metadata/alias/(language|script|territory|subdivision|variant)Alias[@type="(%A)"][@replacement="(%A)"][@reason="(%A)"]
|
||||
; /alias/$1/$2/reason ; values="$4"
|
||||
; /alias/$1/$2/replacement ; values="$3"
|
||||
|
||||
# Region codes used by ICU's Region class
|
||||
# Specify the value explicitly so that the LDMLConverter will split it.
|
||||
//supplementalData/metadata/validity/variable[@type="choice"][@id="\$territory"] ; /regionCodes ; values={value}
|
||||
|
||||
# validity
|
||||
//supplementalData/idValidity/id[@type="(%A)"][@idStatus="(%A)"] ; /idValidity/$1/$2 ; values={value}
|
|
@ -0,0 +1,127 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.truth.Truth.assertThat;
|
||||
import static com.google.common.truth.Truth8.assertThat;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
|
||||
import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class PathMatcherTest {
|
||||
@Test
|
||||
public void testMatcher() {
|
||||
CldrPath calEra = parseDistinguishingPath(
|
||||
"//ldml/dates/calendars/calendar[@type=\"buddhist\"]/eras/eraAbbr/era[@type=\"0\"]");
|
||||
CldrPath chineseMon1 = monthInfo("chinese", "format", "abbreviated", 1);
|
||||
CldrPath chineseMon2 = monthInfo("chinese", "format", "abbreviated", 2);
|
||||
CldrPath genericMon1 = monthInfo("generic", "stand-alone", "narrow", 1);
|
||||
CldrPath genericMon2 = monthInfo("generic", "stand-alone", "narrow", 2);
|
||||
List<CldrPath> calPaths =
|
||||
Arrays.asList(calEra, chineseMon1, chineseMon2, genericMon1, genericMon2);
|
||||
|
||||
PathMatcher anyCalendarPaths = PathMatcher.of("ldml/dates/calendars/calendar");
|
||||
assertThat(calPaths.stream().allMatch(anyCalendarPaths::matchesPrefixOf)).isTrue();
|
||||
assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matches)).isTrue();
|
||||
assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matchesSuffixOf)).isTrue();
|
||||
|
||||
PathMatcher chineseCalendars =
|
||||
PathMatcher.of("ldml/dates/calendars/calendar[@type=\"chinese\"]");
|
||||
assertThat(calPaths.stream().filter(chineseCalendars::matchesPrefixOf))
|
||||
.containsExactly(chineseMon1, chineseMon2);
|
||||
|
||||
PathMatcher anyMonth = PathMatcher.of("monthWidth[@type=*]/month[@type=*]");
|
||||
assertThat(calPaths.stream().filter(anyMonth::matchesSuffixOf))
|
||||
.containsExactly(chineseMon1, chineseMon2, genericMon1, genericMon2);
|
||||
|
||||
PathMatcher narrowMonth = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
|
||||
assertThat(calPaths.stream().filter(narrowMonth::matchesSuffixOf))
|
||||
.containsExactly(genericMon1, genericMon2);
|
||||
assertThat(calPaths.stream().filter(narrowMonth::matches)).isEmpty();
|
||||
|
||||
PathMatcher firstMonth = PathMatcher.of("month[@type=\"1\"]");
|
||||
assertThat(calPaths.stream().filter(firstMonth::matchesSuffixOf))
|
||||
.containsExactly(chineseMon1, genericMon1);
|
||||
|
||||
PathMatcher fullMatch = PathMatcher.of("ldml/dates"
|
||||
+ "/calendars/calendar[@type=\"generic\"]"
|
||||
+ "/months/monthContext[@type=\"stand-alone\"]"
|
||||
+ "/monthWidth[@type=\"narrow\"]"
|
||||
+ "/month[@type=\"2\"]");
|
||||
assertThat(calPaths.stream().filter(fullMatch::matches)).containsExactly(genericMon2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWildcardSegment() {
|
||||
PathMatcher wildcard = PathMatcher.of("ldml/dates"
|
||||
+ "/calendars/calendar[@type=\"generic\"]"
|
||||
+ "/*/*[@type=\"format\"]/*[@type=\"narrow\"]/*[@type=*]");
|
||||
|
||||
assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 1))).isTrue();
|
||||
assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 9))).isTrue();
|
||||
assertThat(wildcard.matches(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
|
||||
|
||||
assertThat(wildcard.matches(monthInfo("chinese", "format", "narrow", 1))).isFalse();
|
||||
assertThat(wildcard.matches(monthInfo("generic", "stand-alone", "narrow", 1))).isFalse();
|
||||
assertThat(wildcard.matches(dayInfo("generic", "format", "wide", "mon"))).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAnyOf() {
|
||||
PathMatcher monthMatch = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]");
|
||||
PathMatcher dayMatch = PathMatcher.of("dayWidth[@type=\"narrow\"]/day[@type=*]");
|
||||
PathMatcher combined = PathMatcher.anyOf(monthMatch, dayMatch);
|
||||
|
||||
assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "narrow", 1))).isTrue();
|
||||
assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "narrow", "sun"))).isTrue();
|
||||
|
||||
assertThat(combined.matchesSuffixOf(monthInfo("generic", "format", "wide", 1))).isFalse();
|
||||
assertThat(combined.matchesSuffixOf(dayInfo("generic", "format", "wide", "mon"))).isFalse();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadSpecifiers() {
|
||||
assertInvalidPathSpecification("");
|
||||
// Leading and trailing '/' are not permitted (they imply empty segments.
|
||||
assertInvalidPathSpecification("/foo/");
|
||||
assertInvalidPathSpecification("foo//bar");
|
||||
assertInvalidPathSpecification("foo/bad segment name");
|
||||
assertInvalidPathSpecification("foo/bar[type=*]");
|
||||
assertInvalidPathSpecification("foo/bar[@type=**]");
|
||||
assertInvalidPathSpecification("foo/bar[@type='double-quotes-only']");
|
||||
}
|
||||
|
||||
private void assertInvalidPathSpecification(String spec) {
|
||||
IllegalArgumentException e =
|
||||
assertThrows(IllegalArgumentException.class, () -> PathMatcher.of(spec));
|
||||
assertThat(e).hasMessageThat().startsWith("invalid path specification");
|
||||
assertThat(e).hasMessageThat().contains(spec);
|
||||
}
|
||||
|
||||
private static CldrPath monthInfo(String type, String context, String width, int number) {
|
||||
return CldrPath.parseDistinguishingPath(String.format(
|
||||
"//ldml/dates/calendars/calendar[@type=\"%s\"]"
|
||||
+ "/months/monthContext[@type=\"%s\"]"
|
||||
+ "/monthWidth[@type=\"%s\"]"
|
||||
+ "/month[@type=\"%d\"]",
|
||||
type, context, width, number));
|
||||
}
|
||||
|
||||
private static CldrPath dayInfo(String type, String context, String width, String id) {
|
||||
return CldrPath.parseDistinguishingPath(String.format(
|
||||
"//ldml/dates/calendars/calendar[@type=\"%s\"]"
|
||||
+ "/days/dayContext[@type=\"%s\"]"
|
||||
+ "/dayWidth[@type=\"%s\"]"
|
||||
+ "/day[@type=\"%s\"]",
|
||||
type, context, width, id));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static org.unicode.icu.tool.cldrtoicu.testing.RbPathSubjectFactory.assertThat;
|
||||
import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
|
||||
import static com.google.common.truth.Truth.assertThat;
|
||||
import static com.google.common.truth.Truth8.assertThat;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class RbPathTest {
|
||||
@Test
|
||||
public void testEmpty() {
|
||||
assertThat(RbPath.empty()).hasSegments();
|
||||
assertThat(RbPath.empty()).hasLength(0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParseVsOf() {
|
||||
assertThat(RbPath.of("foo", "bar")).hasSegments("foo", "bar");
|
||||
assertThat(RbPath.of("foo/bar")).hasSegments("foo/bar");
|
||||
assertThat(RbPath.parse("foo/bar")).hasSegments("foo", "bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBadArgs() {
|
||||
assertBadPath("", "empty path string");
|
||||
assertBadPath("foo//bar", "empty path segment");
|
||||
assertBadPath("foo/<bar/baz", "mismatched quoting");
|
||||
assertBadPath("foo/\"bar", "mismatched quoting");
|
||||
assertBadPath("foo/\"bar\"baz\"", "invalid character");
|
||||
assertBadPath("foo/bar baz", "invalid character");
|
||||
}
|
||||
|
||||
private static void assertBadPath(String path, String errorSnippet) {
|
||||
IllegalArgumentException e =
|
||||
assertThrows(IllegalArgumentException.class, () -> RbPath.parse(path));
|
||||
assertThat(e).hasMessageThat().contains(errorSnippet);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,357 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu;
|
||||
|
||||
import static com.google.common.truth.Truth.assertThat;
|
||||
import static com.google.common.truth.Truth.assertWithMessage;
|
||||
import static com.google.common.truth.Truth8.assertThat;
|
||||
import static java.util.Arrays.asList;
|
||||
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
|
||||
import static org.unicode.cldr.api.CldrValue.parseValue;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
import org.unicode.cldr.api.CldrDataSupplier;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
import org.unicode.cldr.tool.LikelySubtags;
|
||||
import org.unicode.cldr.util.LanguageTagCanonicalizer;
|
||||
import org.unicode.cldr.util.LocaleIDParser;
|
||||
import org.unicode.cldr.util.SupplementalDataInfo;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
|
||||
/**
|
||||
* Unit tests for the supplemental data API. These tests either use fake data for unit testing, or
|
||||
* compare behaviour between this API and the equivalent CLDR utility tool for regression testing.
|
||||
*/
|
||||
@RunWith(JUnit4.class)
|
||||
public class SupplementalDataTest {
|
||||
private static SupplementalData regressionData;
|
||||
private static LikelySubtags likelySubtags;
|
||||
|
||||
@BeforeClass
|
||||
public static void loadRegressionData() {
|
||||
Path cldrRoot = Paths.get(System.getProperty("CLDR_DIR"));
|
||||
regressionData = SupplementalData
|
||||
.create(CldrDataSupplier.forCldrFilesIn(cldrRoot).getDataForType(SUPPLEMENTAL));
|
||||
SupplementalDataInfo sdi =
|
||||
SupplementalDataInfo.getInstance(cldrRoot.resolve("common/supplemental").toString());
|
||||
likelySubtags = new LikelySubtags(sdi);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetParent_explicit() {
|
||||
// Locales with an explicit (non truncation) parent (a.k.a "English is weird").
|
||||
SupplementalData fakeData = fakeSupplementalData(parentLocales("en_001", "en_AU", "en_GB"));
|
||||
|
||||
assertThat(fakeData.getExplicitParentLocaleOf("en_GB")).hasValue("en_001");
|
||||
assertThat(fakeData.getExplicitParentLocaleOf("en_AU")).hasValue("en_001");
|
||||
assertThat(fakeData.getExplicitParentLocaleOf("en_US")).isEmpty();
|
||||
assertThat(fakeData.getExplicitParentLocaleOf("en")).isEmpty();
|
||||
|
||||
assertThat(fakeData.getParent("en_GB")).isEqualTo("en_001");
|
||||
assertThat(fakeData.getParent("en_AU")).isEqualTo("en_001");
|
||||
assertThat(fakeData.getParent("en_001")).isEqualTo("en");
|
||||
assertThat(fakeData.getParent("en_US")).isEqualTo("en");
|
||||
assertThat(fakeData.getParent("en")).isEqualTo("root");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetParent_likelyScript() {
|
||||
// To figure out default scripts we use likely subtags.
|
||||
SupplementalData fakeData = fakeSupplementalData(likelySubtag("zh", "zh_Hans_CN"));
|
||||
|
||||
// When removing a non-default script, the parent become "root".
|
||||
assertThat(fakeData.getParent("zh_Hant")).isEqualTo("root");
|
||||
// "Hans" is recognized as the default script, so the parent is obtained via truncation.
|
||||
assertThat(fakeData.getParent("zh_Hans")).isEqualTo("zh");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMaximize() {
|
||||
SupplementalData fakeData = fakeSupplementalData(
|
||||
likelySubtag("en", "en_Latn_US"),
|
||||
likelySubtag("pt", "pt_Latn_BR"),
|
||||
likelySubtag("und", "en_Latn_US"));
|
||||
|
||||
// You cannot maximize "root".
|
||||
assertThat(fakeData.maximize("root")).isEmpty();
|
||||
// Existing subtags preserved.
|
||||
assertThat(fakeData.maximize("en")).hasValue("en_Latn_US");
|
||||
assertThat(fakeData.maximize("en_GB")).hasValue("en_Latn_GB");
|
||||
assertThat(fakeData.maximize("en_VARIANT")).hasValue("en_Latn_US_VARIANT");
|
||||
// Some other similar examples.
|
||||
assertThat(fakeData.maximize("pt")).hasValue("pt_Latn_BR");
|
||||
assertThat(fakeData.maximize("pt_PT")).hasValue("pt_Latn_PT");
|
||||
assertThat(fakeData.maximize("und")).hasValue("en_Latn_US");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplaceDeprecatedTags_iAmRoot() {
|
||||
SupplementalData fakeData = fakeSupplementalData();
|
||||
assertThat(fakeData.replaceDeprecatedTags("root")).isEqualTo("root");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplaceDeprecatedTags_sameSubtags() {
|
||||
SupplementalData fakeData = fakeSupplementalData(likelySubtag("en", "en_Latn_US"));
|
||||
|
||||
// Replacement does not minimize or maximize results (even though "Latn" is likely).
|
||||
assertThat(fakeData.replaceDeprecatedTags("en_Latn_GB")).isEqualTo("en_Latn_GB");
|
||||
assertThat(fakeData.replaceDeprecatedTags("en_GB")).isEqualTo("en_GB");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplaceDeprecatedTags_subtagReplacement() {
|
||||
SupplementalData fakeData = fakeSupplementalData(
|
||||
languageAlias("cym", "cy"),
|
||||
scriptAlias("Qaai", "Zinh"),
|
||||
territoryAlias("YU", "RS"));
|
||||
|
||||
// Region is deprecated
|
||||
assertThat(fakeData.replaceDeprecatedTags("en_YU")).isEqualTo("en_RS");
|
||||
// Script is deprecated
|
||||
assertThat(fakeData.replaceDeprecatedTags("ar_Qaai_IR")).isEqualTo("ar_Zinh_IR");
|
||||
// Language is deprecated
|
||||
assertThat(fakeData.replaceDeprecatedTags("cym_GB")).isEqualTo("cy_GB");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplaceDeprecatedTags_complex() {
|
||||
SupplementalData fakeData = fakeSupplementalData(
|
||||
languageAlias("sh", "sr_Latn"),
|
||||
languageAlias("zh_TW", "zh_Hant_TW"),
|
||||
languageAlias("tzm_Latn_MA", "tzm_MA"),
|
||||
territoryAlias("YU", "RS"),
|
||||
likelySubtag("sr", "sr_Cyrl_RS"),
|
||||
likelySubtag("zh_Hant", "zh_Hant_TW"));
|
||||
|
||||
// "sh" -> "sr_Latn", taking precedence over the fact that "sr" maximizes to "sr_Cyrl_RS".
|
||||
assertThat(fakeData.replaceDeprecatedTags("sh_YU")).isEqualTo("sr_Latn_RS");
|
||||
// Alias lookup can add tags however depending on the situation.
|
||||
assertThat(fakeData.replaceDeprecatedTags("zh_TW")).isEqualTo("zh_Hant_TW");
|
||||
// But it will NOT remove tags (even though the languageAlias table contains an entry from
|
||||
// "tzm_Latn_MA" to "tzm_MA").
|
||||
assertThat(fakeData.replaceDeprecatedTags("tzm_Latn_MA")).isEqualTo("tzm_Latn_MA");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetDefaultCalendar() {
|
||||
SupplementalData fakeData = fakeSupplementalData(
|
||||
defaultCalendar("gregorian", "001"),
|
||||
defaultCalendar("persian", "AF"),
|
||||
likelySubtag("uz", "uz_Latn_UZ"),
|
||||
likelySubtag("uz_AF", "uz_Arab_AF"),
|
||||
likelySubtag("uz_Arab", "uz_Arab_AF"));
|
||||
assertThat(fakeData.getDefaultCalendar("root")).hasValue("gregorian");
|
||||
// Empty because "gregorian" is the default found in the parent locale.
|
||||
assertThat(fakeData.getDefaultCalendar("en_US")).isEmpty();
|
||||
assertThat(fakeData.getDefaultCalendar("uz")).isEmpty();
|
||||
assertThat(fakeData.getDefaultCalendar("uz_AF")).hasValue("persian");
|
||||
assertThat(fakeData.getDefaultCalendar("uz_Arab")).hasValue("persian");
|
||||
// Empty because "uz_Arab" defines the persian calendar.
|
||||
assertThat(fakeData.getDefaultCalendar("uz_Arab_AF")).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetDefaultCalendar_secretHacks() {
|
||||
SupplementalData fakeData = fakeSupplementalData(
|
||||
defaultCalendar("gregorian", "001"),
|
||||
likelySubtag("ja", "ja_Jpan_JP"),
|
||||
likelySubtag("th", "th_Thai_TH"));
|
||||
// Empty because "gregorian" is the default found in the parent locale.
|
||||
assertThat(fakeData.getDefaultCalendar("ja_US")).isEmpty();
|
||||
assertThat(fakeData.getDefaultCalendar("ja")).isEmpty();
|
||||
|
||||
// Traditional calendars for a region cannot be represented via the territory-only based
|
||||
// CLDR data calendar mapping, so they exist as hard coded "hacks" in SupplementalData.
|
||||
// They could be pulled out into the configuration API, but they should ideally just be
|
||||
// derived from CLDR data directly.
|
||||
assertThat(fakeData.getDefaultCalendar("ja_JP_TRADITIONAL")).hasValue("japanese");
|
||||
assertThat(fakeData.getDefaultCalendar("ja_TRADITIONAL")).hasValue("japanese");
|
||||
assertThat(fakeData.getDefaultCalendar("th_TH_TRADITIONAL")).hasValue("buddhist");
|
||||
assertThat(fakeData.getDefaultCalendar("th_TRADITIONAL")).hasValue("buddhist");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetParent_regression() {
|
||||
for (String id : TEST_LOCALE_IDS) {
|
||||
assertWithMessage("id=%s", id)
|
||||
.that(getIdChain(id, regressionData::getParent))
|
||||
.isEqualTo(getIdChain(id, LocaleIDParser::getParent));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMaximize_regression() {
|
||||
for (String id : TEST_LOCALE_IDS) {
|
||||
assertWithMessage("id=%s", id)
|
||||
.that(regressionData.maximize(id).orElse(null))
|
||||
.isEqualTo(likelySubtags.maximize(id));
|
||||
}
|
||||
|
||||
// ars currently a special case since it's in the ICU data as an alias, but not in the CLDR
|
||||
// data at all. This while it's a structurally valid language code, it cannot be maximized.
|
||||
assertThat(regressionData.maximize("ars")).isEmpty();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplaceDeprecatedTags_regression() {
|
||||
LanguageTagCanonicalizer ltc = new LanguageTagCanonicalizer();
|
||||
for (String id : TEST_LOCALE_IDS) {
|
||||
// Work around:
|
||||
// https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13194
|
||||
try {
|
||||
ltc.transform(id);
|
||||
} catch (NullPointerException e) {
|
||||
System.out.println("--> " + id);
|
||||
continue;
|
||||
}
|
||||
// Need to maximize to work around:
|
||||
// https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13196
|
||||
assertWithMessage("id=%s", id)
|
||||
.that(regressionData.maximize(regressionData.replaceDeprecatedTags(id)).orElse(null))
|
||||
.isEqualTo(likelySubtags.maximize(ltc.transform(id)));
|
||||
}
|
||||
}
|
||||
|
||||
private static Iterable<String> getIdChain(String id, Function<String, String> fn) {
|
||||
List<String> chain = new ArrayList<>();
|
||||
while (!id.equals("root")) {
|
||||
chain.add(id);
|
||||
id = fn.apply(id);
|
||||
}
|
||||
chain.add(id);
|
||||
return chain;
|
||||
}
|
||||
|
||||
private static final ImmutableSet<String> TEST_LOCALE_IDS = ImmutableSet.of(
|
||||
"af", "af_NA", "af_ZA", "agq", "agq_CM", "ak", "ak_GH", "am", "am_ET", "ar", "ar_001",
|
||||
"ar_AE", "ar_BH", "ar_DJ", "ar_DZ", "ar_EG", "ar_EH", "ar_ER", "ar_IL", "ar_IQ", "ar_JO",
|
||||
"ar_KM", "ar_KW", "ar_LB", "ar_LY", "ar_MA", "ar_MR", "ar_OM", "ar_PS", "ar_QA", "ar_SA",
|
||||
"ar_SD", "ar_SO", "ar_SS", "ar_SY", "ar_TD", "ar_TN", "ar_YE", "ars", "as", "as_IN",
|
||||
"asa", "asa_TZ", "ast", "ast_ES", "az", "az_AZ", "az_Cyrl", "az_Cyrl_AZ", "az_Latn",
|
||||
"az_Latn_AZ", "bas", "bas_CM", "be", "be_BY", "bem", "bem_ZM", "bez", "bez_TZ", "bg",
|
||||
"bg_BG", "bm", "bm_ML", "bn", "bn_BD", "bn_IN", "bo", "bo_CN", "bo_IN", "br", "br_FR",
|
||||
"brx", "brx_IN", "bs", "bs_Cyrl", "bs_Cyrl_BA", "bs_Latn", "bs_Latn_BA", "bs_BA", "ca",
|
||||
"ca_AD", "ca_ES", "ca_FR", "ca_IT", "ccp", "ccp_BD", "ccp_IN", "ce", "ce_RU", "ceb",
|
||||
"ceb_PH", "cgg", "cgg_UG", "chr", "chr_US", "ckb", "ckb_IQ", "ckb_IR", "cs", "cs_CZ", "cy",
|
||||
"cy_GB", "da", "da_DK", "da_GL", "dav", "dav_KE", "de", "de_AT", "de_BE", "de_CH", "de_DE",
|
||||
"de_IT", "de_LI", "de_LU", "dje", "dje_NE", "dsb", "dsb_DE", "dua", "dua_CM", "dyo",
|
||||
"dyo_SN", "dz", "dz_BT", "ebu", "ebu_KE", "ee", "ee_GH", "ee_TG", "el", "el_CY", "el_GR",
|
||||
"en", "en_001", "en_150", "en_AE", "en_AG", "en_AI", "en_AS", "en_AT", "en_AU", "en_BB",
|
||||
"en_BE", "en_BI", "en_BM", "en_BS", "en_BW", "en_BZ", "en_CA", "en_CC", "en_CH", "en_CK",
|
||||
"en_CM", "en_CX", "en_CY", "en_DE", "en_DG", "en_DK", "en_DM", "en_ER", "en_FI", "en_FJ",
|
||||
"en_FK", "en_FM", "en_GB", "en_GD", "en_GG", "en_GH", "en_GI", "en_GM", "en_GU", "en_GY",
|
||||
"en_HK", "en_IE", "en_IL", "en_IM", "en_IN", "en_IO", "en_JE", "en_JM", "en_KE", "en_KI",
|
||||
"en_KN", "en_KY", "en_LC", "en_LR", "en_LS", "en_MG", "en_MH", "en_MO", "en_MP", "en_MS",
|
||||
"en_MT", "en_MU", "en_MW", "en_MY", "en_NA", "en_NF", "en_NG", "en_NL", "en_NR", "en_NU",
|
||||
"en_NZ", "en_PG", "en_PH", "en_PK", "en_PN", "en_PR", "en_PW", "en_RH", "en_RW", "en_SB",
|
||||
"en_SC", "en_SD", "en_SE", "en_SG", "en_SH", "en_SI", "en_SL", "en_SS", "en_SX", "en_SZ",
|
||||
"en_TC", "en_TK", "en_TO", "en_TT", "en_TV", "en_TZ", "en_UG", "en_UM", "en_US",
|
||||
"en_US_POSIX", "en_VC", "en_VG", "en_VI", "en_VU", "en_WS", "en_ZA", "en_ZM", "en_ZW", "eo",
|
||||
"eo_001", "es", "es_003", "es_419", "es_AR", "es_BO", "es_BR", "es_BZ", "es_CL", "es_CO",
|
||||
"es_CR", "es_CU", "es_DO", "es_EA", "es_EC", "es_ES", "es_GQ", "es_GT", "es_HN", "es_IC",
|
||||
"es_MX", "es_NI", "es_PA", "es_PE", "es_PH", "es_PR", "es_PY", "es_SV", "es_US", "es_UY",
|
||||
"es_VE", "et", "et_EE", "eu", "eu_ES", "ewo", "ewo_CM", "fa", "fa_AF", "fa_IR", "ff",
|
||||
"ff_CM", "ff_GN", "ff_Latn", "ff_Latn_BF", "ff_Latn_CM", "ff_Latn_GH", "ff_Latn_GM",
|
||||
"ff_Latn_GN", "ff_Latn_GW", "ff_Latn_LR", "ff_Latn_MR", "ff_Latn_NE", "ff_Latn_NG",
|
||||
"ff_Latn_SL", "ff_Latn_SN", "ff_MR", "ff_SN", "fi", "fi_FI", "fil", "fil_PH", "fo", "fo_DK",
|
||||
"fo_FO", "fr", "fr_BE", "fr_BF", "fr_BI", "fr_BJ", "fr_BL", "fr_CA", "fr_CD", "fr_CF",
|
||||
"fr_CG", "fr_CH", "fr_CI", "fr_CM", "fr_DJ", "fr_DZ", "fr_FR", "fr_GA", "fr_GF", "fr_GN",
|
||||
"fr_GP", "fr_GQ", "fr_HT", "fr_KM", "fr_LU", "fr_MA", "fr_MC", "fr_MF", "fr_MG", "fr_ML",
|
||||
"fr_MQ", "fr_MR", "fr_MU", "fr_NC", "fr_NE", "fr_PF", "fr_PM", "fr_RE", "fr_RW", "fr_SC",
|
||||
"fr_SN", "fr_SY", "fr_TD", "fr_TG", "fr_TN", "fr_VU", "fr_WF", "fr_YT", "fur", "fur_IT",
|
||||
"fy", "fy_NL", "ga", "ga_IE", "gd", "gd_GB", "gl", "gl_ES", "gsw", "gsw_CH", "gsw_FR",
|
||||
"gsw_LI", "gu", "gu_IN", "guz", "guz_KE", "gv", "gv_IM", "ha", "ha_GH", "ha_NE", "ha_NG",
|
||||
"haw", "haw_US", "he", "he_IL", "hi", "hi_IN", "hr", "hr_BA", "hr_HR", "hsb", "hsb_DE",
|
||||
"hu", "hu_HU", "hy", "hy_AM", "ia", "ia_001", "id", "id_ID", "ig", "ig_NG", "ii", "ii_CN",
|
||||
"in", "in_ID", "is", "is_IS", "it", "it_CH", "it_IT", "it_SM", "it_VA", "iw", "iw_IL", "ja",
|
||||
"ja_JP", "jgo", "jgo_CM", "jmc", "jmc_TZ", "jv", "jv_ID", "ka", "ka_GE", "kab", "kab_DZ",
|
||||
"kam", "kam_KE", "kde", "kde_TZ", "kea", "kea_CV", "khq", "khq_ML", "ki", "ki_KE", "kk",
|
||||
"kk_KZ", "kkj", "kkj_CM", "kl", "kl_GL", "kln", "kln_KE", "km", "km_KH", "kn", "kn_IN",
|
||||
"ko", "ko_KP", "ko_KR", "kok", "kok_IN", "ks", "ks_IN", "ksb", "ksb_TZ", "ksf", "ksf_CM",
|
||||
"ksh", "ksh_DE", "ku", "ku_TR", "kw", "kw_GB", "ky", "ky_KG", "lag", "lag_TZ", "lb",
|
||||
"lb_LU", "lg", "lg_UG", "lkt", "lkt_US", "ln", "ln_AO", "ln_CD", "ln_CF", "ln_CG", "lo",
|
||||
"lo_LA", "lrc", "lrc_IQ", "lrc_IR", "lt", "lt_LT", "lu", "lu_CD", "luo", "luo_KE", "luy",
|
||||
"luy_KE", "lv", "lv_LV", "mas", "mas_KE", "mas_TZ", "mer", "mer_KE", "mfe", "mfe_MU", "mg",
|
||||
"mg_MG", "mgh", "mgh_MZ", "mgo", "mgo_CM", "mi", "mi_NZ", "mk", "mk_MK", "ml", "ml_IN",
|
||||
"mn", "mn_MN", "mo", "mr", "mr_IN", "ms", "ms_BN", "ms_MY", "ms_SG", "mt", "mt_MT", "mua",
|
||||
"mua_CM", "my", "my_MM", "mzn", "mzn_IR", "naq", "naq_NA", "nb", "nb_NO", "nb_SJ", "nd",
|
||||
"nd_ZW", "nds", "nds_DE", "nds_NL", "ne", "ne_IN", "ne_NP", "nl", "nl_AW", "nl_BE", "nl_BQ",
|
||||
"nl_CW", "nl_NL", "nl_SR", "nl_SX", "nmg", "nmg_CM", "nn", "nn_NO", "nnh", "nnh_CM", "no",
|
||||
"no_NO", "nus", "nus_SS", "nyn", "nyn_UG", "om", "om_ET", "om_KE", "or", "or_IN", "os",
|
||||
"os_GE", "os_RU", "pa", "pa_Arab", "pa_Arab_PK", "pa_Guru", "pa_Guru_IN", "pa_IN", "pa_PK",
|
||||
"pl", "pl_PL", "ps", "ps_AF", "ps_PK", "pt", "pt_AO", "pt_BR", "pt_CH", "pt_CV", "pt_GQ",
|
||||
"pt_GW", "pt_LU", "pt_MO", "pt_MZ", "pt_PT", "pt_ST", "pt_TL", "qu", "qu_BO", "qu_EC",
|
||||
"qu_PE", "rm", "rm_CH", "rn", "rn_BI", "ro", "ro_MD", "ro_RO", "rof", "rof_TZ", "ru",
|
||||
"ru_BY", "ru_KG", "ru_KZ", "ru_MD", "ru_RU", "ru_UA", "rw", "rw_RW", "rwk", "rwk_TZ", "sah",
|
||||
"sah_RU", "saq", "saq_KE", "sbp", "sbp_TZ", "sd", "sd_PK", "se", "se_FI", "se_NO", "se_SE",
|
||||
"seh", "seh_MZ", "ses", "ses_ML", "sg", "sg_CF", "sh", "sh_BA", "sh_CS", "sh_YU", "shi",
|
||||
"shi_Latn", "shi_Latn_MA", "shi_Tfng", "shi_Tfng_MA", "shi_MA", "si", "si_LK", "sk",
|
||||
"sk_SK", "sl", "sl_SI", "smn", "smn_FI", "sn", "sn_ZW", "so", "so_DJ", "so_ET", "so_KE",
|
||||
"so_SO", "sq", "sq_AL", "sq_MK", "sq_XK", "sr", "sr_Cyrl", "sr_Cyrl_BA", "sr_Cyrl_ME",
|
||||
"sr_Cyrl_RS", "sr_Cyrl_CS", "sr_Cyrl_XK", "sr_Cyrl_YU", "sr_Latn", "sr_Latn_BA",
|
||||
"sr_Latn_ME", "sr_Latn_RS", "sr_Latn_CS", "sr_Latn_XK", "sr_Latn_YU", "sr_BA", "sr_ME",
|
||||
"sr_RS", "sr_CS", "sr_YU", "sv", "sv_AX", "sv_FI", "sv_SE", "sw", "sw_CD", "sw_KE", "sw_TZ",
|
||||
"sw_UG", "ta", "ta_IN", "ta_LK", "ta_MY", "ta_SG", "te", "te_IN", "teo", "teo_KE", "teo_UG",
|
||||
"tg", "tg_TJ", "th", "th_TH", "ti", "ti_ER", "ti_ET", "tk", "tk_TM", "tl", "tl_PH", "to",
|
||||
"to_TO", "tr", "tr_CY", "tr_TR", "tt", "tt_RU", "twq", "twq_NE", "tzm", "tzm_MA", "ug",
|
||||
"ug_CN", "uk", "uk_UA", "ur", "ur_IN", "ur_PK", "uz", "uz_AF", "uz_Arab", "uz_Arab_AF",
|
||||
"uz_Cyrl", "uz_Cyrl_UZ", "uz_Latn", "uz_Latn_UZ", "uz_UZ", "vai", "vai_Latn", "vai_Latn_LR",
|
||||
"vai_LR", "vai_Vaii", "vai_Vaii_LR", "vi", "vi_VN", "vun", "vun_TZ", "wae", "wae_CH", "wo",
|
||||
"wo_SN", "xh", "xh_ZA", "xog", "xog_UG", "yav", "yav_CM", "yi", "yi_001", "yo", "yo_BJ",
|
||||
"yo_NG", "yue", "yue_Hans", "yue_Hans_CN", "yue_Hant", "yue_Hant_HK", "zgh", "zgh_MA", "zh",
|
||||
"zh_Hans", "zh_Hans_CN", "zh_Hans_HK", "zh_Hans_MO", "zh_Hans_SG", "zh_Hant", "zh_Hant_HK",
|
||||
"zh_Hant_MO", "zh_Hant_TW", "zh_CN", "zh_HK", "zh_MO", "zh_SG", "zh_TW", "zu", "zu_ZA");
|
||||
|
||||
private static CldrValue parentLocales(String parent, String... locales) {
|
||||
return supplementalData(
|
||||
"parentLocales/parentLocale[@parent=\"%s\"][@locales=\"%s\"]",
|
||||
parent, Joiner.on(' ').join(locales));
|
||||
}
|
||||
|
||||
private static CldrValue defaultCalendar(String calendar, String... territories) {
|
||||
return supplementalData(
|
||||
"calendarPreferenceData/calendarPreference[@territories=\"%s\"][@ordering=\"%s\"]",
|
||||
Joiner.on(' ').join(territories), calendar);
|
||||
}
|
||||
|
||||
private static CldrValue likelySubtag(String from, String to) {
|
||||
return supplementalData(
|
||||
"likelySubtags/likelySubtag[@from=\"%s\"][@to=\"%s\"]", from, to);
|
||||
}
|
||||
|
||||
private static CldrValue languageAlias(String type, String replacement) {
|
||||
return supplementalData(
|
||||
"metadata/alias/languageAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
|
||||
}
|
||||
|
||||
private static CldrValue scriptAlias(String type, String replacement) {
|
||||
return supplementalData(
|
||||
"metadata/alias/scriptAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
|
||||
}
|
||||
|
||||
private static CldrValue territoryAlias(String type, String replacement) {
|
||||
return supplementalData(
|
||||
"metadata/alias/territoryAlias[@type=\"%s\"][@replacement=\"%s\"]", type, replacement);
|
||||
}
|
||||
|
||||
private static CldrValue supplementalData(String path, Object... args) {
|
||||
return parseValue(String.format("//supplementalData/" + path, args), "");
|
||||
}
|
||||
|
||||
private static SupplementalData fakeSupplementalData(CldrValue... values) {
|
||||
return SupplementalData.create(CldrDataSupplier.forValues(asList(values)));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,538 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.regex;
|
||||
|
||||
import static com.google.common.truth.Truth.assertThat;
|
||||
import static java.util.Arrays.asList;
|
||||
import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows;
|
||||
import static org.unicode.icu.tool.cldrtoicu.testing.ResultSubjectFactory.assertThat;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
import org.unicode.cldr.api.CldrPath;
|
||||
import org.unicode.cldr.api.CldrValue;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
/**
|
||||
* Tests for the regex transformer class. Note that in most cases, the rules used here are taken
|
||||
* directly from one of the config files, simply because it avoids having to invent valid paths
|
||||
* for testing (and we still need "real" CLDR paths since the path parsing verifies attributes
|
||||
* against the DTD metadata). Basing tests on real rules illustrates that all of these tests are
|
||||
* asserting about relied-upon behaviour, however there is nothing inherently special about these
|
||||
* paths.
|
||||
*/
|
||||
@RunWith(JUnit4.class)
|
||||
public class RegexTransformerTest {
|
||||
@Test
|
||||
public void testSingleResults_singleCapture() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\w\\-]++",
|
||||
"//ldml/numbers/defaultNumberingSystem[@alt=\"(%A)\"] ; /NumberElements/default_$1",
|
||||
"//ldml/numbers/defaultNumberingSystem ; /NumberElements/default",
|
||||
"//ldml/numbers/otherNumberingSystems/(%W) ; /NumberElements/$1");
|
||||
|
||||
CldrValue defaultNumberingSystem =
|
||||
CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem", "foobar");
|
||||
assertSingleResult(
|
||||
transformer.transform(defaultNumberingSystem), "NumberElements/default", "foobar");
|
||||
|
||||
CldrValue altNumberingSystem =
|
||||
CldrValue.parseValue("//ldml/numbers/defaultNumberingSystem[@alt=\"foo\"]", "bar");
|
||||
assertSingleResult(
|
||||
transformer.transform(altNumberingSystem), "NumberElements/default_foo", "bar");
|
||||
|
||||
CldrValue otherNumberingSystems =
|
||||
CldrValue.parseValue("//ldml/numbers/otherNumberingSystems/finance", "foo bar");
|
||||
assertSingleResult(
|
||||
transformer.transform(otherNumberingSystems), "NumberElements/finance", "foo bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleResults_multipleCapture() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"//ldml/characters"
|
||||
+ "/parseLenients[@scope=\"(%A)\"][@level=\"(%A)\"]"
|
||||
+ "/parseLenient[@sample=\"%A\"]"
|
||||
+ " ; /parse/$1/$2");
|
||||
|
||||
CldrValue lenient = CldrValue.parseValue(
|
||||
"//ldml/characters"
|
||||
+ "/parseLenients[@scope=\"general\"][@level=\"lenient\"]"
|
||||
+ "/parseLenient[@sample=\"ignored\"]",
|
||||
"foo");
|
||||
assertSingleResult(
|
||||
transformer.transform(lenient), "/parse/general/lenient", "foo");
|
||||
|
||||
CldrValue stricter = CldrValue.parseValue(
|
||||
"//ldml/characters"
|
||||
+ "/parseLenients[@scope=\"number\"][@level=\"stricter\"]"
|
||||
+ "/parseLenient[@sample=\"ignored\"]",
|
||||
"bar");
|
||||
assertSingleResult(
|
||||
transformer.transform(stricter), "/parse/number/stricter", "bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleResults() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/numberingSystems"
|
||||
+ "/numberingSystem[@type=\"numeric\"][@id=\"(%W)\"][@digits=\"(%A)\"]",
|
||||
" ; /numberingSystems/$1/algorithmic:int ; values=0",
|
||||
" ; /numberingSystems/$1/desc ; values=$2",
|
||||
" ; /numberingSystems/$1/radix:int ; values=10");
|
||||
|
||||
CldrValue value = CldrValue.parseValue(
|
||||
"//supplementalData/numberingSystems"
|
||||
+ "/numberingSystem[@type=\"numeric\"][@id=\"foo\"][@digits=\"bar\"]",
|
||||
"");
|
||||
ImmutableList<Result> results = transformer.transform(value);
|
||||
assertThat(results).hasSize(3);
|
||||
assertThat(results.get(0)).hasKey("/numberingSystems/foo/algorithmic:int");
|
||||
assertThat(results.get(0)).hasValues("0");
|
||||
assertThat(results.get(0)).isGrouped(false);
|
||||
|
||||
assertThat(results.get(1)).hasKey("/numberingSystems/foo/desc");
|
||||
assertThat(results.get(1)).hasValues("bar");
|
||||
assertThat(results.get(1)).isGrouped(false);
|
||||
|
||||
assertThat(results.get(2)).hasKey("/numberingSystems/foo/radix:int");
|
||||
assertThat(results.get(2)).hasValues("10");
|
||||
assertThat(results.get(2)).isGrouped(false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testImplicitArgumentSplitting() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/gender/personList[@type=\"(%W)\"][@locales=\"(%W)\"]"
|
||||
+ " ; /genderList/$2 ; values=$1",
|
||||
"//supplementalData/windowsZones/mapTimezones"
|
||||
+ "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
|
||||
+ " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
|
||||
|
||||
// Implicit splitting is based on the first unquoted placeholder in the output path ($2 in
|
||||
// this case) and not the first captured group of the input path.
|
||||
CldrValue personList = CldrValue.parseValue(
|
||||
"//supplementalData/gender/personList[@type=\"neutral\"][@locales=\"xx yy zz\"]", "");
|
||||
ImmutableList<Result> results = transformer.transform(personList);
|
||||
assertThat(results).hasSize(3);
|
||||
assertThat(results.get(0)).hasKey("/genderList/xx");
|
||||
assertThat(results.get(0)).hasValues("neutral");
|
||||
assertThat(results.get(1)).hasKey("/genderList/yy");
|
||||
assertThat(results.get(1)).hasValues("neutral");
|
||||
assertThat(results.get(2)).hasKey("/genderList/zz");
|
||||
assertThat(results.get(2)).hasValues("neutral");
|
||||
|
||||
// Quoting prevents the first captured argument with spaces from triggering multiple
|
||||
// results (it will trigger on the first un-quoted argument in the output path). This
|
||||
// quoting must appear in the output however since spaces are "structural" in paths in
|
||||
// ICU data files.
|
||||
CldrValue mapZone = CldrValue.parseValue(
|
||||
"//supplementalData/windowsZones/mapTimezones/mapZone"
|
||||
+ "[@type=\"foo\"]"
|
||||
+ "[@other=\"not split\"]"
|
||||
+ "[@territory=\"XX YY ZZ\"]",
|
||||
"");
|
||||
results = transformer.transform(mapZone);
|
||||
assertThat(results).hasSize(3);
|
||||
assertThat(results.get(0)).hasKey("/mapTimezones/\"not split\"/XX");
|
||||
assertThat(results.get(2)).hasValues("foo");
|
||||
assertThat(results.get(1)).hasKey("/mapTimezones/\"not split\"/YY");
|
||||
assertThat(results.get(2)).hasValues("foo");
|
||||
assertThat(results.get(2)).hasKey("/mapTimezones/\"not split\"/ZZ");
|
||||
assertThat(results.get(2)).hasValues("foo");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testValueSplitting() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/parentLocales/parentLocale[@parent=\"(%A)\"][@locales=\"(%A)\"]"
|
||||
+ " ; /parentLocales/$1 ; values=$2",
|
||||
"//supplementalData/windowsZones/mapTimezones"
|
||||
+ "/mapZone[@type=\"(%A)\"][@other=\"(%A)\"][@territory=\"(%W)\"]"
|
||||
+ " ; /mapTimezones/\"$2\"/$3 ; values=\"$1\"");
|
||||
|
||||
// Because the value is expressed via an explicit values instruction, it is split by space.
|
||||
CldrValue parentLocale = CldrValue.parseValue(
|
||||
"//supplementalData/parentLocales"
|
||||
+ "/parentLocale[@parent=\"foo\"][@locales=\"value is split\"]",
|
||||
"");
|
||||
assertSingleResult(transformer.transform(parentLocale),
|
||||
"/parentLocales/foo", "value", "is", "split");
|
||||
|
||||
// However if a placeholder is quoted in the value instruction, it is not split.
|
||||
CldrValue mapZone = CldrValue.parseValue(
|
||||
"//supplementalData/windowsZones/mapTimezones/mapZone"
|
||||
+ "[@type=\"value is not split\"]"
|
||||
+ "[@other=\"foo\"]"
|
||||
+ "[@territory=\"XX\"]",
|
||||
"");
|
||||
assertSingleResult(transformer.transform(mapZone),
|
||||
"/mapTimezones/\"foo\"/XX", "value is not split");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResultFunctionCalling() {
|
||||
List<String> configLines = asList(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/numberingSystems"
|
||||
+ "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
|
||||
" ; /numberingSystems/foo ; values=&swap( $1 , $2 ) $3",
|
||||
" ; /numberingSystems/bar ; values=\"&swap( $1, quux )\"",
|
||||
" ; /numberingSystems/baz ; values=\"&swap( $1-$2, $3{value} )\"");
|
||||
|
||||
CldrValue numberingSystem = CldrValue.parseValue(
|
||||
"//supplementalData/numberingSystems"
|
||||
+ "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
|
||||
"-VALUE");
|
||||
|
||||
// Note that joining with a space is rather a trivial function, but it does illustrate that
|
||||
// a function's output is still subject to value splitting unless quoted. In fact a common
|
||||
// function (&ymd) is used to split year/month/day strings using spaces exactly so they are
|
||||
// treated as separate values.
|
||||
// Note also that the spaces around the arguments to the function are ignored however.
|
||||
NamedFunction swapFn =
|
||||
NamedFunction.create("swap", 2, args -> args.get(1) + " " + args.get(0));
|
||||
PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, swapFn);
|
||||
ImmutableList<Result> results = transformer.transform(numberingSystem);
|
||||
|
||||
assertThat(results).hasSize(3);
|
||||
assertThat(results.get(0)).hasValues("bar", "foo", "baz");
|
||||
assertThat(results.get(1)).hasValues("quux foo");
|
||||
assertThat(results.get(2)).hasValues("baz-VALUE foo-bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResultFunctionCalling_edgeCases() {
|
||||
List<String> configLines = asList(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/numberingSystems"
|
||||
+ "/numberingSystem[@type=\"(%W)\"][@id=\"(%W)\"][@rules=\"(%A)\"]",
|
||||
" ; /numberingSystems/foo ; values=\"&join( {value} , $1 $2 $3, {value} )\"");
|
||||
|
||||
// This illustrates a fundamental problem with the way that quoting and splitting is
|
||||
// defined in this config language. Splitting is always down after value substitution,
|
||||
// which is just done as a single pass. This, if a value has a double-quote in it can
|
||||
// upset the quoting behaviour in odd ways. Here it prevents the outermost quoting from
|
||||
// working and results in multiple values where there should be one.
|
||||
//
|
||||
// To fix this, the implicit splitting should be replaced by a "split()" function and the
|
||||
// rules should be parsed into something approximating a proper expression AST.
|
||||
CldrValue badValue = CldrValue.parseValue(
|
||||
"//supplementalData/numberingSystems"
|
||||
+ "/numberingSystem[@type=\"foo\"][@id=\"bar\"][@rules=\"baz\"]",
|
||||
"<< \" >>");
|
||||
|
||||
NamedFunction joinFn =
|
||||
NamedFunction.create("join", 3, args -> args.get(0) + args.get(1) + args.get(2));
|
||||
PathValueTransformer transformer = RegexTransformer.fromConfigLines(configLines, joinFn);
|
||||
ImmutableList<Result> results = transformer.transform(badValue);
|
||||
// If outer quoting worked, this would be a single value, not five.
|
||||
assertSingleResult(results, "/numberingSystems/foo", "<< ", ">>foo", "bar", "baz<<", " >>");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDynamicVars() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%W=[\\w\\-]++",
|
||||
"%D=//ldml/numbers/defaultNumberingSystem",
|
||||
"//ldml/numbers/currencyFormats[@numberSystem=\"%D\"]/currencySpacing/(%W)/(%W)",
|
||||
" ; /currencySpacing/$1/$2");
|
||||
CldrValue cldrValue = CldrValue.parseValue(
|
||||
"//ldml/numbers/currencyFormats[@numberSystem=\"latn\"]"
|
||||
+ "/currencySpacing/beforeCurrency/currencyMatch",
|
||||
"format");
|
||||
// The path we expect to be resolved by the dynamic variable function.
|
||||
CldrPath expectedPath =
|
||||
CldrPath.parseDistinguishingPath("//ldml/numbers/defaultNumberingSystem");
|
||||
ImmutableList<Result> format = transformer.transform(cldrValue, p -> {
|
||||
assertThat(p).isEqualTo(expectedPath);
|
||||
return "latn";
|
||||
});
|
||||
assertSingleResult(format, "/currencySpacing/beforeCurrency/currencyMatch", "format");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFallbacks_simple() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%W=[\\w\\-/]++",
|
||||
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol"
|
||||
+ " ; /Currencies/$1 ; fallback=$1",
|
||||
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/displayName"
|
||||
+ " ; /Currencies/$1 ; fallback=$1");
|
||||
|
||||
ImmutableList<Result> symbol = transformer.transform(
|
||||
CldrValue.parseValue(
|
||||
"//ldml/numbers/currencies/currency[@type=\"Foo\"]/symbol", "symbol"));
|
||||
assertSingleResult(symbol, "Currencies/Foo", "symbol");
|
||||
ImmutableList<Result> name = transformer.transform(
|
||||
CldrValue.parseValue(
|
||||
"//ldml/numbers/currencies/currency[@type=\"Foo\"]/displayName", "name"));
|
||||
assertSingleResult(name, "Currencies/Foo", "name");
|
||||
|
||||
RbPath rbPath = RbPath.of("Currencies", "Foo");
|
||||
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
|
||||
assertThat(fallbacks).hasSize(2);
|
||||
|
||||
// Both fallbacks look like they are equal, but they didn't come from the same rule...
|
||||
assertThat(fallbacks.get(0)).hasKey(rbPath);
|
||||
assertThat(fallbacks.get(0)).hasValues("Foo");
|
||||
assertThat(fallbacks.get(1)).hasKey(rbPath);
|
||||
assertThat(fallbacks.get(1)).hasValues("Foo");
|
||||
|
||||
// ... so they correspond to different matched results.
|
||||
assertThat(fallbacks.get(0).isFallbackFor(symbol.get(0))).isTrue();
|
||||
assertThat(fallbacks.get(1).isFallbackFor(symbol.get(0))).isFalse();
|
||||
|
||||
assertThat(fallbacks.get(0).isFallbackFor(name.get(0))).isFalse();
|
||||
assertThat(fallbacks.get(1).isFallbackFor(name.get(0))).isTrue();
|
||||
|
||||
// And they are ordered by their appearance in the configuration file.
|
||||
assertThat(fallbacks.get(0)).isLessThan(fallbacks.get(1));
|
||||
|
||||
// BUT (and this is important) the fallback results are "equal". This is necessary for
|
||||
// other situations where results are generated from different rules but should be
|
||||
// considered "equal" for purposes of deduplication. Deduplication doesn't affect this
|
||||
// situation though (but it's worth being explicit in this test). This is all a bit subtle
|
||||
// and should be fixed properly at some point. See also "testBaseXpath()".
|
||||
assertThat(fallbacks.get(0)).isEqualTo(fallbacks.get(1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFallbacks_multipleArgs() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/calendarData"
|
||||
+ "/calendar[@type=\"(%W)\"]/eras/era[@type=\"(%W)\"][@(start|end)=\"(%A)\"]",
|
||||
" ; /fake/$2/$4/$1/$3 ; fallback=$1 $2 $3 $4 $3 $2 $1");
|
||||
// Path elements match the $N indices so it's easy to see how reordering happens.
|
||||
RbPath rbPath = RbPath.of("fake", "two", "four", "one", "three");
|
||||
// This shows that the capturing of arguments done on the resource bundle path for the
|
||||
// fallback correctly reordered the arguments. Having this many reordered arguments in a
|
||||
// fallback is not something that really happens in the actual config files currently, but
|
||||
// it's complex logic and needs to be tested. Note also how captured arguments can appear
|
||||
// multiple times in the result.
|
||||
assertSingleResult(
|
||||
transformer.getFallbackResultsFor(rbPath, p -> null),
|
||||
rbPath,
|
||||
"one", "two", "three", "four", "three", "two", "one");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFallbacks_valueSplitting() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
|
||||
" ; /fake/$1/$2 ; fallback=$1 and $2");
|
||||
|
||||
RbPath rbPath = RbPath.of("fake", "Foo", "Bar");
|
||||
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
|
||||
assertSingleResult(fallbacks, rbPath, "Foo", "and", "Bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFallbacks_missingArgs() {
|
||||
IllegalStateException e = assertThrows(
|
||||
IllegalStateException.class,
|
||||
() -> transformer(
|
||||
"%A=[^\"']++",
|
||||
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
|
||||
" ; /$1 ; fallback=$2"));
|
||||
// A bit brittle, but this message is important for debugging.
|
||||
assertThat(e).hasMessageThat()
|
||||
.contains("fallback values may only contain arguments from the resource bundle path");
|
||||
assertThat(e).hasMessageThat().contains("$2");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFallbacks_noValueSubstitution() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
|
||||
" ; /$1 ; fallback=$1-{value}");
|
||||
|
||||
RbPath rbPath = RbPath.of("Foo");
|
||||
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
|
||||
// The {value} token is not substituted in a fallback because there is not value.
|
||||
// TODO: Make this into an error (since it's only ever going to happen by mistake)!
|
||||
assertSingleResult(fallbacks, rbPath, "Foo-{value}");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFallbacks_noQuotingSupport() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"//supplementalData/likelySubtags/likelySubtag[@from=\"(%A)\"][@to=\"(%A)\"]",
|
||||
" ; /fake/$1 ; fallback=\"$1\"");
|
||||
|
||||
RbPath rbPath = RbPath.of("fake", "Foo");
|
||||
ImmutableList<Result> fallbacks = transformer.getFallbackResultsFor(rbPath, p -> null);
|
||||
// Fallbacks could support quoting of placeholders, but to match legacy behaviour,
|
||||
// they don't yet. As it is you cannot prevent fallback values being split on spaces.
|
||||
assertSingleResult(fallbacks, rbPath, "\"Foo\"");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHiddenLabelsAndMetazones() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%A=[^\"']++",
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"//supplementalData/metaZones/metazoneInfo"
|
||||
+ "/timezone[@type=\"(%W)\"]/usesMetazone[@mzone=\"(%W)\"]"
|
||||
+ " ; /metazoneInfo/\"$1\"/<$2> ; values=$2",
|
||||
"//supplementalData/metaZones/metazoneInfo"
|
||||
+ "/timezone[@type=\"(%W)\"]/usesMetazone[@to=\"(%A)\"][@mzone=\"(%W)\"]"
|
||||
+ " ; /metazoneInfo/\"$1\"/<1970-01-01 00:00> ; values=$3 \"1970-01-01 00:00\" \"$2\"");
|
||||
|
||||
ImmutableList<Result> parisTz = transformPath(
|
||||
transformer,
|
||||
"//supplementalData/metaZones/metazoneInfo"
|
||||
+ "/timezone[@type=\"Europe/Paris\"]/usesMetazone[@mzone=\"Europe_Central\"]");
|
||||
|
||||
// The conversion from "Europe/Paris" to "Europe:Paris" is a built in special case when
|
||||
// quoting values with '/' in. It's only actually necessary for these timezone identifiers,
|
||||
// but the code is applied everywhere since that's easier. Ideally there'd be something
|
||||
// like the function calling mechanism to make this transformation explicit, but at the
|
||||
// moment, the output resource bunder paths have no way to control the transformation of
|
||||
// substituted arguments, so it has to be built in.
|
||||
assertSingleResult(
|
||||
parisTz, "/metazoneInfo/\"Europe:Paris\"/<Europe_Central>", "Europe_Central");
|
||||
|
||||
ImmutableList<Result> britishTz = transformPath(
|
||||
transformer,
|
||||
"//supplementalData/metaZones/metazoneInfo"
|
||||
+ "/timezone[@type=\"Europe/London\"]"
|
||||
+ "/usesMetazone[@to=\"1971-10-31 02:00\"][@mzone=\"Europe_Central\"]");
|
||||
|
||||
// This example demonstrates that things like ' ' or ':' (normally prohibited in resource
|
||||
// bundle path elements) are acceptable in hidden labels, since those will be stripped out
|
||||
// while writing the resulting data file. The date-time values are quoted in the rule to
|
||||
// ensure they are not split.
|
||||
assertSingleResult(
|
||||
britishTz,
|
||||
"/metazoneInfo/\"Europe:London\"/<1970-01-01 00:00>",
|
||||
"Europe_Central", "1970-01-01 00:00", "1971-10-31 02:00");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBaseXpath() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%W=[\\s\\w\\-/]++",
|
||||
"%N=[\\d\\.]++",
|
||||
|
||||
// In the real data, these rules define multiple results which reflect the actual
|
||||
// differences in the child elements, but the one tested is is only based on the
|
||||
// <territory> path prefix, which is the same for many child elements (which is all
|
||||
// that's ever actually transformed).
|
||||
//
|
||||
// So for a single path prefix you'll generate multiple identical results which need
|
||||
// to be de-duplicated, which can only happen if they are considered to have come
|
||||
// from the same source (since duplicate results happen all the time in general).
|
||||
//
|
||||
// This is what the base xpath does, it fakes a different source CLDR path which makes
|
||||
// the results "equal" (even though they came from different CLDR paths sources).
|
||||
"//supplementalData/territoryInfo"
|
||||
+ "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
|
||||
+ "/languagePopulation[@type=\"(%W)\"][@populationPercent=\"(%N)\"]",
|
||||
" ; /territoryInfo/$1/territoryF:intvector"
|
||||
+ " ; values=$2 $3 $4"
|
||||
+ " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]",
|
||||
|
||||
// Same thing but with child element containing "writingPercent".
|
||||
"//supplementalData/territoryInfo"
|
||||
+ "/territory[@type=\"(%W)\"][@gdp=\"(%N)\"][@literacyPercent=\"(%N)\"][@population=\"(%N)\"]"
|
||||
+ "/languagePopulation[@type=\"(%W)\"][@writingPercent=\"(%N)\"][@populationPercent=\"(%N)\"]",
|
||||
" ; /territoryInfo/$1/territoryF:intvector"
|
||||
+ " ; values=$2 $3 $4"
|
||||
+ " ; base_xpath=//supplementalData/territoryInfo/territory[@type=\"$1\"]");
|
||||
|
||||
String commonPrefix =
|
||||
"//supplementalData/territoryInfo"
|
||||
+ "/territory[@type=\"CI\"][@gdp=\"97160000000\"][@literacyPercent=\"57\"][@population=\"26260600\"]";
|
||||
|
||||
ImmutableList<Result> firstResult = transformPath(
|
||||
transformer,
|
||||
commonPrefix + "/languagePopulation[@type=\"kfo\"][@populationPercent=\"0.3\"]");
|
||||
|
||||
ImmutableList<Result> secondResult = transformPath(
|
||||
transformer,
|
||||
commonPrefix + "/languagePopulation[@type=\"sef\"][@writingPercent=\"5\"][@populationPercent=\"4\"]");
|
||||
|
||||
assertSingleResult(
|
||||
firstResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
|
||||
assertSingleResult(
|
||||
secondResult, "/territoryInfo/CI/territoryF:intvector", "97160000000", "57", "26260600");
|
||||
|
||||
// Even though they come from different rules, these results are treated as interchangeably
|
||||
// equal because the base path is the same. Without the base path this would not be equal.
|
||||
assertThat(firstResult).isEqualTo(secondResult);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResultGrouping() {
|
||||
PathValueTransformer transformer = transformer(
|
||||
"%W=[\\w\\-/]++",
|
||||
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/symbol ; /Currencies/$1",
|
||||
"//ldml/numbers/currencies/currency[@type=\"(%W)\"]/decimal ; /Currencies/$1 ; group");
|
||||
|
||||
Result ungrouped = transformSingleResult(
|
||||
transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/symbol", "$");
|
||||
Result grouped = transformSingleResult(
|
||||
transformer, "//ldml/numbers/currencies/currency[@type=\"USD\"]/decimal", ".");
|
||||
|
||||
// Note that grouping is important for some data, but isn't very interesting at the basic
|
||||
// transformation level (it's just a bit). It's only interesting when the converter
|
||||
// combines multiple results together.
|
||||
assertThat(ungrouped).isGrouped(false);
|
||||
assertThat(grouped).isGrouped(true);
|
||||
}
|
||||
|
||||
private static PathValueTransformer transformer(String... configLines) {
|
||||
return RegexTransformer.fromConfigLines(asList(configLines));
|
||||
}
|
||||
|
||||
private static ImmutableList<Result> transformPath(
|
||||
PathValueTransformer transformer, String cldrPath) {
|
||||
|
||||
return transformer.transform(CldrValue.parseValue(cldrPath, ""));
|
||||
}
|
||||
|
||||
private static Result transformSingleResult(
|
||||
PathValueTransformer transformer, String path, String value) {
|
||||
|
||||
ImmutableList<Result> results =
|
||||
transformer.transform(CldrValue.parseValue(path, value));
|
||||
assertThat(results).hasSize(1);
|
||||
return results.get(0);
|
||||
}
|
||||
|
||||
private static void assertSingleResult(List<Result> results, RbPath path, String... values) {
|
||||
assertThat(results).hasSize(1);
|
||||
assertThat(results.get(0)).isGrouped(false);
|
||||
assertThat(results.get(0)).hasKey(path);
|
||||
assertThat(results.get(0)).hasValues(values);
|
||||
}
|
||||
|
||||
private static void assertSingleResult(List<Result> results, String path, String... values) {
|
||||
assertSingleResult(results, RbPath.parse(path), values);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.testing;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
/** Static assertion helpers (some of which can be removed if JUnit version is updated). */
|
||||
public final class AssertUtils {
|
||||
// Functional interface acting as a lambda target.
|
||||
public interface CheckedRunnable<T extends Throwable> {
|
||||
void run() throws T;
|
||||
}
|
||||
|
||||
/** Asserts that an exception is thrown by a given runnable. */
|
||||
public static <T extends Throwable> T assertThrows(Class<T> cls, CheckedRunnable<T> fn) {
|
||||
try {
|
||||
fn.run();
|
||||
} catch (Throwable t) {
|
||||
if (cls.isInstance(t)) {
|
||||
return cls.cast(t);
|
||||
}
|
||||
fail("expected " + cls.getName() + " but got " + t.getClass().getName());
|
||||
}
|
||||
fail("expected " + cls.getName() + " but nothing was thrown");
|
||||
throw new AssertionError("unreachable!");
|
||||
}
|
||||
|
||||
private AssertUtils() {}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.testing;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
import com.google.common.truth.FailureMetadata;
|
||||
import com.google.common.truth.Subject;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
public final class RbPathSubject extends Subject {
|
||||
// For use when chaining from other subjects.
|
||||
public static Subject.Factory<RbPathSubject, RbPath> rbPaths() {
|
||||
return RbPathSubject::new;
|
||||
}
|
||||
|
||||
private final RbPath actual;
|
||||
|
||||
protected RbPathSubject(FailureMetadata metadata, RbPath actual) {
|
||||
super(metadata, actual);
|
||||
this.actual = actual;
|
||||
}
|
||||
|
||||
/** Asserts the value of the path, as segments (use this if a segment can contain '/'). */
|
||||
public final void hasSegments(String... segments) {
|
||||
check("<segments>").that(actual).isEqualTo(RbPath.of(segments));
|
||||
}
|
||||
|
||||
public final void hasLength(int n) {
|
||||
checkArgument(n >= 0, "invalid path length: %s", n);
|
||||
check("length()").that(actual.length()).isEqualTo(n);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.testing;
|
||||
|
||||
import com.google.common.truth.FailureMetadata;
|
||||
import com.google.common.truth.Subject;
|
||||
import com.google.common.truth.Truth;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
/** Truth subject for asserting about resource bundle paths (makes tests much more readable). */
|
||||
public final class RbPathSubjectFactory implements Subject.Factory<RbPathSubject, RbPath> {
|
||||
public static RbPathSubject assertThat(RbPath result) {
|
||||
return Truth.assertAbout(new RbPathSubjectFactory()).that(result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RbPathSubject createSubject(FailureMetadata failureMetadata, RbPath that) {
|
||||
return new RbPathSubject(failureMetadata, that);
|
||||
}
|
||||
|
||||
RbPathSubjectFactory() {}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.testing;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
import org.unicode.icu.tool.cldrtoicu.RbPath;
|
||||
|
||||
import com.google.common.truth.ComparableSubject;
|
||||
import com.google.common.truth.FailureMetadata;
|
||||
import com.google.common.truth.IterableSubject;
|
||||
import com.google.common.truth.Subject;
|
||||
|
||||
public final class ResultSubject extends ComparableSubject<Result> {
|
||||
// For use when chaining from other subjects.
|
||||
public static Subject.Factory<ResultSubject, Result> results() {
|
||||
return ResultSubject::new;
|
||||
}
|
||||
|
||||
private final Result actual;
|
||||
|
||||
protected ResultSubject(FailureMetadata metadata, Result result) {
|
||||
super(metadata, checkNotNull(result));
|
||||
this.actual = result;
|
||||
}
|
||||
|
||||
public final void isGrouped(boolean grouped) {
|
||||
if (grouped != actual.isGrouped()) {
|
||||
check("isGrouped()").that(actual.isGrouped()).isEqualTo(grouped);
|
||||
}
|
||||
}
|
||||
|
||||
public final IterableSubject hasValueListThat() {
|
||||
return check("getValues()").that(actual.getValues());
|
||||
}
|
||||
|
||||
public final void hasValues(String... values) {
|
||||
hasValueListThat().containsExactlyElementsIn(values);
|
||||
}
|
||||
|
||||
public final RbPathSubject hasKeyThat() {
|
||||
return check("getKey()").about(RbPathSubject.rbPaths()).that(actual.getKey());
|
||||
}
|
||||
|
||||
public final void hasKey(RbPath path) {
|
||||
hasKeyThat().isEqualTo(path);
|
||||
}
|
||||
|
||||
public final void hasKey(String path) {
|
||||
hasKey(RbPath.parse(path));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
// © 2019 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
package org.unicode.icu.tool.cldrtoicu.testing;
|
||||
|
||||
import com.google.common.truth.FailureMetadata;
|
||||
import com.google.common.truth.Subject;
|
||||
import com.google.common.truth.Truth;
|
||||
import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
|
||||
|
||||
/** Truth subject for asserting about transformation results (makes tests much more readable). */
|
||||
public class ResultSubjectFactory implements Subject.Factory<ResultSubject, Result> {
|
||||
public static ResultSubject assertThat(Result result) {
|
||||
return Truth.assertAbout(new ResultSubjectFactory()).that(result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultSubject createSubject(FailureMetadata failureMetadata, Result that) {
|
||||
return new ResultSubject(failureMetadata, that);
|
||||
}
|
||||
|
||||
private ResultSubjectFactory() {}
|
||||
}
|
Loading…
Add table
Reference in a new issue