From 566e0f86866cc7f16122b56b40d61b3d2430b317 Mon Sep 17 00:00:00 2001 From: David Beaumont Date: Mon, 4 May 2020 10:13:49 +0000 Subject: [PATCH] ICU-21084 Migrating ICU tools to use PathMatcher --- .../icu/tool/cldrtoicu/CldrDataProcessor.java | 457 ++++++++++++++++++ .../icu/tool/cldrtoicu/LdmlConverter.java | 32 +- .../icu/tool/cldrtoicu/PathMatcher.java | 260 ---------- .../icu/tool/cldrtoicu/PseudoLocales.java | 72 +-- .../icu/tool/cldrtoicu/SupplementalData.java | 9 +- .../tool/cldrtoicu/mapper/Bcp47Mapper.java | 294 ++++++----- .../cldrtoicu/mapper/BreakIteratorMapper.java | 73 ++- .../cldrtoicu/mapper/CollationMapper.java | 161 +++--- .../cldrtoicu/mapper/DayPeriodsMapper.java | 81 ++-- .../cldrtoicu/mapper/PluralRangesMapper.java | 60 +-- .../tool/cldrtoicu/mapper/PluralsMapper.java | 186 ++++--- .../icu/tool/cldrtoicu/mapper/RbnfMapper.java | 157 +++--- .../cldrtoicu/mapper/SupplementalMapper.java | 15 +- .../cldrtoicu/mapper/TransformsMapper.java | 133 +++-- .../cldrtoicu/AlternateLocaleDataTest.java | 31 +- .../tool/cldrtoicu/CldrDataProcessorTest.java | 154 ++++++ .../icu/tool/cldrtoicu/PathMatcherTest.java | 158 ------ .../mapper/SupplementalMapperTest.java | 14 +- 18 files changed, 1247 insertions(+), 1100 deletions(-) create mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessor.java delete mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java create mode 100644 tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessorTest.java delete mode 100644 tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessor.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessor.java new file mode 100644 index 00000000000..27ee75f1a25 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessor.java @@ -0,0 +1,457 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +package org.unicode.icu.tool.cldrtoicu; + +import static com.google.common.base.Preconditions.checkNotNull; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Supplier; + +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrData.PathOrder; +import org.unicode.cldr.api.CldrData.PrefixVisitor; +import org.unicode.cldr.api.CldrData.PrefixVisitor.Context; +import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.CldrValue; +import org.unicode.cldr.api.PathMatcher; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; + +/** + * An immutable processor which can be configured to process CLDR data according to a series of + * mappings from CLDR paths to "actions". + * + *

In typical use a processor would be statically created to bind paths and handler functions + * (actions) together, and calling {@link CldrDataProcessor#process(CldrData, Object, PathOrder)} + * once for each {@link CldrData} instance. + * + *

A processor is built by adding a mixture of "actions" to a builder. An action either defines + * how to handle a single value (see {@link SubProcessor#addValueAction addValueAction()}) or how + * to start a new sub-processor at a specific point in the data hierarchy (see {@link + * SubProcessor#addAction addAction()} or {@link SubProcessor#addSubprocessor addSubprocessor()}). + * + * @param the main "state" type used by the processor for the top-level processing. + */ +public class CldrDataProcessor { + /** Returns a processor builder which operates on a "state" of type {@code }. */ + public static Builder builder() { + return new Builder<>(); + } + + /** + * A builder for processing a CLDR data sub-hierarchy. + * + * @param the "state" type used by the processor. + */ + public static abstract class SubProcessor { + final List> prefixActions = new ArrayList<>(); + final List> valueActions = new ArrayList<>(); + + private SubProcessor() { } + + /** + * Binds a subtype action to a {@link PathMatcher} prefix pattern, returning a new builder + * for the sub-hierarchy. + * + *

This method is intended for cases where the subtype state does not depend on the + * parent state or the path prefix, but needs some post-processing. For example, the + * subtype state might just be a {@code List} and the elements added to it must be + * combined with the parent state after sub-hierarchy is processing is complete. + * + *

{@code
+         * processor
+         *     .addAction("//parent/path", ArrayList::new, ParentState::addValues)
+         *     .addValueAction("value/suffix", List::add);
+         * }
+ * + * @param pattern the path pattern for the prefix where sub-processing starts. + * @param newStateFn a supplier of subtype state instances for each sub-processing step. + * @param doneFn called after each sub-processing step. + */ + public SubProcessor addAction( + String pattern, Supplier newStateFn, BiConsumer doneFn) { + return addAction(pattern, (t, p) -> newStateFn.get(), doneFn); + } + + /** + * Binds a subtype action to a {@link PathMatcher} prefix pattern, returning a new builder + * for the sub-hierarchy. + * + *

This method is similar to {@link #addAction(String, Supplier, BiConsumer)} but is + * intended for cases where the subtype state depends on the parent path prefix. + * + *

{@code
+         * processor
+         *     .addAction("//parent/path[@type=*]", SubState::fromType, ParentState::addSubState)
+         *     .addValueAction("value/suffix", SubState::collectValue);
+         * }
+ * + * @param pattern the path pattern for the prefix where sub-processing starts. + * @param newStateFn a supplier of subtype state instances for each sub-processing step. + * @param doneFn called after each sub-processing step. + */ + public SubProcessor addAction( + String pattern, Function newStateFn, BiConsumer doneFn) { + return addAction(pattern, (t, p) -> newStateFn.apply(p), doneFn); + } + + /** + * Binds a subtype action to a {@link PathMatcher} prefix pattern, returning a new builder + * for the sub-hierarchy. + * + *

This method is intended for the case where the subtype state is derived from the + * parent state (e.g. an inner class) but does not depend on the path prefix at which the + * sub-hierarchy is rooted. + * + *

{@code
+         * processor
+         *     .addAction("//parent/path", ParentState::newValueCollector)
+         *     .addValueAction("value/suffix", ValueCollector::addValue);
+         * }
+ * + * @param pattern the path pattern for the prefix where sub-processing starts. + * @param newStateFn a supplier of subtype state instances for each sub-processing step. + */ + public SubProcessor addAction(String pattern, Function newStateFn) { + return addAction(pattern, (t, p) -> newStateFn.apply(t)); + } + + /** + * Binds a subtype action to a {@link PathMatcher} prefix pattern, returning a new builder + * for the sub-hierarchy. + * + *

This method is intended for the case where the subtype state is derived from the + * parent state (e.g. an inner class) and the path prefix at which the sub-hierarchy is + * rooted. + * + *

{@code
+         * processor
+         *     .addAction("//parent/path[@type=*]", ParentState::newCollectorOfType)
+         *     .addValueAction("value/suffix", ValueCollector::addValue);
+         * }
+ * + * @param pattern the path pattern for the prefix where sub-processing starts. + * @param newStateFn a supplier of subtype state instances for each sub-processing step. + */ + public SubProcessor addAction( + String pattern, BiFunction newStateFn) { + return addAction(pattern, newStateFn, (t, y) -> {}); + } + + /** + * Binds a subtype action to a {@link PathMatcher} prefix pattern, returning a new builder + * for the sub-hierarchy. + * + *

This method is the most general purpose way to add a sub-hierarchy action and is + * intended for the most complex cases, where subtype state depends on parent state and + * path prefix, and post processing is required. All other implementations of {@code + * addAction} simply delegate to this one in one way or another. + * + *

{@code
+         * processor
+         *     .addAction("//parent/path[@type=*]", ParentState::newCollector, ParentState::done)
+         *     .addValueAction("value/suffix", ValueCollector::addValue);
+         * }
+ * + * @param pattern the path pattern for the prefix where sub-processing starts. + * @param newStateFn a supplier of subtype state instances for each sub-processing step. + * @param doneFn called after each sub-processing step. + */ + public SubProcessor addAction( + String pattern, + BiFunction newStateFn, + BiConsumer doneFn) { + + PrefixBuilder action = + new PrefixBuilder<>(getMatcher(pattern), newStateFn, doneFn); + prefixActions.add(action); + return action; + } + + /** + * Returns a new sub-processor for the specified sub-hierarchy rooted at the given + * {@link PathMatcher} prefix pattern. The new processor builder has the same state type as + * the parent. + * + *

This method is intended for the case where multiple sub-processors are needed below + * a certain point in the hierarchy, but they all operate on the same state instance. + * + *

{@code
+         * SubBuilder subprocessor = processor.addSubprocessor("//parent/path");
+         * subprocessor.addValueAction("value/suffix", MyCollector::addValue);
+         * subprocessor.addValueAction("other/suffix", MyCollector::addOtherValue);
+         * }
+ * + * @param pattern the path pattern for the prefix where sub-processing starts. + */ + public SubProcessor addSubprocessor(String pattern) { + return addAction(pattern, (t, p) -> t); + } + + /** + * Returns a new sub-processor for the specified sub-hierarchy rooted at the given + * {@link PathMatcher} prefix pattern. The new processor builder has the same state type as + * the parent. + * + *

This method is intended for the case where a some setup is required before a + * sub-hierarchy is processed, but the sub-processor state is the same. + * + *

{@code
+         * SubBuilder subprocessor = processor
+         *     .addSubprocessor("//parent/path", MyCollector::startFn)
+         *     .addValueAction("value/suffix", MyCollector::addValue);
+         * }
+ * + * @param startFn a handler called when sub-processing begins + * @param pattern the path pattern for the prefix where sub-processing starts. + */ + public SubProcessor addSubprocessor(String pattern, BiConsumer startFn) { + return addAction(pattern, (t, p) -> { + startFn.accept(t, p); + return t; + }); + } + + /** + * Adds an action to handle {@link CldrValue}s found in the current sub-hierarchy + * visitation which match the given {@link PathMatcher} leaf-path pattern. + * + *

This method is expected to be called at least once for each sub-hierarchy processor + * in order to handle the actual CLDR values being processed, and the path pattern should + * match leaf-paths in the CLDR data hierarchy, rather than path prefixes. + * + *

Multiple value actions can be added to a sub-hierarchy processor, and paths are + * matched in the order the actions are added. It is also possible to mix sub-hierarchy + * actions and value actions on the same processor, but note that sub-hierarchy processors + * will take precedence, so you cannot try to match the same value in both a sub-hierarchy + * processor and a value action. + * + * For example: + *

{@code
+         * processor
+         *     .addAction("//parent/path", ...)
+         *     .addValueAction("value/suffix", ...);
+         * // This will never match any values since the sub-hierarchy processor takes precedence!
+         * processor.addValueAction("//parent/path/value/suffix", ...);
+         * }
+ * + * @param pattern the CLDR path suffix idenifying the values to be processed. + * @param doFn the action to be carried out for each value. + */ + public void addValueAction(String pattern, BiConsumer doFn) { + valueActions.add(new ValueAction<>(getMatcher(pattern), doFn)); + } + + abstract PathMatcher getMatcher(String pattern); + } + + /** + * A root builder of a CLDR data processor. + * + * @param the processor state type. + */ + public static final class Builder extends SubProcessor { + private Builder() { } + + /** Returns the immutable CLDR data processor. */ + public CldrDataProcessor build() { + return new CldrDataProcessor<>( + Lists.transform(prefixActions, PrefixBuilder::build), valueActions); + } + + @Override + PathMatcher getMatcher(String pattern) { + return PathMatcher.of(pattern); + } + } + + /** + * A sub-hierarchy data processor rooted at some specified path prefix. + * + * @param the subtype processor state. + * @param the parent processor state. + */ + private static class PrefixBuilder extends SubProcessor { + private final PathMatcher matcher; + private final BiFunction newStateFn; + private final BiConsumer doneFn; + + PrefixBuilder( + PathMatcher matcher, + BiFunction newStateFn, + BiConsumer doneFn) { + this.matcher = checkNotNull(matcher); + this.newStateFn = checkNotNull(newStateFn); + this.doneFn = checkNotNull(doneFn); + } + + PrefixAction build() { + List> actions = Lists.transform(prefixActions, PrefixBuilder::build); + return new PrefixAction<>(actions, valueActions, matcher, newStateFn, doneFn); + } + + @Override PathMatcher getMatcher(String pattern) { + return matcher.withSuffix(pattern); + } + } + + private final ImmutableList> prefixActions; + private final ImmutableList> valueActions; + + private CldrDataProcessor( + List> prefixActions, + List> valueActions) { + this.prefixActions = ImmutableList.copyOf(prefixActions); + this.valueActions = ImmutableList.copyOf(valueActions); + } + + /** + * Processes a CLDR data instance according to the actions registered for this processor in DTD + * order. This method is preferred over {@link #process(CldrData, Object, PathOrder)} and + * eventually the ability to even specify a path order for processing will be removed. + * + *

This is the main method used to drive the processing of some CLDR data and is typically + * used like: + * + *

{@code
+     * MyResult result = CLDR_PROCESSOR.process(data, new MyResult(), DTD);
+     * }
+ *

or:* + *

{@code
+     * MyResult result = CLDR_PROCESSOR.process(data, MyResult.newBuilder(), DTD).build();
+     * }
+ * + * @param data the CLDR data to be processed. + * @param state an instance of the "primary" state. + * @return the given primary state (after modification). + */ + public T process(CldrData data, T state) { + return process(data, state, PathOrder.DTD); + } + + /** + * Processes a CLDR data instance according to the actions registered for this processor. + * Callers should prefer using {@link #process(CldrData, Object)} whenever possible and avoid + * relying on path ordering for processing. + * + * @param data the CLDR data to be processed. + * @param state an instance of the "primary" state. + * @param pathOrder the order in which CLDR paths should be visited. + * @return the given primary state (after modification). + */ + public T process(CldrData data, T state, PathOrder pathOrder) { + data.accept(pathOrder, new DispatchingVisitor<>(this, state, s -> {})); + return state; + } + + private void dispatchPrefixActions(T state, CldrPath prefix, Context context) { + for (PrefixAction a : prefixActions) { + if (a.matches(state, prefix, context)) { + break; + } + } + } + + private void dispatchValueActions(T state, CldrValue value) { + for (ValueAction a : valueActions) { + if (a.matches(state, value)) { + break; + } + } + } + + /* + * Implementation notes: + * + * "PrefixAction" is a critical part of the design of the path visitor. It acts as a bridge + * between the parent visitation (with state type 'T') and child visitation (state type 'S'). + * + * It is the only class to need to know about both types. Both types are known when the + * CldrDataProcessor is made, but during visitation the caller of the "matches" method doesn't + * need to know about the child type, which is why the parent can just have a list of + * "PrefixAction" and don't need any magical recasting. + * + * It might only be a few lines of code, but it can only exist in a class which knows about + * both parent and child types (obtaining a new child state is a function of the parent state). + */ + static final class PrefixAction extends CldrDataProcessor { + private final PathMatcher matcher; + private final BiFunction newStateFn; + private final BiConsumer doneFn; + + PrefixAction( + List> prefixActions, + List> valueActions, + PathMatcher matcher, + BiFunction newStateFn, + BiConsumer doneFn) { + super(prefixActions, valueActions); + this.matcher = checkNotNull(matcher); + this.newStateFn = checkNotNull(newStateFn); + this.doneFn = checkNotNull(doneFn); + } + + public boolean matches(T state, CldrPath prefix, Context context) { + if (matcher.locallyMatches(prefix)) { + Consumer doneFn = childState -> this.doneFn.accept(state, childState); + context.install( + new DispatchingVisitor<>(this, newStateFn.apply(state, prefix), doneFn), + DispatchingVisitor::done); + return true; + } + return false; + } + } + + private static final class ValueAction { + private final PathMatcher matcher; + private BiConsumer doFn; + + ValueAction(PathMatcher matcher, BiConsumer doFn) { + this.matcher = checkNotNull(matcher); + this.doFn = checkNotNull(doFn); + } + + boolean matches(T state, CldrValue value) { + if (matcher.locallyMatches(value.getPath())) { + doFn.accept(state, value); + return true; + } + return false; + } + } + + private static final class DispatchingVisitor implements PrefixVisitor { + CldrDataProcessor processor; + private final T state; + private final Consumer doneFn; + + DispatchingVisitor(CldrDataProcessor processor, T state, Consumer doneFn) { + this.processor = checkNotNull(processor); + this.state = checkNotNull(state); + this.doneFn = checkNotNull(doneFn); + } + + @Override + public void visitPrefixStart(CldrPath prefix, Context context) { + processor.dispatchPrefixActions(state, prefix, context); + } + + @Override + public void visitValue(CldrValue value) { + processor.dispatchValueActions(state, value); + } + + // Important: This is NOT visitPrefixEnd() since that happens multiple times and isn't + // going to be called for the prefix at which this visitor was started. + void done() { + doneFn.accept(state); + } + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java index 794f41008c9..7a5fe8d522e 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java @@ -37,12 +37,15 @@ import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.TreeSet; +import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; import org.unicode.cldr.api.CldrData; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; +import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.PathMatcher; import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir; import org.unicode.icu.tool.cldrtoicu.mapper.Bcp47Mapper; import org.unicode.icu.tool.cldrtoicu.mapper.BreakIteratorMapper; @@ -83,15 +86,15 @@ import com.google.common.io.CharStreams; */ public final class LdmlConverter { // TODO: Do all supplemental data in one go and split similarly to locale data (using RbPath). - private static final PathMatcher GENDER_LIST_PATHS = + private static final Predicate GENDER_LIST_PATHS = supplementalMatcher("gender"); - private static final PathMatcher LIKELY_SUBTAGS_PATHS = + private static final Predicate LIKELY_SUBTAGS_PATHS = supplementalMatcher("likelySubtags"); - private static final PathMatcher METAZONE_PATHS = + private static final Predicate METAZONE_PATHS = supplementalMatcher("metaZones", "primaryZones"); - private static final PathMatcher METADATA_PATHS = + private static final Predicate METADATA_PATHS = supplementalMatcher("metadata"); - private static final PathMatcher SUPPLEMENTAL_DATA_PATHS = + private static final Predicate SUPPLEMENTAL_DATA_PATHS = supplementalMatcher( "calendarData", "calendarPreferenceData", @@ -109,22 +112,23 @@ public final class LdmlConverter { "unitPreferenceData", "weekData", "weekOfPreference"); - private static final PathMatcher CURRENCY_DATA_PATHS = + private static final Predicate CURRENCY_DATA_PATHS = supplementalMatcher("currencyData"); - private static final PathMatcher NUMBERING_SYSTEMS_PATHS = + private static final Predicate NUMBERING_SYSTEMS_PATHS = supplementalMatcher("numberingSystems"); - private static final PathMatcher WINDOWS_ZONES_PATHS = + private static final Predicate WINDOWS_ZONES_PATHS = supplementalMatcher("windowsZones"); - private static PathMatcher supplementalMatcher(String... spec) { + private static Predicate supplementalMatcher(String... spec) { checkArgument(spec.length > 0, "must supply at least one matcher spec"); if (spec.length == 1) { - return PathMatcher.of("supplementalData/" + spec[0]); + return PathMatcher.of("//supplementalData/" + spec[0])::matchesPrefixOf; } - return PathMatcher.anyOf( + return Arrays.stream(spec) - .map(s -> PathMatcher.of("supplementalData/" + s)) - .toArray(PathMatcher[]::new)); + .map(s -> PathMatcher.of("//supplementalData/" + s)) + .map(m -> ((Predicate) m::matchesPrefixOf)) + .reduce(p -> false, Predicate::or); } private static RbPath RB_PARENT = RbPath.of("%%Parent"); @@ -514,7 +518,7 @@ public final class LdmlConverter { private static final RbPath RB_CLDR_VERSION = RbPath.of("cldrVersion"); private void processSupplemental( - String label, PathMatcher paths, String dir, boolean addCldrVersion) { + String label, Predicate paths, String dir, boolean addCldrVersion) { IcuData icuData = SupplementalMapper.process(src, supplementalTransformer, label, paths); // A hack for "supplementalData.txt" since the "cldrVersion" value doesn't come from the diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java deleted file mode 100644 index 77f0d765b05..00000000000 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PathMatcher.java +++ /dev/null @@ -1,260 +0,0 @@ -// © 2019 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -package org.unicode.icu.tool.cldrtoicu; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static com.google.common.base.Preconditions.checkPositionIndex; -import static com.google.common.base.Preconditions.checkState; -import static com.google.common.collect.ImmutableMap.toImmutableMap; -import static org.unicode.cldr.api.AttributeKey.keyOf; - -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.function.Predicate; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.unicode.cldr.api.AttributeKey; -import org.unicode.cldr.api.CldrPath; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; - -/** - * An immutable matcher for {@link CldrPath} instances. A path matcher specification looks like - * {@code "foo/*[@x="z"]/bar[@y=*]"}, where element names and attribute values can be wildcards. - * - *

Note that the path fragment represented by the specification does not include either leading - * or trailing {@code '/'}. This is because matching can occur at any point in a {@link CldrPath}. - * The choice of where to match in the path is governed by the match method used (e.g. - * {@link PathMatcher#matchesSuffixOf(CldrPath)}. - */ -public abstract class PathMatcher { - /** Parses the path specification into a matcher. */ - public static PathMatcher of(String pathSpec) { - // Supported so far: "a", "a/b", "a/b[@x=*]" - return new BasicMatcher(parse(pathSpec)); - } - - /** - * Combines the given matchers into a single composite matcher which tests all the given - * matchers in order. - */ - public static PathMatcher anyOf(PathMatcher... matchers) { - checkArgument(matchers.length > 0, "must supply at least one matcher"); - if (matchers.length == 1) { - return checkNotNull(matchers[0]); - } - return new CompositeMatcher(ImmutableList.copyOf(matchers)); - } - - /** Attempts a full match against a given path. */ - public abstract boolean matches(CldrPath path); - - /** Attempts a suffix match against a given path. */ - public abstract boolean matchesSuffixOf(CldrPath path); - - /** Attempts a prefix match against a given path. */ - public abstract boolean matchesPrefixOf(CldrPath path); - - // A matcher that simply combines a sequences of other matchers in order. - private static final class CompositeMatcher extends PathMatcher { - private final ImmutableList matchers; - - private CompositeMatcher(ImmutableList matchers) { - checkArgument(matchers.size() > 1); - this.matchers = checkNotNull(matchers); - } - - @Override - public boolean matches(CldrPath path) { - for (PathMatcher m : matchers) { - if (m.matches(path)) { - return true; - } - } - return false; - } - - @Override - public boolean matchesSuffixOf(CldrPath path) { - for (PathMatcher m : matchers) { - if (m.matchesSuffixOf(path)) { - return true; - } - } - return false; - } - - @Override - public boolean matchesPrefixOf(CldrPath path) { - for (PathMatcher m : matchers) { - if (m.matchesPrefixOf(path)) { - return true; - } - } - return false; - } - } - - private static final class BasicMatcher extends PathMatcher { - private final ImmutableList> elementMatchers; - - private BasicMatcher(List> elementMatchers) { - this.elementMatchers = ImmutableList.copyOf(elementMatchers); - } - - @Override - public boolean matches(CldrPath path) { - return elementMatchers.size() == path.getLength() && matchRegion(path, 0); - } - - @Override - public boolean matchesSuffixOf(CldrPath path) { - int start = path.getLength() - elementMatchers.size(); - return start >= 0 && matchRegion(path, start); - } - - @Override - public boolean matchesPrefixOf(CldrPath path) { - return path.getLength() >= elementMatchers.size() && matchRegion(path, 0); - } - - private boolean matchRegion(CldrPath path, int offset) { - // offset is the path element corresponding the the "top most" element matcher, it - // must be in the range 0 ... (path.length() - elementMatchers.size()). - checkPositionIndex(offset, path.getLength() - elementMatchers.size()); - // First jump over the path parents until we find the last matcher. - int matchPathLength = offset + elementMatchers.size(); - while (path.getLength() > matchPathLength) { - path = path.getParent(); - } - return matchForward(path, elementMatchers.size() - 1); - } - - private boolean matchForward(CldrPath path, int matcherIndex) { - if (matcherIndex < 0) { - return true; - } - return matchForward(path.getParent(), matcherIndex - 1) - && elementMatchers.get(matcherIndex).test(path); - } - } - - // Make a new, non-interned, unique instance here which we can test by reference to - // determine if the argument is to be captured (needed as ImmutableMap prohibits null). - // DO NOT change this code to assign "*" as the value directly, it MUST be a new instance. - @SuppressWarnings("StringOperationCanBeSimplified") - private static final String WILDCARD = new String("*"); - - private static final Pattern ELEMENT_START_REGEX = - Pattern.compile("(\\*|[-:\\w]+)(?:/|\\[|$)"); - private static final Pattern ATTRIBUTE_REGEX = - Pattern.compile("\\[@([-:\\w]+)=(?:\\*|\"([^\"]*)\")]"); - - // element := foo, foo[@bar="baz"], foo[@bar=*] - // pathspec := element{/element}* - private static List> parse(String pathSpec) { - List> specs = new ArrayList<>(); - int pos = 0; - do { - pos = parse(pathSpec, pos, specs); - } while (pos >= 0); - return specs; - } - - // Return next start index or -1. - private static int parse(String pathSpec, int pos, List> specs) { - Matcher m = ELEMENT_START_REGEX.matcher(pathSpec).region(pos, pathSpec.length()); - checkArgument(m.lookingAt(), "invalid path specification (index=%s): %s", pos, pathSpec); - String name = m.group(1); - Map attributes = ImmutableMap.of(); - pos = m.end(1); - if (pos < pathSpec.length() && pathSpec.charAt(pos) == '[') { - // We have attributes to add. - attributes = new LinkedHashMap<>(); - do { - m = ATTRIBUTE_REGEX.matcher(pathSpec).region(pos, pathSpec.length()); - checkArgument(m.lookingAt(), - "invalid path specification (index=%s): %s", pos, pathSpec); - // Null if we matched the '*' wildcard. - String value = m.group(2); - attributes.put(m.group(1), value != null ? value : WILDCARD); - pos = m.end(); - } while (pos < pathSpec.length() && pathSpec.charAt(pos) == '['); - } - // Wildcard matching is less efficient because attribute keys cannot be made in advance, so - // since it's also very rare, we special case it. - Predicate matcher = name.equals(WILDCARD) - ? new WildcardElementMatcher(attributes)::match - : new ElementMatcher(name, attributes)::match; - specs.add(matcher); - if (pos == pathSpec.length()) { - return -1; - } - checkState(pathSpec.charAt(pos) == '/', - "invalid path specification (index=%s): %s", pos, pathSpec); - return pos + 1; - } - - // Matcher for path elements like "foo[@bar=*]" where the name is known in advance. - private static final class ElementMatcher { - private final String name; - private final ImmutableMap attributes; - - private ElementMatcher(String name, Map attributes) { - this.name = checkNotNull(name); - this.attributes = attributes.entrySet().stream() - .collect(toImmutableMap(e -> keyOf(name, e.getKey()), Entry::getValue)); - } - - boolean match(CldrPath path) { - if (!path.getName().equals(name)) { - return false; - } - for (Entry e : attributes.entrySet()) { - String actual = path.get(e.getKey()); - if (actual == null) { - return false; - } - String expected = e.getValue(); - // DO NOT change this to use expected.equals(WILDCARD). - if (expected != WILDCARD && !expected.equals(actual)) { - return false; - } - } - return true; - } - } - - // Matcher for path elements like "*[@bar=*]", where the name isn't known until match time. - private static final class WildcardElementMatcher { - private final ImmutableMap attributes; - - private WildcardElementMatcher(Map attributes) { - this.attributes = ImmutableMap.copyOf(attributes); - } - - private boolean match(CldrPath path) { - // The wildcard matcher never fails due to the element name but must create new key - // instances every time matching occurs (because the key name is dynamic). Since this - // is rare, it's worth making into a separate case. - for (Entry attribute : attributes.entrySet()) { - String actual = path.get(keyOf(path.getName(), attribute.getKey())); - if (actual == null) { - return false; - } - String expected = attribute.getValue(); - // DO NOT change this to use expected.equals(WILDCARD). - if (expected != WILDCARD && !expected.equals(actual)) { - return false; - } - } - return true; - } - } -} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java index 850d17d9aad..8747b1f0607 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/PseudoLocales.java @@ -4,6 +4,7 @@ package org.unicode.icu.tool.cldrtoicu; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static java.lang.Character.DIRECTIONALITY_LEFT_TO_RIGHT; import static java.util.function.Function.identity; @@ -26,8 +27,11 @@ import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrDraftStatus; import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; +import org.unicode.cldr.api.FilteredData; +import org.unicode.cldr.api.PathMatcher; import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; @@ -147,43 +151,52 @@ public final class PseudoLocales { } private static final class PseudoLocaleData extends FilteredData { + private static final PathMatcher LDML = PathMatcher.of("//ldml"); + private static final PathMatcher AUX_EXEMPLARS = - PathMatcher.of("ldml/characters/exemplarCharacters[@type=\"auxiliary\"]"); + ldml("characters/exemplarCharacters[@type=\"auxiliary\"]"); private static final PathMatcher NUMBERING_SYSTEM = - PathMatcher.of("ldml/numbers/defaultNumberingSystem"); + ldml("numbers/defaultNumberingSystem"); // These paths were mostly derived from looking at the previous implementation's behaviour // and can be modified as needed. Notably there are no "units" here (but they were also // excluded in the original code). - private static final PathMatcher PSEUDO_PATHS = PathMatcher.anyOf( - ldml("localeDisplayNames"), - ldml("delimiters"), - ldml("dates/calendars/calendar"), - ldml("dates/fields"), - ldml("dates/timeZoneNames"), - ldml("listPatterns"), - ldml("posix/messages"), - ldml("characterLabels"), - ldml("typographicNames")); - - // Paths which contain non-localizable data. It is important that these paths catch all the - // non-localizable sub-paths of the list above. This list must be accurate. - private static final PathMatcher EXCLUDE_PATHS = PathMatcher.anyOf( - ldml("localeDisplayNames/localeDisplayPattern"), - ldml("dates/timeZoneNames/fallbackFormat")); + private static final Predicate IS_PSEUDO_PATH = + matchAnyLdmlPrefix( + "localeDisplayNames", + "delimiters", + "dates/calendars/calendar", + "dates/fields", + "dates/timeZoneNames", + "listPatterns", + "posix/messages", + "characterLabels", + "typographicNames") + .and(matchAnyLdmlPrefix( + "localeDisplayNames/localeDisplayPattern", + "dates/timeZoneNames/fallbackFormat") + .negate()); // The expectation is that all non-alias paths with values under these roots are "date/time // pattern like" (such as "E h:mm:ss B") in which care must be taken to not pseudo localize // the patterns in such as way as to break them. This list must be accurate. - private static final PathMatcher PATTERN_PATHS = PathMatcher.anyOf( - ldml("dates/calendars/calendar/timeFormats"), - ldml("dates/calendars/calendar/dateFormats"), - ldml("dates/calendars/calendar/dateTimeFormats"), - ldml("dates/timeZoneNames/hourFormat")); + private static final Predicate IS_PATTERN_PATH = matchAnyLdmlPrefix( + "dates/calendars/calendar/timeFormats", + "dates/calendars/calendar/dateFormats", + "dates/calendars/calendar/dateTimeFormats", + "dates/timeZoneNames/hourFormat"); - private static PathMatcher ldml(String matcherSuffix) { - return PathMatcher.of("ldml/" + matcherSuffix); + private static PathMatcher ldml(String paths) { + return LDML.withSuffix(paths); + } + + private static Predicate matchAnyLdmlPrefix(String... paths) { + ImmutableList> collect = + Arrays.stream(paths) + .map(s -> (Predicate) ldml(s)::matchesPrefixOf) + .collect(toImmutableList()); + return p -> collect.stream().anyMatch(e -> e.test(p)); } // Look for any attribute in the path with "narrow" in its value. Since "narrow" values @@ -223,7 +236,7 @@ public final class PseudoLocales { CldrValue defaultReturnValue = isResolved ? value : null; // This makes it look like we have explicit values only for the included paths. - if (!PSEUDO_PATHS.matchesPrefixOf(path) || EXCLUDE_PATHS.matchesPrefixOf(path)) { + if (!IS_PSEUDO_PATH.test(path)) { return defaultReturnValue; } String fullPath = value.getFullPath(); @@ -232,7 +245,7 @@ public final class PseudoLocales { if (IS_NARROW.test(fullPath)) { return defaultReturnValue; } - String text = createMessage(value.getValue(), PATTERN_PATHS.matchesPrefixOf(path)); + String text = createMessage(value.getValue(), IS_PATTERN_PATH.test(path)); return CldrValue.parseValue(fullPath, text); } @@ -357,7 +370,7 @@ public final class PseudoLocales { public void addFragment(String text, boolean isLocalizable) { if (isLocalizable) { boolean wrapping = false; - for (int index = 0; index < text.length();) { + for (int index = 0; index < text.length(); ) { int codePoint = text.codePointAt(index); index += Character.charCount(codePoint); byte directionality = Character.getDirectionality(codePoint); @@ -383,5 +396,6 @@ public final class PseudoLocales { }; } - private PseudoLocales() {} + private PseudoLocales() { + } } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java index 1f0756802ff..ddc64786ecf 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/SupplementalData.java @@ -26,6 +26,7 @@ import java.util.stream.Stream; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; +import org.unicode.cldr.api.PathMatcher; import com.google.common.base.Ascii; import com.google.common.base.Splitter; @@ -57,22 +58,22 @@ public final class SupplementalData { private static final Pattern SCRIPT_SUBTAG = Pattern.compile("[A-Z][a-z]{3}"); private static final PathMatcher ALIAS = - PathMatcher.of("supplementalData/metadata/alias/*[@type=*]"); + PathMatcher.of("//supplementalData/metadata/alias/*[@type=*]"); private static final PathMatcher PARENT_LOCALE = - PathMatcher.of("supplementalData/parentLocales/parentLocale[@parent=*]"); + PathMatcher.of("//supplementalData/parentLocales/parentLocale[@parent=*]"); private static final AttributeKey PARENT = keyOf("parentLocale", "parent"); private static final AttributeKey LOCALES = keyOf("parentLocale", "locales"); private static final PathMatcher CALENDER_PREFERENCE = - PathMatcher.of("supplementalData/calendarPreferenceData/calendarPreference[@territories=*]"); + PathMatcher.of("//supplementalData/calendarPreferenceData/calendarPreference[@territories=*]"); private static final AttributeKey CALENDER_TERRITORIES = keyOf("calendarPreference", "territories"); private static final AttributeKey CALENDER_ORDERING = keyOf("calendarPreference", "ordering"); private static final PathMatcher LIKELY_SUBTAGS = - PathMatcher.of("supplementalData/likelySubtags/likelySubtag[@from=*]"); + PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*]"); private static final AttributeKey SUBTAG_FROM = keyOf("likelySubtag", "from"); private static final AttributeKey SUBTAG_TO = keyOf("likelySubtag", "to"); diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java index 1ba305ae4cc..8be6af89b53 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/Bcp47Mapper.java @@ -3,8 +3,6 @@ package org.unicode.icu.tool.cldrtoicu.mapper; import static com.google.common.base.Ascii.toLowerCase; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static org.unicode.cldr.api.AttributeKey.keyOf; import static org.unicode.cldr.api.CldrData.PathOrder.DTD; @@ -17,19 +15,15 @@ import java.util.Map.Entry; import java.util.Optional; import java.util.Set; -import javax.annotation.Nullable; - import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.PrefixVisitor; -import org.unicode.cldr.api.CldrData.ValueVisitor; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Ascii; @@ -46,12 +40,10 @@ import com.google.common.collect.Sets; */ public final class Bcp47Mapper { // Other attributes (e.g. "alias") are value attributes and don't need to be matched here. - private static final PathMatcher KEY = PathMatcher.of("ldmlBCP47/keyword/key[@name=*]"); private static final AttributeKey KEY_NAME = keyOf("key", "name"); private static final AttributeKey KEY_ALIAS = keyOf("key", "alias"); private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType"); - private static final PathMatcher TYPE = PathMatcher.of("type[@name=*]"); private static final AttributeKey TYPE_NAME = keyOf("type", "name"); private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias"); private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred"); @@ -75,6 +67,15 @@ public final class Bcp47Mapper { private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias"); private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias"); + private static final CldrDataProcessor BCP47_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + processor + .addAction("//ldmlBCP47/keyword/key[@name=*]", (m, p) -> m.new ValueCollector(p)) + .addValueAction("type[@name=*]", ValueCollector::collect); + BCP47_PROCESSOR = processor.build(); + } + /** * Processes data from the given supplier to generate Timezone and BCP-47 ICU data. * @@ -87,169 +88,146 @@ public final class Bcp47Mapper { @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. static ImmutableList process(CldrData cldrData) { - Bcp47Visitor visitor = new Bcp47Visitor(); - cldrData.accept(DTD, visitor); - visitor.addKeyMapValues(); - return ImmutableList.of(visitor.keyTypeData.icuData, visitor.tzData.icuData); + Bcp47Mapper mapper = BCP47_PROCESSOR.process(cldrData, new Bcp47Mapper(), DTD); + mapper.addKeyMapValues(); + return ImmutableList.of(mapper.keyTypeData, mapper.tzData); } // Outer visitor which handles "key" paths by installing sub-visitor methods to process // each child "type" element. Depending on the key name, values are stored in different // IcuData instances. - private static final class Bcp47Visitor implements PrefixVisitor { - private final ValueCollector tzData = - new ValueCollector(new IcuData("timezoneTypes", false)); - private final ValueCollector keyTypeData = - new ValueCollector(new IcuData("keyTypeData", false)); + private final IcuData tzData = new IcuData("timezoneTypes", false); + private final IcuData keyTypeData = new IcuData("keyTypeData", false); + // A map collecting each key and values as they are visited. + // TODO: Convert this to a Map which involves removing the '@' prefix hack. + private Map keyMap = new LinkedHashMap<>(); - // The current key name from the parent path element (set when a prefix is matched). - @Nullable private String keyName = null; - // A map collecting each key and values as they are visited. - // TODO: Convert this to a Map which involves removing the '@' prefix hack. - private Map keyMap = new LinkedHashMap<>(); + private Bcp47Mapper() { } - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (KEY.matches(prefix)) { - // Don't inline this since it also sets the field!! - keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix)); - - // How the data is visited is the same for both timezone and other BCP-47 data, - // it's just split into different data files, so we just install a different - // instance of the visitor class according to where the data in this sub-hierarchy - // should end up. - ctx.install(keyName.equals("tz") ? tzData : keyTypeData); + // Post processing to add additional captured attribute values and some special cases. + private void addKeyMapValues() { + IcuData keyData = keyTypeData; + // Add all the keyMap values into the IcuData file. + for (Entry kmData : keyMap.entrySet()) { + String bcpKey = kmData.getKey(); + String key = kmData.getValue(); + if (bcpKey.startsWith("@")) { + // Undoing the weird hack in addInfoAttributes(). This can be done better. + // We use "parse()" because these are full paths, and not single elements. + keyData.add(RbPath.parse(bcpKey.substring(1)), key); + continue; } + if (bcpKey.equals(key)) { + // An empty value indicates that the BCP47 key is same as the legacy key. + bcpKey = ""; + } + keyData.add(RB_KEYMAP.extendBy(key), bcpKey); + } + // Add aliases for timezone data. + keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone"); + keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone"); + keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz"); + } + + private final class ValueCollector { + private final String keyName; + // Mutable data to be written into (differs depending on the key name). + private final IcuData icuData; + + ValueCollector(CldrPath prefix) { + this.keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix)); + this.icuData = keyName.equals("tz") ? tzData : keyTypeData; } - // Post processing to add additional captured attribute values and some special cases. - private void addKeyMapValues() { - IcuData keyData = keyTypeData.icuData; - // Add all the keyMap values into the IcuData file. - for (Entry kmData : keyMap.entrySet()) { - String bcpKey = kmData.getKey(); - String key = kmData.getValue(); - if (bcpKey.startsWith("@")) { - // Undoing the weird hack in addInfoAttributes(). This can be done better. - // We use "parse()" because these are full paths, and not single elements. - keyData.add(RbPath.parse(bcpKey.substring(1)), key); + private void collect(CldrValue value) { + String typeName = TYPE_NAME.valueFrom(value); + // Note that if a "preferred" type exists, we treat the value specially and add + // it only as an alias. We expected values with a preferred replacement to + // always be explicitly deprecated. + Optional prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value); + if (prefName.isPresent()) { + checkState(KEY_DEPRECATED.booleanValueFrom(value, false) + || TYPE_DEPRECATED.booleanValueFrom(value, false), + "unexpected 'preferred' attribute for non-deprecated value: %s", value); + icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get()); + return; + } + // Note: There are some deprecated values which don't have a preferred + // replacement and these will be processed below (in particular we need to emit + // the fact that they are deprecated). + + // Not all key elements have an alias. E.g. in calendar.xml: + // + // But we still add it as a alias to itself (which is later turned into a path with + // an empty value). + String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName)); + + keyMap.put(keyName, keyAlias); + RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias); + + List typeAliases = TYPE_ALIASES.listOfValuesFrom(value); + if (typeAliases.isEmpty()) { + // Generate type map entry using empty value (an empty value indicates same + // type name is used for both BCP47 and legacy type). + icuData.add(typeMapPrefix.extendBy(typeName), ""); + } else { + String mainAlias = typeAliases.get(0); + icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName); + // Put additional aliases as secondary aliases referencing the main alias. + RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias); + typeAliases.stream() + .skip(1) + .map(Bcp47Mapper::quoteAlias) + .forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias)); + } + addInfoAttributes(keyName, typeName, value.getValueAttributes()); + } + + // Add any additional attributes present to the attribute map. Note that this code was + // copied from largely undocumented code, and the precise reasoning for why this is + // needed or why it's done this way is not completely clear. It is very likely that it + // can be simplified. + // + // The '@' symbol added here is just a magic token that gets stripped off again in the + // addKeyMapValues() method, it appears to just be a way to distinguish keys added via + // this method vs during the collect method. A better approach might just be to have two + // maps. + // TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?). + private void addInfoAttributes( + String keyName, String typeName, ImmutableMap attributes) { + // Only emit deprecation for the "key" level, even if all types below that are also + // marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES). + Set keys = + Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet()); + for (AttributeKey a : keys) { + String value = attributes.get(a); + // Skip empty or default values in attributes. + if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) { continue; } - if (bcpKey.equals(key)) { - // An empty value indicates that the BCP47 key is same as the legacy key. - bcpKey = ""; - } - keyData.add(RB_KEYMAP.extendBy(key), bcpKey); + // The ID for the xxxInfo paths in ICU is the path fragment at which the + // attribute exists. Since we only process complete paths here, we must do a + // bit of reconstruction based on the element name of the attribute we are + // processing. This relies on explicit knowledge that the paths are "" or + // "/". This all gets less messy if we switch to RbPath. + String id = + a.getElementName().equals("key") ? keyName : keyName + "/" + typeName; + keyMap.put( + "@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id, + value); } - // Add aliases for timezone data. - keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone"); - keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone"); - keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz"); - } - - private final class ValueCollector implements ValueVisitor { - // Mutable ICU data collected into during visitation. - private final IcuData icuData; - - ValueCollector(IcuData data) { - this.icuData = checkNotNull(data); - } - - @Override - public void visit(CldrValue value) { - checkArgument(TYPE.matchesSuffixOf(value.getPath()), - "unexpected child element: %s", value.getPath()); - String typeName = TYPE_NAME.valueFrom(value); - // Note that if a "preferred" type exists, we treat the value specially and add - // it only as an alias. We expected values with a preferred replacement to - // always be explicitly deprecated. - Optional prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value); - if (prefName.isPresent()) { - checkState(KEY_DEPRECATED.booleanValueFrom(value, false) - || TYPE_DEPRECATED.booleanValueFrom(value, false), - "unexpected 'preferred' attribute for non-deprecated value: %s", value); - icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get()); - return; - } - // Note: There are some deprecated values which don't have a preferred - // replacement and these will be processed below (in particular we need to emit - // the fact that they are deprecated). - - // Not all key elements have an alias. E.g. in calendar.xml: - // - // But we still add it as a alias to itself (which is later turned into a path with - // an empty value). - String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName)); - - keyMap.put(keyName, keyAlias); - RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias); - - List typeAliases = TYPE_ALIASES.listOfValuesFrom(value); - if (typeAliases.isEmpty()) { - // Generate type map entry using empty value (an empty value indicates same - // type name is used for both BCP47 and legacy type). - icuData.add(typeMapPrefix.extendBy(typeName), ""); - } else { - String mainAlias = typeAliases.get(0); - icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName); - // Put additional aliases as secondary aliases referencing the main alias. - RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias); - typeAliases.stream() - .skip(1) - .map(Bcp47Visitor::quoteAlias) - .forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias)); - } - addInfoAttributes(keyName, typeName, value.getValueAttributes()); - } - - // Add any additional attributes present to the attribute map. Note that this code was - // copied from largely undocumented code, and the precise reasoning for why this is - // needed or why it's done this way is not completely clear. It is very likely that it - // can be simplified. - // - // The '@' symbol added here is just a magic token that gets stripped off again in the - // addKeyMapValues() method, it appears to just be a way to distinguish keys added via - // this method vs during the visit method. A better approach might just be to have two - // maps. - // TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?). - private void addInfoAttributes( - String keyName, String typeName, ImmutableMap attributes) { - // Only emit deprecation for the "key" level, even if all types below that are also - // marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES). - Set keys = - Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet()); - for (AttributeKey a : keys) { - String value = attributes.get(a); - // Skip empty or default values in attributes. - if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) { - continue; - } - // The ID for the xxxInfo paths in ICU is the path fragment at which the - // attribute exists. Since we only process complete paths here, we must do a - // bit of reconstruction based on the element name of the attribute we are - // processing. This relies on explicit knowledge that the paths are "" or - // "/". This all gets less messy if we switch to RbPath. - String id = - a.getElementName().equals("key") ? keyName : keyName + "/" + typeName; - keyMap.put( - "@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id, - value); - } - } - } - - /** - * Escapes alias values containing '/' so they can appear in resource bundle paths. This - * function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar"). - * - *

This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar' - * in the CLDR data. - */ - // TODO: Switch to RbPath and do quoting automatically when ICU data is written out. - private static String quoteAlias(String str) { - return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"'; } } - private Bcp47Mapper() {} + /** + * Escapes alias values containing '/' so they can appear in resource bundle paths. This + * function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar"). + * + *

This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar' + * in the CLDR data. + */ + // TODO: Switch to RbPath and do quoting automatically when ICU data is written out. + private static String quoteAlias(String str) { + return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"'; + } } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java index 8451d73d92b..abff6f82f79 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/BreakIteratorMapper.java @@ -4,18 +4,17 @@ package org.unicode.icu.tool.cldrtoicu.mapper; import static com.google.common.base.Preconditions.checkNotNull; import static org.unicode.cldr.api.AttributeKey.keyOf; -import static org.unicode.cldr.api.CldrData.PathOrder.DTD; import java.util.Optional; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; import org.unicode.cldr.api.CldrDataType; -import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor.SubProcessor; import com.google.common.escape.UnicodeEscaper; @@ -29,20 +28,25 @@ import com.google.common.escape.UnicodeEscaper; */ // TODO: This class can almost certainly be replace with a small RegexTransformer config. public final class BreakIteratorMapper { - // The "type" attribute in /suppressions/ is not required so cannot be in the matcher. And - // its default (and only) value is "standard". - // TODO: Understand and document why this is the case. - private static final PathMatcher SUPPRESSION = PathMatcher.of( - "ldml/segmentations/segmentation[@type=*]/suppressions/suppression"); + + private static final CldrDataProcessor CLDR_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + // The "type" attribute in /suppressions/ is not required so cannot be in the matcher. And + // its default (and only) value is "standard". + // TODO: Understand and document why this is the case. + processor.addValueAction( + "//ldml/segmentations/segmentation[@type=*]/suppressions/suppression", + BreakIteratorMapper::addSuppression); + SubProcessor specials = + processor.addSubprocessor("//ldml/special/icu:breakIteratorData"); + specials.addValueAction("icu:boundaries/*", BreakIteratorMapper::addBoundary); + specials.addValueAction( + "icu:dictionaries/icu:dictionary", BreakIteratorMapper::addDictionary); + CLDR_PROCESSOR = processor.build(); + } + private static final AttributeKey SEGMENTATION_TYPE = keyOf("segmentation", "type"); - - // Note: This could be done with an intermediate matcher for - // "ldml/special/icu:breakIteratorData" but there are so few "special" values it's not worth it - private static final PathMatcher BOUNDARIES = - PathMatcher.of("ldml/special/icu:breakIteratorData/icu:boundaries/*"); - private static final PathMatcher DICTIONARY = - PathMatcher.of("ldml/special/icu:breakIteratorData/icu:dictionaries/icu:dictionary"); - private static final AttributeKey DICTIONARY_DEP = keyOf("icu:dictionary", "icu:dependency"); private static final AttributeKey DICTIONARY_TYPE = keyOf("icu:dictionary", "type"); @@ -59,8 +63,8 @@ public final class BreakIteratorMapper { IcuData icuData, CldrData cldrData, Optional icuSpecialData) { BreakIteratorMapper mapper = new BreakIteratorMapper(icuData); - icuSpecialData.ifPresent(s -> s.accept(DTD, mapper::addSpecials)); - cldrData.accept(DTD, mapper::addSuppression); + icuSpecialData.ifPresent(d -> CLDR_PROCESSOR.process(d, mapper)); + CLDR_PROCESSOR.process(cldrData, mapper); return mapper.icuData; } @@ -72,28 +76,21 @@ public final class BreakIteratorMapper { } private void addSuppression(CldrValue v) { - if (SUPPRESSION.matches(v.getPath())) { - String type = SEGMENTATION_TYPE.valueFrom(v); - // TODO: Understand and document why we escape values here, but not for collation data. - icuData.add( - RbPath.of("exceptions", type + ":array"), - ESCAPE_NON_ASCII.escape(v.getValue())); - } + String type = SEGMENTATION_TYPE.valueFrom(v); + // TODO: Understand and document why we escape values here, but not for collation data. + icuData.add( + RbPath.of("exceptions", type + ":array"), ESCAPE_NON_ASCII.escape(v.getValue())); } - private void addSpecials(CldrValue v) { - CldrPath p = v.getPath(); - if (BOUNDARIES.matches(p)) { - addDependency( - getDependencyName(v), - getBoundaryType(v), - getBoundaryDependency(v)); - } else if (DICTIONARY.matches(p)) { - addDependency( - getDependencyName(v), - DICTIONARY_TYPE.valueFrom(v), - DICTIONARY_DEP.optionalValueFrom(v)); - } + private void addBoundary(CldrValue v) { + addDependency(getDependencyName(v), getBoundaryType(v), getBoundaryDependency(v)); + } + + private void addDictionary(CldrValue v) { + addDependency( + getDependencyName(v), + DICTIONARY_TYPE.valueFrom(v), + DICTIONARY_DEP.optionalValueFrom(v)); } private void addDependency(String name, String type, Optional dependency) { diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java index 207de901b43..16287605e27 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/CollationMapper.java @@ -11,14 +11,13 @@ import java.util.Optional; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.PrefixVisitor; import org.unicode.cldr.api.CldrDataType; -import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; import org.unicode.icu.tool.cldrtoicu.RbValue; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor.SubProcessor; import com.google.common.base.CharMatcher; import com.google.common.base.Splitter; @@ -32,18 +31,22 @@ import com.google.common.base.Splitter; * } */ public final class CollationMapper { - private static final PathMatcher COLLATIONS = PathMatcher.of("ldml/collations"); - // Note that the 'type' attribute is optional, so cannot be in the path matcher. - // However since the CLDR data never actually omits the value, it would be easy to change the - // attribute metadata to stop it being an implicit attribute and then it could appear. - private static final PathMatcher COLLATION_RULE = PathMatcher.of("collation/cr"); + private static final CldrDataProcessor CLDR_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + SubProcessor collations = processor.addSubprocessor("//ldml/collations"); + collations.addValueAction("collation/cr", CollationMapper::collectRule); + collations.addValueAction("defaultCollation", CollationMapper::collectDefault); + // This could be a separate processor, since the specials data only contains these paths, + // but it's not clear if in future it could also contain any collation rules. + processor.addValueAction("//ldml/special/*", CollationMapper::maybeAddSpecial); + CLDR_PROCESSOR = processor.build(); + } + private static final AttributeKey COLLATION_TYPE = keyOf("collation", "type"); private static final AttributeKey COLLATION_RULE_ALT = keyOf("cr", "alt"); - private static final PathMatcher DEFAULT_COLLATION = PathMatcher.of("defaultCollation"); - - private static final PathMatcher SPECIAL = PathMatcher.of("ldml/special"); private static final AttributeKey SPECIAL_RULES = keyOf("icu:UCARules", "icu:uca_rules"); private static final AttributeKey SPECIAL_DEP = keyOf("icu:depends", "icu:dependency"); @@ -68,88 +71,76 @@ public final class CollationMapper { public static IcuData process( IcuData icuData, CldrData cldrData, Optional icuSpecialData, String cldrVersion) { - CollationVisitor visitor = new CollationVisitor(icuData, cldrVersion); - icuSpecialData.ifPresent(s -> s.accept(DTD, visitor)); - cldrData.accept(DTD, visitor); - return visitor.icuData; + CollationMapper mapper = new CollationMapper(icuData, cldrVersion); + icuSpecialData.ifPresent(specialData -> CLDR_PROCESSOR.process(specialData, mapper, DTD)); + CLDR_PROCESSOR.process(cldrData, mapper, DTD); + return icuData; } - final static class CollationVisitor implements PrefixVisitor { - private final IcuData icuData; - private final String cldrVersion; + private final IcuData icuData; + private final String cldrVersion; - CollationVisitor(IcuData icuData, String cldrVersion) { - this.icuData = checkNotNull(icuData); - this.cldrVersion = checkNotNull(cldrVersion); - // Super special hack case because the XML data is a bit broken for the root collation - // data (there's an empty element that's a non-leaf element and thus not - // visited, but we should add an empty sequence to the output data. - // TODO: Fix CLDR (https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13131) - if (icuData.getName().equals("root")) { - icuData.replace(RB_STANDARD_SEQUENCE, ""); - // TODO: Collation versioning probably needs to be improved. - icuData.replace(RB_STANDARD_VERSION, cldrVersion); - } + private CollationMapper(IcuData icuData, String cldrVersion) { + this.icuData = checkNotNull(icuData); + this.cldrVersion = checkNotNull(cldrVersion); + // Super special hack case because the XML data is a bit broken for the root collation + // data (there's an empty element that's a non-leaf element and thus not + // visited, but we should add an empty sequence to the output data. + // TODO: Fix CLDR (https://unicode-org.atlassian.net/projects/CLDR/issues/CLDR-13131) + if (icuData.getName().equals("root")) { + icuData.replace(RB_STANDARD_SEQUENCE, ""); + // TODO: Collation versioning probably needs to be improved. + icuData.replace(RB_STANDARD_VERSION, cldrVersion); } + } - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (COLLATIONS.matchesPrefixOf(prefix)) { - ctx.install(this::collectRules); - } else if (SPECIAL.matchesPrefixOf(prefix)) { - ctx.install(this::maybeAddSpecial); - } + private void collectRule(CldrValue v) { + String type = COLLATION_TYPE.valueFrom(v); + RbPath rbPath = RbPath.of("collations", type, "Sequence"); + + // WARNING: This is almost certainly a bug, since while @type can have the value + // "short" it can also have other values. This code was copied from CollationMapper + // which has the line; + // isShort = attr.getValue("alt") != null; + // TODO: Raise a ticket to examine this. + boolean isShort = COLLATION_RULE_ALT.optionalValueFrom(v).isPresent(); + + // Note that it's not clear why there's a check for "contains()" here. The code + // from which this was derived is largely undocumented and this check could have + // been overly defensive (perhaps a duplicate key should be an error?). + if (isShort || !icuData.getPaths().contains(rbPath)) { + RbValue rules = RbValue.of( + LINE_SPLITTER.splitToList(v.getValue()).stream() + .map(CollationMapper::removeComment) + .filter(s -> !s.isEmpty())::iterator); + icuData.replace(rbPath, rules); + icuData.replace(RbPath.of("collations", type, "Version"), cldrVersion); } + } - private void collectRules(CldrValue v) { - CldrPath p = v.getPath(); - if (COLLATION_RULE.matchesSuffixOf(p)) { - String type = COLLATION_TYPE.valueFrom(v); - RbPath rbPath = RbPath.of("collations", type, "Sequence"); + private void collectDefault(CldrValue v) { + icuData.add(RB_COLLATIONS_DEFAULT, v.getValue()); + } - // WARNING: This is almost certainly a bug, since while @type can have the value - // "short" it can also have other values. This code was copied from CollationMapper - // which has the line; - // isShort = attr.getValue("alt") != null; - // TODO: Raise a ticket to examine this. - boolean isShort = COLLATION_RULE_ALT.optionalValueFrom(v).isPresent(); - - // Note that it's not clear why there's a check for "contains()" here. The code - // from which this was derived is largely undocumented and this check could have - // been overly defensive (perhaps a duplicate key should be an error?). - if (isShort || !icuData.getPaths().contains(rbPath)) { - RbValue rules = RbValue.of( - LINE_SPLITTER.splitToList(v.getValue()).stream() - .map(CollationMapper::removeComment) - .filter(s -> !s.isEmpty())::iterator); - icuData.replace(rbPath, rules); - icuData.replace(RbPath.of("collations", type, "Version"), cldrVersion); - } - } else if (DEFAULT_COLLATION.matchesSuffixOf(p)) { - icuData.add(RB_COLLATIONS_DEFAULT, v.getValue()); - } - } - - // This is a bit special since the attribute we want to add depends on the element we are - // visiting (which is somewhat unusual in the transformation classes). - private void maybeAddSpecial(CldrValue value) { - AttributeKey key; - switch (value.getPath().getName()) { - case "icu:UCARules": - key = SPECIAL_RULES; - break; - case "icu:depends": - key = SPECIAL_DEP; - break; - default: - return; - } - // substring(4) just removes the "icu:" prefix (which we know is present in the key). - RbPath rbPath = RbPath.of( - String.format("%s:process(%s)", - key.getElementName().substring(4), key.getAttributeName().substring(4))); - icuData.add(rbPath, key.valueFrom(value)); + // This is a bit special since the attribute we want to add depends on the element we are + // visiting (which is somewhat unusual in the transformation classes). + private void maybeAddSpecial(CldrValue value) { + AttributeKey key; + switch (value.getPath().getName()) { + case "icu:UCARules": + key = SPECIAL_RULES; + break; + case "icu:depends": + key = SPECIAL_DEP; + break; + default: + return; } + // substring(4) just removes the "icu:" prefix (which we know is present in the key). + RbPath rbPath = RbPath.of( + String.format("%s:process(%s)", + key.getElementName().substring(4), key.getAttributeName().substring(4))); + icuData.add(rbPath, key.valueFrom(value)); } // Collation data can contain # to mark an end-of-line comment, but it can also contain data @@ -195,6 +186,4 @@ public final class CollationMapper { checkArgument(!quoted, "mismatched quotes in: %s", s); return -1; } - - private CollationMapper() {} } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java index ef0e2966c32..18cb14cc9d1 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/DayPeriodsMapper.java @@ -6,18 +6,15 @@ import static org.unicode.cldr.api.AttributeKey.keyOf; import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING; import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; -import java.util.Optional; - import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.PrefixVisitor; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; import com.google.common.annotations.VisibleForTesting; @@ -29,14 +26,18 @@ import com.google.common.annotations.VisibleForTesting; * } */ public final class DayPeriodsMapper { - private static final PathMatcher RULESET = - PathMatcher.of("supplementalData/dayPeriodRuleSet"); + + private static final CldrDataProcessor CLDR_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + processor.addAction("//supplementalData/dayPeriodRuleSet", (m, p) -> m.new Ruleset(p)) + .addSubprocessor("dayPeriodRules[@locales=*]", Ruleset::prefixStart) + .addValueAction("dayPeriodRule[@type=*]", Ruleset::visitRule); + CLDR_PROCESSOR = processor.build(); + } + private static final AttributeKey RULESET_TYPE = keyOf("dayPeriodRuleSet", "type"); - - private static final PathMatcher RULES = PathMatcher.of("dayPeriodRules[@locales=*]"); private static final AttributeKey RULES_LOCALES = keyOf("dayPeriodRules", "locales"); - - private static final PathMatcher RULE = PathMatcher.of("dayPeriodRule[@type=*]"); private static final AttributeKey RULE_TYPE = keyOf("dayPeriodRule", "type"); private static final RbPath RB_LOCALES = RbPath.of("locales"); @@ -53,51 +54,33 @@ public final class DayPeriodsMapper { @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. static IcuData process(CldrData data) { - RuleSetVisitor mapper = new RuleSetVisitor(); - data.accept(NESTED_GROUPING, mapper); - return mapper.icuData; + return CLDR_PROCESSOR.process(data, new DayPeriodsMapper(), NESTED_GROUPING).icuData; } - private static final class RuleSetVisitor implements PrefixVisitor { - // Mutable ICU data collected into during visitation. - private final IcuData icuData = new IcuData("dayPeriods", false); - private int setNum = 0; + // Mutable ICU data collected into during visitation. + private final IcuData icuData = new IcuData("dayPeriods", false); + private int setNum = 0; - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (RULESET.matches(prefix)) { - ctx.install(new RuleVisitor(RULESET_TYPE.optionalValueFrom(prefix))); - } + private final class Ruleset { + private RbPath localePrefix; + + Ruleset(CldrPath prefix) { + this.localePrefix = RULESET_TYPE.optionalValueFrom(prefix) + .map(t -> RbPath.of("locales_" + t)) + .orElse(RB_LOCALES); } - private final class RuleVisitor implements PrefixVisitor { - private final RbPath localePrefix; + private void prefixStart(CldrPath prefix) { + // Sets are arbitrarily identified by the string "setNN". + String setName = "set" + (++setNum); + RULES_LOCALES.listOfValuesFrom(prefix) + .forEach(locale -> icuData.add(localePrefix.extendBy(locale), setName)); + } - private RuleVisitor(Optional type) { - // If there's a given type, add it to the prefix path. - this.localePrefix = type.map(t -> RbPath.of("locales_" + t)).orElse(RB_LOCALES); - } - - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (RULES.matchesSuffixOf(prefix)) { - // Sets are arbitrarily identified by the string "setNN". - String setName = "set" + (++setNum); - RULES_LOCALES.listOfValuesFrom(prefix) - .forEach(locale -> icuData.add(localePrefix.extendBy(locale), setName)); - ctx.install(this::visitRule); - } - } - - private void visitRule(CldrValue value) { - if (RULE.matchesSuffixOf(value.getPath())) { - RbPath prefix = RbPath.of("rules", "set" + setNum, RULE_TYPE.valueFrom(value)); - value.getValueAttributes() - .forEach((k, v) -> icuData.add(prefix.extendBy(k.getAttributeName()), v)); - } - } + private void visitRule(CldrValue value) { + RbPath prefix = RbPath.of("rules", "set" + setNum, RULE_TYPE.valueFrom(value)); + value.getValueAttributes() + .forEach((k, v) -> icuData.add(prefix.extendBy(k.getAttributeName()), v)); } } - - private DayPeriodsMapper() {} } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java index c811a8e1c39..826ffee89c0 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralRangesMapper.java @@ -2,22 +2,20 @@ // License & terms of use: http://www.unicode.org/copyright.html package org.unicode.icu.tool.cldrtoicu.mapper; -import static com.google.common.base.Preconditions.checkState; import static org.unicode.cldr.api.AttributeKey.keyOf; import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING; import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.PrefixVisitor; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; import org.unicode.icu.tool.cldrtoicu.RbValue; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; import com.google.common.annotations.VisibleForTesting; @@ -29,15 +27,18 @@ import com.google.common.annotations.VisibleForTesting; * } */ public final class PluralRangesMapper { - // Note that this mapper only matches when there's no "type" specified on the "plurals" element. - // This is a bit weird, since the PluralsMapper expects a type (e.g. cardinal or ordinal) to - // be present. Really this just illustrates that the plural ranges just should not be under the - // same parent element as plurals. - private static final PathMatcher RANGES = - PathMatcher.of("supplementalData/plurals/pluralRanges[@locales=*]"); - private static final AttributeKey RANGES_LOCALES = keyOf("pluralRanges", "locales"); - private static final PathMatcher RANGE = PathMatcher.of("pluralRange[@start=*][@end=*]"); + private static final CldrDataProcessor CLDR_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + processor + .addAction( + "//supplementalData/plurals/pluralRanges[@locales=*]", (m, p) -> m.new Ranges(p)) + .addValueAction("pluralRange[@start=*][@end=*]", Ranges::visitRange); + CLDR_PROCESSOR = processor.build(); + } + + private static final AttributeKey RANGES_LOCALES = keyOf("pluralRanges", "locales"); private static final AttributeKey RANGE_START = keyOf("pluralRange", "start"); private static final AttributeKey RANGE_END = keyOf("pluralRange", "end"); private static final AttributeKey RANGE_RESULT = keyOf("pluralRange", "result"); @@ -52,46 +53,37 @@ public final class PluralRangesMapper { * @return the IcuData instance to be written to a file. */ public static IcuData process(CldrDataSupplier src) { - CldrData data = src.getDataForType(SUPPLEMENTAL); - return process(data); + return process(src.getDataForType(SUPPLEMENTAL)); } @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. static IcuData process(CldrData data) { - PluralRangesVisitor visitor = new PluralRangesVisitor(); - data.accept(NESTED_GROUPING, visitor); - return visitor.icuData; + return CLDR_PROCESSOR.process(data, new PluralRangesMapper(), NESTED_GROUPING).icuData; } - private static final class PluralRangesVisitor implements PrefixVisitor { - private final IcuData icuData = new IcuData("pluralRanges", false); + private final IcuData icuData = new IcuData("pluralRanges", false); + private int setIndex = 0; - private int setIndex = 0; - private String ruleLabel = null; + private PluralRangesMapper() { } - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (RANGES.matches(prefix)) { - ruleLabel = String.format("set%02d", setIndex++); - RANGES_LOCALES.listOfValuesFrom(prefix) - .forEach(l -> icuData.add(RB_LOCALES.extendBy(l), ruleLabel)); - ctx.install(this::visitRange); - } + private final class Ranges { + private final String label; + + Ranges(CldrPath prefix) { + this.label = String.format("set%02d", setIndex++); + RANGES_LOCALES.listOfValuesFrom(prefix) + .forEach(l -> icuData.add(RB_LOCALES.extendBy(l), label)); } private void visitRange(CldrValue value) { - checkState(RANGE.matchesSuffixOf(value.getPath()), - "unexpected path: %s", value.getPath()); // Note: "range:start" and "range:end" are optional attributes, but the CLDR DTD // specifies a default via comments. They should probably be changed to just have a // default in the DTD (and possibly converted to use an enum here). - icuData.add(RB_RULES.extendBy(ruleLabel), + icuData.add(RB_RULES.extendBy(label), RbValue.of( RANGE_START.valueFrom(value, "all"), RANGE_END.valueFrom(value, "all"), RANGE_RESULT.valueFrom(value))); } } - - private PluralRangesMapper() {} -} +} \ No newline at end of file diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java index dd517817709..7f95f485a63 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/PluralsMapper.java @@ -2,30 +2,29 @@ // License & terms of use: http://www.unicode.org/copyright.html package org.unicode.icu.tool.cldrtoicu.mapper; -import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static org.unicode.cldr.api.AttributeKey.keyOf; import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING; import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.PrefixVisitor; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.CldrValue; +import org.unicode.cldr.api.FilteredData; +import org.unicode.cldr.api.PathMatcher; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Iterables; /** * A mapper to collect plural data from {@link CldrDataType#SUPPLEMENTAL SUPPLEMENTAL} data via @@ -35,15 +34,21 @@ import com.google.common.collect.Iterables; * } */ public final class PluralsMapper { - private static final PathMatcher PLURALS = PathMatcher.of("supplementalData/plurals[@type=*]"); + private static final AttributeKey PLURALS_TYPE = keyOf("plurals", "type"); - - private static final PathMatcher RULES = PathMatcher.of("pluralRules[@locales=*]"); private static final AttributeKey RULES_LOCALES = keyOf("pluralRules", "locales"); - - private static final PathMatcher RULE = PathMatcher.of("pluralRule[@count=*]"); private static final AttributeKey RULE_COUNT = keyOf("pluralRule", "count"); + private static final CldrDataProcessor CLDR_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + processor + .addAction("//supplementalData/plurals[@type=*]", (m, p) -> m.new Plurals(p)) + .addAction("pluralRules[@locales=*]", Rules::new, Plurals::addRules) + .addValueAction("pluralRule[@count=*]", Rules::addRule); + CLDR_PROCESSOR = processor.build(); + } + private static final ImmutableMap ICU_PREFIX_MAP = ImmutableMap.of("cardinal", RbPath.of("locales"), "ordinal", RbPath.of("locales_ordinals")); @@ -54,103 +59,94 @@ public final class PluralsMapper { * @return the IcuData instance to be written to a file. */ public static IcuData process(CldrDataSupplier src) { - CldrData data = src.getDataForType(SUPPLEMENTAL); - return process(data); + return process(src.getDataForType(SUPPLEMENTAL)); } @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. static IcuData process(CldrData data) { - PluralsVisitor visitor = new PluralsVisitor(); - // Note: We explicitly reset the type to mimic the order of the existing code, since this + PluralsMapper mapper = new PluralsMapper(); + // Note: We explicitly filter by type to mimic the order of the existing code, since this // affects the set indices we generate during processing. Ideally this would all be immune // to ordering (or just enforce DTD ordering) but right now it's very dependent on - // mimicking the order of the existing code to get identical output. - data.accept(NESTED_GROUPING, visitor.setType("cardinal")); - data.accept(NESTED_GROUPING, visitor.setType("ordinal")); - return visitor.icuData; + // mimicking the order of the existing code to get identical output. Once DTD order is + // everywhere, this can just be a single pass over the original data. + CLDR_PROCESSOR.process(filterByType(data, "cardinal"), mapper, NESTED_GROUPING); + CLDR_PROCESSOR.process(filterByType(data, "ordinal"), mapper, NESTED_GROUPING); + return mapper.icuData; } - private static final class PluralsVisitor implements PrefixVisitor { - // Mutable ICU data collected into during visitation. - // In a post XML-aware API, is recording the XML file names really a good idea? - private final IcuData icuData = new IcuData("plurals", false); - // Filter for the type we are processing now (this could be removed if we don't mind which - // order the types are processed, and switching to DTD ordering would make it stable). - private String type = null; - private final List> previousRules = new ArrayList<>(); + // Mutable ICU data collected into during visitation. + // In a post XML-aware API, is recording the XML file names really a good idea? + private final IcuData icuData = new IcuData("plurals", false); + private final List> previousRules = new ArrayList<>(); - // Hack method to allow a single type to be processed at a time (the visitor would otherwise - // happily handle both types in a single pass). We can't do this as two different visitors - // (one for each type) because the current behaviour relies on carrying over the calculated - // set numbers from one pass to the next. Once migration is complete we should revisit this - // and allow this visitor to work in a single pass (probably with DTD order for stability). - PluralsVisitor setType(String type) { - this.type = checkNotNull(type); - return this; + private class Plurals { + private final RbPath icuPrefix; + + Plurals(CldrPath prefix) { + // Note: "plurals:type" is an optional attribute but the CLDR DTD specifies a + // default via comments. It should probably be changed to just have a default in + // the DTD. + this.icuPrefix = ICU_PREFIX_MAP.get(PLURALS_TYPE.valueFrom(prefix, "cardinal")); } - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (PLURALS.matches(prefix)) { - // Note: "plurals:type" is an optional attribute but the CLDR DTD specifies a - // default via comments. It should probably be changed to just have a default in - // the DTD. - if (PLURALS_TYPE.valueFrom(prefix, "cardinal").equals(type)) { - ctx.install(new RulesVisitor(ICU_PREFIX_MAP.get(type))); - } - } - } - - private final class RulesVisitor implements PrefixVisitor { - private final RbPath icuPrefix; - private final List locales = new ArrayList<>(); - private final Map rules = new LinkedHashMap<>(); - - RulesVisitor(RbPath icuPrefix) { - this.icuPrefix = checkNotNull(icuPrefix); - } - - @Override - public void visitPrefixStart(CldrPath prefix, Context ctx) { - if (RULES.matchesSuffixOf(prefix)) { - Iterables.addAll(locales, RULES_LOCALES.listOfValuesFrom(prefix)); - ctx.install(value -> { - if (RULE.matchesSuffixOf(value.getPath())) { - rules.put(RULE_COUNT.valueFrom(value), value.getValue()); - } - }); - } - } - - @Override - public void visitPrefixEnd(CldrPath prefix) { - checkState(!locales.isEmpty(), "missing locale data for plurals: %s", prefix); - // Note: The original mapper code "sort of" coped with empty rules, but it's not - // completely well behaved (or documented), so since this doesn't happen in the - // current CLDR data, I decided to just prohibit it in the new code. Support can - // easily be added in once the expected semantics are clear. - checkState(!rules.isEmpty(), "missing rule data for plurals: %s", prefix); - - // Have we seen this set of rules before? If so, reuse the existing index. Note - // that an IDE might report this call as suspicious because the key is not yet an - // immutable map (saves creating immutable maps just to check for inclusion) but - // this is fine because collection equality is based only on contents, not - // collection type. - int idx = previousRules.indexOf(rules); - if (idx == -1) { - int newIdx = previousRules.size(); - rules.forEach((k, v) -> icuData.add(RbPath.of("rules", "set" + newIdx, k), v)); - // Since "rules" is mutable and reused, we must take an immutable copy here. - previousRules.add(ImmutableMap.copyOf(rules)); - idx = newIdx; - } - String setName = "set" + idx; - locales.forEach(locale -> icuData.add(icuPrefix.extendBy(locale), setName)); - rules.clear(); - locales.clear(); + private void addRules(Rules r) { + ImmutableMap rules = r.getRules(); + // Note: The original mapper code "sort of" coped with empty rules, but it's not + // completely well behaved (or documented), so since this doesn't happen in the + // current CLDR data, I decided to just prohibit it in the new code. Support can + // easily be added in once the expected semantics are clear. + checkState(!rules.isEmpty(), "missing rule data for plurals"); + + // Have we seen this set of rules before? If so, reuse the existing index. Note + // that an IDE might report this call as suspicious because the key is not yet an + // immutable map (saves creating immutable maps just to check for inclusion) but + // this is fine because collection equality is based only on contents, not + // collection type. + int idx = previousRules.indexOf(rules); + if (idx == -1) { + int newIdx = previousRules.size(); + rules.forEach((k, v) -> icuData.add(RbPath.of("rules", "set" + newIdx, k), v)); + // Since "rules" is mutable and reused, we must take an immutable copy here. + previousRules.add(rules); + idx = newIdx; } + String setName = "set" + idx; + r.getLocales().forEach(locale -> icuData.add(icuPrefix.extendBy(locale), setName)); } } - private PluralsMapper() {} + private static class Rules { + private final ImmutableList locales; + private final ImmutableMap.Builder map = ImmutableMap.builder(); + + private Rules(CldrPath prefix) { + this.locales = ImmutableList.copyOf(RULES_LOCALES.listOfValuesFrom(prefix)); + checkState(!locales.isEmpty(), "missing locale data for plurals: %s", prefix); + } + + private void addRule(CldrValue value) { + map.put(RULE_COUNT.valueFrom(value), value.getValue()); + } + + private ImmutableList getLocales() { + return locales; + } + + private ImmutableMap getRules() { + return map.build(); + } + } + + // A hack to allow us to process "cardinal" data before "ordinal" data (even though DTD order + // is the other way round). Once DTD order is the only ordering used, this can be removed. + private static CldrData filterByType(CldrData data, String pluralType) { + PathMatcher matcher = + PathMatcher.of("//supplementalData/plurals[@type=\"" + pluralType + "\"]"); + return new FilteredData(data) { + @Override protected CldrValue filter(CldrValue value) { + return matcher.matchesPrefixOf(value.getPath()) ? value : null; + } + }; + } } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java index 9377e8d78dc..e227dcb2091 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/RbnfMapper.java @@ -7,16 +7,15 @@ import static org.unicode.cldr.api.AttributeKey.keyOf; import static org.unicode.cldr.api.CldrData.PathOrder.DTD; import java.util.Optional; -import java.util.concurrent.atomic.AtomicBoolean; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.PrefixVisitor; import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrPath; +import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; import com.google.common.escape.UnicodeEscaper; @@ -26,21 +25,26 @@ import com.google.common.escape.UnicodeEscaper; * //ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*] * } */ -// TODO: This class can almost certainly be written using RegexTransformer and a small config. public final class RbnfMapper { - private static final PathMatcher RULE_SET = - PathMatcher.of("ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]"); private static final AttributeKey GROUPING_TYPE = keyOf("rulesetGrouping", "type"); private static final AttributeKey RULESET_TYPE = keyOf("ruleset", "type"); - - private static final PathMatcher RBNF_RULE = PathMatcher.of("rbnfrule"); + private static final AttributeKey RULESET_ACCESS = keyOf("ruleset", "access"); private static final AttributeKey RBNF_VALUE = keyOf("rbnfrule", "value"); private static final AttributeKey RBNF_RADIX = keyOf("rbnfrule", "radix"); - private static final AttributeKey RULESET_ACCESS = keyOf("ruleset", "access"); // This is the ICU path prefix, below which everything generated by this visitor will go. private static final RbPath RB_ROOT = RbPath.of("RBNFRules"); + private static final CldrDataProcessor RBNF_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + processor + .addAction( + "//ldml/rbnf/rulesetGrouping[@type=*]/ruleset[@type=*]", (m, p) -> m.new Ruleset(p)) + .addValueAction("rbnfrule", Ruleset::addRule); + RBNF_PROCESSOR = processor.build(); + } + /** * Processes data from the given supplier to generate RBNF data for a set of locale IDs. * @@ -55,89 +59,78 @@ public final class RbnfMapper { // Using DTD order is essential here because the RBNF paths contain ordered elements, // so we must ensure that they appear in sorted order (otherwise we'd have to do more // work at this end to re-sort the results). - RulesetVisitor visitor = new RulesetVisitor(icuData); - icuSpecialData.ifPresent(s -> s.accept(DTD, visitor)); - cldrData.accept(DTD, visitor); - return visitor.icuData; + RbnfMapper mapper = new RbnfMapper(icuData); + icuSpecialData.ifPresent(s -> RBNF_PROCESSOR.process(s, mapper, DTD)); + RBNF_PROCESSOR.process(cldrData, mapper, DTD); + return mapper.icuData; } - static final class RulesetVisitor implements PrefixVisitor { + private final IcuData icuData; - private final IcuData icuData; + private RbnfMapper(IcuData icuData) { + this.icuData = checkNotNull(icuData); + } - private RulesetVisitor(IcuData icuData) { - this.icuData = checkNotNull(icuData); + private class Ruleset { + private final RbPath rbPath; + private final String rulesetType; + private final boolean isStrict; + private boolean hasHeader = false; + + Ruleset(CldrPath prefix) { + this.rbPath = RB_ROOT.extendBy(GROUPING_TYPE.valueFrom(prefix)); + this.rulesetType = RULESET_TYPE.valueFrom(prefix); + this.isStrict = !"lenient-parse".equals(rulesetType); } - @Override public void visitPrefixStart(CldrPath prefix, Context context) { - if (RULE_SET.matchesPrefixOf(prefix)) { - RbPath rbPath = RB_ROOT.extendBy(GROUPING_TYPE.valueFrom(prefix)); - String rulesetType = RULESET_TYPE.valueFrom(prefix); - boolean isStrict = !"lenient-parse".equals(rulesetType); - - // This is rather hacky because the access attribute lives on the parent path - // element, but we cannot use it until we visit the child values (because it's a - // value attribute and will not be in the prefix path). So we need to add the - // header only once, just before we start adding the values relating to the child - // elements, so we need a flag. - // - // This cannot be a boolean field since it must be "effectively final". - AtomicBoolean hasHeader = new AtomicBoolean(false); - context.install( - value -> { - if (RBNF_RULE.matchesSuffixOf(value.getPath())) { - if (!hasHeader.get()) { - boolean isPrivate = - RULESET_ACCESS.valueFrom(value, "public").equals("private"); - icuData.add(rbPath, (isPrivate ? "%%" : "%") + rulesetType + ":"); - hasHeader.set(true); - } - String rulePrefix = ""; - if (isStrict) { - String basePrefix = RBNF_VALUE.valueFrom(value); - rulePrefix = RBNF_RADIX.optionalValueFrom(value) - .map(r -> basePrefix + "/" + r) - .orElse(basePrefix); - rulePrefix += ": "; - } - icuData.add( - rbPath, - rulePrefix + ESCAPE_RBNF_DATA.escape(value.getValue())); - } - }); + void addRule(CldrValue value) { + // This is a bit hacky because the access attribute lives on the parent path element, + // but we cannot use it until we visit the child values (because it's a value attribute + // and will not be in the prefix path) so we need to add the header only once here. + if (!hasHeader) { + boolean isPrivate = RULESET_ACCESS.valueFrom(value, "public").equals("private"); + icuData.add(rbPath, (isPrivate ? "%%" : "%") + rulesetType + ":"); + hasHeader = true; } + // Prefix is: "@value: ", "@value/@radix: " or empty (for non strict rules). + String rulePrefix = isStrict + ? RBNF_VALUE.valueFrom(value) + + RBNF_RADIX.optionalValueFrom(value).map(r -> "/" + r).orElse("") + + ": " + : ""; + icuData.add(rbPath, rulePrefix + ESCAPE_RBNF_DATA.escape(value.getValue())); } + } - /* - * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert - * backslash to a double backslash. This class is super slow for non-ASCII escaping due to - * using "String.format()", however there's < 100 values that need any escaping, so it's - * fine. - */ - private static final UnicodeEscaper ESCAPE_RBNF_DATA = new UnicodeEscaper() { - private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray(); - private final char[] LEFT_ANGLE = "<".toCharArray(); - private final char[] RIGHT_ANGLE = ">".toCharArray(); + /* + * Convert characters outside the range U+0020 to U+007F to Unicode escapes, and convert + * backslash to a double backslash. This class is super slow for non-ASCII escaping due to + * using "String.format()", however there's < 100 values that need any escaping, so it's + * fine. + */ + private static final UnicodeEscaper ESCAPE_RBNF_DATA = new UnicodeEscaper() { + private final char[] DOUBLE_BACKSLASH = "\\\\".toCharArray(); + private final char[] LEFT_ANGLE = "<".toCharArray(); + private final char[] RIGHT_ANGLE = ">".toCharArray(); - @Override - protected char[] escape(int cp) { - // Returning null means "do not escape". - switch (cp) { - case '\\': - return DOUBLE_BACKSLASH; - case '←': - return LEFT_ANGLE; - case '→': - return RIGHT_ANGLE; - default: - if (0x0020 <= cp && cp <= 0x007F) { - return null; - } else if (cp <= 0xFFFF) { - return String.format("\\u%04X", cp).toCharArray(); - } - return String.format("\\U%08X", cp).toCharArray(); + @Override + protected char[] escape(int cp) { + // Returning null means "do not escape". + switch (cp) { + case '\\': + return DOUBLE_BACKSLASH; + case '←': + return LEFT_ANGLE; + case '→': + return RIGHT_ANGLE; + default: + if (0x0020 <= cp && cp <= 0x007F) { + return null; + } else if (cp <= 0xFFFF) { + return String.format("\\u%04X", cp).toCharArray(); } + return String.format("\\U%08X", cp).toCharArray(); } - }; - } + } + }; } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java index b3c57050254..a0654b3ae6c 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapper.java @@ -5,11 +5,13 @@ package org.unicode.icu.tool.cldrtoicu.mapper; import static com.google.common.base.Preconditions.checkNotNull; import static org.unicode.cldr.api.CldrData.PathOrder.NESTED_GROUPING; +import java.util.function.Predicate; + import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; +import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.PathValueTransformer; import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result; import org.unicode.icu.tool.cldrtoicu.RbPath; @@ -36,18 +38,21 @@ public final class SupplementalMapper extends AbstractPathValueMapper { */ // TODO: Improve external data splitting and remove need for a PathMatcher here. public static IcuData process( - CldrDataSupplier src, PathValueTransformer transformer, String icuName, PathMatcher paths) { + CldrDataSupplier src, + PathValueTransformer transformer, + String icuName, + Predicate paths) { IcuData icuData = new IcuData(icuName, false); new SupplementalMapper(src, transformer, paths).addIcuData(icuData); return icuData; } - private final PathMatcher paths; + private final Predicate paths; private int fifoCounter = 0; private SupplementalMapper( - CldrDataSupplier src, PathValueTransformer transformer, PathMatcher pathFilter) { + CldrDataSupplier src, PathValueTransformer transformer, Predicate pathFilter) { super(src.getDataForType(CldrDataType.SUPPLEMENTAL), transformer); this.paths = checkNotNull(pathFilter); @@ -63,7 +68,7 @@ public final class SupplementalMapper extends AbstractPathValueMapper { } private void visit(CldrValue value) { - if (paths.matchesPrefixOf(value.getPath())) { + if (paths.test(value.getPath())) { transformValue(value).forEach(this::collectResult); fifoCounter++; } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java index ece1aaa62e2..f1ce048700a 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java @@ -6,7 +6,6 @@ import static com.google.common.base.CharMatcher.whitespace; import static com.google.common.base.Preconditions.checkNotNull; import static java.nio.file.StandardOpenOption.CREATE_NEW; import static org.unicode.cldr.api.AttributeKey.keyOf; -import static org.unicode.cldr.api.CldrData.PathOrder.DTD; import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; import java.io.IOException; @@ -20,14 +19,13 @@ import java.util.function.Function; import org.unicode.cldr.api.AttributeKey; import org.unicode.cldr.api.CldrData; -import org.unicode.cldr.api.CldrData.ValueVisitor; import org.unicode.cldr.api.CldrDataSupplier; import org.unicode.cldr.api.CldrDataType; import org.unicode.cldr.api.CldrValue; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.RbPath; import org.unicode.icu.tool.cldrtoicu.RbValue; +import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; @@ -43,8 +41,15 @@ import com.ibm.icu.text.Transliterator; *

This mapper also writes out the transform rule files into a specified directory. */ public final class TransformsMapper { - private static final PathMatcher TRULE = - PathMatcher.of("supplementalData/transforms/transform/tRule"); + + private static final CldrDataProcessor CLDR_PROCESSOR; + static { + CldrDataProcessor.Builder processor = CldrDataProcessor.builder(); + processor.addValueAction( + "//supplementalData/transforms/transform/tRule", TransformsMapper::processRule); + CLDR_PROCESSOR = processor.build(); + } + private static final AttributeKey TRANSFORM_SOURCE = keyOf("transform", "source"); private static final AttributeKey TRANSFORM_TARGET = keyOf("transform", "target"); private static final AttributeKey TRANSFORM_DIRECTION = keyOf("transform", "direction"); @@ -99,74 +104,68 @@ public final class TransformsMapper { static IcuData process( CldrData cldrData, Function fileWriterFn, List header) { - RuleVisitor visitor = new RuleVisitor(fileWriterFn, header); - cldrData.accept(DTD, visitor); - addSpecialCaseValues(visitor.icuData); - return visitor.icuData; + TransformsMapper mapper = new TransformsMapper(fileWriterFn, header); + CLDR_PROCESSOR.process(cldrData, mapper); + addSpecialCaseValues(mapper.icuData); + return mapper.icuData; } - private static class RuleVisitor implements ValueVisitor { - private final IcuData icuData = new IcuData("root", false); - private final Function outFn; - private final ImmutableList header; + private final IcuData icuData = new IcuData("root", false); + private final Function outFn; + private final ImmutableList header; - RuleVisitor(Function outFn, List header) { - this.outFn = checkNotNull(outFn); - this.header = ImmutableList.copyOf(header); - icuData.setFileComment("File: root.txt"); + private TransformsMapper(Function outFn, List header) { + this.outFn = checkNotNull(outFn); + this.header = ImmutableList.copyOf(header); + icuData.setFileComment("File: root.txt"); + } + + private void processRule(CldrValue value) { + String source = getExpectedOptionalAttribute(value, TRANSFORM_SOURCE); + String target = getExpectedOptionalAttribute(value, TRANSFORM_TARGET); + Optional variant = TRANSFORM_VARIANT.optionalValueFrom(value); + String baseFilename = source + "_" + target; + String filename = variant.map(v -> baseFilename + "_" + v).orElse(baseFilename) + ".txt"; + writeRootIndexEntry(value, source, target, variant, filename); + writeDataFile(filename, value); + } + + private void writeDataFile(String filename, CldrValue value) { + try (PrintWriter out = outFn.apply(Paths.get(filename))) { + out.print("\uFEFF"); + header.forEach(s -> out.println("# " + s)); + out.println("#"); + out.println("# File: " + filename); + out.println("# Generated from CLDR"); + out.println("#"); + out.println(); + out.println(FIXUP.transliterate(whitespace().trimFrom(value.getValue()))); + out.println(); } + } - @Override public void visit(CldrValue value) { - // The other possible element is "comment" but we currently ignore those. - if (TRULE.matches(value.getPath())) { - String source = getExpectedOptionalAttribute(value, TRANSFORM_SOURCE); - String target = getExpectedOptionalAttribute(value, TRANSFORM_TARGET); - Optional variant = TRANSFORM_VARIANT.optionalValueFrom(value); - String baseFilename = source + "_" + target; - String filename = - variant.map(v -> baseFilename + "_" + v).orElse(baseFilename) + ".txt"; - writeRootIndexEntry(value, source, target, variant, filename); - writeDataFile(filename, value); - } + private void writeRootIndexEntry( + CldrValue value, String source, String target, Optional variant, String filename) { + Visibility visibility = TRANSFORM_VISIBILITY.valueFrom(value, Visibility.class); + String status = visibility == Visibility.internal ? "internal" : "file"; + + Direction dir = TRANSFORM_DIRECTION.valueFrom(value, Direction.class); + // TODO: Consider checks for unused data (e.g. forward aliases in a backward rule). + if (dir != Direction.backward) { + String id = getId(source, target, variant); + TRANSFORM_ALIAS.listOfValuesFrom(value) + .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id)); + RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status); + icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename); + icuData.add(rbPrefix.extendBy("direction"), "FORWARD"); } - - private void writeDataFile(String filename, CldrValue value) { - try (PrintWriter out = outFn.apply(Paths.get(filename))) { - out.print("\uFEFF"); - header.forEach(s -> out.println("# " + s)); - out.println("#"); - out.println("# File: " + filename); - out.println("# Generated from CLDR"); - out.println("#"); - out.println(); - out.println(FIXUP.transliterate(whitespace().trimFrom(value.getValue()))); - out.println(); - } - } - - private void writeRootIndexEntry( - CldrValue value, String source, String target, Optional variant, String filename) { - Visibility visibility = TRANSFORM_VISIBILITY.valueFrom(value, Visibility.class); - String status = visibility == Visibility.internal ? "internal" : "file"; - - Direction dir = TRANSFORM_DIRECTION.valueFrom(value, Direction.class); - // TODO: Consider checks for unused data (e.g. forward aliases in a backward rule). - if (dir != Direction.backward) { - String id = getId(source, target, variant); - TRANSFORM_ALIAS.listOfValuesFrom(value) - .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id)); - RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status); - icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename); - icuData.add(rbPrefix.extendBy("direction"), "FORWARD"); - } - if (dir != Direction.forward) { - String id = getId(target, source, variant); - TRANSFORM_BACKALIAS.listOfValuesFrom(value) - .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id)); - RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status); - icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename); - icuData.add(rbPrefix.extendBy("direction"), "REVERSE"); - } + if (dir != Direction.forward) { + String id = getId(target, source, variant); + TRANSFORM_BACKALIAS.listOfValuesFrom(value) + .forEach(a -> icuData.add(RB_TRANSLITERATOR_IDS.extendBy(a, "alias"), id)); + RbPath rbPrefix = RB_TRANSLITERATOR_IDS.extendBy(id, status); + icuData.add(rbPrefix.extendBy("resource:process(transliterator)"), filename); + icuData.add(rbPrefix.extendBy("direction"), "REVERSE"); } } diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/AlternateLocaleDataTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/AlternateLocaleDataTest.java index 0e7cd5ee375..494a81957d9 100644 --- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/AlternateLocaleDataTest.java +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/AlternateLocaleDataTest.java @@ -46,14 +46,14 @@ public class AlternateLocaleDataTest { FakeDataSupplier src = new FakeDataSupplier() .addLocaleData("xx", target, source, other) .addInheritedData("xx", inherited); - CldrDataSupplier transformed = AlternateLocaleData.transform( - src, - ImmutableMap.of(target.getPath(), source.getPath()), - ImmutableTable.of()); + CldrDataSupplier transformed = + AlternateLocaleData.transform( + src, ImmutableMap.of(target.getPath(), source.getPath()), ImmutableTable.of()); CldrData unresolved = transformed.getDataForLocale("xx", UNRESOLVED); CldrData resolved = transformed.getDataForLocale("xx", RESOLVED); + // Note that the source is always removed (unless it's also a target). assertValuesUnordered(unresolved, altValue, other); assertValuesUnordered(resolved, altValue, other, inherited); } @@ -67,10 +67,9 @@ public class AlternateLocaleDataTest { ldml("numbers/currencies/currency[@type=\"USD\"][@alt=\"short\"]/displayName", "Name"); FakeDataSupplier src = new FakeDataSupplier().addLocaleData("xx", target); - CldrDataSupplier transformed = AlternateLocaleData.transform( - src, - ImmutableMap.of(target.getPath(), source.getPath()), - ImmutableTable.of()); + CldrDataSupplier transformed = + AlternateLocaleData.transform( + src, ImmutableMap.of(target.getPath(), source.getPath()), ImmutableTable.of()); CldrData unresolved = transformed.getDataForLocale("xx", UNRESOLVED); CldrData resolved = transformed.getDataForLocale("xx", RESOLVED); @@ -87,19 +86,21 @@ public class AlternateLocaleDataTest { ldml("numbers/currencies/currency[@type=\"USD\"]/displayName", "Full Display Name"); CldrValue source = ldml("numbers/currencies/currency[@type=\"USD\"][@alt=\"short\"]/displayName", "Name"); + CldrValue other = + ldml("numbers/currencies/currency[@type=\"EUR\"]/displayName", "Euro"); - FakeDataSupplier src = new FakeDataSupplier().addLocaleData("xx", source); + FakeDataSupplier src = new FakeDataSupplier().addLocaleData("xx", source, other); CldrDataSupplier transformed = - AlternateLocaleData.transform(src, ImmutableMap.of(target.getPath(), source.getPath()), - ImmutableTable.of()); + AlternateLocaleData.transform( + src, ImmutableMap.of(target.getPath(), source.getPath()), ImmutableTable.of()); CldrData unresolved = transformed.getDataForLocale("xx", UNRESOLVED); CldrData resolved = transformed.getDataForLocale("xx", RESOLVED); - // If there's no target the alt-path mapping is incomplete and we do nothing (this matches - // the old CLDR tool behaviour and reasonable but can hide inconsistencies in CLDR data). - assertValuesUnordered(unresolved, source); - assertValuesUnordered(resolved, source); + // Even though the missing target is not matched (so no change there) the source is always + // removed from the transformed data. + assertValuesUnordered(unresolved, other); + assertValuesUnordered(resolved, other); } @Test diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessorTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessorTest.java new file mode 100644 index 00000000000..bf312c522d9 --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/CldrDataProcessorTest.java @@ -0,0 +1,154 @@ +// © 2020 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +package org.unicode.icu.tool.cldrtoicu; + +import static com.google.common.truth.Truth.assertThat; + +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.unicode.cldr.api.AttributeKey; +import org.unicode.cldr.api.CldrData; +import org.unicode.cldr.api.CldrDataSupplier; +import org.unicode.cldr.api.CldrValue; + +import com.google.common.collect.ImmutableMap; + +@RunWith(JUnit4.class) +public class CldrDataProcessorTest { + + private static final AttributeKey TERRITORY_TYPE = AttributeKey.keyOf("territory", "type"); + private static final AttributeKey CURRENCY_TYPE = AttributeKey.keyOf("currency", "type"); + + // An overly simplistic value type for currency for testing purposes. In real code you would + // probably want an immutable type and a separate builder, or a mutable type just to collect + // values that doesn't need equals/hashcode (this class serves 2 purposes in the test). + private static final class CurrencyData { + final String key; + String name = ""; + String symbol = ""; + + CurrencyData(String key) { + this.key = key; + } + + CurrencyData(String key, String name, String symbol) { + this.key = key; + this.name = name; + this.symbol = symbol; + } + + @Override public boolean equals(Object o) { + if (o instanceof CurrencyData) { + CurrencyData that = (CurrencyData) o; + return key.equals(that.key) && name.equals(that.name) && symbol.equals(that.symbol); + } + return false; + } + + @Override public int hashCode() { + return Objects.hash(key, name, symbol); + } + + @Override public String toString() { + return String.format("CurrencyData{name=%s, symbol='%s'}", name, symbol); + } + } + + // For collecting processed values. + private static final class State { + ImmutableMap names = ImmutableMap.of(); + ImmutableMap currencies = ImmutableMap.of(); + + void setNames(Map map) { + names = ImmutableMap.copyOf(map); + } + + void setCurrencies(Map map) { + currencies = ImmutableMap.copyOf(map); + } + } + + private static final CldrDataProcessor VISITOR = createTestVisitor(); + + private static CldrDataProcessor createTestVisitor() { + // Note that this is deliberately doing things the "messy" way by creating and then copying + // a map. This is to show an extra level of processing in tests. You could just have a + // value action which adds the territory to a map in the State object. + CldrDataProcessor.Builder builder = CldrDataProcessor.builder(); + builder + .addAction( + "//ldml/localeDisplayNames/territories", + () -> new LinkedHashMap(), + State::setNames) + .addValueAction( + "territory[@type=*]", + (map, value) -> map.put(value.getPath().get(TERRITORY_TYPE), value.getValue())); + + // Another convoluted example for testing. This has the same additional level for a map + // just so we can show a 3-level processor. In real code this wouldn't look so messy. + CldrDataProcessor.SubProcessor currencyProcessor = builder + .addAction( + "//ldml/numbers/currencies", + () -> new LinkedHashMap(), + State::setCurrencies) + .addAction( + "currency[@type=*]", + (map, path) -> new CurrencyData(path.get(CURRENCY_TYPE)), + (map, data) -> map.put(data.key, data)); + currencyProcessor.addValueAction( + "displayName", + (data, value) -> data.name = value.getValue()); + currencyProcessor.addValueAction( + "symbol", + (data, value) -> data.symbol = value.getValue()); + + return builder.build(); + } + + @Test + public void testTwoLevelProcessing() { + CldrData data = CldrDataSupplier.forValues(Arrays.asList( + ldml("localeDisplayNames/territories/territory[@type=\"BE\"]", "Belgium"), + ldml("localeDisplayNames/territories/territory[@type=\"CH\"]", "Switzerland"), + ldml("localeDisplayNames/territories/territory[@type=\"IN\"]", "India"))); + + State state = VISITOR.process(data, new State(), CldrData.PathOrder.DTD); + + assertThat(state.names) + .containsExactly( + "BE", "Belgium", + "CH", "Switzerland", + "IN", "India") + .inOrder(); + } + + @Test + public void testThreeLevelProcessing() { + CldrData data = CldrDataSupplier.forValues(Arrays.asList( + ldml("numbers/currencies/currency[@type=\"EUR\"]/displayName", "euro"), + ldml("numbers/currencies/currency[@type=\"EUR\"]/symbol", "€"), + ldml("numbers/currencies/currency[@type=\"CHF\"]/displayName", "Swiss franc"), + ldml("numbers/currencies/currency[@type=\"CHF\"]/symbol", "Fr."), + ldml("numbers/currencies/currency[@type=\"INR\"]/displayName", "Indian rupee"), + ldml("numbers/currencies/currency[@type=\"INR\"]/symbol", "₹"))); + + State state = VISITOR.process(data, new State(), CldrData.PathOrder.DTD); + + assertThat(state.currencies) + .containsExactly( + "CHF", new CurrencyData("CHF", "Swiss franc", "Fr."), + "EUR", new CurrencyData("EUR", "euro", "€"), + "INR", new CurrencyData("INR", "Indian rupee", "₹")) + .inOrder(); + } + + private static CldrValue ldml(String path, String value) { + return CldrValue.parseValue("//ldml/" + path, value); + } +} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java deleted file mode 100644 index ff9c54cdc21..00000000000 --- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/PathMatcherTest.java +++ /dev/null @@ -1,158 +0,0 @@ -// © 2019 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -package org.unicode.icu.tool.cldrtoicu; - -import static com.google.common.truth.Truth.assertThat; -import static com.google.common.truth.Truth8.assertThat; -import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath; -import static org.unicode.icu.tool.cldrtoicu.testing.AssertUtils.assertThrows; - -import java.util.Arrays; -import java.util.List; - -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; -import org.unicode.cldr.api.CldrPath; - -@RunWith(JUnit4.class) -public class PathMatcherTest { - @Test - public void testMatcher() { - CldrPath calEra = parseDistinguishingPath( - "//ldml/dates/calendars/calendar[@type=\"buddhist\"]/eras/eraAbbr/era[@type=\"0\"]"); - CldrPath chineseMon1 = monthInfo("chinese", "format", "abbreviated", 1); - CldrPath chineseMon2 = monthInfo("chinese", "format", "abbreviated", 2); - CldrPath genericMon1 = monthInfo("generic", "stand-alone", "narrow", 1); - CldrPath genericMon2 = monthInfo("generic", "stand-alone", "narrow", 2); - List calPaths = - Arrays.asList(calEra, chineseMon1, chineseMon2, genericMon1, genericMon2); - - PathMatcher anyCalendarPaths = PathMatcher.of("ldml/dates/calendars/calendar"); - assertThat(calPaths.stream().allMatch(anyCalendarPaths::matchesPrefixOf)).isTrue(); - assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matches)).isTrue(); - assertThat(calPaths.stream().noneMatch(anyCalendarPaths::matchesSuffixOf)).isTrue(); - - PathMatcher chineseCalendars = - PathMatcher.of("ldml/dates/calendars/calendar[@type=\"chinese\"]"); - assertThat(calPaths.stream().filter(chineseCalendars::matchesPrefixOf)) - .containsExactly(chineseMon1, chineseMon2); - - PathMatcher anyMonth = PathMatcher.of("monthWidth[@type=*]/month[@type=*]"); - assertThat(calPaths.stream().filter(anyMonth::matchesSuffixOf)) - .containsExactly(chineseMon1, chineseMon2, genericMon1, genericMon2); - - PathMatcher narrowMonth = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]"); - assertThat(calPaths.stream().filter(narrowMonth::matchesSuffixOf)) - .containsExactly(genericMon1, genericMon2); - assertThat(calPaths.stream().filter(narrowMonth::matches)).isEmpty(); - - PathMatcher firstMonth = PathMatcher.of("month[@type=\"1\"]"); - assertThat(calPaths.stream().filter(firstMonth::matchesSuffixOf)) - .containsExactly(chineseMon1, genericMon1); - - PathMatcher fullMatch = PathMatcher.of("ldml/dates" - + "/calendars/calendar[@type=\"generic\"]" - + "/months/monthContext[@type=\"stand-alone\"]" - + "/monthWidth[@type=\"narrow\"]" - + "/month[@type=\"2\"]"); - assertThat(calPaths.stream().filter(fullMatch::matches)).containsExactly(genericMon2); - } - - @Test - public void testWildcardSegment() { - PathMatcher wildcard = PathMatcher.of("ldml/dates" - + "/calendars/calendar[@type=\"generic\"]" - + "/*/*[@type=\"format\"]/*[@type=\"narrow\"]/*[@type=*]"); - - assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 1))).isTrue(); - assertThat(wildcard.matches(monthInfo("generic", "format", "narrow", 9))).isTrue(); - assertThat(wildcard.matches(dayInfo("generic", "format", "narrow", "sun"))).isTrue(); - - assertThat(wildcard.matches(monthInfo("chinese", "format", "narrow", 1))).isFalse(); - assertThat(wildcard.matches(monthInfo("generic", "stand-alone", "narrow", 1))).isFalse(); - assertThat(wildcard.matches(dayInfo("generic", "format", "wide", "mon"))).isFalse(); - } - - @Test - public void testAnyOf_match() { - PathMatcher narrowMonth = - PathMatcher.of("ldml/dates/calendars/calendar[@type=*]/months" - + "/monthContext[@type=\"format\"]/monthWidth[@type=\"narrow\"]/month[@type=*]"); - PathMatcher narrowDay = - PathMatcher.of("ldml/dates/calendars/calendar[@type=*]/days" - + "/dayContext[@type=\"format\"]/dayWidth[@type=\"narrow\"]/day[@type=*]"); - PathMatcher prefix = PathMatcher.anyOf(narrowMonth, narrowDay); - - assertThat(prefix.matches(monthInfo("gregorian", "format", "narrow", 1))).isTrue(); - assertThat(prefix.matches(dayInfo("buddhist", "format", "narrow", "sun"))).isTrue(); - - assertThat(prefix.matches(monthInfo("hindu", "format", "wide", 1))).isFalse(); - assertThat(prefix.matches(dayInfo("hindu", "format", "wide", "mon"))).isFalse(); - } - - @Test - public void testAnyOf_suffix() { - PathMatcher monthSuffix = PathMatcher.of("monthWidth[@type=\"narrow\"]/month[@type=*]"); - PathMatcher daySuffix = PathMatcher.of("dayWidth[@type=\"narrow\"]/day[@type=*]"); - PathMatcher suffix = PathMatcher.anyOf(monthSuffix, daySuffix); - - assertThat(suffix.matchesSuffixOf(monthInfo("generic", "format", "narrow", 1))).isTrue(); - assertThat(suffix.matchesSuffixOf(dayInfo("generic", "format", "narrow", "sun"))).isTrue(); - - assertThat(suffix.matchesSuffixOf(monthInfo("generic", "format", "wide", 1))).isFalse(); - assertThat(suffix.matchesSuffixOf(dayInfo("generic", "format", "wide", "mon"))).isFalse(); - } - - @Test - public void testAnyOf_prefix() { - PathMatcher monthPrefix = - PathMatcher.of("ldml/dates/calendars/calendar[@type=\"gregorian\"]/months"); - PathMatcher dayPrefix = - PathMatcher.of("ldml/dates/calendars/calendar[@type=\"buddhist\"]/days"); - PathMatcher prefix = PathMatcher.anyOf(monthPrefix, dayPrefix); - - assertThat(prefix.matchesPrefixOf(monthInfo("gregorian", "format", "narrow", 1))).isTrue(); - assertThat(prefix.matchesPrefixOf(dayInfo("buddhist", "format", "narrow", "sun"))).isTrue(); - - assertThat(prefix.matchesPrefixOf(monthInfo("hindu", "format", "wide", 1))).isFalse(); - assertThat(prefix.matchesPrefixOf(dayInfo("hindu", "format", "wide", "mon"))).isFalse(); - } - - @Test - public void testBadSpecifiers() { - assertInvalidPathSpecification(""); - // Leading and trailing '/' are not permitted (they imply empty segments. - assertInvalidPathSpecification("/foo/"); - assertInvalidPathSpecification("foo//bar"); - assertInvalidPathSpecification("foo/bad segment name"); - assertInvalidPathSpecification("foo/bar[type=*]"); - assertInvalidPathSpecification("foo/bar[@type=**]"); - assertInvalidPathSpecification("foo/bar[@type='double-quotes-only']"); - } - - private void assertInvalidPathSpecification(String spec) { - IllegalArgumentException e = - assertThrows(IllegalArgumentException.class, () -> PathMatcher.of(spec)); - assertThat(e).hasMessageThat().startsWith("invalid path specification"); - assertThat(e).hasMessageThat().contains(spec); - } - - private static CldrPath monthInfo(String type, String context, String width, int number) { - return CldrPath.parseDistinguishingPath(String.format( - "//ldml/dates/calendars/calendar[@type=\"%s\"]" - + "/months/monthContext[@type=\"%s\"]" - + "/monthWidth[@type=\"%s\"]" - + "/month[@type=\"%d\"]", - type, context, width, number)); - } - - private static CldrPath dayInfo(String type, String context, String width, String id) { - return CldrPath.parseDistinguishingPath(String.format( - "//ldml/dates/calendars/calendar[@type=\"%s\"]" - + "/days/dayContext[@type=\"%s\"]" - + "/dayWidth[@type=\"%s\"]" - + "/day[@type=\"%s\"]", - type, context, width, id)); - } -} diff --git a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapperTest.java b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapperTest.java index 8085616fd90..1a9a25ccdaa 100644 --- a/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapperTest.java +++ b/tools/cldr/cldr-to-icu/src/test/java/org/unicode/icu/tool/cldrtoicu/mapper/SupplementalMapperTest.java @@ -5,12 +5,15 @@ package org.unicode.icu.tool.cldrtoicu.mapper; import static org.unicode.cldr.api.CldrValue.parseValue; import static org.unicode.icu.tool.cldrtoicu.testing.IcuDataSubjectFactory.assertThat; +import java.util.function.Predicate; + import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; +import org.unicode.cldr.api.CldrPath; import org.unicode.cldr.api.CldrValue; +import org.unicode.cldr.api.PathMatcher; import org.unicode.icu.tool.cldrtoicu.IcuData; -import org.unicode.icu.tool.cldrtoicu.PathMatcher; import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result; import org.unicode.icu.tool.cldrtoicu.testing.FakeDataSupplier; import org.unicode.icu.tool.cldrtoicu.testing.FakeResult; @@ -29,8 +32,7 @@ public class SupplementalMapperTest { supplementalData("likelySubtags/likelySubtag[@from=\"Foo\"][@to=\"Bar\"]"), simpleResult("/Foo", "Bar")); - PathMatcher allPaths = PathMatcher.of("supplementalData"); - IcuData icuData = SupplementalMapper.process(src, transformer, "name", allPaths); + IcuData icuData = SupplementalMapper.process(src, transformer, "name", p -> true); assertThat(icuData).getPaths().hasSize(1); assertThat(icuData).hasValuesFor("/Foo", "Bar"); @@ -55,8 +57,7 @@ public class SupplementalMapperTest { supplementalData("currencyData/region[@iso3166=\"US\"]/currency[@iso4217=\"USD\"]"), simpleResult("/CurrencyMap/US//id", "USD")); - PathMatcher allPaths = PathMatcher.of("supplementalData"); - IcuData icuData = SupplementalMapper.process(src, transformer, "name", allPaths); + IcuData icuData = SupplementalMapper.process(src, transformer, "name", p -> true); assertThat(icuData).getPaths().hasSize(3); assertThat(icuData).hasValuesFor("/CurrencyMap/US/<0000>/id", "USD"); @@ -73,7 +74,8 @@ public class SupplementalMapperTest { supplementalData("currencyData/region[@iso3166=\"US\"]/currency[@iso4217=\"USN\"]"), simpleResult("/CurrencyMap/US//id", "USN")); - PathMatcher filter = PathMatcher.of("supplementalData/likelySubtags"); + Predicate filter = + PathMatcher.of("//supplementalData/likelySubtags")::matchesPrefixOf; IcuData icuData = SupplementalMapper.process(src, transformer, "name", filter); assertThat(icuData).getPaths().hasSize(1);