From 841cd7fcccf88195b1cd6463d128f66b90dc8c6e Mon Sep 17 00:00:00 2001 From: David Beaumont Date: Thu, 12 Sep 2019 13:58:39 +0200 Subject: [PATCH] ICU-20693 Adding support for deletion of existing files prior to ICU data generation. --- tools/cldr/cldr-to-icu/build-icu-data.xml | 59 +++- .../icu/tool/cldrtoicu/IcuTextWriter.java | 15 +- .../icu/tool/cldrtoicu/LdmlConverter.java | 23 +- .../tool/cldrtoicu/LdmlConverterConfig.java | 4 +- .../ant/CleanOutputDirectoryTask.java | 281 ++++++++++++++++++ .../cldrtoicu/mapper/TransformsMapper.java | 6 +- 6 files changed, 368 insertions(+), 20 deletions(-) create mode 100644 tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java diff --git a/tools/cldr/cldr-to-icu/build-icu-data.xml b/tools/cldr/cldr-to-icu/build-icu-data.xml index 82b00f331ac..aa4bd4b1c82 100644 --- a/tools/cldr/cldr-to-icu/build-icu-data.xml +++ b/tools/cldr/cldr-to-icu/build-icu-data.xml @@ -10,7 +10,10 @@ - + + + + @@ -66,8 +69,13 @@ since some supplemental data is also written to the curr/ directory. See LdmlConverter.OutputType for the full list of valid types. --> - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java index 18763299282..a91ad264ec0 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/IcuTextWriter.java @@ -3,12 +3,16 @@ package org.unicode.icu.tool.cldrtoicu; import static com.google.common.base.Preconditions.checkNotNull; +import static java.nio.file.StandardOpenOption.CREATE; +import static java.nio.file.StandardOpenOption.CREATE_NEW; +import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; import static java.util.stream.Collectors.joining; import java.io.IOException; import java.io.PrintWriter; import java.io.Writer; import java.nio.file.Files; +import java.nio.file.OpenOption; import java.nio.file.Path; import java.util.List; import java.util.regex.Matcher; @@ -34,11 +38,18 @@ final class IcuTextWriter { private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')"); private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\""); + private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW }; + private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING }; + /** Write a file in ICU data format with the specified header. */ - static void writeToFile(IcuData icuData, Path outDir, List header) { + static void writeToFile( + IcuData icuData, Path outDir, List header, boolean allowOverwrite) { + try { Files.createDirectories(outDir); - try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt")); + Path file = outDir.resolve(icuData.getName() + ".txt"); + OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES; + try (Writer w = Files.newBufferedWriter(file, fileOptions); PrintWriter out = new PrintWriter(w)) { new IcuTextWriter(icuData).writeTo(out, header); } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java index 6d2d600465b..5aef1d9f036 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverter.java @@ -33,7 +33,6 @@ import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -225,7 +224,7 @@ public final class LdmlConverter { this.localeTransformer = RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"), IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN); - this.fileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt")); + this.fileHeader = readLinesFromResource("/ldml2icu_header.txt"); } private void convertAll() { @@ -237,9 +236,9 @@ public final class LdmlConverter { } } - private static List readLinesFromResource(String name) { + private static ImmutableList readLinesFromResource(String name) { try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) { - return CharStreams.readLines(new InputStreamReader(in)); + return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in))); } catch (IOException e) { throw new RuntimeException("cannot read resource: " + name, e); } @@ -336,7 +335,7 @@ public final class LdmlConverter { if (!splitData.getPaths().isEmpty() || isBaseLanguage || dir.includeEmpty()) { splitData.setVersion(CldrDataSupplier.getCldrVersionString()); - write(splitData, outDir); + write(splitData, outDir, false); writtenLocaleIds.put(dir, id); } } @@ -472,7 +471,7 @@ public final class LdmlConverter { case TRANSFORMS: Path transformDir = createDirectory(config.getOutputDir().resolve("translit")); - write(TransformsMapper.process(src, transformDir, fileHeader), transformDir); + write(TransformsMapper.process(src, transformDir, fileHeader), transformDir, false); break; case KEY_TYPE_DATA: @@ -502,7 +501,9 @@ public final class LdmlConverter { private void writeAliasFile(String srcId, String destId, Path dir) { IcuData icuData = new IcuData(srcId, true); icuData.add(RB_ALIAS, destId); - write(icuData, dir); + // Allow overwrite for aliases since some are "forced" and overwrite existing targets. + // TODO: Maybe tighten this up so only forced aliases for existing targets are overwritten. + write(icuData, dir, true); } private void writeEmptyFile(String id, Path dir, Collection aliasTargets) { @@ -516,16 +517,16 @@ public final class LdmlConverter { // which is itself not in the set of written ICU files. An "indirect alias target". icuData.setVersion(CldrDataSupplier.getCldrVersionString()); } - write(icuData, dir); + write(icuData, dir, false); } private void write(IcuData icuData, String dir) { - write(icuData, config.getOutputDir().resolve(dir)); + write(icuData, config.getOutputDir().resolve(dir), false); } - private void write(IcuData icuData, Path dir) { + private void write(IcuData icuData, Path dir, boolean allowOverwrite) { createDirectory(dir); - IcuTextWriter.writeToFile(icuData, dir, fileHeader); + IcuTextWriter.writeToFile(icuData, dir, fileHeader, allowOverwrite); } private Path createDirectory(Path dir) { diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java index de7252c5182..d319527c593 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/LdmlConverterConfig.java @@ -42,7 +42,7 @@ public interface LdmlConverterConfig { } /** Returns the relative output directory name. */ - String getOutputDir() { + public String getOutputDir() { return dirName; } @@ -51,7 +51,7 @@ public interface LdmlConverterConfig { * the supported set of locales for the "service" provided by the data in that * directory). */ - // TODO: Document why there's a difference between directories for empty directories. + // TODO: Document why there's a difference between directories for empty files. boolean includeEmpty() { return includeEmpty; } diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java new file mode 100644 index 00000000000..ddd34ea9c8a --- /dev/null +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/ant/CleanOutputDirectoryTask.java @@ -0,0 +1,281 @@ +// © 2019 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +package org.unicode.icu.tool.cldrtoicu.ant; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.nio.file.LinkOption.NOFOLLOW_LINKS; +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.partitioningBy; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.tools.ant.BuildException; +import org.apache.tools.ant.Task; +import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir; + +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.io.CharStreams; + +// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed. +public final class CleanOutputDirectoryTask extends Task { + private static final ImmutableSet ALLOWED_DIRECTORIES = + Stream + .concat( + Stream.of("misc", "translit"), + Arrays.stream(IcuLocaleDir.values()).map(IcuLocaleDir::getOutputDir)) + .sorted() + .collect(toImmutableSet()); + + private static final CharMatcher NOT_WHITESPACE = CharMatcher.whitespace().negate(); + + private Path root = null; + private boolean forceDelete = false; + private final List outputDirs = new ArrayList<>(); + private final ImmutableList headerLines; + + public CleanOutputDirectoryTask() { + // TODO: Consider passing in header lines via Ant? + this.headerLines = readLinesFromResource("/ldml2icu_header.txt"); + } + + public static final class Retain extends Task { + private Path path = null; + + // Don't use "Path" for the argument type because that always makes an absolute path (e.g. + // relative to the working directory for the Ant task). We want relative paths. + @SuppressWarnings("unused") + public void setPath(String path) { + Path p = Paths.get(path).normalize(); + checkBuild(!p.isAbsolute() && !p.startsWith(".."), "invalid path: %s", path); + this.path = p; + } + + @Override + public void init() throws BuildException { + checkBuild(path != null, "missing 'path' attribute"); + } + } + + public static final class Dir extends Task { + private String name; + private final Set retained = new HashSet<>(); + + @SuppressWarnings("unused") + public void setName(String name) { + checkBuild(ALLOWED_DIRECTORIES.contains(name), + "unknown directory name '%s'; allowed values: %s", name, ALLOWED_DIRECTORIES); + this.name = name; + } + + @SuppressWarnings("unused") + public void addConfiguredRetain(Retain retain) { + retained.add(retain.path); + } + + @Override + public void init() throws BuildException { + checkBuild(name != null, "missing 'name' attribute"); + } + } + + @SuppressWarnings("unused") + public void setRoot(Path root) { + this.root = root; + } + + @SuppressWarnings("unused") + public void setForceDelete(boolean forceDelete) { + this.forceDelete = forceDelete; + } + + @SuppressWarnings("unused") + public void addConfiguredDir(Dir dir) { + outputDirs.add(dir); + } + + @Override + public void execute() throws BuildException { + checkBuild(root != null, "missing 'root' attribute"); + checkBuild(!outputDirs.isEmpty(), "missing elements"); + + if (!Files.exists(root)) { + log("Root directory '" + root + "' does not exist (nothing to clean)"); + return; + } + checkBuild(Files.isDirectory(root), "specified root '%s' is not a directory", root); + + Set autogenFiles = new TreeSet<>(); + Set unknownFiles = new TreeSet<>(); + for (Dir dirInfo : outputDirs) { + Path dirPath = root.resolve(dirInfo.name); + if (!Files.exists(dirPath)) { + continue; + } + checkBuild(Files.isDirectory(dirPath), "'%s' is not a directory", dirPath); + + // Note: For now we just walk the immediate contents of each output directory and don't + // attempt to recursively process things. Only a couple of output directories have + // sub-directories anyway, and we never write files into them anyway. + try (Stream files = Files.list(dirPath)) { + Map> map = files + .filter(p -> couldDelete(p, dirPath, dirInfo)) + .parallel() + .collect(partitioningBy(this::wasAutoGenerated)); + unknownFiles.addAll(map.get(false)); + autogenFiles.addAll(map.get(true)); + } catch (IOException e) { + throw new BuildException("Error processing directory: " + dirPath, e); + } + } + + if (!unknownFiles.isEmpty() && !forceDelete) { + // If there are NO safe files, then something weird is going on (perhaps a change in + // the header file). + if (autogenFiles.isEmpty()) { + log("Error determining 'safe' files for deletion (no auto-generated files found)."); + log(unknownFiles.size() + " files would be deleted for 'clean' task"); + logPartioned(unknownFiles); + log("Set '-DforceDelete=true' to delete all files not listed in" + + " ."); + } else { + // A mix of safe and unsafe files is weird, but in this case it should be a + // relatively small number of files (e.g. adding a new manually maintained file or + // accidental editing of header lines). + log("Unknown files exist which cannot be determined to be auto-generated"); + log("Files:"); + logPartioned(unknownFiles); + log(String.format("%d unknown files or directories found", unknownFiles.size())); + log("Set '-DforceDelete=true' to delete these files, or add them to" + + " ."); + } + throw new BuildException("Unsafe files cannot be deleted"); + } + if (!unknownFiles.isEmpty()) { + checkState(forceDelete, "unexpected flag state (forceDelete should be true here)"); + List filesToDelete = + unknownFiles.stream() + .filter(p -> !Files.isDirectory(p)) + .collect(Collectors.toList()); + log(String.format("Force deleting %,d files...\n", filesToDelete.size())); + deleteAllFiles(filesToDelete); + + List unknownDirs = + unknownFiles.stream() + .filter(p -> Files.isDirectory(p)) + .collect(Collectors.toList()); + if (!unknownDirs.isEmpty()) { + log("Add the following directories to the task:"); + logPartioned(unknownDirs); + throw new BuildException("Unsafe directories cannot be deleted"); + } + } + if (!autogenFiles.isEmpty()) { + log(String.format("Deleting %,d auto-generated files...\n", autogenFiles.size())); + deleteAllFiles(autogenFiles); + } + } + + private void logPartioned(Iterable files) { + Iterables.partition(files, 5) + .forEach(f -> log( + f.stream().map(p -> root.relativize(p).toString()).collect(joining(", ")))); + } + + private boolean couldDelete(Path path, Path dir, Dir dirInfo) { + return !dirInfo.retained.contains(dir.relativize(path)); + } + + private boolean wasAutoGenerated(Path path) { + if (!Files.isRegularFile(path, NOFOLLOW_LINKS)) { + // Directories, symbolic links, devices etc. + return false; + } + try (BufferedReader r = Files.newBufferedReader(path)) { + // A byte-order-mark (BOM) is added to ICU data files, but not JSON deps files, so just + // treat it as optional everywhere (it's not the important thing we check here). + r.mark(1); + int maybeByteOrderMark = r.read(); + if (maybeByteOrderMark != '\uFEFF') { + // Also reset if the file was empty, but that should be harmless. + r.reset(); + } + for (String headerLine : headerLines) { + String line = r.readLine(); + if (line == null) { + return false; + } + int headerStart = skipComment(line); + if (headerStart < 0 + || !line.regionMatches(headerStart, headerLine, 0, headerLine.length())) { + return false; + } + } + return true; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static int skipComment(String line) { + if (line.startsWith("#")) { + return toCommentStart(line, 1); + } else if (line.startsWith("//")) { + return toCommentStart(line, 2); + } + return -1; + } + + // Not just "index-of" since a comment start followed by only whitespace is NOT a failure to + // find a comment (since the header might have an empty line in it, which should be okay). + private static int toCommentStart(String line, int offset) { + int index = NOT_WHITESPACE.indexIn(line, offset); + return index >= 0 ? index : line.length(); + } + + private static void deleteAllFiles(Iterable files) { + for (Path p : files) { + try { + // This is a code error, since only files should be passed here. + checkArgument(!Files.isDirectory(p), "Cannot delete directories: %s", p); + Files.deleteIfExists(p); + } catch (IOException e) { + throw new BuildException("Error deleting file: " + p, e); + } + } + } + + private static void checkBuild(boolean condition, String message, Object... args) { + if (!condition) { + throw new BuildException(String.format(message, args)); + } + } + + private static ImmutableList readLinesFromResource(String name) { + try (InputStream in = CleanOutputDirectoryTask.class.getResourceAsStream(name)) { + return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in))); + } catch (IOException e) { + throw new RuntimeException("cannot read resource: " + name, e); + } + } +} diff --git a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java index e0844e653dd..ece1aaa62e2 100644 --- a/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java +++ b/tools/cldr/cldr-to-icu/src/main/java/org/unicode/icu/tool/cldrtoicu/mapper/TransformsMapper.java @@ -4,8 +4,7 @@ package org.unicode.icu.tool.cldrtoicu.mapper; import static com.google.common.base.CharMatcher.whitespace; import static com.google.common.base.Preconditions.checkNotNull; -import static java.nio.file.StandardOpenOption.CREATE; -import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING; +import static java.nio.file.StandardOpenOption.CREATE_NEW; import static org.unicode.cldr.api.AttributeKey.keyOf; import static org.unicode.cldr.api.CldrData.PathOrder.DTD; import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL; @@ -86,7 +85,8 @@ public final class TransformsMapper { Function fileWriterFn = p -> { Path file = ruleFileOutputDir.resolve(p); try { - return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING)); + // Specify "CREATE_NEW" since we don't want to overwrite any existing files. + return new PrintWriter(Files.newBufferedWriter(file, CREATE_NEW)); } catch (IOException e) { throw new RuntimeException("error opening file: " + file, e); }