ICU-20693 Adding support for deletion of existing files prior to ICU data generation.

This commit is contained in:
David Beaumont 2019-09-12 13:58:39 +02:00 committed by David Beaumont
parent 43826cccba
commit 841cd7fccc
6 changed files with 368 additions and 20 deletions

View file

@ -10,7 +10,10 @@
<!-- TODO: Add things like copying of a template directory and deleting previous files
(perhaps always generate into a temporary directory and copy back to avoid having
inconsistent state when the conversion is cancelled). -->
<project name="Convert" default="convert" basedir=".">
<project name="Convert" default="all" basedir=".">
<target name="all" depends="init-args, prepare-jar, clean, convert"/>
<!-- Initialize the properties which were not already set on the command line. -->
<target name="init-args">
<property environment="env"/>
@ -66,8 +69,13 @@
since some supplemental data is also written to the curr/ directory.
See LdmlConverter.OutputType for the full list of valid types. -->
<!-- TODO: Find out what people actually want here and switch to that. -->
<!-- TODO: Find out what common use cases are and use them. -->
<property name="outputTypes" value=""/>
<!-- Override to force the 'clean' task to delete files it cannot determine to be
auto-generated by this tool. This is useful if the file header changes since
the heading is what's used to recognize auto-generated files. -->
<property name="forceDelete" value="false"/>
</target>
<!-- Build a standalone JAR which is called by Ant (and which avoids needing to mess
@ -322,4 +330,51 @@
locales="xx,yy_ZZ"/> -->
</convert>
</target>
<target name="clean" depends="init-args, prepare-jar">
<taskdef name="outputDirectories" classname="org.unicode.icu.tool.cldrtoicu.ant.CleanOutputDirectoryTask">
<classpath>
<pathelement path="target/cldr-to-icu-1.0-SNAPSHOT-jar-with-dependencies.jar"/>
</classpath>
</taskdef>
<outputDirectories root="${outDir}" forceDelete="${forceDelete}">
<dir name="brkitr">
<retain path="dictionaries"/>
<retain path="rules"/>
</dir>
<dir name="coll">
<retain path="de__PHONEBOOK.txt"/>
<retain path="de_.txt"/>
<retain path="es__TRADITIONAL.txt"/>
<retain path="es_.txt"/>
</dir>
<dir name="curr"/>
<dir name="lang"/>
<dir name="locales"/>
<dir name="misc">
<retain path="currencyNumericCodes.txt"/>
<retain path="icudata.rc"/>
<retain path="icustd.txt"/>
<retain path="icuver.txt"/>
<retain path="langInfo.txt"/>
<retain path="zoneinfo64.txt"/>
<!-- This file should be removed before the next release. -->
<retain path="miscfiles.mk"/>
</dir>
<dir name="rbnf"/>
<dir name="region"/>
<dir name="translit">
<retain path="en.txt"/>
<retain path="el.txt"/>
<!-- This file should be removed before the next release. -->
<retain path="trnsfiles.mk"/>
</dir>
<dir name="unit"/>
<dir name="zone">
<retain path="tzdbNames.txt"/>
</dir>
</outputDirectories>
</target>
</project>

View file

@ -3,12 +3,16 @@
package org.unicode.icu.tool.cldrtoicu;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.CREATE_NEW;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static java.util.stream.Collectors.joining;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Matcher;
@ -34,11 +38,18 @@ final class IcuTextWriter {
private static final Pattern STRING_ESCAPE = Pattern.compile("(?!')\\\\\\\\(?!')");
private static final Pattern QUOTE_ESCAPE = Pattern.compile("\\\\?\"");
private static final OpenOption[] ONLY_NEW_FILES = { CREATE_NEW };
private static final OpenOption[] OVERWRITE_FILES = { CREATE, TRUNCATE_EXISTING };
/** Write a file in ICU data format with the specified header. */
static void writeToFile(IcuData icuData, Path outDir, List<String> header) {
static void writeToFile(
IcuData icuData, Path outDir, List<String> header, boolean allowOverwrite) {
try {
Files.createDirectories(outDir);
try (Writer w = Files.newBufferedWriter(outDir.resolve(icuData.getName() + ".txt"));
Path file = outDir.resolve(icuData.getName() + ".txt");
OpenOption[] fileOptions = allowOverwrite ? OVERWRITE_FILES : ONLY_NEW_FILES;
try (Writer w = Files.newBufferedWriter(file, fileOptions);
PrintWriter out = new PrintWriter(w)) {
new IcuTextWriter(icuData).writeTo(out, header);
}

View file

@ -33,7 +33,6 @@ import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
@ -225,7 +224,7 @@ public final class LdmlConverter {
this.localeTransformer =
RegexTransformer.fromConfigLines(readLinesFromResource("/ldml2icu_locale.txt"),
IcuFunctions.CONTEXT_TRANSFORM_INDEX_FN);
this.fileHeader = ImmutableList.copyOf(readLinesFromResource("/ldml2icu_header.txt"));
this.fileHeader = readLinesFromResource("/ldml2icu_header.txt");
}
private void convertAll() {
@ -237,9 +236,9 @@ public final class LdmlConverter {
}
}
private static List<String> readLinesFromResource(String name) {
private static ImmutableList<String> readLinesFromResource(String name) {
try (InputStream in = LdmlConverter.class.getResourceAsStream(name)) {
return CharStreams.readLines(new InputStreamReader(in));
return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in)));
} catch (IOException e) {
throw new RuntimeException("cannot read resource: " + name, e);
}
@ -336,7 +335,7 @@ public final class LdmlConverter {
if (!splitData.getPaths().isEmpty() || isBaseLanguage || dir.includeEmpty()) {
splitData.setVersion(CldrDataSupplier.getCldrVersionString());
write(splitData, outDir);
write(splitData, outDir, false);
writtenLocaleIds.put(dir, id);
}
}
@ -472,7 +471,7 @@ public final class LdmlConverter {
case TRANSFORMS:
Path transformDir = createDirectory(config.getOutputDir().resolve("translit"));
write(TransformsMapper.process(src, transformDir, fileHeader), transformDir);
write(TransformsMapper.process(src, transformDir, fileHeader), transformDir, false);
break;
case KEY_TYPE_DATA:
@ -502,7 +501,9 @@ public final class LdmlConverter {
private void writeAliasFile(String srcId, String destId, Path dir) {
IcuData icuData = new IcuData(srcId, true);
icuData.add(RB_ALIAS, destId);
write(icuData, dir);
// Allow overwrite for aliases since some are "forced" and overwrite existing targets.
// TODO: Maybe tighten this up so only forced aliases for existing targets are overwritten.
write(icuData, dir, true);
}
private void writeEmptyFile(String id, Path dir, Collection<String> aliasTargets) {
@ -516,16 +517,16 @@ public final class LdmlConverter {
// which is itself not in the set of written ICU files. An "indirect alias target".
icuData.setVersion(CldrDataSupplier.getCldrVersionString());
}
write(icuData, dir);
write(icuData, dir, false);
}
private void write(IcuData icuData, String dir) {
write(icuData, config.getOutputDir().resolve(dir));
write(icuData, config.getOutputDir().resolve(dir), false);
}
private void write(IcuData icuData, Path dir) {
private void write(IcuData icuData, Path dir, boolean allowOverwrite) {
createDirectory(dir);
IcuTextWriter.writeToFile(icuData, dir, fileHeader);
IcuTextWriter.writeToFile(icuData, dir, fileHeader, allowOverwrite);
}
private Path createDirectory(Path dir) {

View file

@ -42,7 +42,7 @@ public interface LdmlConverterConfig {
}
/** Returns the relative output directory name. */
String getOutputDir() {
public String getOutputDir() {
return dirName;
}
@ -51,7 +51,7 @@ public interface LdmlConverterConfig {
* the supported set of locales for the "service" provided by the data in that
* directory).
*/
// TODO: Document why there's a difference between directories for empty directories.
// TODO: Document why there's a difference between directories for empty files.
boolean includeEmpty() {
return includeEmpty;
}

View file

@ -0,0 +1,281 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.ant;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static java.nio.file.LinkOption.NOFOLLOW_LINKS;
import static java.util.stream.Collectors.joining;
import static java.util.stream.Collectors.partitioningBy;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.Task;
import org.unicode.icu.tool.cldrtoicu.LdmlConverterConfig.IcuLocaleDir;
import com.google.common.base.CharMatcher;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.io.CharStreams;
// Note: Auto-magical Ant methods are listed as "unused" by IDEs, unless the warning is suppressed.
public final class CleanOutputDirectoryTask extends Task {
private static final ImmutableSet<String> ALLOWED_DIRECTORIES =
Stream
.concat(
Stream.of("misc", "translit"),
Arrays.stream(IcuLocaleDir.values()).map(IcuLocaleDir::getOutputDir))
.sorted()
.collect(toImmutableSet());
private static final CharMatcher NOT_WHITESPACE = CharMatcher.whitespace().negate();
private Path root = null;
private boolean forceDelete = false;
private final List<Dir> outputDirs = new ArrayList<>();
private final ImmutableList<String> headerLines;
public CleanOutputDirectoryTask() {
// TODO: Consider passing in header lines via Ant?
this.headerLines = readLinesFromResource("/ldml2icu_header.txt");
}
public static final class Retain extends Task {
private Path path = null;
// Don't use "Path" for the argument type because that always makes an absolute path (e.g.
// relative to the working directory for the Ant task). We want relative paths.
@SuppressWarnings("unused")
public void setPath(String path) {
Path p = Paths.get(path).normalize();
checkBuild(!p.isAbsolute() && !p.startsWith(".."), "invalid path: %s", path);
this.path = p;
}
@Override
public void init() throws BuildException {
checkBuild(path != null, "missing 'path' attribute");
}
}
public static final class Dir extends Task {
private String name;
private final Set<Path> retained = new HashSet<>();
@SuppressWarnings("unused")
public void setName(String name) {
checkBuild(ALLOWED_DIRECTORIES.contains(name),
"unknown directory name '%s'; allowed values: %s", name, ALLOWED_DIRECTORIES);
this.name = name;
}
@SuppressWarnings("unused")
public void addConfiguredRetain(Retain retain) {
retained.add(retain.path);
}
@Override
public void init() throws BuildException {
checkBuild(name != null, "missing 'name' attribute");
}
}
@SuppressWarnings("unused")
public void setRoot(Path root) {
this.root = root;
}
@SuppressWarnings("unused")
public void setForceDelete(boolean forceDelete) {
this.forceDelete = forceDelete;
}
@SuppressWarnings("unused")
public void addConfiguredDir(Dir dir) {
outputDirs.add(dir);
}
@Override
public void execute() throws BuildException {
checkBuild(root != null, "missing 'root' attribute");
checkBuild(!outputDirs.isEmpty(), "missing <dir> elements");
if (!Files.exists(root)) {
log("Root directory '" + root + "' does not exist (nothing to clean)");
return;
}
checkBuild(Files.isDirectory(root), "specified root '%s' is not a directory", root);
Set<Path> autogenFiles = new TreeSet<>();
Set<Path> unknownFiles = new TreeSet<>();
for (Dir dirInfo : outputDirs) {
Path dirPath = root.resolve(dirInfo.name);
if (!Files.exists(dirPath)) {
continue;
}
checkBuild(Files.isDirectory(dirPath), "'%s' is not a directory", dirPath);
// Note: For now we just walk the immediate contents of each output directory and don't
// attempt to recursively process things. Only a couple of output directories have
// sub-directories anyway, and we never write files into them anyway.
try (Stream<Path> files = Files.list(dirPath)) {
Map<Boolean, List<Path>> map = files
.filter(p -> couldDelete(p, dirPath, dirInfo))
.parallel()
.collect(partitioningBy(this::wasAutoGenerated));
unknownFiles.addAll(map.get(false));
autogenFiles.addAll(map.get(true));
} catch (IOException e) {
throw new BuildException("Error processing directory: " + dirPath, e);
}
}
if (!unknownFiles.isEmpty() && !forceDelete) {
// If there are NO safe files, then something weird is going on (perhaps a change in
// the header file).
if (autogenFiles.isEmpty()) {
log("Error determining 'safe' files for deletion (no auto-generated files found).");
log(unknownFiles.size() + " files would be deleted for 'clean' task");
logPartioned(unknownFiles);
log("Set '-DforceDelete=true' to delete all files not listed in"
+ " <outputDirectories>.");
} else {
// A mix of safe and unsafe files is weird, but in this case it should be a
// relatively small number of files (e.g. adding a new manually maintained file or
// accidental editing of header lines).
log("Unknown files exist which cannot be determined to be auto-generated");
log("Files:");
logPartioned(unknownFiles);
log(String.format("%d unknown files or directories found", unknownFiles.size()));
log("Set '-DforceDelete=true' to delete these files, or add them to"
+ " <outputDirectories>.");
}
throw new BuildException("Unsafe files cannot be deleted");
}
if (!unknownFiles.isEmpty()) {
checkState(forceDelete, "unexpected flag state (forceDelete should be true here)");
List<Path> filesToDelete =
unknownFiles.stream()
.filter(p -> !Files.isDirectory(p))
.collect(Collectors.toList());
log(String.format("Force deleting %,d files...\n", filesToDelete.size()));
deleteAllFiles(filesToDelete);
List<Path> unknownDirs =
unknownFiles.stream()
.filter(p -> Files.isDirectory(p))
.collect(Collectors.toList());
if (!unknownDirs.isEmpty()) {
log("Add the following directories to the <outputDirectories> task:");
logPartioned(unknownDirs);
throw new BuildException("Unsafe directories cannot be deleted");
}
}
if (!autogenFiles.isEmpty()) {
log(String.format("Deleting %,d auto-generated files...\n", autogenFiles.size()));
deleteAllFiles(autogenFiles);
}
}
private void logPartioned(Iterable<Path> files) {
Iterables.partition(files, 5)
.forEach(f -> log(
f.stream().map(p -> root.relativize(p).toString()).collect(joining(", "))));
}
private boolean couldDelete(Path path, Path dir, Dir dirInfo) {
return !dirInfo.retained.contains(dir.relativize(path));
}
private boolean wasAutoGenerated(Path path) {
if (!Files.isRegularFile(path, NOFOLLOW_LINKS)) {
// Directories, symbolic links, devices etc.
return false;
}
try (BufferedReader r = Files.newBufferedReader(path)) {
// A byte-order-mark (BOM) is added to ICU data files, but not JSON deps files, so just
// treat it as optional everywhere (it's not the important thing we check here).
r.mark(1);
int maybeByteOrderMark = r.read();
if (maybeByteOrderMark != '\uFEFF') {
// Also reset if the file was empty, but that should be harmless.
r.reset();
}
for (String headerLine : headerLines) {
String line = r.readLine();
if (line == null) {
return false;
}
int headerStart = skipComment(line);
if (headerStart < 0
|| !line.regionMatches(headerStart, headerLine, 0, headerLine.length())) {
return false;
}
}
return true;
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
private static int skipComment(String line) {
if (line.startsWith("#")) {
return toCommentStart(line, 1);
} else if (line.startsWith("//")) {
return toCommentStart(line, 2);
}
return -1;
}
// Not just "index-of" since a comment start followed by only whitespace is NOT a failure to
// find a comment (since the header might have an empty line in it, which should be okay).
private static int toCommentStart(String line, int offset) {
int index = NOT_WHITESPACE.indexIn(line, offset);
return index >= 0 ? index : line.length();
}
private static void deleteAllFiles(Iterable<Path> files) {
for (Path p : files) {
try {
// This is a code error, since only files should be passed here.
checkArgument(!Files.isDirectory(p), "Cannot delete directories: %s", p);
Files.deleteIfExists(p);
} catch (IOException e) {
throw new BuildException("Error deleting file: " + p, e);
}
}
}
private static void checkBuild(boolean condition, String message, Object... args) {
if (!condition) {
throw new BuildException(String.format(message, args));
}
}
private static ImmutableList<String> readLinesFromResource(String name) {
try (InputStream in = CleanOutputDirectoryTask.class.getResourceAsStream(name)) {
return ImmutableList.copyOf(CharStreams.readLines(new InputStreamReader(in)));
} catch (IOException e) {
throw new RuntimeException("cannot read resource: " + name, e);
}
}
}

View file

@ -4,8 +4,7 @@ package org.unicode.icu.tool.cldrtoicu.mapper;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkNotNull;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.TRUNCATE_EXISTING;
import static java.nio.file.StandardOpenOption.CREATE_NEW;
import static org.unicode.cldr.api.AttributeKey.keyOf;
import static org.unicode.cldr.api.CldrData.PathOrder.DTD;
import static org.unicode.cldr.api.CldrDataType.SUPPLEMENTAL;
@ -86,7 +85,8 @@ public final class TransformsMapper {
Function<Path, PrintWriter> fileWriterFn = p -> {
Path file = ruleFileOutputDir.resolve(p);
try {
return new PrintWriter(Files.newBufferedWriter(file, CREATE, TRUNCATE_EXISTING));
// Specify "CREATE_NEW" since we don't want to overwrite any existing files.
return new PrintWriter(Files.newBufferedWriter(file, CREATE_NEW));
} catch (IOException e) {
throw new RuntimeException("error opening file: " + file, e);
}