ICU-13326 gennorm2 --combined option to write the combined data of the input files; and gennorm2 minus operator to write the diffs of the combined data from two sets of input files

X-SVN-Rev: 40349
This commit is contained in:
Markus Scherer 2017-08-23 23:33:47 +00:00
parent d362b18924
commit d287dbbe30
4 changed files with 253 additions and 9 deletions

View file

@ -61,6 +61,7 @@ enum {
OUTPUT_FILENAME,
UNICODE_VERSION,
WRITE_C_SOURCE,
WRITE_COMBINED_DATA,
OPT_FAST
};
@ -73,6 +74,7 @@ static UOption options[]={
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_DEF("csource", '\1', UOPT_NO_ARG),
UOPTION_DEF("combined", '\1', UOPT_NO_ARG),
UOPTION_DEF("fast", '\1', UOPT_NO_ARG)
};
@ -96,17 +98,22 @@ main(int argc, char* argv[]) {
if( argc<2 ||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
) {
/*
* Broken into chunks because the C89 standard says the minimum
* required supported string length is 509 bytes.
*/
fprintf(stderr,
"Usage: %s [-options] infiles+ -o outputfilename\n"
"\n"
"Reads the infiles with normalization data and\n"
"creates a binary or C source file (outputfilename) with the data.\n"
"creates a binary file, or a C source file (--csource), with the data,\n"
"or writes a data file with the combined data (--combined).\n"
"See http://userguide.icu-project.org/transforms/normalization#TOC-Data-File-Syntax\n"
"\n"
"Alternate usage: %s [-options] a.txt b.txt minus p.txt q.txt -o outputfilename\n"
"\n"
"Computes the difference of (a, b) minus (p, q) and writes the diff data\n"
"in input-file syntax to the outputfilename.\n"
"It is then possible to build (p, q, diff) to get the same data as (a, b).\n"
"(Useful for computing minimal incremental mapping data files.)\n"
"\n",
argv[0]);
argv[0], argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
@ -116,7 +123,9 @@ main(int argc, char* argv[]) {
fprintf(stderr,
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-o or --output output filename\n"
"\t --csource writes a C source file with initializers\n");
"\t --csource writes a C source file with initializers\n"
"\t --combined writes a .txt file (input-file syntax) with the\n"
"\t combined data from all of the input files\n");
fprintf(stderr,
"\t --fast optimize the data for fast normalization,\n"
"\t which might increase its size (Writes fully decomposed\n"
@ -144,7 +153,10 @@ main(int argc, char* argv[]) {
#else
LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode), errorCode);
LocalPointer<Normalizer2DataBuilder> b1(new Normalizer2DataBuilder(errorCode), errorCode);
LocalPointer<Normalizer2DataBuilder> b2;
LocalPointer<Normalizer2DataBuilder> diff;
Normalizer2DataBuilder *builder = b1.getAlias();
errorCode.assertSuccess();
if(options[UNICODE_VERSION].doesOccur) {
@ -166,8 +178,29 @@ main(int argc, char* argv[]) {
pathLength=filename.length();
}
bool doMinus = false;
for(int i=1; i<argc; ++i) {
printf("gennorm2: processing %s\n", argv[i]);
if(strcmp(argv[i], "minus") == 0) {
if(doMinus) {
fprintf(stderr, "gennorm2 error: only one 'minus' can be specified\n");
exit(U_ILLEGAL_ARGUMENT_ERROR);
}
// Data from previous input files has been collected in b1.
// Collect data from further input files in b2.
b2.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
diff.adoptInsteadAndCheckErrorCode(new Normalizer2DataBuilder(errorCode), errorCode);
errorCode.assertSuccess();
builder = b2.getAlias();
if(options[UNICODE_VERSION].doesOccur) {
builder->setUnicodeVersion(options[UNICODE_VERSION].value);
}
if(options[OPT_FAST].doesOccur) {
builder->setOptimization(Normalizer2DataBuilder::OPTIMIZE_FAST);
}
doMinus = true;
continue;
}
filename.append(argv[i], errorCode);
LocalStdioFilePointer f(fopen(filename.data(), "r"));
if(f==NULL) {
@ -179,7 +212,12 @@ main(int argc, char* argv[]) {
filename.truncate(pathLength);
}
if(options[WRITE_C_SOURCE].doesOccur) {
if(doMinus) {
Normalizer2DataBuilder::computeDiff(*b1, *b2, *diff);
diff->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ true);
} else if(options[WRITE_COMBINED_DATA].doesOccur) {
builder->writeDataFile(options[OUTPUT_FILENAME].value, /* writeRemoved= */ false);
} else if(options[WRITE_C_SOURCE].doesOccur) {
builder->writeCSourceFile(options[OUTPUT_FILENAME].value);
} else {
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);

View file

@ -30,7 +30,9 @@
#include "unicode/localpointer.h"
#include "unicode/putil.h"
#include "unicode/udata.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
#include "unicode/ustring.h"
#include "charstr.h"
#include "extradata.h"
@ -146,6 +148,7 @@ void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
norms.createNorm(c)->cc=cc;
norms.ccSet.add(c);
}
static UBool isWellFormed(const UnicodeString &s) {
@ -166,6 +169,7 @@ void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m)
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ONE_WAY;
p->setMappingCP();
norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
@ -195,12 +199,14 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
p->mapping=new UnicodeString(m);
p->mappingType=Norm::ROUND_TRIP;
p->mappingCP=U_SENTINEL;
norms.mappingSet.add(c);
}
void Normalizer2DataBuilder::removeMapping(UChar32 c) {
// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
Norm *p=checkNormForMapping(norms.createNorm(c), c);
p->mappingType=Norm::REMOVED;
norms.mappingSet.add(c);
}
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
@ -832,6 +838,198 @@ Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
fclose(f);
}
namespace {
bool equalStrings(const UnicodeString *s1, const UnicodeString *s2) {
if(s1 == nullptr) {
return s2 == nullptr;
} else if(s2 == nullptr) {
return false;
} else {
return *s1 == *s2;
}
}
const char *typeChars = "?-=>";
void writeMapping(FILE *f, const UnicodeString *m) {
if(m != nullptr && !m->isEmpty()) {
int32_t i = 0;
UChar32 c = m->char32At(i);
fprintf(f, "%04lX", (long)c);
while((i += U16_LENGTH(c)) < m->length()) {
c = m->char32At(i);
fprintf(f, " %04lX", (long)c);
}
}
fputs("\n", f);
}
} // namespace
void
Normalizer2DataBuilder::writeDataFile(const char *filename, bool writeRemoved) const {
// Do not processData() before writing the input-syntax data file.
FILE *f = fopen(filename, "w");
if(f == nullptr) {
fprintf(stderr, "gennorm2/writeDataFile() error: unable to create the output file %s\n",
filename);
exit(U_FILE_ACCESS_ERROR);
return;
}
if(unicodeVersion[0] != 0 || unicodeVersion[1] != 0 ||
unicodeVersion[2] != 0 || unicodeVersion[3] != 0) {
char uv[U_MAX_VERSION_STRING_LENGTH];
u_versionToString(unicodeVersion, uv);
fprintf(f, "* Unicode %s\n\n", uv);
}
UnicodeSetIterator ccIter(norms.ccSet);
UChar32 start = U_SENTINEL;
UChar32 end = U_SENTINEL;
uint8_t prevCC = 0;
bool done = false;
bool didWrite = false;
do {
UChar32 c;
uint8_t cc;
if(ccIter.next() && !ccIter.isString()) {
c = ccIter.getCodepoint();
cc = norms.getCC(c);
} else {
c = 0x110000;
cc = 0;
done = true;
}
if(cc == prevCC && c == (end + 1)) {
end = c;
} else {
if(prevCC != 0) {
if(start == end) {
fprintf(f, "%04lX:%d\n", (long)start, (int)prevCC);
} else {
fprintf(f, "%04lX..%04lX:%d\n", (long)start, (long)end, (int)prevCC);
}
didWrite = true;
}
start = end = c;
prevCC = cc;
}
} while(!done);
if(didWrite) {
fputs("\n", f);
}
UnicodeSetIterator mIter(norms.mappingSet);
start = U_SENTINEL;
end = U_SENTINEL;
const UnicodeString *prevMapping = nullptr;
Norm::MappingType prevType = Norm::NONE;
done = false;
do {
UChar32 c;
const Norm *norm;
if(mIter.next() && !mIter.isString()) {
c = mIter.getCodepoint();
norm = norms.getNorm(c);
} else {
c = 0x110000;
norm = nullptr;
done = true;
}
const UnicodeString *mapping;
Norm::MappingType type;
if(norm == nullptr) {
mapping = nullptr;
type = Norm::NONE;
} else {
type = norm->mappingType;
if(type == Norm::NONE) {
mapping = nullptr;
} else {
mapping = norm->mapping;
}
}
if(type == prevType && equalStrings(mapping, prevMapping) && c == (end + 1)) {
end = c;
} else {
if(writeRemoved ? prevType != Norm::NONE : prevType > Norm::REMOVED) {
if(start == end) {
fprintf(f, "%04lX%c", (long)start, typeChars[prevType]);
} else {
fprintf(f, "%04lX..%04lX%c", (long)start, (long)end, typeChars[prevType]);
}
writeMapping(f, prevMapping);
}
start = end = c;
prevMapping = mapping;
prevType = type;
}
} while(!done);
fclose(f);
}
void
Normalizer2DataBuilder::computeDiff(const Normalizer2DataBuilder &b1,
const Normalizer2DataBuilder &b2,
Normalizer2DataBuilder &diff) {
// Compute diff = b1 - b2
// so that we should be able to get b1 = b2 + diff.
if(0 != memcmp(b1.unicodeVersion, b2.unicodeVersion, U_MAX_VERSION_LENGTH)) {
memcpy(diff.unicodeVersion, b1.unicodeVersion, U_MAX_VERSION_LENGTH);
}
UnicodeSet ccSet(b1.norms.ccSet);
ccSet.addAll(b2.norms.ccSet);
UnicodeSetIterator ccIter(ccSet);
while(ccIter.next() && !ccIter.isString()) {
UChar32 c = ccIter.getCodepoint();
uint8_t cc1 = b1.norms.getCC(c);
uint8_t cc2 = b2.norms.getCC(c);
if(cc1 != cc2) {
diff.setCC(c, cc1);
}
}
UnicodeSet mSet(b1.norms.mappingSet);
mSet.addAll(b2.norms.mappingSet);
UnicodeSetIterator mIter(mSet);
while(mIter.next() && !mIter.isString()) {
UChar32 c = mIter.getCodepoint();
const Norm *norm1 = b1.norms.getNorm(c);
const Norm *norm2 = b2.norms.getNorm(c);
const UnicodeString *mapping1;
Norm::MappingType type1;
if(norm1 == nullptr || !norm1->hasMapping()) {
mapping1 = nullptr;
type1 = Norm::NONE;
} else {
mapping1 = norm1->mapping;
type1 = norm1->mappingType;
}
const UnicodeString *mapping2;
Norm::MappingType type2;
if(norm2 == nullptr || !norm2->hasMapping()) {
mapping2 = nullptr;
type2 = Norm::NONE;
} else {
mapping2 = norm2->mapping;
type2 = norm2->mappingType;
}
if(type1 == type2 && equalStrings(mapping1, mapping2)) {
// Nothing to do.
} else if(type1 == Norm::NONE) {
diff.removeMapping(c);
} else if(type1 == Norm::ROUND_TRIP) {
diff.setRoundTripMapping(c, *mapping1);
} else if(type1 == Norm::ONE_WAY) {
diff.setOneWayMapping(c, *mapping1);
}
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -63,6 +63,11 @@ public:
void writeBinaryFile(const char *filename);
void writeCSourceFile(const char *filename);
void writeDataFile(const char *filename, bool writeRemoved) const;
static void computeDiff(const Normalizer2DataBuilder &b1,
const Normalizer2DataBuilder &b2,
Normalizer2DataBuilder &diff);
private:
friend class Norm16Writer;

View file

@ -15,6 +15,7 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "normalizer2impl.h"
@ -183,6 +184,8 @@ public:
void enumRanges(Enumerator &e);
UnicodeSet ccSet, mappingSet;
private:
Norms(const Norms &other) = delete;
Norms &operator=(const Norms &other) = delete;