ICU-3969 initial code for gencase, generate ucase.icu containing only and all case mapping properties

X-SVN-Rev: 16214
This commit is contained in:
Markus Scherer 2004-08-30 22:13:08 +00:00
parent f415ea5e5a
commit d5631e445c
7 changed files with 2118 additions and 0 deletions

View file

@ -0,0 +1,100 @@
## Makefile.in for ICU - tools/gencase
## Copyright (c) 1999-2004, International Business Machines Corporation and
## others. All Rights Reserved.
## Steven R. Loomis
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
##
TARGET_STUB_NAME = gencase
SECTION = 8
MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
## Build directory information
subdir = tools/gencase
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS) $(MAN_FILES)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = gencase.o store.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check \
check-local install-man
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET) $(MAN_FILES)
install-local: all-local install-man
$(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
$(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)
# man page
install-man: $(MAN_FILES)
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
$(INSTALL_DATA) $< $(DESTDIR)$(mandir)/man$(SECTION)
%.$(SECTION): $(srcdir)/%.$(SECTION).in
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
# build postscript and pdf formats
#$(TARGET).ps: $(TARGET).$(SECTION)
# groff -man < $< > $@
#$(TARGET).pdf: $(TARGET).ps
# ps2pdf $< $@
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(TARGET) $(OBJECTS)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

View file

@ -0,0 +1,126 @@
.\" Hey, Emacs! This is -*-nroff-*- you know...
.\"
.\" genprops.8: manual page for the genprops utility
.\"
.\" Copyright (C) 2000-2001 IBM, Inc. and others.
.\"
.TH GENPROPS 8 "16 January 2001" "ICU MANPAGE" "ICU @VERSION@ Manual"
.SH NAME
.B genprops
\- compile properties from the Unicode Character Database
.SH SYNOPSIS
.B genprops
[
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
]
[
.BR "\-v\fP, \fB\-\-verbose"
]
[
.BI "\-u\fP, \fB\-\-unicode" " version"
]
[
.BI "\-c\fP, \fB\-\-copyright"
]
[
.BI "\-s\fP, \fB\-\-sourcedir" " source"
]
[
.BI "\-d\fP, \fB\-\-destdir" " destination"
]
[
.I suffix
]
.SH DESCRIPTION
.B genprops
reads some of the Unicode Character Database files and compiles their
information information into a binary form.
The resulting file,
.BR icudata.dat ,
can then be read directly by ICU, or used by
.BR pkgdata (8)
for incorporation into a larger archive or library.
.LP
The files read by
.B genprops
are described in the
.B FILES
section. If
.I suffix
is passed on the command line, the names of these files will actually
be changed to include a dash followed by
.I suffix
in their basename. For example, the file
.B UnicodeData.txt
would be looked for under the name
.BR UnicodeData\-\fIsuffix\fP.txt .
.SH OPTIONS
.TP
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
Print help about usage and exit.
.TP
.BR "\-v\fP, \fB\-\-verbose"
Display extra informative messages during execution.
.TP
.BI "\-u\fP, \fB\-\-unicode" " version"
Specify which
.I version
of Unicode the Unicode Character Database refers to.
Defaults to
.BR 3.0.0 .
.TP
.BI "\-c\fP, \fB\-\-copyright"
Include a copyright notice into the binary data.
.TP
.BI "\-s\fP, \fB\-\-sourcedir" " source"
Set the source directory to
.IR source .
The default source directory is specified by the environment variable
.BR ICU_DATA .
.TP
.BI "\-d\fP, \fB\-\-destdir" " destination"
Set the destination directory to
.IR destination .
The default destination directory is specified by the environment variable
.BR ICU_DATA .
.SH ENVIRONMENT
.TP 10
.B ICU_DATA
Specifies the directory containing ICU data. Defaults to
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
Some tools in ICU depend on the presence of the trailing slash. It is thus
important to make sure that it is present if
.B ICU_DATA
is set.
.SH FILES
The following files are read by
.B genprops
and are looked for in the
.I source
directory.
.TP 20
.B UnicodeData.txt
The main file in the Unicode Character Database. Contains character
properties, combining classes information, decompositions, names,
etc.\|.\|..
.TP
.B BidiMirroring.txt
Properties for substituting characters in an implementation of
bidirectional mirroring.
.TP
.B SpecialCasing.txt
List of properties required for full case mapping.
.TP
.B CaseFolding.txt
Mapping from characters to their case-folded forms. (Note: this file
is derived from
.B UnicodeData.txt
and
.B SpecialCasing.txt
when generated by the Unicode Consortium.)
.SH VERSION
@VERSION@
.SH COPYRIGHT
Copyright (C) 2000-2002 IBM, Inc. and others.
.SH SEE ALSO
.BR pkgdata (8)

View file

@ -0,0 +1,776 @@
/*
*******************************************************************************
*
* Copyright (C) 2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: gencase.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2004aug28
* created by: Markus W. Scherer
*
* This program reads several of the Unicode character database text files,
* parses them, and the case mapping properties for each character.
* It then writes a binary file containing the properties
* that is designed to be used directly for random-access to
* the properties of each Unicode character.
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uset.h"
#include "unicode/putil.h"
#include "unicode/uclean.h"
#include "cmemory.h"
#include "cstring.h"
#include "uarrsort.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "uprops.h"
#include "propsvec.h"
#include "gencase.h"
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
/* data --------------------------------------------------------------------- */
static UNewTrie *trie;
uint32_t *pv;
static int32_t pvCount;
UBool beVerbose=FALSE, haveCopyright=TRUE;
/*
* Unicode set collecting the case-sensitive characters;
* see uchar.h UCHAR_CASE_SENSITIVE.
* Add code points from case mappings/foldings in
* the root locale and with default options.
*/
static USet *caseSensitive;
/* prototypes --------------------------------------------------------------- */
static void
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
static void
parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
static void
parseDB(const char *filename, UErrorCode *pErrorCode);
/* parse files with multiple binary properties ------------------------------ */
/* TODO: more common code, move functions to uparse.h|c */
/* TODO: similar to genprops/props2.c but not the same */
struct Binary {
const char *propName;
int32_t vecWord;
uint32_t vecValue, vecMask;
};
typedef struct Binary Binary;
struct Binaries {
const char *ucdFile;
const Binary *binaries;
int32_t binariesCount;
};
typedef struct Binaries Binaries;
static const Binary
propListNames[]={
{ "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK }
};
static const Binaries
propListBinaries={
"PropList", propListNames, LENGTHOF(propListNames)
};
static const Binary
derCorePropsNames[]={
{ "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK },
{ "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }
};
static const Binaries
derCorePropsBinaries={
"DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
};
static void U_CALLCONV
binariesLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
const Binaries *bin;
char *s;
uint32_t start, limit;
int32_t i;
bin=(const Binaries *)context;
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse binary property name */
s=(char *)u_skipWhitespace(fields[1][0]);
for(i=0;; ++i) {
if(i==bin->binariesCount) {
/* ignore unrecognized properties */
return;
}
if(isToken(bin->binaries[i].propName, s)) {
break;
}
}
if(bin->binaries[i].vecMask==0) {
fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n",
(int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName);
exit(U_INTERNAL_PROGRAM_ERROR);
}
if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) {
fprintf(stderr, "gencase error: unable to set %s, code: %s\n",
bin->binaries[i].propName, u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
parseBinariesFile(char *filename, char *basename, const char *suffix,
const Binaries *bin,
UErrorCode *pErrorCode) {
char *fields[2][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
writeUCDFilename(basename, bin->ucdFile, suffix);
u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
}
}
/* -------------------------------------------------------------------------- */
enum
{
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
COPYRIGHT,
DESTDIR,
SOURCEDIR,
UNICODE_VERSION,
ICUDATADIR
};
/* Keep these values in sync with the above enums */
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR,
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
UOPTION_ICUDATADIR
};
extern int
main(int argc, char* argv[]) {
char filename[300];
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
char *basename=NULL;
UErrorCode errorCode=U_ZERO_ERROR;
U_MAIN_INIT_ARGS(argc, argv);
/* preset then read command line options */
options[DESTDIR].value=u_getDataDirectory();
options[SOURCEDIR].value="";
options[UNICODE_VERSION].value="";
options[ICUDATADIR].value=u_getDataDirectory();
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
/* error handling, printing usage message */
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
}
if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
/*
* Broken into chucks because the C89 standard says the minimum
* required supported string length is 509 bytes.
*/
fprintf(stderr,
"Usage: %s [-options] [suffix]\n"
"\n"
"read the UnicodeData.txt file and other Unicode properties files and\n"
"create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the character properties\n"
"\n",
argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
"\t-c or --copyright include a copyright notice\n"
"\t-u or --unicode Unicode version, followed by the version like 3.0.0\n");
fprintf(stderr,
"\t-d or --destdir destination directory, followed by the path\n"
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
"\t followed by path, defaults to %s\n"
"\tsuffix suffix that is to be appended with a '-'\n"
"\t to the source file basenames before opening;\n"
"\t 'gencase new' will read UnicodeData-new.txt etc.\n",
u_getDataDirectory());
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
/* get the options values */
beVerbose=options[VERBOSE].doesOccur;
haveCopyright=options[COPYRIGHT].doesOccur;
srcDir=options[SOURCEDIR].value;
destDir=options[DESTDIR].value;
if(argc>=2) {
suffix=argv[1];
} else {
suffix=NULL;
}
if(options[UNICODE_VERSION].doesOccur) {
setUnicodeVersion(options[UNICODE_VERSION].value);
}
/* else use the default dataVersion in store.c */
if (options[ICUDATADIR].doesOccur) {
u_setDataDirectory(options[ICUDATADIR].value);
}
/* prepare the filename beginning with the source dir */
uprv_strcpy(filename, srcDir);
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
/* initialize */
pv=upvec_open(1, 10000);
caseSensitive=uset_open(1, 0); /* empty set (start>end) */
/* process SpecialCasing.txt */
writeUCDFilename(basename, "SpecialCasing", suffix);
parseSpecialCasing(filename, &errorCode);
/* process CaseFolding.txt */
writeUCDFilename(basename, "CaseFolding", suffix);
parseCaseFolding(filename, &errorCode);
/* process additional properties files */
*basename=0;
parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode);
parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode);
/* process UnicodeData.txt */
writeUCDFilename(basename, "UnicodeData", suffix);
parseDB(filename, &errorCode);
/* process parsed data */
makeCaseClosure();
makeExceptions();
if(U_SUCCESS(errorCode)) {
/* write the properties data file */
generateData(destDir);
}
u_cleanup();
return errorCode;
}
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix) {
int32_t length=(int32_t)uprv_strlen(filename);
uprv_strcpy(basename, filename);
if(suffix!=NULL) {
basename[length++]='-';
uprv_strcpy(basename+length, suffix);
length+=(int32_t)uprv_strlen(suffix);
}
uprv_strcpy(basename+length, ".txt");
}
/* TODO: move to toolutil */
U_CFUNC UBool
isToken(const char *token, const char *s) {
const char *z;
int32_t j;
s=u_skipWhitespace(s);
for(j=0;; ++j) {
if(token[j]!=0) {
if(s[j]!=token[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0) {
return TRUE;
} else {
break;
}
}
}
return FALSE;
}
static void
_set_addAll(USet *set, const UChar *s, int32_t length) {
UChar32 c;
int32_t i;
/* needs length>=0 */
for(i=0; i<length; /* U16_NEXT advances i */) {
U16_NEXT(s, i, length, c);
uset_add(set, c);
}
}
/* parser for SpecialCasing.txt --------------------------------------------- */
#define MAX_SPECIAL_CASING_COUNT 500
static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
static int32_t specialCasingCount=0;
static void U_CALLCONV
specialCasingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *end;
/* get code point */
specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* is this a complex mapping? */
if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
/* there is some condition text in the fifth field */
specialCasings[specialCasingCount].isComplex=TRUE;
/* do not store any actual mappings for this */
specialCasings[specialCasingCount].lowerCase[0]=0;
specialCasings[specialCasingCount].upperCase[0]=0;
specialCasings[specialCasingCount].titleCase[0]=0;
} else {
/* just set the "complex" flag and get the case mappings */
specialCasings[specialCasingCount].isComplex=FALSE;
specialCasings[specialCasingCount].lowerCase[0]=
(UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
specialCasings[specialCasingCount].upperCase[0]=
(UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
specialCasings[specialCasingCount].titleCase[0]=
(UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]);
exit(*pErrorCode);
}
uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
_set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
_set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
_set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
}
if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
fprintf(stderr, "gencase: too many special casing mappings\n");
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static int32_t U_CALLCONV
compareSpecialCasings(const void *context, const void *left, const void *right) {
return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
}
static void
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
char *fields[5][2];
int32_t i, j;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
/* sort the special casing entries by code point */
if(specialCasingCount>0) {
uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
compareSpecialCasings, NULL, FALSE, pErrorCode);
}
if(U_FAILURE(*pErrorCode)) {
return;
}
/* replace multiple entries for any code point by one "complex" one */
j=0;
for(i=1; i<specialCasingCount; ++i) {
if(specialCasings[i-1].code==specialCasings[i].code) {
/* there is a duplicate code point */
specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */
specialCasings[i].isComplex=TRUE; /* make the following one complex */
specialCasings[i].lowerCase[0]=0;
specialCasings[i].upperCase[0]=0;
specialCasings[i].titleCase[0]=0;
++j;
}
}
/* if some entries just were removed, then re-sort */
if(j>0) {
uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing),
compareSpecialCasings, NULL, FALSE, pErrorCode);
specialCasingCount-=j;
}
if(U_FAILURE(*pErrorCode)) {
return;
}
/*
* Add one complex mapping to caseSensitive that was filtered out above:
* Greek final Sigma has a conditional mapping but not locale-sensitive,
* and it is taken when lowercasing just U+03A3 alone.
* 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
*/
uset_add(caseSensitive, 0x3c2);
}
/* parser for CaseFolding.txt ----------------------------------------------- */
#define MAX_CASE_FOLDING_COUNT 2000
static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
static int32_t caseFoldingCount=0;
static void U_CALLCONV
caseFoldingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *end;
static UChar32 prevCode=0;
int32_t count;
char status;
/* get code point */
caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get the status of this mapping */
caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
if(status=='L') {
return;
}
/* get the mapping */
count=caseFoldings[caseFoldingCount].full[0]=
(UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
exit(*pErrorCode);
}
/* there is a simple mapping only if there is exactly one code point (count is in UChars) */
if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
caseFoldings[caseFoldingCount].simple=0;
}
/* update the case-sensitive set */
if(status!='T') {
uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
_set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
}
/* check the status */
if(status=='S') {
/* check if there was a full mapping for this code point before */
if( caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
caseFoldings[caseFoldingCount-1].status=='F'
) {
/* merge the two entries */
caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
return;
}
} else if(status=='F') {
/* check if there was a simple mapping for this code point before */
if( caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
caseFoldings[caseFoldingCount-1].status=='S'
) {
/* merge the two entries */
uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
return;
}
} else if(status=='I' || status=='T') {
/* check if there was a default mapping for this code point before (remove it) */
while(caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
) {
prevCode=0;
--caseFoldingCount;
}
/* store only a marker for special handling for cases like dotless i */
caseFoldings[caseFoldingCount].simple=0;
caseFoldings[caseFoldingCount].full[0]=0;
}
/* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)caseFoldings[caseFoldingCount].code,
(unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
prevCode=caseFoldings[caseFoldingCount].code;
if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
fprintf(stderr, "gencase: too many case folding mappings\n");
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static void
parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
char *fields[3][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
}
/* parser for UnicodeData.txt ----------------------------------------------- */
static int32_t specialCasingIndex=0, caseFoldingIndex=0;
static void U_CALLCONV
unicodeDataLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
Props p;
char *end;
static UChar32 prevCode=0;
UChar32 value;
UBool something=FALSE;
/* reset the properties */
uprv_memset(&p, 0, sizeof(Props));
/* get the character code, field 0 */
p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get general category, field 2 */
if(isToken("Lt", fields[2][0])) {
p.isTitle=TRUE;
something=TRUE;
}
/* get canonical combining class, field 3 */
value=(UChar32)uprv_strtoul(fields[3][0], &end, 10);
if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value>0) {
p.cc=(uint8_t)value;
something=TRUE;
}
/* get uppercase mapping, field 12 */
value=(UChar32)uprv_strtoul(fields[12][0], &end, 16);
if(end!=fields[12][1]) {
fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value!=0 && value!=p.code) {
p.upperCase=value;
uset_add(caseSensitive, p.code);
uset_add(caseSensitive, value);
something=TRUE;
}
/* get lowercase value, field 13 */
value=(UChar32)uprv_strtoul(fields[13][0], &end, 16);
if(end!=fields[13][1]) {
fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value!=0 && value!=p.code) {
p.lowerCase=value;
uset_add(caseSensitive, p.code);
uset_add(caseSensitive, value);
something=TRUE;
}
/* get titlecase value, field 14 */
value=(UChar32)uprv_strtoul(fields[14][0], &end, 16);
if(end!=fields[14][1]) {
fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value!=0 && value!=p.code) {
p.titleCase=value;
uset_add(caseSensitive, p.code);
uset_add(caseSensitive, value);
something=TRUE;
}
/* set additional properties from previously parsed files */
if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
p.specialCasing=specialCasings+specialCasingIndex++;
something=TRUE;
} else {
p.specialCasing=NULL;
}
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
p.caseFolding=caseFoldings+caseFoldingIndex++;
something=TRUE;
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
if( p.caseFolding->status=='C' &&
p.caseFolding->simple==p.lowerCase
) {
p.caseFolding=NULL;
}
} else {
p.caseFolding=NULL;
}
/* check for non-character code points */
if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* check that the code points (p.code) are in ascending order */
if(p.code<=prevCode && p.code>0) {
fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)p.code, (unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* properties for a single code point */
if(something) {
setProps(&p);
}
prevCode=p.code;
}
static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
char *fields[15][2];
UChar32 start, end;
int32_t i;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
/* are all sub-properties consumed? */
if(specialCasingIndex<specialCasingCount) {
fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(caseFoldingIndex<caseFoldingCount) {
fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(U_FAILURE(*pErrorCode)) {
return;
}
for(i=0;
0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
++i
) {
addCaseSensitive(start, end);
}
if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
*pErrorCode=U_ZERO_ERROR;
}
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -0,0 +1,194 @@
# Microsoft Developer Studio Project File - Name="gencase" - Package Owner=<4>
# Microsoft Developer Studio Generated Build File, Format Version 6.00
# ** DO NOT EDIT **
# TARGTYPE "Win32 (x86) Console Application" 0x0103
CFG=gencase - Win32 Debug
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
!MESSAGE use the Export Makefile command and run
!MESSAGE
!MESSAGE NMAKE /f "gencase.mak".
!MESSAGE
!MESSAGE You can specify a configuration when running NMAKE
!MESSAGE by defining the macro CFG on the command line. For example:
!MESSAGE
!MESSAGE NMAKE /f "gencase.mak" CFG="gencase - Win32 Debug"
!MESSAGE
!MESSAGE Possible choices for configuration are:
!MESSAGE
!MESSAGE "gencase - Win32 Release" (based on "Win32 (x86) Console Application")
!MESSAGE "gencase - Win32 Debug" (based on "Win32 (x86) Console Application")
!MESSAGE "gencase - Win64 Release" (based on "Win32 (x86) Console Application")
!MESSAGE "gencase - Win64 Debug" (based on "Win32 (x86) Console Application")
!MESSAGE
# Begin Project
# PROP AllowPerConfigDependencies 0
# PROP Scc_ProjName ""
# PROP Scc_LocalPath ""
CPP=cl.exe
RSC=rc.exe
!IF "$(CFG)" == "gencase - Win32 Release"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 0
# PROP BASE Output_Dir "Release"
# PROP BASE Intermediate_Dir "Release"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 0
# PROP Output_Dir "Release"
# PROP Intermediate_Dir "Release"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD CPP /nologo /G6 /MD /Za /W3 /GX /O2 /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD BASE RSC /l 0x409 /d "NDEBUG"
# ADD RSC /l 0x409 /d "NDEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# ADD LINK32 icuuc.lib icutu.lib /nologo /subsystem:console /machine:I386 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" /libpath:"..\..\..\lib"
# Begin Custom Build
TargetPath=.\Release\gencase.exe
InputPath=.\Release\gencase.exe
InputName=gencase
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
# End Custom Build
!ELSEIF "$(CFG)" == "gencase - Win32 Debug"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 1
# PROP BASE Output_Dir "Debug"
# PROP BASE Intermediate_Dir "Debug"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 1
# PROP Output_Dir "Debug"
# PROP Intermediate_Dir "Debug"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
# ADD CPP /nologo /G6 /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c
# ADD BASE RSC /l 0x409 /d "_DEBUG"
# ADD RSC /l 0x409 /d "_DEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# ADD LINK32 icuucd.lib icutud.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" /libpath:"..\..\..\lib"
# Begin Custom Build
TargetPath=.\Debug\gencase.exe
InputPath=.\Debug\gencase.exe
InputName=gencase
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
# End Custom Build
!ELSEIF "$(CFG)" == "gencase - Win64 Release"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 0
# PROP BASE Output_Dir "Release"
# PROP BASE Intermediate_Dir "Release"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 0
# PROP Output_Dir "Release"
# PROP Intermediate_Dir "Release"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
# ADD CPP /nologo /MD /Za /W3 /GX /Zi /O2 /I "..\toolutil" /I "..\..\common" /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /D "_IA64_" /D "WIN32" /D "_AFX_NO_DAO_SUPPORT" /FD /QIA64_fmaopt /Wp64 /Zm600 /c
# ADD BASE RSC /l 0x409 /d "NDEBUG"
# ADD RSC /l 0x409 /d "NDEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:IX86 /machine:IA64
# ADD LINK32 icuuc.lib icutu.lib /nologo /subsystem:console /machine:IX86 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" /libpath:"..\..\..\lib" /machine:IA64
# Begin Custom Build
TargetPath=.\Release\gencase.exe
InputPath=.\Release\gencase.exe
InputName=gencase
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
# End Custom Build
!ELSEIF "$(CFG)" == "gencase - Win64 Debug"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 1
# PROP BASE Output_Dir "Debug"
# PROP BASE Intermediate_Dir "Debug"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 1
# PROP Output_Dir "Debug"
# PROP Intermediate_Dir "Debug"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /Zi /Od /I "..\toolutil" /I "..\..\common" /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "_IA64_" /D "WIN32" /D "_AFX_NO_DAO_SUPPORT" /FR /FD /GZ /QIA64_fmaopt /Wp64 /Zm600 /c
# ADD BASE RSC /l 0x409 /d "_DEBUG"
# ADD RSC /l 0x409 /d "_DEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:IX86 /pdbtype:sept /machine:IA64
# ADD LINK32 icuucd.lib icutud.lib /nologo /subsystem:console /incremental:no /debug /machine:IX86 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" /libpath:"..\..\..\lib" /machine:IA64
# Begin Custom Build
TargetPath=.\Debug\gencase.exe
InputPath=.\Debug\gencase.exe
InputName=gencase
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
# End Custom Build
!ENDIF
# Begin Target
# Name "gencase - Win32 Release"
# Name "gencase - Win32 Debug"
# Name "gencase - Win64 Release"
# Name "gencase - Win64 Debug"
# Begin Source File
SOURCE=.\gencase.c
# End Source File
# Begin Source File
SOURCE=.\gencase.h
# End Source File
# Begin Source File
SOURCE=.\store.c
# End Source File
# End Target
# End Project

View file

@ -0,0 +1,187 @@
/*
*******************************************************************************
*
* Copyright (C) 2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: genprops.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2004aug28
* created by: Markus W. Scherer
*/
#ifndef __GENCASE_H__
#define __GENCASE_H__
#include "unicode/utypes.h"
#include "utrie.h"
U_CDECL_BEGIN
/* file definitions --------------------------------------------------------- */
#define UCASE_DATA_NAME "ucase"
#define UCASE_DATA_TYPE "icu"
/* format "cAsE" */
#define UCASE_FMT_0 0x63
#define UCASE_FMT_1 0x41
#define UCASE_FMT_2 0x53
#define UCASE_FMT_3 0x45
/* indexes into indexes[] */
enum {
UCASE_IX_INDEX_TOP,
UCASE_IX_LENGTH,
UCASE_IX_TRIE_SIZE,
UCASE_IX_EXC_LENGTH,
UCASE_IX_TOP=16
};
/* definitions for 16-bit case properties word ------------------------------ */
/* 2-bit constants for types of cased characters */
#define UCASE_TYPE_MASK 3
enum {
UCASE_NONE,
UCASE_LOWER,
UCASE_UPPER,
UCASE_TITLE
};
#define UCASE_SENSITIVE 4
#define UCASE_EXCEPTION 8
#define UCASE_DOT_MASK 0x30
enum {
UCASE_NO_DOT=0,
UCASE_SOFT_DOTTED=0x10,
UCASE_ABOVE=0x20, /* "above" accents with cc=230 */
UCASE_OTHER_ACCENT=0x30 /* other character (0<cc!=230) */
};
/* no exception: bits 15..6 are a 10-bit signed case mapping delta */
#define UCASE_DELTA_SHIFT 6
#define UCASE_DELTA_MASK 0xffc0
#define UCASE_MAX_DELTA 0x1ff
#define UCASE_MIN_DELTA (-UCASE_MAX_DELTA-1)
#define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
#define UCASE_EXC_SHIFT 4
#define UCASE_EXC_MASK 0xfff0
#define UCASE_MAX_EXCEPTIONS 0x1000
/* definitions for 16-bit main exceptions word ------------------------------ */
/* first 8 bits indicate values in optional slots */
enum {
UCASE_EXC_LOWER,
UCASE_EXC_FOLD,
UCASE_EXC_UPPER,
UCASE_EXC_TITLE,
UCASE_EXC_4, /* reserved */
UCASE_EXC_5, /* reserved */
UCASE_EXC_6, /* reserved */
UCASE_EXC_FULL_MAPPINGS
};
/* each slot is 2 uint16_t instead of 1 */
#define UCASE_EXC_DOUBLE_SLOTS 0x100
/* reserved: exception bits 11..9 */
/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
#define UCASE_EXC_DOT_SHIFT 8
/* normally stored in the main word, but pushed out for larger exception indexes */
#define UCASE_EXC_DOT_MASK 0x3000
enum {
UCASE_EXC_NO_DOT=0,
UCASE_EXC_SOFT_DOTTED=0x1000,
UCASE_EXC_ABOVE=0x2000, /* "above" accents with cc=230 */
UCASE_EXC_OTHER_ACCENT=0x3000 /* other character (0<cc!=230) */
};
/* complex/conditional mappings */
#define UCASE_EXC_CONDITIONAL_SPECIAL 0x4000
#define UCASE_EXC_CONDITIONAL_FOLD 0x8000
/* definitions for lengths word for full case mappings */
#define UCASE_FULL_LOWER 0xf
#define UCASE_FULL_FOLDING 0xf0
#define UCASE_FULL_UPPER 0xf00
#define UCASE_FULL_TITLE 0xf000
/* gencase ------------------------------------------------------------------ */
#define UGENCASE_EXC_SHIFT 16
#define UGENCASE_EXC_MASK 0xffff0000
/* special casing data */
typedef struct {
UChar32 code;
UBool isComplex;
UChar lowerCase[32], upperCase[32], titleCase[32];
} SpecialCasing;
/* case folding data */
typedef struct {
UChar32 code, simple;
char status;
UChar full[32];
} CaseFolding;
/* case mapping properties */
typedef struct {
UChar32 code, lowerCase, upperCase, titleCase;
SpecialCasing *specialCasing;
CaseFolding *caseFolding;
uint8_t cc;
UBool isTitle;
} Props;
/* global flags */
extern UBool beVerbose, haveCopyright;
/* properties vectors in gencase.c */
extern uint32_t *pv;
/* prototypes */
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix);
U_CFUNC UBool
isToken(const char *token, const char *s);
extern void
setUnicodeVersion(const char *v);
extern void
setProps(Props *p);
U_CFUNC uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
extern void
addCaseSensitive(UChar32 first, UChar32 last);
extern void
makeCaseClosure(void);
extern void
makeExceptions(void);
extern void
generateData(const char *dataDir);
U_CDECL_END
#endif

View file

@ -0,0 +1,166 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="7.10"
Name="gencase"
SccProjectName=""
SccLocalPath="">
<Platforms>
<Platform
Name="Win32"/>
</Platforms>
<Configurations>
<Configuration
Name="Release|Win32"
OutputDirectory=".\Release"
IntermediateDirectory=".\Release"
ConfigurationType="1"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="FALSE"
CharacterSet="2">
<Tool
Name="VCCLCompilerTool"
InlineFunctionExpansion="2"
ImproveFloatingPointConsistency="TRUE"
AdditionalIncludeDirectories="..\toolutil,..\..\common"
PreprocessorDefinitions="WIN32,NDEBUG,_CONSOLE"
StringPooling="TRUE"
RuntimeLibrary="2"
EnableFunctionLevelLinking="TRUE"
DisableLanguageExtensions="TRUE"
PrecompiledHeaderFile=".\Release/gencase.pch"
AssemblerListingLocation=".\Release/"
ObjectFile=".\Release/"
ProgramDataBaseFileName=".\Release/"
WarningLevel="3"
SuppressStartupBanner="TRUE"
CompileAs="0"/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin
"
Outputs="..\..\..\bin\$(TargetFileName)"/>
<Tool
Name="VCLinkerTool"
OutputFile=".\Release/gencase.exe"
LinkIncremental="1"
SuppressStartupBanner="TRUE"
ProgramDatabaseFile=".\Release/gencase.pdb"
SubSystem="1"/>
<Tool
Name="VCMIDLTool"
TypeLibraryName=".\Release/gencase.tlb"/>
<Tool
Name="VCPostBuildEventTool"/>
<Tool
Name="VCPreBuildEventTool"/>
<Tool
Name="VCPreLinkEventTool"/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="NDEBUG"
Culture="1033"/>
<Tool
Name="VCWebServiceProxyGeneratorTool"/>
<Tool
Name="VCXMLDataGeneratorTool"/>
<Tool
Name="VCWebDeploymentTool"/>
<Tool
Name="VCManagedWrapperGeneratorTool"/>
<Tool
Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
</Configuration>
<Configuration
Name="Debug|Win32"
OutputDirectory=".\Debug"
IntermediateDirectory=".\Debug"
ConfigurationType="1"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="FALSE"
CharacterSet="2">
<Tool
Name="VCCLCompilerTool"
Optimization="0"
ImproveFloatingPointConsistency="TRUE"
OptimizeForProcessor="2"
AdditionalIncludeDirectories="..\toolutil,..\..\common"
PreprocessorDefinitions="WIN32,_DEBUG,_CONSOLE"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
DisableLanguageExtensions="TRUE"
PrecompiledHeaderFile=".\Debug/gencase.pch"
AssemblerListingLocation=".\Debug/"
ObjectFile=".\Debug/"
ProgramDataBaseFileName=".\Debug/"
BrowseInformation="1"
WarningLevel="3"
SuppressStartupBanner="TRUE"
DebugInformationFormat="4"
CompileAs="0"/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin
"
Outputs="..\..\..\bin\$(TargetFileName)"/>
<Tool
Name="VCLinkerTool"
OutputFile=".\Debug/gencase.exe"
LinkIncremental="2"
SuppressStartupBanner="TRUE"
GenerateDebugInformation="TRUE"
ProgramDatabaseFile=".\Debug/gencase.pdb"
SubSystem="1"/>
<Tool
Name="VCMIDLTool"
TypeLibraryName=".\Debug/gencase.tlb"/>
<Tool
Name="VCPostBuildEventTool"/>
<Tool
Name="VCPreBuildEventTool"/>
<Tool
Name="VCPreLinkEventTool"/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="_DEBUG"
Culture="1033"/>
<Tool
Name="VCWebServiceProxyGeneratorTool"/>
<Tool
Name="VCXMLDataGeneratorTool"/>
<Tool
Name="VCWebDeploymentTool"/>
<Tool
Name="VCManagedWrapperGeneratorTool"/>
<Tool
Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="c;cpp;rc">
<File
RelativePath=".\gencase.c">
</File>
<File
RelativePath=".\store.c">
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h">
<File
RelativePath=".\gencase.h">
</File>
</Filter>
<Filter
Name="Resource Files"
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe">
</Filter>
</Files>
<Globals>
</Globals>
</VisualStudioProject>

View file

@ -0,0 +1,569 @@
/*
*******************************************************************************
*
* Copyright (C) 2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: store.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2004aug28
* created by: Markus W. Scherer
*
* Store Unicode case mapping properties efficiently for
* random access.
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "utrie.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "propsvec.h"
#include "gencase.h"
/* Unicode case mapping properties file format ---------------------------------
The file format prepared and written here contains several data
structures that store indexes or data.
Before the data contents described below, there are the headers required by
the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 1 .
The file contains the following structures:
const int32_t indexes[i0] with values i0, i1, ...:
(see UCASE_IX_... constants for names of indexes)
i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
i2 trieSize; -- size in bytes of the case mapping properties trie
i3 exceptionsLength; -- length in uint16_t of the exceptions array
i4..indexes[i0] reservedIndexes; -- reserved values; 0 for now
Serizalied trie, see utrie.h;
const uint16_t exceptions[exceptionsLength];
Trie data word:
Bits
if(exception) {
15..4 unsigned exception index
} else {
if(not uncased) {
15..6 signed delta to simple case mapping code point
(add delta to input code point)
}
5..4 0 normal character with cc=0
1 soft-dotted character
2 cc=230
3 other cc
}
3 exception
2 case sensitive
1..0 0 uncased
1 lowercase
2 uppercase
3 titlecase
Exceptions:
A sub-array of the exceptions array is indexed by the exception index in a
trie word.
The sub-array consists of the following fields:
uint16_t excWord;
uint16_t optional values [];
UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
excWord: (see UCASE_EXC_...)
Bits
15 conditional case folding
14 conditional special casing
13..12 same as non-exception trie data bits 5..4
moved here because the exception index needs more bits than the delta
0 normal character with cc=0
1 soft-dotted character
2 cc=230
3 other cc
11.. 9 reserved
8 if set, then for each optional-value slot there are 2 uint16_t values
(high and low parts of 32-bit values)
instead of single ones
7.. 0 bits for which optional value is present
Optional-value slots:
0 lowercase mapping (code point)
1 case folding (code point)
2 uppercase mapping (code point)
3 titlecase mapping (code point)
4..6 reserved
7 there is at least one full (string) case mapping
the length of each is encoded in a nibble of this optional value,
and the strings follow this optional value in the same order:
lower/fold/upper/title
For space saving, some values are not stored. Lookups are as follows:
- If special casing is conditional, then no full lower/upper/title mapping
strings are stored.
- If case folding is conditional, then no simple or full case foldings are
stored.
- Fall back in this order:
full (string) mapping -- if full mappings are used
simple (code point) mapping of the same type
simple fold->simple lower
simple title->simple upper
finally, the original code point (no mapping)
----------------------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
/* dataFormat="cAsE" */
{ UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
{ 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 4, 0, 1, 0 } /* dataVersion */
};
enum {
/* maximum number of exceptions expected */
MAX_EXC_COUNT=1000
};
/* exceptions values */
static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
static uint16_t exceptionsTop=0;
static Props excProps[MAX_EXC_COUNT];
static uint16_t exceptionsCount=0;
/* -------------------------------------------------------------------------- */
extern void
setUnicodeVersion(const char *v) {
UVersionInfo version;
u_versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
/* store a character's properties ------------------------------------------- */
extern void
setProps(Props *p) {
UErrorCode errorCode;
uint32_t value;
int32_t delta;
uint16_t count;
/* count the case mappings and other values competing for the value bit field */
value=upvec_getValue(pv, p->code, 0);
delta=0;
count=0;
if(p->isTitle) {
/* the Titlecase property is read late, from UnicodeData.txt */
value|=UCASE_TITLE;
}
if(p->upperCase!=0) {
/* uppercase mapping as delta if the character is lowercase */
if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
delta=p->upperCase-p->code;
} else {
value|=UCASE_EXCEPTION;
}
}
if(p->lowerCase!=0) {
/* lowercase mapping as delta if the character is uppercase or titlecase */
if((value&UCASE_TYPE_MASK)==UCASE_UPPER || (value&UCASE_TYPE_MASK)==UCASE_TITLE) {
delta=p->lowerCase-p->code;
} else {
value|=UCASE_EXCEPTION;
}
}
if(p->upperCase!=p->titleCase) {
value|=UCASE_EXCEPTION;
}
if(p->specialCasing!=NULL) {
value|=UCASE_EXCEPTION;
}
if(p->caseFolding!=NULL) {
value|=UCASE_EXCEPTION;
}
if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
value|=UCASE_EXCEPTION;
}
if(p->cc!=0) {
if(value&UCASE_DOT_MASK) {
fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
exit(U_INTERNAL_PROGRAM_ERROR);
}
if(p->cc==230) {
value|=UCASE_ABOVE;
} else {
value|=UCASE_OTHER_ACCENT;
}
}
/* handle exceptions */
if(value&UCASE_EXCEPTION) {
/* simply store exceptions for later processing and encoding */
value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
if(++exceptionsCount==MAX_EXC_COUNT) {
fprintf(stderr, "gencase: too many exceptions\n");
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
} else {
/* store the simple case mapping delta */
value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
}
errorCode=U_ZERO_ERROR;
if(!upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)) {
fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
extern void
addCaseSensitive(UChar32 first, UChar32 last) {
UErrorCode errorCode=U_ZERO_ERROR;
if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) {
fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
extern void
makeCaseClosure() {
/* TODO */
}
/* exceptions --------------------------------------------------------------- */
static UBool
fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
int32_t i, length;
UChar32 full;
length=*s++;
if(length==0 || length>U16_MAX_LENGTH) {
return FALSE;
}
i=0;
U16_NEXT(s, i, length, full);
if(simple==0) {
simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
}
return (UBool)(i==length && full==simple);
}
static uint16_t
makeException(uint32_t value, Props *p) {
uint32_t slots[8];
uint32_t slotBits;
uint16_t excWord, excIndex, excTop, i, count, length, fullLengths;
UBool doubleSlots;
/* excIndex will be returned for storing in the trie word */
excIndex=exceptionsTop;
if(excIndex>=UCASE_MAX_EXCEPTIONS) {
fprintf(stderr, "gencase error: too many exceptions words\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */
/* copy and shift the soft-dotted bits */
excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
/* set the bits for conditional mappings */
if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
p->specialCasing=NULL;
}
if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
excWord|=UCASE_EXC_CONDITIONAL_FOLD;
p->caseFolding=NULL;
}
/*
* Note:
* UCD stores no simple mappings when they are the same as the code point itself.
* SpecialCasing and CaseFolding do store simple mappings even if they are
* the same as the code point itself.
* Comparisons between simple regular mappings and simple special/folding
* mappings need to compensate for the difference by comparing with the
* original code point if a simple UCD mapping is missing (0).
*/
/* remove redundant data */
if(p->specialCasing!=NULL) {
/* do not store full mappings if they are the same as the simple ones */
if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
p->specialCasing->lowerCase[0]=0;
}
if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
p->specialCasing->upperCase[0]=0;
}
if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
p->specialCasing->titleCase[0]=0;
}
}
if( p->caseFolding!=NULL &&
fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
) {
p->caseFolding->full[0]=0;
}
/* write the optional slots */
slotBits=0;
count=0;
if(p->lowerCase!=0) {
slots[count]=(uint32_t)p->lowerCase;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_LOWER);
}
if( p->caseFolding!=NULL &&
(p->lowerCase!=0 ?
p->caseFolding->simple!=p->lowerCase :
p->caseFolding->simple!=p->code)
) {
slots[count]=(uint32_t)p->caseFolding->simple;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_FOLD);
}
if(p->upperCase!=0) {
slots[count]=(uint32_t)p->upperCase;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_UPPER);
}
if(p->upperCase!=p->titleCase) {
if(p->titleCase!=0) {
slots[count]=(uint32_t)p->titleCase;
} else {
slots[count]=(uint32_t)p->code;
}
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_TITLE);
}
/* lengths of full case mapping strings, stored in the last slot */
fullLengths=0;
if(p->specialCasing!=NULL) {
fullLengths=p->specialCasing->lowerCase[0];
fullLengths|=p->specialCasing->upperCase[0]<<8;
fullLengths|=p->specialCasing->titleCase[0]<<12;
}
if(p->caseFolding!=NULL) {
fullLengths|=p->caseFolding->full[0]<<4;
}
if(fullLengths!=0) {
slots[count]=fullLengths;
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
}
/* write slots */
doubleSlots=(UBool)(slotBits>0xffff);
if(!doubleSlots) {
for(i=0; i<count; ++i) {
exceptions[excTop++]=(uint16_t)slots[i];
}
} else {
excWord|=UCASE_EXC_DOUBLE_SLOTS;
for(i=0; i<count; ++i) {
exceptions[excTop++]=(uint16_t)(slots[i]>>16);
exceptions[excTop++]=(uint16_t)slots[i];
}
}
/* write the full case mapping strings */
if(p->specialCasing!=NULL) {
length=(uint16_t)p->specialCasing->lowerCase[0];
u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
excTop+=length;
}
if(p->caseFolding!=NULL) {
length=(uint16_t)p->caseFolding->full[0];
u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
excTop+=length;
}
if(p->specialCasing!=NULL) {
length=(uint16_t)p->specialCasing->upperCase[0];
u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
excTop+=length;
length=(uint16_t)p->specialCasing->titleCase[0];
u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
excTop+=length;
}
exceptionsTop=excTop;
/* write the main exceptions word */
exceptions[excIndex]=excWord;
return excIndex;
}
extern void
makeExceptions() {
uint32_t *row;
uint32_t value;
int32_t i;
uint16_t excIndex;
i=0;
while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
value=*row;
if(value&UCASE_EXCEPTION) {
excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
*row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
}
++i;
}
}
/* generate output data ----------------------------------------------------- */
/* TODO: create/use default folding function?! */
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
U_CFUNC uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
extern void
generateData(const char *dataDir) {
static int32_t indexes[UCASE_IX_TOP]={
UCASE_IX_TOP
};
static uint8_t trieBlock[40000];
const uint32_t *row;
UChar32 start, limit;
int32_t i;
UNewDataMemory *pData;
UNewTrie *pTrie;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t trieSize;
long dataLength;
pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
if(pTrie==NULL) {
fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) {
if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) {
fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
}
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
exit(errorCode);
}
indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
indexes[UCASE_IX_TRIE_SIZE]=trieSize;
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop;
if(beVerbose) {
printf("trie size in bytes: %5d\n", (int)trieSize);
printf("number of code points with exceptions: %5d\n", exceptionsCount);
printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop);
printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
}
/* write the data */
pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
exit(errorCode);
}
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, trieBlock, trieSize);
udata_writeBlock(pData, exceptions, 2*exceptionsTop);
/* finish up */
dataLength=udata_finish(pData, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
exit(errorCode);
}
if(dataLength!=indexes[UCASE_IX_LENGTH]) {
fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
dataLength, (int)indexes[UCASE_IX_LENGTH]);
exit(U_INTERNAL_PROGRAM_ERROR);
}
utrie_close(pTrie);
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/