diff --git a/icu4c/source/tools/gencase/Makefile.in b/icu4c/source/tools/gencase/Makefile.in new file mode 100644 index 00000000000..0125abd6f5b --- /dev/null +++ b/icu4c/source/tools/gencase/Makefile.in @@ -0,0 +1,100 @@ +## Makefile.in for ICU - tools/gencase +## Copyright (c) 1999-2004, International Business Machines Corporation and +## others. All Rights Reserved. +## Steven R. Loomis + +## Source directory information +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ + +top_builddir = ../.. + +include $(top_builddir)/icudefs.mk + +## + +TARGET_STUB_NAME = gencase + +SECTION = 8 + +MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) + +## Build directory information +subdir = tools/gencase + +## Extra files to remove for 'make clean' +CLEANFILES = *~ $(DEPS) $(MAN_FILES) + +## Target information +TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) + +CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(srcdir)/../toolutil +LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) + +OBJECTS = gencase.o store.o + +DEPS = $(OBJECTS:.o=.d) + +## List of phony targets +.PHONY : all all-local install install-local clean clean-local \ +distclean distclean-local dist dist-local check \ +check-local install-man + +## Clear suffix list +.SUFFIXES : + +## List of standard targets +all: all-local +install: install-local +clean: clean-local +distclean : distclean-local +dist: dist-local +check: all check-local + +all-local: $(TARGET) $(MAN_FILES) + +install-local: all-local install-man + $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) + $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) + +# man page +install-man: $(MAN_FILES) + $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) + $(INSTALL_DATA) $< $(DESTDIR)$(mandir)/man$(SECTION) + +%.$(SECTION): $(srcdir)/%.$(SECTION).in + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +# build postscript and pdf formats +#$(TARGET).ps: $(TARGET).$(SECTION) +# groff -man < $< > $@ + +#$(TARGET).pdf: $(TARGET).ps +# ps2pdf $< $@ + +dist-local: + +clean-local: + test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) + $(RMV) $(TARGET) $(OBJECTS) + +distclean-local: clean-local + $(RMV) Makefile + +check-local: all-local + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +$(TARGET) : $(OBJECTS) + $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) + +ifeq (,$(MAKECMDGOALS)) +-include $(DEPS) +else +ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) +-include $(DEPS) +endif +endif diff --git a/icu4c/source/tools/gencase/gencase.8.in b/icu4c/source/tools/gencase/gencase.8.in new file mode 100644 index 00000000000..e161c877526 --- /dev/null +++ b/icu4c/source/tools/gencase/gencase.8.in @@ -0,0 +1,126 @@ +.\" Hey, Emacs! This is -*-nroff-*- you know... +.\" +.\" genprops.8: manual page for the genprops utility +.\" +.\" Copyright (C) 2000-2001 IBM, Inc. and others. +.\" +.TH GENPROPS 8 "16 January 2001" "ICU MANPAGE" "ICU @VERSION@ Manual" +.SH NAME +.B genprops +\- compile properties from the Unicode Character Database +.SH SYNOPSIS +.B genprops +[ +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +] +[ +.BR "\-v\fP, \fB\-\-verbose" +] +[ +.BI "\-u\fP, \fB\-\-unicode" " version" +] +[ +.BI "\-c\fP, \fB\-\-copyright" +] +[ +.BI "\-s\fP, \fB\-\-sourcedir" " source" +] +[ +.BI "\-d\fP, \fB\-\-destdir" " destination" +] +[ +.I suffix +] +.SH DESCRIPTION +.B genprops +reads some of the Unicode Character Database files and compiles their +information information into a binary form. +The resulting file, +.BR icudata.dat , +can then be read directly by ICU, or used by +.BR pkgdata (8) +for incorporation into a larger archive or library. +.LP +The files read by +.B genprops +are described in the +.B FILES +section. If +.I suffix +is passed on the command line, the names of these files will actually +be changed to include a dash followed by +.I suffix +in their basename. For example, the file +.B UnicodeData.txt +would be looked for under the name +.BR UnicodeData\-\fIsuffix\fP.txt . +.SH OPTIONS +.TP +.BR "\-h\fP, \fB\-?\fP, \fB\-\-help" +Print help about usage and exit. +.TP +.BR "\-v\fP, \fB\-\-verbose" +Display extra informative messages during execution. +.TP +.BI "\-u\fP, \fB\-\-unicode" " version" +Specify which +.I version +of Unicode the Unicode Character Database refers to. +Defaults to +.BR 3.0.0 . +.TP +.BI "\-c\fP, \fB\-\-copyright" +Include a copyright notice into the binary data. +.TP +.BI "\-s\fP, \fB\-\-sourcedir" " source" +Set the source directory to +.IR source . +The default source directory is specified by the environment variable +.BR ICU_DATA . +.TP +.BI "\-d\fP, \fB\-\-destdir" " destination" +Set the destination directory to +.IR destination . +The default destination directory is specified by the environment variable +.BR ICU_DATA . +.SH ENVIRONMENT +.TP 10 +.B ICU_DATA +Specifies the directory containing ICU data. Defaults to +.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ . +Some tools in ICU depend on the presence of the trailing slash. It is thus +important to make sure that it is present if +.B ICU_DATA +is set. +.SH FILES +The following files are read by +.B genprops +and are looked for in the +.I source +directory. +.TP 20 +.B UnicodeData.txt +The main file in the Unicode Character Database. Contains character +properties, combining classes information, decompositions, names, +etc.\|.\|.. +.TP +.B BidiMirroring.txt +Properties for substituting characters in an implementation of +bidirectional mirroring. +.TP +.B SpecialCasing.txt +List of properties required for full case mapping. +.TP +.B CaseFolding.txt +Mapping from characters to their case-folded forms. (Note: this file +is derived from +.B UnicodeData.txt +and +.B SpecialCasing.txt +when generated by the Unicode Consortium.) +.SH VERSION +@VERSION@ +.SH COPYRIGHT +Copyright (C) 2000-2002 IBM, Inc. and others. +.SH SEE ALSO +.BR pkgdata (8) diff --git a/icu4c/source/tools/gencase/gencase.c b/icu4c/source/tools/gencase/gencase.c new file mode 100644 index 00000000000..bdf81361e36 --- /dev/null +++ b/icu4c/source/tools/gencase/gencase.c @@ -0,0 +1,776 @@ +/* +******************************************************************************* +* +* Copyright (C) 2004, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: gencase.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004aug28 +* created by: Markus W. Scherer +* +* This program reads several of the Unicode character database text files, +* parses them, and the case mapping properties for each character. +* It then writes a binary file containing the properties +* that is designed to be used directly for random-access to +* the properties of each Unicode character. +*/ + +#include +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/uset.h" +#include "unicode/putil.h" +#include "unicode/uclean.h" +#include "cmemory.h" +#include "cstring.h" +#include "uarrsort.h" +#include "unewdata.h" +#include "uoptions.h" +#include "uparse.h" +#include "uprops.h" +#include "propsvec.h" +#include "gencase.h" + +#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) + +/* data --------------------------------------------------------------------- */ + +static UNewTrie *trie; +uint32_t *pv; +static int32_t pvCount; + +UBool beVerbose=FALSE, haveCopyright=TRUE; + +/* + * Unicode set collecting the case-sensitive characters; + * see uchar.h UCHAR_CASE_SENSITIVE. + * Add code points from case mappings/foldings in + * the root locale and with default options. + */ +static USet *caseSensitive; + +/* prototypes --------------------------------------------------------------- */ + +static void +parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); + +static void +parseCaseFolding(const char *filename, UErrorCode *pErrorCode); + +static void +parseDB(const char *filename, UErrorCode *pErrorCode); + +/* parse files with multiple binary properties ------------------------------ */ + +/* TODO: more common code, move functions to uparse.h|c */ + +/* TODO: similar to genprops/props2.c but not the same */ + +struct Binary { + const char *propName; + int32_t vecWord; + uint32_t vecValue, vecMask; +}; +typedef struct Binary Binary; + +struct Binaries { + const char *ucdFile; + const Binary *binaries; + int32_t binariesCount; +}; +typedef struct Binaries Binaries; + +static const Binary +propListNames[]={ + { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } +}; + +static const Binaries +propListBinaries={ + "PropList", propListNames, LENGTHOF(propListNames) +}; + +static const Binary +derCorePropsNames[]={ + { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, + { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK } +}; + +static const Binaries +derCorePropsBinaries={ + "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) +}; + +static void U_CALLCONV +binariesLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + const Binaries *bin; + char *s; + uint32_t start, limit; + int32_t i; + + bin=(const Binaries *)context; + + u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); + exit(*pErrorCode); + } + ++limit; + + /* parse binary property name */ + s=(char *)u_skipWhitespace(fields[1][0]); + for(i=0;; ++i) { + if(i==bin->binariesCount) { + /* ignore unrecognized properties */ + return; + } + if(isToken(bin->binaries[i].propName, s)) { + break; + } + } + + if(bin->binaries[i].vecMask==0) { + fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", + (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); + exit(U_INTERNAL_PROGRAM_ERROR); + } + + if(!upvec_setValue(pv, start, limit, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode)) { + fprintf(stderr, "gencase error: unable to set %s, code: %s\n", + bin->binaries[i].propName, u_errorName(*pErrorCode)); + exit(*pErrorCode); + } +} + +static void +parseBinariesFile(char *filename, char *basename, const char *suffix, + const Binaries *bin, + UErrorCode *pErrorCode) { + char *fields[2][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + writeUCDFilename(basename, bin->ucdFile, suffix); + + u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); + } +} + +/* -------------------------------------------------------------------------- */ + +enum +{ + HELP_H, + HELP_QUESTION_MARK, + VERBOSE, + COPYRIGHT, + DESTDIR, + SOURCEDIR, + UNICODE_VERSION, + ICUDATADIR +}; + +/* Keep these values in sync with the above enums */ +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_DESTDIR, + UOPTION_SOURCEDIR, + { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, + UOPTION_ICUDATADIR +}; + +extern int +main(int argc, char* argv[]) { + char filename[300]; + const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; + char *basename=NULL; + UErrorCode errorCode=U_ZERO_ERROR; + + U_MAIN_INIT_ARGS(argc, argv); + + /* preset then read command line options */ + options[DESTDIR].value=u_getDataDirectory(); + options[SOURCEDIR].value=""; + options[UNICODE_VERSION].value=""; + options[ICUDATADIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { + /* + * Broken into chucks because the C89 standard says the minimum + * required supported string length is 509 bytes. + */ + fprintf(stderr, + "Usage: %s [-options] [suffix]\n" + "\n" + "read the UnicodeData.txt file and other Unicode properties files and\n" + "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the character properties\n" + "\n", + argv[0]); + fprintf(stderr, + "Options:\n" + "\t-h or -? or --help this usage text\n" + "\t-v or --verbose verbose output\n" + "\t-c or --copyright include a copyright notice\n" + "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"); + fprintf(stderr, + "\t-d or --destdir destination directory, followed by the path\n" + "\t-s or --sourcedir source directory, followed by the path\n" + "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" + "\t followed by path, defaults to %s\n" + "\tsuffix suffix that is to be appended with a '-'\n" + "\t to the source file basenames before opening;\n" + "\t 'gencase new' will read UnicodeData-new.txt etc.\n", + u_getDataDirectory()); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + beVerbose=options[VERBOSE].doesOccur; + haveCopyright=options[COPYRIGHT].doesOccur; + srcDir=options[SOURCEDIR].value; + destDir=options[DESTDIR].value; + + if(argc>=2) { + suffix=argv[1]; + } else { + suffix=NULL; + } + + if(options[UNICODE_VERSION].doesOccur) { + setUnicodeVersion(options[UNICODE_VERSION].value); + } + /* else use the default dataVersion in store.c */ + + if (options[ICUDATADIR].doesOccur) { + u_setDataDirectory(options[ICUDATADIR].value); + } + + /* prepare the filename beginning with the source dir */ + uprv_strcpy(filename, srcDir); + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename++=U_FILE_SEP_CHAR; + } + + /* initialize */ + pv=upvec_open(1, 10000); + caseSensitive=uset_open(1, 0); /* empty set (start>end) */ + + /* process SpecialCasing.txt */ + writeUCDFilename(basename, "SpecialCasing", suffix); + parseSpecialCasing(filename, &errorCode); + + /* process CaseFolding.txt */ + writeUCDFilename(basename, "CaseFolding", suffix); + parseCaseFolding(filename, &errorCode); + + /* process additional properties files */ + *basename=0; + + parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); + + parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); + + /* process UnicodeData.txt */ + writeUCDFilename(basename, "UnicodeData", suffix); + parseDB(filename, &errorCode); + + /* process parsed data */ + makeCaseClosure(); + + makeExceptions(); + + if(U_SUCCESS(errorCode)) { + /* write the properties data file */ + generateData(destDir); + } + + u_cleanup(); + return errorCode; +} + +U_CFUNC void +writeUCDFilename(char *basename, const char *filename, const char *suffix) { + int32_t length=(int32_t)uprv_strlen(filename); + uprv_strcpy(basename, filename); + if(suffix!=NULL) { + basename[length++]='-'; + uprv_strcpy(basename+length, suffix); + length+=(int32_t)uprv_strlen(suffix); + } + uprv_strcpy(basename+length, ".txt"); +} + +/* TODO: move to toolutil */ +U_CFUNC UBool +isToken(const char *token, const char *s) { + const char *z; + int32_t j; + + s=u_skipWhitespace(s); + for(j=0;; ++j) { + if(token[j]!=0) { + if(s[j]!=token[j]) { + break; + } + } else { + z=u_skipWhitespace(s+j); + if(*z==';' || *z==0) { + return TRUE; + } else { + break; + } + } + } + + return FALSE; +} + +static void +_set_addAll(USet *set, const UChar *s, int32_t length) { + UChar32 c; + int32_t i; + + /* needs length>=0 */ + for(i=0; icode-((const SpecialCasing *)right)->code; +} + +static void +parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { + char *fields[5][2]; + int32_t i, j; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); + + /* sort the special casing entries by code point */ + if(specialCasingCount>0) { + uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), + compareSpecialCasings, NULL, FALSE, pErrorCode); + } + if(U_FAILURE(*pErrorCode)) { + return; + } + + /* replace multiple entries for any code point by one "complex" one */ + j=0; + for(i=1; i0) { + uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), + compareSpecialCasings, NULL, FALSE, pErrorCode); + specialCasingCount-=j; + } + if(U_FAILURE(*pErrorCode)) { + return; + } + + /* + * Add one complex mapping to caseSensitive that was filtered out above: + * Greek final Sigma has a conditional mapping but not locale-sensitive, + * and it is taken when lowercasing just U+03A3 alone. + * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA + */ + uset_add(caseSensitive, 0x3c2); +} + +/* parser for CaseFolding.txt ----------------------------------------------- */ + +#define MAX_CASE_FOLDING_COUNT 2000 + +static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; +static int32_t caseFoldingCount=0; + +static void U_CALLCONV +caseFoldingLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + char *end; + static UChar32 prevCode=0; + int32_t count; + char status; + + /* get code point */ + caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); + end=(char *)u_skipWhitespace(end); + if(end<=fields[0][0] || end!=fields[0][1]) { + fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* get the status of this mapping */ + caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); + if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { + fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ + if(status=='L') { + return; + } + + /* get the mapping */ + count=caseFoldings[caseFoldingCount].full[0]= + (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); + exit(*pErrorCode); + } + + /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ + if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { + caseFoldings[caseFoldingCount].simple=0; + } + + /* update the case-sensitive set */ + if(status!='T') { + uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); + _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); + } + + /* check the status */ + if(status=='S') { + /* check if there was a full mapping for this code point before */ + if( caseFoldingCount>0 && + caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && + caseFoldings[caseFoldingCount-1].status=='F' + ) { + /* merge the two entries */ + caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; + return; + } + } else if(status=='F') { + /* check if there was a simple mapping for this code point before */ + if( caseFoldingCount>0 && + caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && + caseFoldings[caseFoldingCount-1].status=='S' + ) { + /* merge the two entries */ + uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); + return; + } + } else if(status=='I' || status=='T') { + /* check if there was a default mapping for this code point before (remove it) */ + while(caseFoldingCount>0 && + caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code + ) { + prevCode=0; + --caseFoldingCount; + } + /* store only a marker for special handling for cases like dotless i */ + caseFoldings[caseFoldingCount].simple=0; + caseFoldings[caseFoldingCount].full[0]=0; + } + + /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ + if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { + fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", + (unsigned long)caseFoldings[caseFoldingCount].code, + (unsigned long)prevCode); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + prevCode=caseFoldings[caseFoldingCount].code; + + if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { + fprintf(stderr, "gencase: too many case folding mappings\n"); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + exit(U_INDEX_OUTOFBOUNDS_ERROR); + } +} + +static void +parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { + char *fields[3][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); +} + +/* parser for UnicodeData.txt ----------------------------------------------- */ + +static int32_t specialCasingIndex=0, caseFoldingIndex=0; + +static void U_CALLCONV +unicodeDataLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + Props p; + char *end; + static UChar32 prevCode=0; + UChar32 value; + UBool something=FALSE; + + /* reset the properties */ + uprv_memset(&p, 0, sizeof(Props)); + + /* get the character code, field 0 */ + p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); + if(end<=fields[0][0] || end!=fields[0][1]) { + fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* get general category, field 2 */ + if(isToken("Lt", fields[2][0])) { + p.isTitle=TRUE; + something=TRUE; + } + + /* get canonical combining class, field 3 */ + value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); + if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { + fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + if(value>0) { + p.cc=(uint8_t)value; + something=TRUE; + } + + /* get uppercase mapping, field 12 */ + value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); + if(end!=fields[12][1]) { + fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", + (unsigned long)p.code); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + if(value!=0 && value!=p.code) { + p.upperCase=value; + uset_add(caseSensitive, p.code); + uset_add(caseSensitive, value); + something=TRUE; + } + + /* get lowercase value, field 13 */ + value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); + if(end!=fields[13][1]) { + fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", + (unsigned long)p.code); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + if(value!=0 && value!=p.code) { + p.lowerCase=value; + uset_add(caseSensitive, p.code); + uset_add(caseSensitive, value); + something=TRUE; + } + + /* get titlecase value, field 14 */ + value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); + if(end!=fields[14][1]) { + fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", + (unsigned long)p.code); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + if(value!=0 && value!=p.code) { + p.titleCase=value; + uset_add(caseSensitive, p.code); + uset_add(caseSensitive, value); + something=TRUE; + } + + /* set additional properties from previously parsed files */ + if(specialCasingIndexstatus=='C' && + p.caseFolding->simple==p.lowerCase + ) { + p.caseFolding=NULL; + } + } else { + p.caseFolding=NULL; + } + + /* check for non-character code points */ + if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { + fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", + (unsigned long)p.code); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* check that the code points (p.code) are in ascending order */ + if(p.code<=prevCode && p.code>0) { + fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", + (unsigned long)p.code, (unsigned long)prevCode); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* properties for a single code point */ + if(something) { + setProps(&p); + } + + prevCode=p.code; +} + +static void +parseDB(const char *filename, UErrorCode *pErrorCode) { + char *fields[15][2]; + UChar32 start, end; + int32_t i; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); + + /* are all sub-properties consumed? */ + if(specialCasingIndex +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Console Application" 0x0103 + +CFG=gencase - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "gencase.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "gencase.mak" CFG="gencase - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "gencase - Win32 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "gencase - Win32 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE "gencase - Win64 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "gencase - Win64 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "gencase - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +MTL=midl.exe +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c +# ADD CPP /nologo /G6 /MD /Za /W3 /GX /O2 /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 +# ADD LINK32 icuuc.lib icutu.lib /nologo /subsystem:console /machine:I386 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" /libpath:"..\..\..\lib" +# Begin Custom Build +TargetPath=.\Release\gencase.exe +InputPath=.\Release\gencase.exe +InputName=gencase +SOURCE="$(InputPath)" + +"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(TargetPath) ..\..\..\bin + +# End Custom Build + +!ELSEIF "$(CFG)" == "gencase - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +MTL=midl.exe +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c +# ADD CPP /nologo /G6 /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept +# ADD LINK32 icuucd.lib icutud.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" /libpath:"..\..\..\lib" +# Begin Custom Build +TargetPath=.\Debug\gencase.exe +InputPath=.\Debug\gencase.exe +InputName=gencase +SOURCE="$(InputPath)" + +"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(TargetPath) ..\..\..\bin + +# End Custom Build + +!ELSEIF "$(CFG)" == "gencase - Win64 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +MTL=midl.exe +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c +# ADD CPP /nologo /MD /Za /W3 /GX /Zi /O2 /I "..\toolutil" /I "..\..\common" /D "WIN64" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /D "_IA64_" /D "WIN32" /D "_AFX_NO_DAO_SUPPORT" /FD /QIA64_fmaopt /Wp64 /Zm600 /c +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:IX86 /machine:IA64 +# ADD LINK32 icuuc.lib icutu.lib /nologo /subsystem:console /machine:IX86 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" /libpath:"..\..\..\lib" /machine:IA64 +# Begin Custom Build +TargetPath=.\Release\gencase.exe +InputPath=.\Release\gencase.exe +InputName=gencase +SOURCE="$(InputPath)" + +"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(TargetPath) ..\..\..\bin + +# End Custom Build + +!ELSEIF "$(CFG)" == "gencase - Win64 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +MTL=midl.exe +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FD /GZ /c +# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /Zi /Od /I "..\toolutil" /I "..\..\common" /D "WIN64" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "_IA64_" /D "WIN32" /D "_AFX_NO_DAO_SUPPORT" /FR /FD /GZ /QIA64_fmaopt /Wp64 /Zm600 /c +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:IX86 /pdbtype:sept /machine:IA64 +# ADD LINK32 icuucd.lib icutud.lib /nologo /subsystem:console /incremental:no /debug /machine:IX86 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" /libpath:"..\..\..\lib" /machine:IA64 +# Begin Custom Build +TargetPath=.\Debug\gencase.exe +InputPath=.\Debug\gencase.exe +InputName=gencase +SOURCE="$(InputPath)" + +"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(TargetPath) ..\..\..\bin + +# End Custom Build + +!ENDIF + +# Begin Target + +# Name "gencase - Win32 Release" +# Name "gencase - Win32 Debug" +# Name "gencase - Win64 Release" +# Name "gencase - Win64 Debug" +# Begin Source File + +SOURCE=.\gencase.c +# End Source File +# Begin Source File + +SOURCE=.\gencase.h +# End Source File +# Begin Source File + +SOURCE=.\store.c +# End Source File +# End Target +# End Project diff --git a/icu4c/source/tools/gencase/gencase.h b/icu4c/source/tools/gencase/gencase.h new file mode 100644 index 00000000000..40902277848 --- /dev/null +++ b/icu4c/source/tools/gencase/gencase.h @@ -0,0 +1,187 @@ +/* +******************************************************************************* +* +* Copyright (C) 2004, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genprops.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004aug28 +* created by: Markus W. Scherer +*/ + +#ifndef __GENCASE_H__ +#define __GENCASE_H__ + +#include "unicode/utypes.h" +#include "utrie.h" + +U_CDECL_BEGIN + +/* file definitions --------------------------------------------------------- */ + +#define UCASE_DATA_NAME "ucase" +#define UCASE_DATA_TYPE "icu" + +/* format "cAsE" */ +#define UCASE_FMT_0 0x63 +#define UCASE_FMT_1 0x41 +#define UCASE_FMT_2 0x53 +#define UCASE_FMT_3 0x45 + +/* indexes into indexes[] */ +enum { + UCASE_IX_INDEX_TOP, + UCASE_IX_LENGTH, + UCASE_IX_TRIE_SIZE, + UCASE_IX_EXC_LENGTH, + + UCASE_IX_TOP=16 +}; + +/* definitions for 16-bit case properties word ------------------------------ */ + +/* 2-bit constants for types of cased characters */ +#define UCASE_TYPE_MASK 3 +enum { + UCASE_NONE, + UCASE_LOWER, + UCASE_UPPER, + UCASE_TITLE +}; + +#define UCASE_SENSITIVE 4 +#define UCASE_EXCEPTION 8 + +#define UCASE_DOT_MASK 0x30 +enum { + UCASE_NO_DOT=0, + UCASE_SOFT_DOTTED=0x10, + UCASE_ABOVE=0x20, /* "above" accents with cc=230 */ + UCASE_OTHER_ACCENT=0x30 /* other character (0>UCASE_DELTA_SHIFT) + +/* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */ +#define UCASE_EXC_SHIFT 4 +#define UCASE_EXC_MASK 0xfff0 +#define UCASE_MAX_EXCEPTIONS 0x1000 + +/* definitions for 16-bit main exceptions word ------------------------------ */ + +/* first 8 bits indicate values in optional slots */ +enum { + UCASE_EXC_LOWER, + UCASE_EXC_FOLD, + UCASE_EXC_UPPER, + UCASE_EXC_TITLE, + UCASE_EXC_4, /* reserved */ + UCASE_EXC_5, /* reserved */ + UCASE_EXC_6, /* reserved */ + UCASE_EXC_FULL_MAPPINGS +}; + +/* each slot is 2 uint16_t instead of 1 */ +#define UCASE_EXC_DOUBLE_SLOTS 0x100 + +/* reserved: exception bits 11..9 */ + +/* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK< + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/icu4c/source/tools/gencase/store.c b/icu4c/source/tools/gencase/store.c new file mode 100644 index 00000000000..6f708534665 --- /dev/null +++ b/icu4c/source/tools/gencase/store.c @@ -0,0 +1,569 @@ +/* +******************************************************************************* +* +* Copyright (C) 2004, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: store.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2004aug28 +* created by: Markus W. Scherer +* +* Store Unicode case mapping properties efficiently for +* random access. +*/ + +#include +#include +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "utrie.h" +#include "unicode/udata.h" +#include "unewdata.h" +#include "propsvec.h" +#include "gencase.h" + +/* Unicode case mapping properties file format --------------------------------- + +The file format prepared and written here contains several data +structures that store indexes or data. + +Before the data contents described below, there are the headers required by +the udata API for loading ICU data. Especially, a UDataInfo structure +precedes the actual data. It contains platform properties values and the +file format version. + +The following is a description of format version 1 . + +The file contains the following structures: + + const int32_t indexes[i0] with values i0, i1, ...: + (see UCASE_IX_... constants for names of indexes) + + i0 indexLength; -- length of indexes[] (UCASE_IX_TOP) + i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) + i2 trieSize; -- size in bytes of the case mapping properties trie + i3 exceptionsLength; -- length in uint16_t of the exceptions array + + i4..indexes[i0] reservedIndexes; -- reserved values; 0 for now + + + Serizalied trie, see utrie.h; + + const uint16_t exceptions[exceptionsLength]; + + +Trie data word: +Bits +if(exception) { + 15..4 unsigned exception index +} else { + if(not uncased) { + 15..6 signed delta to simple case mapping code point + (add delta to input code point) + } + 5..4 0 normal character with cc=0 + 1 soft-dotted character + 2 cc=230 + 3 other cc +} + 3 exception + 2 case sensitive + 1..0 0 uncased + 1 lowercase + 2 uppercase + 3 titlecase + + +Exceptions: +A sub-array of the exceptions array is indexed by the exception index in a +trie word. +The sub-array consists of the following fields: + uint16_t excWord; + uint16_t optional values []; + UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase + +excWord: (see UCASE_EXC_...) +Bits + 15 conditional case folding + 14 conditional special casing +13..12 same as non-exception trie data bits 5..4 + moved here because the exception index needs more bits than the delta + 0 normal character with cc=0 + 1 soft-dotted character + 2 cc=230 + 3 other cc +11.. 9 reserved + 8 if set, then for each optional-value slot there are 2 uint16_t values + (high and low parts of 32-bit values) + instead of single ones + 7.. 0 bits for which optional value is present + +Optional-value slots: +0 lowercase mapping (code point) +1 case folding (code point) +2 uppercase mapping (code point) +3 titlecase mapping (code point) +4..6 reserved +7 there is at least one full (string) case mapping + the length of each is encoded in a nibble of this optional value, + and the strings follow this optional value in the same order: + lower/fold/upper/title + +For space saving, some values are not stored. Lookups are as follows: +- If special casing is conditional, then no full lower/upper/title mapping + strings are stored. +- If case folding is conditional, then no simple or full case foldings are + stored. +- Fall back in this order: + full (string) mapping -- if full mappings are used + simple (code point) mapping of the same type + simple fold->simple lower + simple title->simple upper + finally, the original code point (no mapping) + +----------------------------------------------------------------------------- */ + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + /* dataFormat="cAsE" */ + { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, + { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 4, 0, 1, 0 } /* dataVersion */ +}; + +enum { + /* maximum number of exceptions expected */ + MAX_EXC_COUNT=1000 +}; + +/* exceptions values */ +static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100]; +static uint16_t exceptionsTop=0; +static Props excProps[MAX_EXC_COUNT]; +static uint16_t exceptionsCount=0; + +/* -------------------------------------------------------------------------- */ + +extern void +setUnicodeVersion(const char *v) { + UVersionInfo version; + u_versionFromString(version, v); + uprv_memcpy(dataInfo.dataVersion, version, 4); +} + +/* store a character's properties ------------------------------------------- */ + +extern void +setProps(Props *p) { + UErrorCode errorCode; + uint32_t value; + int32_t delta; + uint16_t count; + + /* count the case mappings and other values competing for the value bit field */ + value=upvec_getValue(pv, p->code, 0); + delta=0; + count=0; + + if(p->isTitle) { + /* the Titlecase property is read late, from UnicodeData.txt */ + value|=UCASE_TITLE; + } + + if(p->upperCase!=0) { + /* uppercase mapping as delta if the character is lowercase */ + if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { + delta=p->upperCase-p->code; + } else { + value|=UCASE_EXCEPTION; + } + } + if(p->lowerCase!=0) { + /* lowercase mapping as delta if the character is uppercase or titlecase */ + if((value&UCASE_TYPE_MASK)==UCASE_UPPER || (value&UCASE_TYPE_MASK)==UCASE_TITLE) { + delta=p->lowerCase-p->code; + } else { + value|=UCASE_EXCEPTION; + } + } + if(p->upperCase!=p->titleCase) { + value|=UCASE_EXCEPTION; + } + if(p->specialCasing!=NULL) { + value|=UCASE_EXCEPTION; + } + if(p->caseFolding!=NULL) { + value|=UCASE_EXCEPTION; + } + + if(deltacc!=0) { + if(value&UCASE_DOT_MASK) { + fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n"); + exit(U_INTERNAL_PROGRAM_ERROR); + } + if(p->cc==230) { + value|=UCASE_ABOVE; + } else { + value|=UCASE_OTHER_ACCENT; + } + } + + /* handle exceptions */ + if(value&UCASE_EXCEPTION) { + /* simply store exceptions for later processing and encoding */ + value|=(uint32_t)exceptionsCount<code, p->code+1, 0, value, 0xffffffff, &errorCode)) { + fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", + u_errorName(errorCode)); + exit(errorCode); + } +} + +extern void +addCaseSensitive(UChar32 first, UChar32 last) { + UErrorCode errorCode=U_ZERO_ERROR; + if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) { + fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n", + u_errorName(errorCode)); + exit(errorCode); + } +} + +extern void +makeCaseClosure() { + /* TODO */ +} + +/* exceptions --------------------------------------------------------------- */ + +static UBool +fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) { + int32_t i, length; + UChar32 full; + + length=*s++; + if(length==0 || length>U16_MAX_LENGTH) { + return FALSE; + } + i=0; + U16_NEXT(s, i, length, full); + + if(simple==0) { + simple=c; /* UCD has no simple mapping if it's the same as the code point itself */ + } + return (UBool)(i==length && full==simple); +} + +static uint16_t +makeException(uint32_t value, Props *p) { + uint32_t slots[8]; + uint32_t slotBits; + uint16_t excWord, excIndex, excTop, i, count, length, fullLengths; + UBool doubleSlots; + + /* excIndex will be returned for storing in the trie word */ + excIndex=exceptionsTop; + if(excIndex>=UCASE_MAX_EXCEPTIONS) { + fprintf(stderr, "gencase error: too many exceptions words\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */ + + /* copy and shift the soft-dotted bits */ + excWord=((uint16_t)value&UCASE_DOT_MASK)<specialCasing!=NULL && p->specialCasing->isComplex) { + excWord|=UCASE_EXC_CONDITIONAL_SPECIAL; + p->specialCasing=NULL; + } + if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) { + excWord|=UCASE_EXC_CONDITIONAL_FOLD; + p->caseFolding=NULL; + } + + /* + * Note: + * UCD stores no simple mappings when they are the same as the code point itself. + * SpecialCasing and CaseFolding do store simple mappings even if they are + * the same as the code point itself. + * Comparisons between simple regular mappings and simple special/folding + * mappings need to compensate for the difference by comparing with the + * original code point if a simple UCD mapping is missing (0). + */ + + /* remove redundant data */ + if(p->specialCasing!=NULL) { + /* do not store full mappings if they are the same as the simple ones */ + if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) { + p->specialCasing->lowerCase[0]=0; + } + if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) { + p->specialCasing->upperCase[0]=0; + } + if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) { + p->specialCasing->titleCase[0]=0; + } + } + if( p->caseFolding!=NULL && + fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code) + ) { + p->caseFolding->full[0]=0; + } + + /* write the optional slots */ + slotBits=0; + count=0; + + if(p->lowerCase!=0) { + slots[count]=(uint32_t)p->lowerCase; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_LOWER); + } + if( p->caseFolding!=NULL && + (p->lowerCase!=0 ? + p->caseFolding->simple!=p->lowerCase : + p->caseFolding->simple!=p->code) + ) { + slots[count]=(uint32_t)p->caseFolding->simple; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_FOLD); + } + if(p->upperCase!=0) { + slots[count]=(uint32_t)p->upperCase; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_UPPER); + } + if(p->upperCase!=p->titleCase) { + if(p->titleCase!=0) { + slots[count]=(uint32_t)p->titleCase; + } else { + slots[count]=(uint32_t)p->code; + } + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_TITLE); + } + + /* lengths of full case mapping strings, stored in the last slot */ + fullLengths=0; + if(p->specialCasing!=NULL) { + fullLengths=p->specialCasing->lowerCase[0]; + fullLengths|=p->specialCasing->upperCase[0]<<8; + fullLengths|=p->specialCasing->titleCase[0]<<12; + } + if(p->caseFolding!=NULL) { + fullLengths|=p->caseFolding->full[0]<<4; + } + if(fullLengths!=0) { + slots[count]=fullLengths; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS); + } + + /* write slots */ + doubleSlots=(UBool)(slotBits>0xffff); + if(!doubleSlots) { + for(i=0; i>16); + exceptions[excTop++]=(uint16_t)slots[i]; + } + } + + /* write the full case mapping strings */ + if(p->specialCasing!=NULL) { + length=(uint16_t)p->specialCasing->lowerCase[0]; + u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length); + excTop+=length; + } + if(p->caseFolding!=NULL) { + length=(uint16_t)p->caseFolding->full[0]; + u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length); + excTop+=length; + } + if(p->specialCasing!=NULL) { + length=(uint16_t)p->specialCasing->upperCase[0]; + u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length); + excTop+=length; + + length=(uint16_t)p->specialCasing->titleCase[0]; + u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length); + excTop+=length; + } + + exceptionsTop=excTop; + + /* write the main exceptions word */ + exceptions[excIndex]=excWord; + + return excIndex; +} + +extern void +makeExceptions() { + uint32_t *row; + uint32_t value; + int32_t i; + uint16_t excIndex; + + i=0; + while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) { + value=*row; + if(value&UCASE_EXCEPTION) { + excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT)); + *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<