diff --git a/.gitignore b/.gitignore index ca57dfd1ab1..27144532fce 100644 --- a/.gitignore +++ b/.gitignore @@ -980,22 +980,6 @@ tools/unicode/c/gencase/gencase.vcproj.*.*.user tools/unicode/c/gencase/release tools/unicode/c/gencase/x64 tools/unicode/c/gencase/x86 -tools/unicode/c/gennames/*.d -tools/unicode/c/gennames/*.ncb -tools/unicode/c/gennames/*.o -tools/unicode/c/gennames/*.opt -tools/unicode/c/gennames/*.pdb -tools/unicode/c/gennames/*.plg -tools/unicode/c/gennames/Debug -tools/unicode/c/gennames/Makefile -tools/unicode/c/gennames/Release -tools/unicode/c/gennames/debug -tools/unicode/c/gennames/gennames -tools/unicode/c/gennames/gennames.[0-9] -tools/unicode/c/gennames/gennames.vcproj.*.*.user -tools/unicode/c/gennames/release -tools/unicode/c/gennames/x64 -tools/unicode/c/gennames/x86 tools/unicode/c/genprops/*.d tools/unicode/c/genprops/*.ncb tools/unicode/c/genprops/*.o diff --git a/tools/unicode/c/CMakeLists.txt b/tools/unicode/c/CMakeLists.txt index c7f143eaf97..155d631159f 100644 --- a/tools/unicode/c/CMakeLists.txt +++ b/tools/unicode/c/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2010-2011, International Business Machines +# Copyright (C) 2010-2012, International Business Machines # Corporation and others. All Rights Reserved. # # created on: 2010jun03 @@ -18,7 +18,6 @@ include_directories( ${ICU_SRC_DIR}/source/tools/toolutil) link_directories(${ICU_INST_DIR}/lib) add_subdirectory(gencase) -add_subdirectory(gennames) add_subdirectory(genprops) add_subdirectory(genuca) add_subdirectory(genuts46) diff --git a/tools/unicode/c/gennames/CMakeLists.txt b/tools/unicode/c/gennames/CMakeLists.txt deleted file mode 100644 index 843f0c7b4a6..00000000000 --- a/tools/unicode/c/gennames/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2010, International Business Machines -# Corporation and others. All Rights Reserved. -# -# created on: 2010jun03 -# created by: Markus W. Scherer -# edited on: 2010jul20 -# edited by: Stuart G. Gill -add_executable(gennames gennames.c) -target_link_libraries(gennames icuuc icutu) diff --git a/tools/unicode/c/gennames/Makefile.in b/tools/unicode/c/gennames/Makefile.in deleted file mode 100644 index ed2f88f3fd8..00000000000 --- a/tools/unicode/c/gennames/Makefile.in +++ /dev/null @@ -1,97 +0,0 @@ -## Makefile.in for ICU - tools/gennames -## Copyright (c) 1999-2005, International Business Machines Corporation and -## others. All Rights Reserved. -## Steven R. Loomis - -## Source directory information -srcdir = @srcdir@ -top_srcdir = @top_srcdir@ - -top_builddir = ../.. - -include $(top_builddir)/icudefs.mk - -## Build directory information -subdir = tools/gennames - -TARGET_STUB_NAME = gennames - -SECTION = 8 - -#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) - - -## Extra files to remove for 'make clean' -CLEANFILES = *~ $(DEPS) $(MAN_FILES) - -## Target information -TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) - -ifneq ($(top_builddir),$(top_srcdir)) -CPPFLAGS += -I$(top_builddir)/common -endif -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) - -OBJECTS = gennames.o - -DEPS = $(OBJECTS:.o=.d) - -## List of phony targets -.PHONY : all all-local install install-local clean clean-local \ -distclean distclean-local dist dist-local check check-local install-man - -## Clear suffix list -.SUFFIXES : - -## List of standard targets -all: all-local -install: install-local -clean: clean-local -distclean : distclean-local -dist: dist-local -check: all check-local - -all-local: $(TARGET) $(MAN_FILES) - -install-local: all-local install-man -# $(MKINSTALLDIRS) $(DESTDIR)$(sbindir) -# $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir) - -install-man: $(MAN_FILES) -# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) -# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) - -dist-local: - -clean-local: - test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) - $(RMV) $(TARGET) $(OBJECTS) - -distclean-local: clean-local - $(RMV) Makefile - -check-local: all-local - -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - cd $(top_builddir) \ - && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status - -$(TARGET) : $(OBJECTS) - $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) - $(POST_BUILD_STEP) - - -%.$(SECTION): $(srcdir)/%.$(SECTION).in - cd $(top_builddir) \ - && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status - - -ifeq (,$(MAKECMDGOALS)) --include $(DEPS) -else -ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) --include $(DEPS) -endif -endif - diff --git a/tools/unicode/c/gennames/gennames.vcproj b/tools/unicode/c/gennames/gennames.vcproj deleted file mode 100644 index 0a1f05fd77e..00000000000 --- a/tools/unicode/c/gennames/gennames.vcproj +++ /dev/null @@ -1,399 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tools/unicode/c/genprops/CMakeLists.txt b/tools/unicode/c/genprops/CMakeLists.txt index 0062d26c211..f6c7c90e25d 100644 --- a/tools/unicode/c/genprops/CMakeLists.txt +++ b/tools/unicode/c/genprops/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2010-2011, International Business Machines +# Copyright (C) 2010-2012, International Business Machines # Corporation and others. All Rights Reserved. # # created on: 2010jun03 @@ -7,5 +7,6 @@ # edited by: Stuart G. Gill add_executable(genprops genprops.cpp pnamesbuilder.cpp corepropsbuilder.cpp - bidipropsbuilder.cpp) + bidipropsbuilder.cpp + namespropsbuilder.cpp) target_link_libraries(genprops icuuc icutu) diff --git a/tools/unicode/c/genprops/bidipropsbuilder.cpp b/tools/unicode/c/genprops/bidipropsbuilder.cpp index 3ab7a75bd3f..04c88ca47db 100644 --- a/tools/unicode/c/genprops/bidipropsbuilder.cpp +++ b/tools/unicode/c/genprops/bidipropsbuilder.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2004-2011, International Business Machines +* Copyright (C) 2004-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -33,8 +33,6 @@ #include "ubidi_props.h" #include "genprops.h" -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - /* Unicode bidi/shaping properties file format --------------------------------- The file format prepared and written here contains several data @@ -519,11 +517,11 @@ BiDiPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorC PropsBuilder * createBiDiPropsBuilder(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } - PropsBuilder *pw=new BiDiPropsBuilder(errorCode); - if(pw==NULL) { + PropsBuilder *pb=new BiDiPropsBuilder(errorCode); + if(pb==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } - return pw; + return pb; } /* diff --git a/tools/unicode/c/genprops/corepropsbuilder.cpp b/tools/unicode/c/genprops/corepropsbuilder.cpp index db33e815268..73b5ea5a6bc 100644 --- a/tools/unicode/c/genprops/corepropsbuilder.cpp +++ b/tools/unicode/c/genprops/corepropsbuilder.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -236,8 +236,6 @@ Change from UTrie to UTrie2. ----------------------------------------------------------------------------- */ -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - U_NAMESPACE_USE /* UDataInfo cf. udata.h */ @@ -813,11 +811,11 @@ CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorC PropsBuilder * createCorePropsBuilder(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } - PropsBuilder *pw=new CorePropsBuilder(errorCode); - if(pw==NULL) { + PropsBuilder *pb=new CorePropsBuilder(errorCode); + if(pb==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } - return pw; + return pb; } /* diff --git a/tools/unicode/c/genprops/genprops.cpp b/tools/unicode/c/genprops/genprops.cpp index 61e5009a66c..b198c22f7a7 100644 --- a/tools/unicode/c/genprops/genprops.cpp +++ b/tools/unicode/c/genprops/genprops.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -29,15 +29,16 @@ #include "toolutil.h" #include "uoptions.h" -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - U_NAMESPACE_USE UBool beVerbose=FALSE; +UBool beQuiet=FALSE; PropsBuilder::PropsBuilder() {} PropsBuilder::~PropsBuilder() {} void PropsBuilder::setUnicodeVersion(const UVersionInfo) {} +void PropsBuilder::setAlgNamesRange(UChar32, UChar32, + const char *, const char *, UErrorCode &) {} void PropsBuilder::setProps(const UniProps &, const UnicodeSet &, UErrorCode &) {} void PropsBuilder::build(UErrorCode &) {} void PropsBuilder::writeCSourceFile(const char *, UErrorCode &) {} @@ -47,6 +48,7 @@ enum { HELP_H, HELP_QUESTION_MARK, VERBOSE, + QUIET, COPYRIGHT }; @@ -55,6 +57,7 @@ static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, + UOPTION_QUIET, UOPTION_COPYRIGHT }; @@ -86,18 +89,21 @@ main(int argc, char* argv[]) { "Options:\n" "\t-h or -? or --help this usage text\n" "\t-v or --verbose verbose output\n" + "\t-q or --quiet no output\n" "\t-c or --copyright include a copyright notice\n"); return argc<2 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ beVerbose=options[VERBOSE].doesOccur; + beQuiet=options[QUIET].doesOccur; /* initialize */ IcuToolErrorCode errorCode("genprops"); LocalPointer pnamesBuilder(createPNamesBuilder(errorCode)); LocalPointer corePropsBuilder(createCorePropsBuilder(errorCode)); LocalPointer bidiPropsBuilder(createBiDiPropsBuilder(errorCode)); + LocalPointer namesPropsBuilder(createNamesPropsBuilder(errorCode)); if(errorCode.isFailure()) { fprintf(stderr, "genprops: unable to create PropsBuilders - %s\n", errorCode.errorName()); return errorCode.reset(); @@ -138,10 +144,19 @@ main(int argc, char* argv[]) { const UniProps *props=ppucd.getProps(newValues, errorCode); corePropsBuilder->setProps(*props, newValues, errorCode); bidiPropsBuilder->setProps(*props, newValues, errorCode); + namesPropsBuilder->setProps(*props, newValues, errorCode); } else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) { const UVersionInfo &version=ppucd.getUnicodeVersion(); corePropsBuilder->setUnicodeVersion(version); bidiPropsBuilder->setUnicodeVersion(version); + namesPropsBuilder->setUnicodeVersion(version); + } else if(lineType==PreparsedUCD::ALG_NAMES_RANGE_LINE) { + UChar32 start, end; + if(ppucd.getRangeForAlgNames(start, end, errorCode)) { + const char *type=ppucd.nextField(); + const char *prefix=ppucd.nextField(); // NULL if type==hangul + namesPropsBuilder->setAlgNamesRange(start, end, type, prefix, errorCode); + } } if(errorCode.isFailure()) { fprintf(stderr, @@ -153,6 +168,7 @@ main(int argc, char* argv[]) { corePropsBuilder->build(errorCode); bidiPropsBuilder->build(errorCode); + namesPropsBuilder->build(errorCode); if(errorCode.isFailure()) { fprintf(stderr, "genprops error: failure finalizing the data - %s\n", errorCode.errorName()); @@ -174,6 +190,7 @@ main(int argc, char* argv[]) { corePropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); bidiPropsBuilder->writeCSourceFile(sourceCommon.data(), errorCode); bidiPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); + namesPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); return errorCode; } diff --git a/tools/unicode/c/genprops/genprops.h b/tools/unicode/c/genprops/genprops.h index a50f759d81e..7eede9563dc 100644 --- a/tools/unicode/c/genprops/genprops.h +++ b/tools/unicode/c/genprops/genprops.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -23,11 +23,15 @@ #include "ppucd.h" #include "unewdata.h" +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + class PropsBuilder { public: PropsBuilder(); virtual ~PropsBuilder(); virtual void setUnicodeVersion(const UVersionInfo version); + virtual void setAlgNamesRange(UChar32 start, UChar32 end, + const char *type, const char *prefix, UErrorCode &errorCode); virtual void setProps(const icu::UniProps &props, const icu::UnicodeSet &newValues, UErrorCode &errorCode); virtual void build(UErrorCode &errorCode); virtual void writeCSourceFile(const char *path, UErrorCode &errorCode); @@ -42,8 +46,10 @@ public: PNamesBuilder *createPNamesBuilder(UErrorCode &errorCode); PropsBuilder *createCorePropsBuilder(UErrorCode &errorCode); PropsBuilder *createBiDiPropsBuilder(UErrorCode &errorCode); +PropsBuilder *createNamesPropsBuilder(UErrorCode &errorCode); /* global flags */ -U_CFUNC UBool beVerbose; +extern UBool beVerbose; +extern UBool beQuiet; #endif diff --git a/tools/unicode/c/gennames/gennames.c b/tools/unicode/c/genprops/namespropsbuilder.cpp similarity index 57% rename from tools/unicode/c/gennames/gennames.c rename to tools/unicode/c/genprops/namespropsbuilder.cpp index 2d67443bf44..60d37b43189 100644 --- a/tools/unicode/c/gennames/gennames.c +++ b/tools/unicode/c/genprops/namespropsbuilder.cpp @@ -1,11 +1,11 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2011, International Business Machines +* Copyright (C) 1999-2012, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* -* file name: gennames.c +* file name: namespropsbuilder.cpp (was gennames/gennames.c) * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 @@ -13,11 +13,8 @@ * created on: 1999sep30 * created by: Markus W. Scherer * -* This program reads the Unicode character database text file, -* parses it, and extracts the character code, -* the "modern" character name, and optionally the -* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment. -* It then tokenizes and compresses the names and builds +* This builder reads Unicode character names and aliases, +* tokenizes and compresses them, and builds * compact binary tables for random-access lookup * in a u_charName() API function. * @@ -121,16 +118,16 @@ #include #include "unicode/utypes.h" #include "unicode/putil.h" -#include "unicode/uclean.h" #include "unicode/udata.h" +#include "charstr.h" #include "cmemory.h" #include "cstring.h" +#include "genprops.h" +#include "ppucd.h" #include "uarrsort.h" +#include "uassert.h" #include "unewdata.h" #include "uoptions.h" -#include "uparse.h" - -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) #define STRING_STORE_SIZE 1000000 #define GROUP_STORE_SIZE 5000 @@ -143,66 +140,8 @@ #define MAX_WORD_COUNT 20000 #define MAX_GROUP_COUNT 5000 -#define DATA_NAME "unames" -#define DATA_TYPE "icu" -#define VERSION_STRING "unam" #define NAME_SEPARATOR_CHAR ';' -#define ISO_DATA_NAME "ucomment" - -/* Unicode versions --------------------------------------------------------- */ - -enum { - UNI_1_0, - UNI_1_1, - UNI_2_0, - UNI_3_0, - UNI_3_1, - UNI_3_2, - UNI_4_0, - UNI_4_0_1, - UNI_4_1, - UNI_5_0, - UNI_5_1, - UNI_5_2, - UNI_6_0, - UNI_6_1, - UNI_VER_COUNT -}; - -static const UVersionInfo -unicodeVersions[]={ - { 1, 0, 0, 0 }, - { 1, 1, 0, 0 }, - { 2, 0, 0, 0 }, - { 3, 0, 0, 0 }, - { 3, 1, 0, 0 }, - { 3, 2, 0, 0 }, - { 4, 0, 0, 0 }, - { 4, 0, 1, 0 }, - { 4, 1, 0, 0 }, - { 5, 0, 0, 0 }, - { 5, 1, 0, 0 }, - { 5, 2, 0, 0 }, - { 6, 0, 0, 0 }, - { 6, 1, 0, 0 } -}; - -static int32_t ucdVersion=UNI_6_1; - -static int32_t -findUnicodeVersion(const UVersionInfo version) { - int32_t i; - - for(i=0; /* while(version>unicodeVersions[i]) {} */ - i0; - ++i) {} - if(0=ucdVersion comparisons */ - } - return i; /* version>=unicodeVersions[i] && version=3) { - parseNameAliases(argv[2], &moreOptions); - } - parseDB(argc>=2 ? argv[1] : "-", &moreOptions); - compress(); - generateData(options[DESTDIR].value, &moreOptions); - - u_cleanup(); - return 0; -} - -static void -init() { - int i; - - for(i=0; i<256; ++i) { - tokens[i]=0; - } -} - /* parsing ------------------------------------------------------------------ */ -/* get a name, strip leading and trailing whitespace */ -static int16_t -getName(char **pStart, char *limit) { - /* strip leading whitespace */ - char *start=(char *)u_skipWhitespace(*pStart); - - /* strip trailing whitespace */ - while(start=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) { - fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n", - name, (unsigned long)code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* check for non-character code points */ - if(!U_IS_UNICODE_CHAR(code)) { - fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n", - (unsigned long)code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* - * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line. - * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character. - */ - fields[2][1]=0; - if(0!=uprv_strcmp("correction", fields[2][0])) { +void +NamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) { return; } - /* check that the code points (code) are in ascending order */ - if(code<=prevCode && code>0) { - fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)code, (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=code; + U_ASSERT(props.start==props.end); - if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) { - fprintf(stderr, "gennames: error - too many name aliases\n"); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - cpNameAliases[cpNameAliasesTop].code=code; - uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length); - cpNameAliases[cpNameAliasesTop].nameAlias[length]=0; - ++cpNameAliasesTop; - - parseName(name, length); -} - -static void U_CALLCONV -lineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - Options *storeOptions=(Options *)context; - char *names[4]; + const char *names[4]={ NULL, NULL, NULL, NULL }; int16_t lengths[4]={ 0, 0, 0, 0 }; - static uint32_t prevCode=0; - uint32_t code=0; - - if(U_FAILURE(*pErrorCode)) { - return; - } - /* get the character code */ - code=uprv_strtoul(fields[0][0], NULL, 16); /* get the character name */ - if(storeOptions->storeNames) { - names[0]=fields[1][0]; - lengths[0]=getName(names+0, fields[1][1]); - if(names[0][0]=='<') { - /* do not store pseudo-names in <> brackets */ - lengths[0]=0; + if(props.name!=NULL) { + names[0]=props.name; + lengths[0]=(int16_t)uprv_strlen(props.name); + parseName(names[0], lengths[0]); + } + + CharString buffer; + if(props.nameAlias!=NULL) { + /* + * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line. + * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character. + */ + const char *corr=uprv_strstr(props.nameAlias, "correction="); + if(corr!=NULL) { + corr+=11; // skip "correction=" + const char *limit=uprv_strchr(corr, ','); + if(limit!=NULL) { + buffer.append(corr, limit-corr, errorCode); + names[3]=buffer.data(); + lengths[3]=(int16_t)(limit-corr); + } else { + names[3]=corr; + lengths[3]=(int16_t)uprv_strlen(corr); + } + parseName(names[3], lengths[3]); } } - /* store 1.0 names */ - /* get the second character name, the one from Unicode 1.0 */ - if(storeOptions->store10Names) { - names[1]=fields[10][0]; - lengths[1]=getName(names+1, fields[10][1]); - if(names[1][0]=='<') { - /* do not store pseudo-names in <> brackets */ - lengths[1]=0; - } - } - - /* get the ISO 10646 comment */ - if(storeOptions->storeISOComments) { - names[2]=fields[11][0]; - lengths[2]=getName(names+2, fields[11][1]); - } - - if(lengths[0]+lengths[1]+lengths[2]==0) { - return; - } - - /* check for non-character code points */ - if(!U_IS_UNICODE_CHAR(code)) { - fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n", - (unsigned long)code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* check that the code points (code) are in ascending order */ - if(code<=prevCode && code>0) { - fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)code, (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=code; - - parseName(names[0], lengths[0]); - parseName(names[1], lengths[1]); - parseName(names[2], lengths[2]); - - if(cpNameAliasesIndex=cpNameAliases[cpNameAliasesIndex].code) { - if(code==cpNameAliases[cpNameAliasesIndex].code) { - names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias; - lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias); - ++cpNameAliasesIndex; - } else { - fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n", - (unsigned long)code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - } - - /* - * set the count argument to - * 1: only store regular names, or only store ISO 10646 comments - * 2: store regular and 1.0 names - * 3: store names and ISO 10646 comment - * 4: also store name alias - * - * addLine() will ignore empty trailing names - */ - if(storeOptions->storeNames) { - /* store names and comments as parsed according to storeOptions */ - addLine(code, names, lengths, LENGTHOF(names)); - } else { - /* store only ISO 10646 comments */ - addLine(code, names+2, lengths+2, 1); - } + addLine(props.start, names, lengths, LENGTHOF(names)); } static void -parseNameAliases(const char *filename, Options *storeOptions) { - char *fields[3][2]; - UErrorCode errorCode=U_ZERO_ERROR; - - if(!storeOptions->storeNames) { - return; - } - /* - * This works only for Unicode 6.1 NameAliases.txt with 3 fields per line. - * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character. - */ - u_parseDelimitedFile(filename, ';', fields, 3, nameAliasesLineFn, NULL, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); - exit(errorCode); - } - - if(!beQuiet) { - printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop); - } -} - -static void -parseDB(const char *filename, Options *storeOptions) { - char *fields[15][2]; - UErrorCode errorCode=U_ZERO_ERROR; - - u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode)); - exit(errorCode); - } - if(cpNameAliasesIndex0 && words[wordCount-1].weight<1) { @@ -826,11 +516,13 @@ compress() { for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { if(tokens[i]!=-1) { tokens[i]=wordNumber; +#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } +#endif ++wordNumber; } } @@ -880,11 +572,13 @@ compress() { /* set token 0 to word 0 */ tokens[0]=0; +#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x000]: word%8ld \"%.*s\"\n", (long)words[0].weight, words[0].length, words[0].s); } +#endif wordNumber=1; /* set the lead byte tokens */ @@ -897,11 +591,13 @@ compress() { /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */ if(tokens[i]!=-1) { tokens[i]=wordNumber; +#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } +#endif ++wordNumber; } } @@ -912,11 +608,13 @@ compress() { tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ } else { tokens[i]=wordNumber; +#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } +#endif ++wordNumber; } } @@ -1051,27 +749,42 @@ compareWords(const void *context, const void *word1, const void *word2) { return ((Word *)word2)->weight-((Word *)word1)->weight; } +void +NamesPropsBuilder::build(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + + if(!beQuiet) { + puts("* unames.icu stats *"); + printf("size of all names in the database: %lu\n", + (unsigned long)lineTop); + printf("number of named Unicode characters: %lu\n", + (unsigned long)lineCount); + printf("number of words in the dictionary from these names: %lu\n", + (unsigned long)wordCount); + } + compress(errorCode); +} + /* generate output data ----------------------------------------------------- */ -static void -generateData(const char *dataDir, Options *storeOptions) { - UNewDataMemory *pData; - UErrorCode errorCode=U_ZERO_ERROR; +void +NamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + + UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo, + withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n", + path, u_errorName(errorCode)); + return; + } + uint16_t groupWords[3]; - uint32_t i, groupTop=lineTop, offset, size, + uint32_t i, groupTop=lineTop, size, tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; long dataLength; int16_t token; - pData=udata_create(dataDir, - DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME, - &dataInfo, - haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); - exit(errorCode); - } - /* first, see how much space we need, and prepare the token strings */ for(i=0; i>16); groupWords[2]=(uint16_t)(offset); udata_writeBlock(pData, groupWords, 6); @@ -1187,7 +899,8 @@ generateData(const char *dataDir, Options *storeOptions) { /* 4-align the algorithmic names data */ udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom))); - generateAlgorithmicData(pData, storeOptions); + udata_write32(pData, countAlgRanges); + udata_writeBlock(pData, algRanges.data(), algRanges.length()); /* finish up */ dataLength=udata_finish(pData, &errorCode); @@ -1203,205 +916,6 @@ dataLength, (unsigned long)size); } } -/* the structure for algorithmic names needs to be 4-aligned */ -typedef struct AlgorithmicRange { - uint32_t rangeStart, rangeEnd; - uint8_t algorithmType, algorithmVariant; - uint16_t rangeSize; -} AlgorithmicRange; - -static uint32_t -generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) { - static char prefix[] = "CJK UNIFIED IDEOGRAPH-"; -# define PREFIX_LENGTH 23 -# define PREFIX_LENGTH_4 24 - uint32_t countAlgRanges; - - static AlgorithmicRange cjkExtA={ - 0x3400, 0x4db5, - 0, 4, - sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 - }; - static AlgorithmicRange cjk={ - 0x4e00, 0x9fa5, - 0, 4, - sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 - }; - static AlgorithmicRange cjkExtB={ - 0x20000, 0x2a6d6, - 0, 5, - sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 - }; - static AlgorithmicRange cjkExtC={ - 0x2a700, 0x2b734, - 0, 5, - sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 - }; - static AlgorithmicRange cjkExtD={ - 0x2b740, 0x2b81d, - 0, 5, - sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 - }; - - static char jamo[]= - "HANGUL SYLLABLE \0" - - "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" - "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" - - "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" - "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" - "YU\0EU\0YI\0I\0" - - "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" - "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" - "S\0SS\0NG\0J\0C\0K\0T\0P\0H" - ; - - static AlgorithmicRange hangul={ - 0xac00, 0xd7a3, - 1, 3, - sizeof(AlgorithmicRange)+6+sizeof(jamo) - }; - - /* modulo factors, maximum 8 */ - /* 3 factors: 19, 21, 28, most-to-least-significant */ - static uint16_t hangulFactors[3]={ - 19, 21, 28 - }; - - uint32_t size; - - size=0; - - if(ucdVersion>=UNI_6_1) { - /* Unicode 6.1 and up has a longer CJK Unihan range than before */ - cjk.rangeEnd=0x9FCC; - } else if(ucdVersion>=UNI_5_2) { - /* Unicode 5.2 and up has a longer CJK Unihan range than before */ - cjk.rangeEnd=0x9FCB; - } else if(ucdVersion>=UNI_5_1) { - /* Unicode 5.1 and up has a longer CJK Unihan range than before */ - cjk.rangeEnd=0x9FC3; - } else if(ucdVersion>=UNI_4_1) { - /* Unicode 4.1 and up has a longer CJK Unihan range than before */ - cjk.rangeEnd=0x9FBB; - } - - /* number of ranges of algorithmic names */ - if(!storeOptions->storeNames) { - countAlgRanges=0; - } else if(ucdVersion>=UNI_6_0) { - /* Unicode 6.0 and up has 6 ranges including CJK Extension D */ - countAlgRanges=6; - } else if(ucdVersion>=UNI_5_2) { - /* Unicode 5.2 and up has 5 ranges including CJK Extension C */ - countAlgRanges=5; - } else if(ucdVersion>=UNI_3_1) { - /* Unicode 3.1 and up has 4 ranges including CJK Extension B */ - countAlgRanges=4; - } else if(ucdVersion>=UNI_3_0) { - /* Unicode 3.0 has 3 ranges including CJK Extension A */ - countAlgRanges=3; - } else { - /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */ - countAlgRanges=2; - } - - if(pData!=NULL) { - udata_write32(pData, countAlgRanges); - } else { - size+=4; - } - if(countAlgRanges==0) { - return size; - } - - /* - * each range: - * uint32_t rangeStart - * uint32_t rangeEnd - * uint8_t algorithmType - * uint8_t algorithmVariant - * uint16_t size of range data - * uint8_t[size] data - */ - - /* range 0: cjk extension a */ - if(countAlgRanges>=3) { - if(pData!=NULL) { - udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange)); - udata_writeString(pData, prefix, PREFIX_LENGTH); - if(PREFIX_LENGTH=4) { - if(pData!=NULL) { - udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange)); - udata_writeString(pData, prefix, PREFIX_LENGTH); - if(PREFIX_LENGTH=5) { - if(pData!=NULL) { - udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange)); - udata_writeString(pData, prefix, PREFIX_LENGTH); - if(PREFIX_LENGTH=6) { - if(pData!=NULL) { - udata_writeBlock(pData, &cjkExtD, sizeof(AlgorithmicRange)); - udata_writeString(pData, prefix, PREFIX_LENGTH); - if(PREFIX_LENGTH