diff --git a/.gitignore b/.gitignore index 007bc151029..290de65af16 100644 --- a/.gitignore +++ b/.gitignore @@ -964,20 +964,6 @@ tools/trac/IcuCodeTools/0.11/icucodetools/*.pyc tools/trac/IcuCodeTools/0.12/*.egg-info tools/trac/IcuCodeTools/0.12/build tools/trac/IcuCodeTools/0.12/icucodetools/*.pyc -tools/unicode/c/genbidi/*.d -tools/unicode/c/genbidi/*.o -tools/unicode/c/genbidi/*.pdb -tools/unicode/c/genbidi/*.plg -tools/unicode/c/genbidi/Debug -tools/unicode/c/genbidi/Makefile -tools/unicode/c/genbidi/Release -tools/unicode/c/genbidi/debug -tools/unicode/c/genbidi/genbidi -tools/unicode/c/genbidi/genbidi.[0-9] -tools/unicode/c/genbidi/genbidi.vcproj.*.*.user -tools/unicode/c/genbidi/release -tools/unicode/c/genbidi/x64 -tools/unicode/c/genbidi/x86 tools/unicode/c/gencase/*.d tools/unicode/c/gencase/*.ncb tools/unicode/c/gencase/*.o diff --git a/tools/unicode/c/CMakeLists.txt b/tools/unicode/c/CMakeLists.txt index ab726672a7a..5d9b9ef38a0 100644 --- a/tools/unicode/c/CMakeLists.txt +++ b/tools/unicode/c/CMakeLists.txt @@ -17,7 +17,6 @@ include_directories( ${ICU_SRC_DIR}/source/i18n ${ICU_SRC_DIR}/source/tools/toolutil) link_directories(${ICU_INST_DIR}/lib) -add_subdirectory(genbidi) add_subdirectory(gencase) add_subdirectory(gennames) add_subdirectory(gennorm) diff --git a/tools/unicode/c/genbidi/CMakeLists.txt b/tools/unicode/c/genbidi/CMakeLists.txt deleted file mode 100644 index 56aaf3aab58..00000000000 --- a/tools/unicode/c/genbidi/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2010, International Business Machines -# Corporation and others. All Rights Reserved. -# -# created on: 2010jun03 -# created by: Markus W. Scherer -# edited on: 2010jul20 -# edited by: Stuart G. Gill -add_executable(genbidi genbidi.c store.c) -target_link_libraries(genbidi icuuc icutu) diff --git a/tools/unicode/c/genbidi/Makefile.in b/tools/unicode/c/genbidi/Makefile.in deleted file mode 100644 index 2f93e7d6179..00000000000 --- a/tools/unicode/c/genbidi/Makefile.in +++ /dev/null @@ -1,95 +0,0 @@ -## Makefile.in for ICU - tools/genbidi -## Copyright (c) 1999-2005, International Business Machines Corporation and -## others. All Rights Reserved. -## Steven R. Loomis - -## Source directory information -srcdir = @srcdir@ -top_srcdir = @top_srcdir@ - -top_builddir = ../.. - -include $(top_builddir)/icudefs.mk - -## Build directory information -subdir = tools/genbidi - -TARGET_STUB_NAME = genbidi - -SECTION = 8 - -#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) - -## Extra files to remove for 'make clean' -CLEANFILES = *~ $(DEPS) $(MAN_FILES) - -## Target information -TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) - -ifneq ($(top_builddir),$(top_srcdir)) -CPPFLAGS += -I$(top_builddir)/common -endif -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) - -OBJECTS = genbidi.o store.o - -DEPS = $(OBJECTS:.o=.d) - -## List of phony targets -.PHONY : all all-local install install-local clean clean-local \ -distclean distclean-local dist dist-local check check-local install-man - -## Clear suffix list -.SUFFIXES : - -## List of standard targets -all: all-local -install: install-local -clean: clean-local -distclean : distclean-local -dist: dist-local -check: all check-local - -all-local: $(TARGET) $(MAN_FILES) - -install-local: all-local install-man - -install-man: $(MAN_FILES) -# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) -# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) - - -dist-local: - -clean-local: - test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) - $(RMV) $(TARGET) $(OBJECTS) - -distclean-local: clean-local - $(RMV) Makefile - -check-local: all-local - -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - cd $(top_builddir) \ - && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status - -$(TARGET) : $(OBJECTS) - $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) - $(POST_BUILD_STEP) - - -%.$(SECTION): $(srcdir)/%.$(SECTION).in - cd $(top_builddir) \ - && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status - - -ifeq (,$(MAKECMDGOALS)) --include $(DEPS) -else -ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) --include $(DEPS) -endif -endif - diff --git a/tools/unicode/c/genbidi/genbidi.c b/tools/unicode/c/genbidi/genbidi.c deleted file mode 100644 index 0855699f2c4..00000000000 --- a/tools/unicode/c/genbidi/genbidi.c +++ /dev/null @@ -1,643 +0,0 @@ -/* -******************************************************************************* -* -* Copyright (C) 2004-2011, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: genbidi.c -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2004dec30 -* created by: Markus W. Scherer -* -* This program reads several of the Unicode character database text files, -* parses them, and extracts the bidi/shaping properties for each character. -* It then writes a binary file containing the properties -* that is designed to be used directly for random-access to -* the properties of each Unicode character. -*/ - -#include -#include "unicode/utypes.h" -#include "unicode/uchar.h" -#include "unicode/putil.h" -#include "unicode/uclean.h" -#include "cmemory.h" -#include "cstring.h" -#include "uarrsort.h" -#include "unewdata.h" -#include "uoptions.h" -#include "uparse.h" -#include "propsvec.h" -#include "ubidi_props.h" -#include "genbidi.h" - -#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) - -/* data --------------------------------------------------------------------- */ - -UPropsVectors *pv; - -UBool beVerbose=FALSE, haveCopyright=TRUE; - -/* prototypes --------------------------------------------------------------- */ - -static UBool -isToken(const char *token, const char *s); - -static void -parseBidiMirroring(const char *filename, UErrorCode *pErrorCode); - -static void -parseDB(const char *filename, UErrorCode *pErrorCode); - -/* miscellaneous ------------------------------------------------------------ */ - -/* TODO: more common code, move functions to uparse.h|c */ - -static char * -trimTerminateField(char *s, char *limit) { - /* trim leading whitespace */ - s=(char *)u_skipWhitespace(s); - - /* trim trailing whitespace */ - while(sucdFile, fields[0][0]); - exit(*pErrorCode); - } - - /* parse property alias */ - s=trimTerminateField(fields[1][0], fields[1][1]); - value=u_getPropertyValueEnum(sen->prop, s); - if(value<0) { - if(sen->prop==UCHAR_BLOCK) { - if(isToken("Greek", s)) { - value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */ - } else if(isToken("Combining Marks for Symbols", s)) { - value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */ - } else if(isToken("Private Use", s)) { - value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */ - } - } - } - if(value<0) { - fprintf(stderr, "genbidi error: unknown %s name in %s.txt field 1 at %s\n", - sen->propName, sen->ucdFile, s); - exit(U_PARSE_ERROR); - } - - uv=(uint32_t)(value<vecShift); - if((uv&sen->vecMask)!=uv) { - fprintf(stderr, "genbidi error: %s value overflow (0x%x) at %s\n", - sen->propName, (int)uv, s); - exit(U_INTERNAL_PROGRAM_ERROR); - } - - upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genbidi error: unable to set %s code: %s\n", - sen->propName, u_errorName(*pErrorCode)); - exit(*pErrorCode); - } -} - -static void -parseSingleEnumFile(char *filename, char *basename, const char *suffix, - const SingleEnum *sen, - UErrorCode *pErrorCode) { - char *fields[2][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - writeUCDFilename(basename, sen->ucdFile, suffix); - - u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode)); - } -} - -/* parse files with multiple binary properties ------------------------------ */ - -/* TODO: more common code, move functions to uparse.h|c */ - -/* TODO: similar to genbidi/props2.c but not the same; same as in gencase/gencase.c */ - -struct Binary { - const char *propName; - int32_t vecWord; - uint32_t vecValue, vecMask; -}; -typedef struct Binary Binary; - -struct Binaries { - const char *ucdFile; - const Binary *binaries; - int32_t binariesCount; -}; -typedef struct Binaries Binaries; - -static const Binary -propListNames[]={ - { "Bidi_Control", 0, U_MASK(UBIDI_BIDI_CONTROL_SHIFT), U_MASK(UBIDI_BIDI_CONTROL_SHIFT) }, - { "Join_Control", 0, U_MASK(UBIDI_JOIN_CONTROL_SHIFT), U_MASK(UBIDI_JOIN_CONTROL_SHIFT) } -}; - -static const Binaries -propListBinaries={ - "PropList", propListNames, LENGTHOF(propListNames) -}; - -static void U_CALLCONV -binariesLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - const Binaries *bin; - char *s; - uint32_t start, end; - int32_t i; - - bin=(const Binaries *)context; - - u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genbidi: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); - exit(*pErrorCode); - } - - /* parse binary property name */ - s=(char *)u_skipWhitespace(fields[1][0]); - for(i=0;; ++i) { - if(i==bin->binariesCount) { - /* ignore unrecognized properties */ - return; - } - if(isToken(bin->binaries[i].propName, s)) { - break; - } - } - - if(bin->binaries[i].vecMask==0) { - fprintf(stderr, "genbidi error: mask value %d==0 for %s %s\n", - (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); - exit(U_INTERNAL_PROGRAM_ERROR); - } - - upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genbidi error: unable to set %s, code: %s\n", - bin->binaries[i].propName, u_errorName(*pErrorCode)); - exit(*pErrorCode); - } -} - -static void -parseBinariesFile(char *filename, char *basename, const char *suffix, - const Binaries *bin, - UErrorCode *pErrorCode) { - char *fields[2][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - writeUCDFilename(basename, bin->ucdFile, suffix); - - u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); - } -} - -/* -------------------------------------------------------------------------- */ - -enum { - HELP_H, - HELP_QUESTION_MARK, - VERBOSE, - COPYRIGHT, - DESTDIR, - SOURCEDIR, - UNICODE_VERSION, - ICUDATADIR, - CSOURCE -}; - -/* Keep these values in sync with the above enums */ -static UOption options[]={ - UOPTION_HELP_H, - UOPTION_HELP_QUESTION_MARK, - UOPTION_VERBOSE, - UOPTION_COPYRIGHT, - UOPTION_DESTDIR, - UOPTION_SOURCEDIR, - UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), - UOPTION_ICUDATADIR, - UOPTION_DEF("csource", 'C', UOPT_NO_ARG) -}; - -extern int -main(int argc, char* argv[]) { - char filename[300]; - const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; - char *basename=NULL; - UErrorCode errorCode=U_ZERO_ERROR; - - U_MAIN_INIT_ARGS(argc, argv); - - /* preset then read command line options */ - options[DESTDIR].value=u_getDataDirectory(); - options[SOURCEDIR].value=""; - options[UNICODE_VERSION].value=""; - options[ICUDATADIR].value=u_getDataDirectory(); - argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); - - /* error handling, printing usage message */ - if(argc<0) { - fprintf(stderr, - "error in command line argument \"%s\"\n", - argv[-argc]); - } - if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { - /* - * Broken into chucks because the C89 standard says the minimum - * required supported string length is 509 bytes. - */ - fprintf(stderr, - "Usage: %s [-options] [suffix]\n" - "\n" - "read the UnicodeData.txt file and other Unicode properties files and\n" - "create a binary file " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE " with the bidi/shaping properties\n" - "\n", - argv[0]); - fprintf(stderr, - "Options:\n" - "\t-h or -? or --help this usage text\n" - "\t-v or --verbose verbose output\n" - "\t-c or --copyright include a copyright notice\n" - "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" - "\t-C or --csource generate a .c source file rather than the .icu binary\n"); - fprintf(stderr, - "\t-d or --destdir destination directory, followed by the path\n" - "\t-s or --sourcedir source directory, followed by the path\n" - "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" - "\t followed by path, defaults to %s\n" - "\tsuffix suffix that is to be appended with a '-'\n" - "\t to the source file basenames before opening;\n" - "\t 'genbidi new' will read UnicodeData-new.txt etc.\n", - u_getDataDirectory()); - return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; - } - - /* get the options values */ - beVerbose=options[VERBOSE].doesOccur; - haveCopyright=options[COPYRIGHT].doesOccur; - srcDir=options[SOURCEDIR].value; - destDir=options[DESTDIR].value; - - if(argc>=2) { - suffix=argv[1]; - } else { - suffix=NULL; - } - - if(options[UNICODE_VERSION].doesOccur) { - setUnicodeVersion(options[UNICODE_VERSION].value); - } - /* else use the default dataVersion in store.c */ - - if (options[ICUDATADIR].doesOccur) { - u_setDataDirectory(options[ICUDATADIR].value); - } - - /* prepare the filename beginning with the source dir */ - uprv_strcpy(filename, srcDir); - basename=filename+uprv_strlen(filename); - if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { - *basename++=U_FILE_SEP_CHAR; - } - - /* initialize */ - pv=upvec_open(2, &errorCode); - - /* process BidiMirroring.txt */ - writeUCDFilename(basename, "BidiMirroring", suffix); - parseBidiMirroring(filename, &errorCode); - - /* process additional properties files */ - *basename=0; - - parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); - - parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, &errorCode); - - parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, &errorCode); - - /* process UnicodeData.txt */ - writeUCDFilename(basename, "UnicodeData", suffix); - parseDB(filename, &errorCode); - - /* set proper bidi class for unassigned code points (Cn) */ - parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, &errorCode); - - /* process parsed data */ - if(U_SUCCESS(errorCode)) { - /* write the properties data file */ - generateData(destDir, options[CSOURCE].doesOccur); - } - - u_cleanup(); - return errorCode; -} - -U_CFUNC void -writeUCDFilename(char *basename, const char *filename, const char *suffix) { - int32_t length=(int32_t)uprv_strlen(filename); - uprv_strcpy(basename, filename); - if(suffix!=NULL) { - basename[length++]='-'; - uprv_strcpy(basename+length, suffix); - length+=(int32_t)uprv_strlen(suffix); - } - uprv_strcpy(basename+length, ".txt"); -} - -/* TODO: move to toolutil */ -static UBool -isToken(const char *token, const char *s) { - const char *z; - int32_t j; - - s=u_skipWhitespace(s); - for(j=0;; ++j) { - if(token[j]!=0) { - if(s[j]!=token[j]) { - break; - } - } else { - z=u_skipWhitespace(s+j); - if(*z==';' || *z==0) { - return TRUE; - } else { - break; - } - } - } - - return FALSE; -} - -/* parser for BidiMirroring.txt --------------------------------------------- */ - -static void U_CALLCONV -mirrorLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - const char *s; - char *end; - UChar32 src, mirror; - - /* ignore "" which is on the @missing line */ - s=u_skipWhitespace(fields[1][0]); - if(0==uprv_strncmp(s, "", 12)) { - return; - } - - src=(UChar32)uprv_strtoul(fields[0][0], &end, 16); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - mirror=(UChar32)uprv_strtoul(s, &end, 16); - if(end<=s || end!=fields[1][1]) { - fprintf(stderr, "genbidi: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - addMirror(src, mirror); -} - -static void -parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) { - char *fields[2][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode); -} - -/* parser for UnicodeData.txt ----------------------------------------------- */ - -static void U_CALLCONV -unicodeDataLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - char *end; - UErrorCode errorCode; - UChar32 c; - - errorCode=U_ZERO_ERROR; - - /* get the character code, field 0 */ - c=(UChar32)uprv_strtoul(fields[0][0], &end, 16); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "genbidi: syntax error in field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* get Mirrored flag, field 9 */ - if(*fields[9][0]=='Y') { - upvec_setValue(pv, c, c, 0, U_MASK(UBIDI_IS_MIRRORED_SHIFT), U_MASK(UBIDI_IS_MIRRORED_SHIFT), &errorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genbidi error: unable to set 'is mirrored' for U+%04lx, code: %s\n", - (long)c, u_errorName(errorCode)); - exit(errorCode); - } - } else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') { - fprintf(stderr, "genbidi: syntax error in field 9 at U+%04lx\n", - (long)c); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } -} - -static void -parseDB(const char *filename, UErrorCode *pErrorCode) { - /* default Bidi classes for unassigned code points */ - static const UChar32 defaultBidi[][3]={ /* { start, end, class } */ - /* R: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF */ - { 0x0590, 0x05FF, U_RIGHT_TO_LEFT }, - { 0x07C0, 0x08FF, U_RIGHT_TO_LEFT }, - { 0xFB1D, 0xFB4F, U_RIGHT_TO_LEFT }, - { 0x10800, 0x10FFF, U_RIGHT_TO_LEFT }, - - /* AL: U+0600..U+07BF, U+FB50..U+FDCF, U+FDF0..U+FDFF, U+FE70..U+FEFE */ - { 0x0600, 0x07BF, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFB50, 0xFDCF, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFDF0, 0xFDFF, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFE70, 0xFEFE, U_RIGHT_TO_LEFT_ARABIC } - - /* L otherwise */ - }; - - char *fields[15][2]; - UChar32 start, end; - int32_t i; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - /* - * Set default Bidi classes for unassigned code points. - * See the documentation for Bidi_Class in UCD.html in the Unicode data. - * http://www.unicode.org/Public/ - * - * Starting with Unicode 5.0, DerivedBidiClass.txt should (re)set - * the Bidi_Class values for all code points including unassigned ones - * and including L values for these. - * This code becomes unnecesary but harmless. Leave it for now in case - * someone uses genbidi on pre-Unicode 5.0 data. - */ - for(i=0; i - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tools/unicode/c/genbidi/store.c b/tools/unicode/c/genbidi/store.c deleted file mode 100644 index 23901ded92d..00000000000 --- a/tools/unicode/c/genbidi/store.c +++ /dev/null @@ -1,466 +0,0 @@ -/* -******************************************************************************* -* -* Copyright (C) 2004-2011, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: store.c -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2004dec30 -* created by: Markus W. Scherer -* -* Store Unicode bidi/shaping properties efficiently for -* random access. -*/ - -#include -#include -#include "unicode/utypes.h" -#include "unicode/uchar.h" -#include "cmemory.h" -#include "cstring.h" -#include "utrie2.h" -#include "uarrsort.h" -#include "unicode/udata.h" -#include "unewdata.h" -#include "propsvec.h" -#include "writesrc.h" -#include "ubidi_props.h" -#include "genbidi.h" - -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - -/* Unicode bidi/shaping properties file format --------------------------------- - -The file format prepared and written here contains several data -structures that store indexes or data. - -Before the data contents described below, there are the headers required by -the udata API for loading ICU data. Especially, a UDataInfo structure -precedes the actual data. It contains platform properties values and the -file format version. - -The following is a description of format version 2.0 . - -The file contains the following structures: - - const int32_t indexes[i0] with values i0, i1, ...: - (see UBIDI_IX_... constants for names of indexes) - - i0 indexLength; -- length of indexes[] (UBIDI_IX_TOP) - i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) - i2 trieSize; -- size in bytes of the bidi/shaping properties trie - i3 mirrorLength; -- length in uint32_t of the bidi mirroring array - - i4 jgStart; -- first code point with Joining_Group data - i5 jgLimit; -- limit code point for Joining_Group data - - i6..i14 reservedIndexes; -- reserved values; 0 for now - - i15 maxValues; -- maximum code values for enumerated properties - bits 23..16 contain the max value for Joining_Group, - otherwise the bits are used like enum fields in the trie word - - Serialized trie, see utrie2.h; - - const uint32_t mirrors[mirrorLength]; - - const uint8_t jgArray[i5-i4]; -- (i5-i4) is always a multiple of 4 - -Trie data word: -Bits -15..13 signed delta to bidi mirroring code point - (add delta to input code point) - 0 no such code point (source maps to itself) - -3..-1, 1..3 delta - -4 look in mirrors table - 12 is mirrored - 11 Bidi_Control - 10 Join_Control - 9.. 8 reserved (set to 0) - 7.. 5 Joining_Type - 4.. 0 BiDi category - - -Mirrors: -Stores some of the bidi mirroring data, where each code point maps to -at most one other. -Most code points do not have a mirroring code point; most that do have a signed -delta stored in the trie data value. Only those where the delta does not fit -into the trie data are stored in this table. - -Logically, this is a two-column table with source and mirror code points. - -Physically, the table is compressed by taking advantage of the fact that each -mirror code point is also a source code point -(each of them is a mirror of the other). -Therefore, both logical columns contain the same set of code points, which needs -to be stored only once. - -The table stores source code points, and also for each the index of its mirror -code point in the same table, in a simple array of uint32_t. -Bits -31..21 index to mirror code point (unsigned) -20.. 0 source code point - -The table is sorted by source code points. - - -Joining_Group array: -The Joining_Group values do not fit into the 16-bit trie, but the data is also -limited to a small range of code points (Arabic and Syriac) and not -well compressible. - -The start and limit code points for the range are stored in the indexes[] -array, and the jgArray[] stores a byte for each of these code points, -containing the Joining_Group value. - -All code points outside of this range have No_Joining_Group (0). - ---- Changes in format version 2 --- - -Change from UTrie to UTrie2. - ------------------------------------------------------------------------------ */ - -/* UDataInfo cf. udata.h */ -static UDataInfo dataInfo={ - sizeof(UDataInfo), - 0, - - U_IS_BIG_ENDIAN, - U_CHARSET_FAMILY, - U_SIZEOF_UCHAR, - 0, - - /* dataFormat="BiDi" */ - { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, - { 2, 0, 0, 0 }, /* formatVersion */ - { 6, 0, 0, 0 } /* dataVersion */ -}; - -/* exceptions values */ -static uint32_t mirrors[UBIDI_MAX_MIRROR_INDEX+1][2]; -static uint16_t mirrorTop=0; - -/* -------------------------------------------------------------------------- */ - -extern void -setUnicodeVersion(const char *v) { - UVersionInfo version; - u_versionFromString(version, v); - uprv_memcpy(dataInfo.dataVersion, version, 4); -} - -/* bidi mirroring table ----------------------------------------------------- */ - -extern void -addMirror(UChar32 src, UChar32 mirror) { - UErrorCode errorCode; - int32_t delta; - - delta=mirror-src; - if(delta==0) { - return; /* mapping to self=no mapping */ - } - - if(delta0x1fffff) { - continue; /* this entry already has an index */ - } - - /* search for the mirror code point in the source column */ - if(c%04lx->?\n", - (long)mirrors[i][0], (long)mirrors[i][1]); - errorCode=U_ILLEGAL_ARGUMENT_ERROR; - } - if(c==mirrors[j][0]) { - /* - * found the mirror code point c in the source column, - * set both entries' indexes to each other - */ - if(UBIDI_GET_MIRROR_CODE_POINT(mirrors[i][0])!=UBIDI_GET_MIRROR_CODE_POINT(mirrors[j][1])) { - /* roundtrip check fails */ - fprintf(stderr, - "genbidi error: bidi mirrors do not roundtrip - %04lx->%04lx->%04lx\n", - (long)mirrors[i][0], (long)mirrors[i][1], (long)mirrors[j][1]); - errorCode=U_ILLEGAL_ARGUMENT_ERROR; - } else { - mirrors[i][1]|=(uint32_t)j< +#include +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "cmemory.h" +#include "cstring.h" +#include "ppucd.h" +#include "uarrsort.h" +#include "unicode/udata.h" +#include "unewdata.h" +#include "utrie2.h" +#include "writesrc.h" +#include "ubidi_props.h" +#include "genprops.h" + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +/* Unicode bidi/shaping properties file format --------------------------------- + +The file format prepared and written here contains several data +structures that store indexes or data. + +Before the data contents described below, there are the headers required by +the udata API for loading ICU data. Especially, a UDataInfo structure +precedes the actual data. It contains platform properties values and the +file format version. + +The following is a description of format version 2.0 . + +The file contains the following structures: + + const int32_t indexes[i0] with values i0, i1, ...: + (see UBIDI_IX_... constants for names of indexes) + + i0 indexLength; -- length of indexes[] (UBIDI_IX_TOP) + i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) + i2 trieSize; -- size in bytes of the bidi/shaping properties trie + i3 mirrorLength; -- length in uint32_t of the bidi mirroring array + + i4 jgStart; -- first code point with Joining_Group data + i5 jgLimit; -- limit code point for Joining_Group data + + i6..i14 reservedIndexes; -- reserved values; 0 for now + + i15 maxValues; -- maximum code values for enumerated properties + bits 23..16 contain the max value for Joining_Group, + otherwise the bits are used like enum fields in the trie word + + Serialized trie, see utrie2.h; + + const uint32_t mirrors[mirrorLength]; + + const uint8_t jgArray[i5-i4]; -- (i5-i4) is always a multiple of 4 + +Trie data word: +Bits +15..13 signed delta to bidi mirroring code point + (add delta to input code point) + 0 no such code point (source maps to itself) + -3..-1, 1..3 delta + -4 look in mirrors table + 12 is mirrored + 11 Bidi_Control + 10 Join_Control + 9.. 8 reserved (set to 0) + 7.. 5 Joining_Type + 4.. 0 BiDi category + + +Mirrors: +Stores some of the bidi mirroring data, where each code point maps to +at most one other. +Most code points do not have a mirroring code point; most that do have a signed +delta stored in the trie data value. Only those where the delta does not fit +into the trie data are stored in this table. + +Logically, this is a two-column table with source and mirror code points. + +Physically, the table is compressed by taking advantage of the fact that each +mirror code point is also a source code point +(each of them is a mirror of the other). +Therefore, both logical columns contain the same set of code points, which needs +to be stored only once. + +The table stores source code points, and also for each the index of its mirror +code point in the same table, in a simple array of uint32_t. +Bits +31..21 index to mirror code point (unsigned) +20.. 0 source code point + +The table is sorted by source code points. + + +Joining_Group array: +The Joining_Group values do not fit into the 16-bit trie, but the data is also +limited to a small range of code points (Arabic and Syriac) and not +well compressible. + +The start and limit code points for the range are stored in the indexes[] +array, and the jgArray[] stores a byte for each of these code points, +containing the Joining_Group value. + +All code points outside of this range have No_Joining_Group (0). + +--- Changes in format version 2 --- + +Change from UTrie to UTrie2. + +----------------------------------------------------------------------------- */ + +U_NAMESPACE_USE + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + /* dataFormat="BiDi" */ + { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, + { 2, 0, 0, 0 }, /* formatVersion */ + { 6, 0, 0, 0 } /* dataVersion */ +}; + +/* -------------------------------------------------------------------------- */ + +class BiDiPropsBuilder : public PropsBuilder { +public: + BiDiPropsBuilder(UErrorCode &errorCode); + virtual ~BiDiPropsBuilder(); + + virtual void setUnicodeVersion(const UVersionInfo version); + virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode); + virtual void build(UErrorCode &errorCode); + virtual void writeCSourceFile(const char *path, UErrorCode &errorCode); + virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode); + +private: + int32_t encodeBidiMirroringGlyph(UChar32 src, UChar32 end, UChar32 mirror, UErrorCode &errorCode); + void makeMirror(UErrorCode &errorCode); + + UnicodeSet relevantProps; + UTrie2 *pTrie; + uint8_t jgArray[0x300]; /* at most for U+0600..U+08FF */ + UChar32 jgStart; // First code point with a Joining_Group. + UChar32 jgLimit; // One past the last one. + uint32_t mirrors[UBIDI_MAX_MIRROR_INDEX+1][2]; + int32_t mirrorTop; +}; + +BiDiPropsBuilder::BiDiPropsBuilder(UErrorCode &errorCode) + : pTrie(NULL), + jgStart(0), jgLimit(0), + mirrorTop(0) { + // This builder encodes the following properties. + relevantProps. + add(UCHAR_BIDI_CONTROL). + add(UCHAR_BIDI_MIRRORED). + add(UCHAR_BIDI_CLASS). + add(UCHAR_BIDI_MIRRORING_GLYPH). + add(UCHAR_JOIN_CONTROL). + add(UCHAR_JOINING_GROUP). + add(UCHAR_JOINING_TYPE); + pTrie=utrie2_open(0, 0, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "genprops error: bidipropsbuilder utrie2_open() failed - %s\n", + u_errorName(errorCode)); + } + uprv_memset(jgArray, U_JG_NO_JOINING_GROUP, sizeof(jgArray)); +} + +BiDiPropsBuilder::~BiDiPropsBuilder() { + utrie2_close(pTrie); +} + +void +BiDiPropsBuilder::setUnicodeVersion(const UVersionInfo version) { + uprv_memcpy(dataInfo.dataVersion, version, 4); +} + +/* bidi mirroring table ----------------------------------------------------- */ + +int32_t +BiDiPropsBuilder::encodeBidiMirroringGlyph(UChar32 src, UChar32 end, UChar32 mirror, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode) || mirror<0) { + return 0; + } + if(src!=end) { + fprintf(stderr, + "genprops error: range U+%04lX..U+%04lX all with the same " + "Bidi_Mirroring_Glyph U+%04lX\n", + (long)src, (long)end, (long)mirror); + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + int32_t delta=mirror-src; + if(delta==0) { + return 0; /* mapping to self=no mapping */ + } + + if(delta=jgLimit || jgLimit==0) { jgLimit=end+1; } + } + // On the off-chance that a block is defined with a Joining_Group + // that is then overridden by No_Joining_Group, + // we set that too, but only inside U+0600..U+08FF. + if(end>=0x600 && start<=0x8ff) { + if(start<0x600) { start=0x600; } + if(end>0x8ff) { end=0x8ff; } + + /* set Joining_Group value for start..end */ + for(UChar32 c=start; c<=end; ++c) { + jgArray[c-0x600]=(uint8_t)jg; + } + } +} + +/* generate output data ----------------------------------------------------- */ + +static int32_t U_CALLCONV +compareMirror(const void *context, const void *left, const void *right) { + UChar32 l, r; + + l=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)left)[0]); + r=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)right)[0]); + return l-r; +} + +void +BiDiPropsBuilder::makeMirror(UErrorCode &errorCode) { + /* sort the mirroring table by source code points */ + uprv_sortArray(mirrors, mirrorTop, 8, + compareMirror, NULL, FALSE, &errorCode); + if(U_FAILURE(errorCode)) { return; } + + /* + * reduce the 2-column table to a single column + * by putting the index to the mirror entry into the source entry + * + * first: + * find each mirror code point in the source column and set each other's indexes + * + * second: + * reduce the table, combine the source code points with their indexes + * and store as a simple array of uint32_t + */ + for(int32_t i=0; i0x1fffff) { + continue; /* this entry already has an index */ + } + + /* search for the mirror code point in the source column */ + int32_t start, limit, step; + if(c%04lx->?\n", + (long)mirrors[i][0], (long)mirrors[i][1]); + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + } + if(c==mirrors[j][0]) { + /* + * found the mirror code point c in the source column, + * set both entries' indexes to each other + */ + if(UBIDI_GET_MIRROR_CODE_POINT(mirrors[i][0])!=UBIDI_GET_MIRROR_CODE_POINT(mirrors[j][1])) { + /* roundtrip check fails */ + fprintf(stderr, + "genprops error: bidi mirrors do not roundtrip - %04lx->%04lx->%04lx\n", + (long)mirrors[i][0], (long)mirrors[i][1], (long)mirrors[j][1]); + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + } else { + mirrors[i][1]|=(uint32_t)j< pnamesBuilder(createPNamesBuilder(errorCode)); LocalPointer corePropsBuilder(createCorePropsBuilder(errorCode)); + LocalPointer bidiPropsBuilder(createBiDiPropsBuilder(errorCode)); if(errorCode.isFailure()) { fprintf(stderr, "genprops: unable to create PropsBuilders - %s\n", errorCode.errorName()); return errorCode.reset(); @@ -136,9 +137,11 @@ main(int argc, char* argv[]) { if(ppucd.lineHasPropertyValues()) { const UniProps *props=ppucd.getProps(newValues, errorCode); corePropsBuilder->setProps(*props, newValues, errorCode); + bidiPropsBuilder->setProps(*props, newValues, errorCode); } else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) { const UVersionInfo &version=ppucd.getUnicodeVersion(); corePropsBuilder->setUnicodeVersion(version); + bidiPropsBuilder->setUnicodeVersion(version); } if(errorCode.isFailure()) { fprintf(stderr, @@ -149,6 +152,7 @@ main(int argc, char* argv[]) { } corePropsBuilder->build(errorCode); + bidiPropsBuilder->build(errorCode); if(errorCode.isFailure()) { fprintf(stderr, "genprops error: failure finalizing the data - %s\n", errorCode.errorName()); @@ -168,6 +172,8 @@ main(int argc, char* argv[]) { pnamesBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); corePropsBuilder->writeCSourceFile(sourceCommon.data(), errorCode); corePropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); + bidiPropsBuilder->writeCSourceFile(sourceCommon.data(), errorCode); + bidiPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); return errorCode; } diff --git a/tools/unicode/c/genprops/genprops.h b/tools/unicode/c/genprops/genprops.h index 7e27dd9607e..a50f759d81e 100644 --- a/tools/unicode/c/genprops/genprops.h +++ b/tools/unicode/c/genprops/genprops.h @@ -41,6 +41,7 @@ public: PNamesBuilder *createPNamesBuilder(UErrorCode &errorCode); PropsBuilder *createCorePropsBuilder(UErrorCode &errorCode); +PropsBuilder *createBiDiPropsBuilder(UErrorCode &errorCode); /* global flags */ U_CFUNC UBool beVerbose;