diff --git a/.gitignore b/.gitignore
index ca57dfd1ab1..27144532fce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -980,22 +980,6 @@ tools/unicode/c/gencase/gencase.vcproj.*.*.user
tools/unicode/c/gencase/release
tools/unicode/c/gencase/x64
tools/unicode/c/gencase/x86
-tools/unicode/c/gennames/*.d
-tools/unicode/c/gennames/*.ncb
-tools/unicode/c/gennames/*.o
-tools/unicode/c/gennames/*.opt
-tools/unicode/c/gennames/*.pdb
-tools/unicode/c/gennames/*.plg
-tools/unicode/c/gennames/Debug
-tools/unicode/c/gennames/Makefile
-tools/unicode/c/gennames/Release
-tools/unicode/c/gennames/debug
-tools/unicode/c/gennames/gennames
-tools/unicode/c/gennames/gennames.[0-9]
-tools/unicode/c/gennames/gennames.vcproj.*.*.user
-tools/unicode/c/gennames/release
-tools/unicode/c/gennames/x64
-tools/unicode/c/gennames/x86
tools/unicode/c/genprops/*.d
tools/unicode/c/genprops/*.ncb
tools/unicode/c/genprops/*.o
diff --git a/tools/unicode/c/CMakeLists.txt b/tools/unicode/c/CMakeLists.txt
index c7f143eaf97..155d631159f 100644
--- a/tools/unicode/c/CMakeLists.txt
+++ b/tools/unicode/c/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2010-2011, International Business Machines
+# Copyright (C) 2010-2012, International Business Machines
# Corporation and others. All Rights Reserved.
#
# created on: 2010jun03
@@ -18,7 +18,6 @@ include_directories(
${ICU_SRC_DIR}/source/tools/toolutil)
link_directories(${ICU_INST_DIR}/lib)
add_subdirectory(gencase)
-add_subdirectory(gennames)
add_subdirectory(genprops)
add_subdirectory(genuca)
add_subdirectory(genuts46)
diff --git a/tools/unicode/c/gennames/CMakeLists.txt b/tools/unicode/c/gennames/CMakeLists.txt
deleted file mode 100644
index 843f0c7b4a6..00000000000
--- a/tools/unicode/c/gennames/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (C) 2010, International Business Machines
-# Corporation and others. All Rights Reserved.
-#
-# created on: 2010jun03
-# created by: Markus W. Scherer
-# edited on: 2010jul20
-# edited by: Stuart G. Gill
-add_executable(gennames gennames.c)
-target_link_libraries(gennames icuuc icutu)
diff --git a/tools/unicode/c/gennames/Makefile.in b/tools/unicode/c/gennames/Makefile.in
deleted file mode 100644
index ed2f88f3fd8..00000000000
--- a/tools/unicode/c/gennames/Makefile.in
+++ /dev/null
@@ -1,97 +0,0 @@
-## Makefile.in for ICU - tools/gennames
-## Copyright (c) 1999-2005, International Business Machines Corporation and
-## others. All Rights Reserved.
-## Steven R. Loomis
-
-## Source directory information
-srcdir = @srcdir@
-top_srcdir = @top_srcdir@
-
-top_builddir = ../..
-
-include $(top_builddir)/icudefs.mk
-
-## Build directory information
-subdir = tools/gennames
-
-TARGET_STUB_NAME = gennames
-
-SECTION = 8
-
-#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
-
-
-## Extra files to remove for 'make clean'
-CLEANFILES = *~ $(DEPS) $(MAN_FILES)
-
-## Target information
-TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
-
-ifneq ($(top_builddir),$(top_srcdir))
-CPPFLAGS += -I$(top_builddir)/common
-endif
-CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
-LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
-
-OBJECTS = gennames.o
-
-DEPS = $(OBJECTS:.o=.d)
-
-## List of phony targets
-.PHONY : all all-local install install-local clean clean-local \
-distclean distclean-local dist dist-local check check-local install-man
-
-## Clear suffix list
-.SUFFIXES :
-
-## List of standard targets
-all: all-local
-install: install-local
-clean: clean-local
-distclean : distclean-local
-dist: dist-local
-check: all check-local
-
-all-local: $(TARGET) $(MAN_FILES)
-
-install-local: all-local install-man
-# $(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
-# $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)
-
-install-man: $(MAN_FILES)
-# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
-# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
-
-dist-local:
-
-clean-local:
- test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
- $(RMV) $(TARGET) $(OBJECTS)
-
-distclean-local: clean-local
- $(RMV) Makefile
-
-check-local: all-local
-
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
- cd $(top_builddir) \
- && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
-
-$(TARGET) : $(OBJECTS)
- $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
- $(POST_BUILD_STEP)
-
-
-%.$(SECTION): $(srcdir)/%.$(SECTION).in
- cd $(top_builddir) \
- && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
-
-
-ifeq (,$(MAKECMDGOALS))
--include $(DEPS)
-else
-ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
--include $(DEPS)
-endif
-endif
-
diff --git a/tools/unicode/c/gennames/gennames.vcproj b/tools/unicode/c/gennames/gennames.vcproj
deleted file mode 100644
index 0a1f05fd77e..00000000000
--- a/tools/unicode/c/gennames/gennames.vcproj
+++ /dev/null
@@ -1,399 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/tools/unicode/c/genprops/CMakeLists.txt b/tools/unicode/c/genprops/CMakeLists.txt
index 0062d26c211..f6c7c90e25d 100644
--- a/tools/unicode/c/genprops/CMakeLists.txt
+++ b/tools/unicode/c/genprops/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2010-2011, International Business Machines
+# Copyright (C) 2010-2012, International Business Machines
# Corporation and others. All Rights Reserved.
#
# created on: 2010jun03
@@ -7,5 +7,6 @@
# edited by: Stuart G. Gill
add_executable(genprops genprops.cpp
pnamesbuilder.cpp corepropsbuilder.cpp
- bidipropsbuilder.cpp)
+ bidipropsbuilder.cpp
+ namespropsbuilder.cpp)
target_link_libraries(genprops icuuc icutu)
diff --git a/tools/unicode/c/genprops/bidipropsbuilder.cpp b/tools/unicode/c/genprops/bidipropsbuilder.cpp
index 3ab7a75bd3f..04c88ca47db 100644
--- a/tools/unicode/c/genprops/bidipropsbuilder.cpp
+++ b/tools/unicode/c/genprops/bidipropsbuilder.cpp
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 2004-2011, International Business Machines
+* Copyright (C) 2004-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -33,8 +33,6 @@
#include "ubidi_props.h"
#include "genprops.h"
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
/* Unicode bidi/shaping properties file format ---------------------------------
The file format prepared and written here contains several data
@@ -519,11 +517,11 @@ BiDiPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorC
PropsBuilder *
createBiDiPropsBuilder(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return NULL; }
- PropsBuilder *pw=new BiDiPropsBuilder(errorCode);
- if(pw==NULL) {
+ PropsBuilder *pb=new BiDiPropsBuilder(errorCode);
+ if(pb==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
- return pw;
+ return pb;
}
/*
diff --git a/tools/unicode/c/genprops/corepropsbuilder.cpp b/tools/unicode/c/genprops/corepropsbuilder.cpp
index db33e815268..73b5ea5a6bc 100644
--- a/tools/unicode/c/genprops/corepropsbuilder.cpp
+++ b/tools/unicode/c/genprops/corepropsbuilder.cpp
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 1999-2011, International Business Machines
+* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -236,8 +236,6 @@ Change from UTrie to UTrie2.
----------------------------------------------------------------------------- */
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
U_NAMESPACE_USE
/* UDataInfo cf. udata.h */
@@ -813,11 +811,11 @@ CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorC
PropsBuilder *
createCorePropsBuilder(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return NULL; }
- PropsBuilder *pw=new CorePropsBuilder(errorCode);
- if(pw==NULL) {
+ PropsBuilder *pb=new CorePropsBuilder(errorCode);
+ if(pb==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
- return pw;
+ return pb;
}
/*
diff --git a/tools/unicode/c/genprops/genprops.cpp b/tools/unicode/c/genprops/genprops.cpp
index 61e5009a66c..b198c22f7a7 100644
--- a/tools/unicode/c/genprops/genprops.cpp
+++ b/tools/unicode/c/genprops/genprops.cpp
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 1999-2011, International Business Machines
+* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -29,15 +29,16 @@
#include "toolutil.h"
#include "uoptions.h"
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
-
U_NAMESPACE_USE
UBool beVerbose=FALSE;
+UBool beQuiet=FALSE;
PropsBuilder::PropsBuilder() {}
PropsBuilder::~PropsBuilder() {}
void PropsBuilder::setUnicodeVersion(const UVersionInfo) {}
+void PropsBuilder::setAlgNamesRange(UChar32, UChar32,
+ const char *, const char *, UErrorCode &) {}
void PropsBuilder::setProps(const UniProps &, const UnicodeSet &, UErrorCode &) {}
void PropsBuilder::build(UErrorCode &) {}
void PropsBuilder::writeCSourceFile(const char *, UErrorCode &) {}
@@ -47,6 +48,7 @@ enum {
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
+ QUIET,
COPYRIGHT
};
@@ -55,6 +57,7 @@ static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
+ UOPTION_QUIET,
UOPTION_COPYRIGHT
};
@@ -86,18 +89,21 @@ main(int argc, char* argv[]) {
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
+ "\t-q or --quiet no output\n"
"\t-c or --copyright include a copyright notice\n");
return argc<2 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
/* get the options values */
beVerbose=options[VERBOSE].doesOccur;
+ beQuiet=options[QUIET].doesOccur;
/* initialize */
IcuToolErrorCode errorCode("genprops");
LocalPointer pnamesBuilder(createPNamesBuilder(errorCode));
LocalPointer corePropsBuilder(createCorePropsBuilder(errorCode));
LocalPointer bidiPropsBuilder(createBiDiPropsBuilder(errorCode));
+ LocalPointer namesPropsBuilder(createNamesPropsBuilder(errorCode));
if(errorCode.isFailure()) {
fprintf(stderr, "genprops: unable to create PropsBuilders - %s\n", errorCode.errorName());
return errorCode.reset();
@@ -138,10 +144,19 @@ main(int argc, char* argv[]) {
const UniProps *props=ppucd.getProps(newValues, errorCode);
corePropsBuilder->setProps(*props, newValues, errorCode);
bidiPropsBuilder->setProps(*props, newValues, errorCode);
+ namesPropsBuilder->setProps(*props, newValues, errorCode);
} else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) {
const UVersionInfo &version=ppucd.getUnicodeVersion();
corePropsBuilder->setUnicodeVersion(version);
bidiPropsBuilder->setUnicodeVersion(version);
+ namesPropsBuilder->setUnicodeVersion(version);
+ } else if(lineType==PreparsedUCD::ALG_NAMES_RANGE_LINE) {
+ UChar32 start, end;
+ if(ppucd.getRangeForAlgNames(start, end, errorCode)) {
+ const char *type=ppucd.nextField();
+ const char *prefix=ppucd.nextField(); // NULL if type==hangul
+ namesPropsBuilder->setAlgNamesRange(start, end, type, prefix, errorCode);
+ }
}
if(errorCode.isFailure()) {
fprintf(stderr,
@@ -153,6 +168,7 @@ main(int argc, char* argv[]) {
corePropsBuilder->build(errorCode);
bidiPropsBuilder->build(errorCode);
+ namesPropsBuilder->build(errorCode);
if(errorCode.isFailure()) {
fprintf(stderr, "genprops error: failure finalizing the data - %s\n",
errorCode.errorName());
@@ -174,6 +190,7 @@ main(int argc, char* argv[]) {
corePropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
bidiPropsBuilder->writeCSourceFile(sourceCommon.data(), errorCode);
bidiPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
+ namesPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode);
return errorCode;
}
diff --git a/tools/unicode/c/genprops/genprops.h b/tools/unicode/c/genprops/genprops.h
index a50f759d81e..7eede9563dc 100644
--- a/tools/unicode/c/genprops/genprops.h
+++ b/tools/unicode/c/genprops/genprops.h
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 1999-2011, International Business Machines
+* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -23,11 +23,15 @@
#include "ppucd.h"
#include "unewdata.h"
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
class PropsBuilder {
public:
PropsBuilder();
virtual ~PropsBuilder();
virtual void setUnicodeVersion(const UVersionInfo version);
+ virtual void setAlgNamesRange(UChar32 start, UChar32 end,
+ const char *type, const char *prefix, UErrorCode &errorCode);
virtual void setProps(const icu::UniProps &props, const icu::UnicodeSet &newValues, UErrorCode &errorCode);
virtual void build(UErrorCode &errorCode);
virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
@@ -42,8 +46,10 @@ public:
PNamesBuilder *createPNamesBuilder(UErrorCode &errorCode);
PropsBuilder *createCorePropsBuilder(UErrorCode &errorCode);
PropsBuilder *createBiDiPropsBuilder(UErrorCode &errorCode);
+PropsBuilder *createNamesPropsBuilder(UErrorCode &errorCode);
/* global flags */
-U_CFUNC UBool beVerbose;
+extern UBool beVerbose;
+extern UBool beQuiet;
#endif
diff --git a/tools/unicode/c/gennames/gennames.c b/tools/unicode/c/genprops/namespropsbuilder.cpp
similarity index 57%
rename from tools/unicode/c/gennames/gennames.c
rename to tools/unicode/c/genprops/namespropsbuilder.cpp
index 2d67443bf44..60d37b43189 100644
--- a/tools/unicode/c/gennames/gennames.c
+++ b/tools/unicode/c/genprops/namespropsbuilder.cpp
@@ -1,11 +1,11 @@
/*
*******************************************************************************
*
-* Copyright (C) 1999-2011, International Business Machines
+* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
-* file name: gennames.c
+* file name: namespropsbuilder.cpp (was gennames/gennames.c)
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
@@ -13,11 +13,8 @@
* created on: 1999sep30
* created by: Markus W. Scherer
*
-* This program reads the Unicode character database text file,
-* parses it, and extracts the character code,
-* the "modern" character name, and optionally the
-* Unicode 1.0 character name, and (starting with ICU 2.2) the ISO 10646 comment.
-* It then tokenizes and compresses the names and builds
+* This builder reads Unicode character names and aliases,
+* tokenizes and compresses them, and builds
* compact binary tables for random-access lookup
* in a u_charName() API function.
*
@@ -121,16 +118,16 @@
#include
#include "unicode/utypes.h"
#include "unicode/putil.h"
-#include "unicode/uclean.h"
#include "unicode/udata.h"
+#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
+#include "genprops.h"
+#include "ppucd.h"
#include "uarrsort.h"
+#include "uassert.h"
#include "unewdata.h"
#include "uoptions.h"
-#include "uparse.h"
-
-#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define STRING_STORE_SIZE 1000000
#define GROUP_STORE_SIZE 5000
@@ -143,66 +140,8 @@
#define MAX_WORD_COUNT 20000
#define MAX_GROUP_COUNT 5000
-#define DATA_NAME "unames"
-#define DATA_TYPE "icu"
-#define VERSION_STRING "unam"
#define NAME_SEPARATOR_CHAR ';'
-#define ISO_DATA_NAME "ucomment"
-
-/* Unicode versions --------------------------------------------------------- */
-
-enum {
- UNI_1_0,
- UNI_1_1,
- UNI_2_0,
- UNI_3_0,
- UNI_3_1,
- UNI_3_2,
- UNI_4_0,
- UNI_4_0_1,
- UNI_4_1,
- UNI_5_0,
- UNI_5_1,
- UNI_5_2,
- UNI_6_0,
- UNI_6_1,
- UNI_VER_COUNT
-};
-
-static const UVersionInfo
-unicodeVersions[]={
- { 1, 0, 0, 0 },
- { 1, 1, 0, 0 },
- { 2, 0, 0, 0 },
- { 3, 0, 0, 0 },
- { 3, 1, 0, 0 },
- { 3, 2, 0, 0 },
- { 4, 0, 0, 0 },
- { 4, 0, 1, 0 },
- { 4, 1, 0, 0 },
- { 5, 0, 0, 0 },
- { 5, 1, 0, 0 },
- { 5, 2, 0, 0 },
- { 6, 0, 0, 0 },
- { 6, 1, 0, 0 }
-};
-
-static int32_t ucdVersion=UNI_6_1;
-
-static int32_t
-findUnicodeVersion(const UVersionInfo version) {
- int32_t i;
-
- for(i=0; /* while(version>unicodeVersions[i]) {} */
- i0;
- ++i) {}
- if(0=ucdVersion comparisons */
- }
- return i; /* version>=unicodeVersions[i] && version=3) {
- parseNameAliases(argv[2], &moreOptions);
- }
- parseDB(argc>=2 ? argv[1] : "-", &moreOptions);
- compress();
- generateData(options[DESTDIR].value, &moreOptions);
-
- u_cleanup();
- return 0;
-}
-
-static void
-init() {
- int i;
-
- for(i=0; i<256; ++i) {
- tokens[i]=0;
- }
-}
-
/* parsing ------------------------------------------------------------------ */
-/* get a name, strip leading and trailing whitespace */
-static int16_t
-getName(char **pStart, char *limit) {
- /* strip leading whitespace */
- char *start=(char *)u_skipWhitespace(*pStart);
-
- /* strip trailing whitespace */
- while(start=sizeof(cpNameAliases[cpNameAliasesTop].nameAlias)) {
- fprintf(stderr, "gennames: error - name alias %s empty or too long for code point U+%04lx\n",
- name, (unsigned long)code);
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
-
- /* check for non-character code points */
- if(!U_IS_UNICODE_CHAR(code)) {
- fprintf(stderr, "gennames: error - name alias for non-character code point U+%04lx\n",
- (unsigned long)code);
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
-
- /*
- * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line.
- * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
- */
- fields[2][1]=0;
- if(0!=uprv_strcmp("correction", fields[2][0])) {
+void
+NamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
+ UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+ if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) {
return;
}
- /* check that the code points (code) are in ascending order */
- if(code<=prevCode && code>0) {
- fprintf(stderr, "gennames: error - NameAliases entries out of order, U+%04lx after U+%04lx\n",
- (unsigned long)code, (unsigned long)prevCode);
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
- prevCode=code;
+ U_ASSERT(props.start==props.end);
- if(cpNameAliasesTop>=LENGTHOF(cpNameAliases)) {
- fprintf(stderr, "gennames: error - too many name aliases\n");
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
- cpNameAliases[cpNameAliasesTop].code=code;
- uprv_memcpy(cpNameAliases[cpNameAliasesTop].nameAlias, name, length);
- cpNameAliases[cpNameAliasesTop].nameAlias[length]=0;
- ++cpNameAliasesTop;
-
- parseName(name, length);
-}
-
-static void U_CALLCONV
-lineFn(void *context,
- char *fields[][2], int32_t fieldCount,
- UErrorCode *pErrorCode) {
- Options *storeOptions=(Options *)context;
- char *names[4];
+ const char *names[4]={ NULL, NULL, NULL, NULL };
int16_t lengths[4]={ 0, 0, 0, 0 };
- static uint32_t prevCode=0;
- uint32_t code=0;
-
- if(U_FAILURE(*pErrorCode)) {
- return;
- }
- /* get the character code */
- code=uprv_strtoul(fields[0][0], NULL, 16);
/* get the character name */
- if(storeOptions->storeNames) {
- names[0]=fields[1][0];
- lengths[0]=getName(names+0, fields[1][1]);
- if(names[0][0]=='<') {
- /* do not store pseudo-names in <> brackets */
- lengths[0]=0;
+ if(props.name!=NULL) {
+ names[0]=props.name;
+ lengths[0]=(int16_t)uprv_strlen(props.name);
+ parseName(names[0], lengths[0]);
+ }
+
+ CharString buffer;
+ if(props.nameAlias!=NULL) {
+ /*
+ * Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line.
+ * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
+ */
+ const char *corr=uprv_strstr(props.nameAlias, "correction=");
+ if(corr!=NULL) {
+ corr+=11; // skip "correction="
+ const char *limit=uprv_strchr(corr, ',');
+ if(limit!=NULL) {
+ buffer.append(corr, limit-corr, errorCode);
+ names[3]=buffer.data();
+ lengths[3]=(int16_t)(limit-corr);
+ } else {
+ names[3]=corr;
+ lengths[3]=(int16_t)uprv_strlen(corr);
+ }
+ parseName(names[3], lengths[3]);
}
}
- /* store 1.0 names */
- /* get the second character name, the one from Unicode 1.0 */
- if(storeOptions->store10Names) {
- names[1]=fields[10][0];
- lengths[1]=getName(names+1, fields[10][1]);
- if(names[1][0]=='<') {
- /* do not store pseudo-names in <> brackets */
- lengths[1]=0;
- }
- }
-
- /* get the ISO 10646 comment */
- if(storeOptions->storeISOComments) {
- names[2]=fields[11][0];
- lengths[2]=getName(names+2, fields[11][1]);
- }
-
- if(lengths[0]+lengths[1]+lengths[2]==0) {
- return;
- }
-
- /* check for non-character code points */
- if(!U_IS_UNICODE_CHAR(code)) {
- fprintf(stderr, "gennames: error - properties for non-character code point U+%04lx\n",
- (unsigned long)code);
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
-
- /* check that the code points (code) are in ascending order */
- if(code<=prevCode && code>0) {
- fprintf(stderr, "gennames: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
- (unsigned long)code, (unsigned long)prevCode);
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
- prevCode=code;
-
- parseName(names[0], lengths[0]);
- parseName(names[1], lengths[1]);
- parseName(names[2], lengths[2]);
-
- if(cpNameAliasesIndex=cpNameAliases[cpNameAliasesIndex].code) {
- if(code==cpNameAliases[cpNameAliasesIndex].code) {
- names[3]=cpNameAliases[cpNameAliasesIndex].nameAlias;
- lengths[3]=(int16_t)uprv_strlen(cpNameAliases[cpNameAliasesIndex].nameAlias);
- ++cpNameAliasesIndex;
- } else {
- fprintf(stderr, "gennames: error - NameAlias but no UnicodeData entry for U+%04lx\n",
- (unsigned long)code);
- *pErrorCode=U_PARSE_ERROR;
- exit(U_PARSE_ERROR);
- }
- }
-
- /*
- * set the count argument to
- * 1: only store regular names, or only store ISO 10646 comments
- * 2: store regular and 1.0 names
- * 3: store names and ISO 10646 comment
- * 4: also store name alias
- *
- * addLine() will ignore empty trailing names
- */
- if(storeOptions->storeNames) {
- /* store names and comments as parsed according to storeOptions */
- addLine(code, names, lengths, LENGTHOF(names));
- } else {
- /* store only ISO 10646 comments */
- addLine(code, names+2, lengths+2, 1);
- }
+ addLine(props.start, names, lengths, LENGTHOF(names));
}
static void
-parseNameAliases(const char *filename, Options *storeOptions) {
- char *fields[3][2];
- UErrorCode errorCode=U_ZERO_ERROR;
-
- if(!storeOptions->storeNames) {
- return;
- }
- /*
- * This works only for Unicode 6.1 NameAliases.txt with 3 fields per line.
- * TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
- */
- u_parseDelimitedFile(filename, ';', fields, 3, nameAliasesLineFn, NULL, &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
- exit(errorCode);
- }
-
- if(!beQuiet) {
- printf("number of name aliases: %lu\n", (unsigned long)cpNameAliasesTop);
- }
-}
-
-static void
-parseDB(const char *filename, Options *storeOptions) {
- char *fields[15][2];
- UErrorCode errorCode=U_ZERO_ERROR;
-
- u_parseDelimitedFile(filename, ';', fields, 15, lineFn, storeOptions, &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "gennames parse error: %s\n", u_errorName(errorCode));
- exit(errorCode);
- }
- if(cpNameAliasesIndex0 && words[wordCount-1].weight<1) {
@@ -826,11 +516,13 @@ compress() {
for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
if(tokens[i]!=-1) {
tokens[i]=wordNumber;
+#ifdef DEBUG_NAMES
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
+#endif
++wordNumber;
}
}
@@ -880,11 +572,13 @@ compress() {
/* set token 0 to word 0 */
tokens[0]=0;
+#ifdef DEBUG_NAMES
if(beVerbose) {
printf("tokens[0x000]: word%8ld \"%.*s\"\n",
(long)words[0].weight,
words[0].length, words[0].s);
}
+#endif
wordNumber=1;
/* set the lead byte tokens */
@@ -897,11 +591,13 @@ compress() {
/* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
if(tokens[i]!=-1) {
tokens[i]=wordNumber;
+#ifdef DEBUG_NAMES
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
+#endif
++wordNumber;
}
}
@@ -912,11 +608,13 @@ compress() {
tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
} else {
tokens[i]=wordNumber;
+#ifdef DEBUG_NAMES
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
+#endif
++wordNumber;
}
}
@@ -1051,27 +749,42 @@ compareWords(const void *context, const void *word1, const void *word2) {
return ((Word *)word2)->weight-((Word *)word1)->weight;
}
+void
+NamesPropsBuilder::build(UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+
+ if(!beQuiet) {
+ puts("* unames.icu stats *");
+ printf("size of all names in the database: %lu\n",
+ (unsigned long)lineTop);
+ printf("number of named Unicode characters: %lu\n",
+ (unsigned long)lineCount);
+ printf("number of words in the dictionary from these names: %lu\n",
+ (unsigned long)wordCount);
+ }
+ compress(errorCode);
+}
+
/* generate output data ----------------------------------------------------- */
-static void
-generateData(const char *dataDir, Options *storeOptions) {
- UNewDataMemory *pData;
- UErrorCode errorCode=U_ZERO_ERROR;
+void
+NamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
+ if(U_FAILURE(errorCode)) { return; }
+
+ UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo,
+ withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n",
+ path, u_errorName(errorCode));
+ return;
+ }
+
uint16_t groupWords[3];
- uint32_t i, groupTop=lineTop, offset, size,
+ uint32_t i, groupTop=lineTop, size,
tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
long dataLength;
int16_t token;
- pData=udata_create(dataDir,
- DATA_TYPE, storeOptions->storeNames ? DATA_NAME : ISO_DATA_NAME,
- &dataInfo,
- haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
- if(U_FAILURE(errorCode)) {
- fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode);
- exit(errorCode);
- }
-
/* first, see how much space we need, and prepare the token strings */
for(i=0; i>16);
groupWords[2]=(uint16_t)(offset);
udata_writeBlock(pData, groupWords, 6);
@@ -1187,7 +899,8 @@ generateData(const char *dataDir, Options *storeOptions) {
/* 4-align the algorithmic names data */
udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
- generateAlgorithmicData(pData, storeOptions);
+ udata_write32(pData, countAlgRanges);
+ udata_writeBlock(pData, algRanges.data(), algRanges.length());
/* finish up */
dataLength=udata_finish(pData, &errorCode);
@@ -1203,205 +916,6 @@ dataLength, (unsigned long)size);
}
}
-/* the structure for algorithmic names needs to be 4-aligned */
-typedef struct AlgorithmicRange {
- uint32_t rangeStart, rangeEnd;
- uint8_t algorithmType, algorithmVariant;
- uint16_t rangeSize;
-} AlgorithmicRange;
-
-static uint32_t
-generateAlgorithmicData(UNewDataMemory *pData, Options *storeOptions) {
- static char prefix[] = "CJK UNIFIED IDEOGRAPH-";
-# define PREFIX_LENGTH 23
-# define PREFIX_LENGTH_4 24
- uint32_t countAlgRanges;
-
- static AlgorithmicRange cjkExtA={
- 0x3400, 0x4db5,
- 0, 4,
- sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
- };
- static AlgorithmicRange cjk={
- 0x4e00, 0x9fa5,
- 0, 4,
- sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
- };
- static AlgorithmicRange cjkExtB={
- 0x20000, 0x2a6d6,
- 0, 5,
- sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
- };
- static AlgorithmicRange cjkExtC={
- 0x2a700, 0x2b734,
- 0, 5,
- sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
- };
- static AlgorithmicRange cjkExtD={
- 0x2b740, 0x2b81d,
- 0, 5,
- sizeof(AlgorithmicRange)+PREFIX_LENGTH_4
- };
-
- static char jamo[]=
- "HANGUL SYLLABLE \0"
-
- "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
- "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
-
- "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
- "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
- "YU\0EU\0YI\0I\0"
-
- "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
- "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
- "S\0SS\0NG\0J\0C\0K\0T\0P\0H"
- ;
-
- static AlgorithmicRange hangul={
- 0xac00, 0xd7a3,
- 1, 3,
- sizeof(AlgorithmicRange)+6+sizeof(jamo)
- };
-
- /* modulo factors, maximum 8 */
- /* 3 factors: 19, 21, 28, most-to-least-significant */
- static uint16_t hangulFactors[3]={
- 19, 21, 28
- };
-
- uint32_t size;
-
- size=0;
-
- if(ucdVersion>=UNI_6_1) {
- /* Unicode 6.1 and up has a longer CJK Unihan range than before */
- cjk.rangeEnd=0x9FCC;
- } else if(ucdVersion>=UNI_5_2) {
- /* Unicode 5.2 and up has a longer CJK Unihan range than before */
- cjk.rangeEnd=0x9FCB;
- } else if(ucdVersion>=UNI_5_1) {
- /* Unicode 5.1 and up has a longer CJK Unihan range than before */
- cjk.rangeEnd=0x9FC3;
- } else if(ucdVersion>=UNI_4_1) {
- /* Unicode 4.1 and up has a longer CJK Unihan range than before */
- cjk.rangeEnd=0x9FBB;
- }
-
- /* number of ranges of algorithmic names */
- if(!storeOptions->storeNames) {
- countAlgRanges=0;
- } else if(ucdVersion>=UNI_6_0) {
- /* Unicode 6.0 and up has 6 ranges including CJK Extension D */
- countAlgRanges=6;
- } else if(ucdVersion>=UNI_5_2) {
- /* Unicode 5.2 and up has 5 ranges including CJK Extension C */
- countAlgRanges=5;
- } else if(ucdVersion>=UNI_3_1) {
- /* Unicode 3.1 and up has 4 ranges including CJK Extension B */
- countAlgRanges=4;
- } else if(ucdVersion>=UNI_3_0) {
- /* Unicode 3.0 has 3 ranges including CJK Extension A */
- countAlgRanges=3;
- } else {
- /* Unicode 2.0 has 2 ranges including Hangul and CJK Unihan */
- countAlgRanges=2;
- }
-
- if(pData!=NULL) {
- udata_write32(pData, countAlgRanges);
- } else {
- size+=4;
- }
- if(countAlgRanges==0) {
- return size;
- }
-
- /*
- * each range:
- * uint32_t rangeStart
- * uint32_t rangeEnd
- * uint8_t algorithmType
- * uint8_t algorithmVariant
- * uint16_t size of range data
- * uint8_t[size] data
- */
-
- /* range 0: cjk extension a */
- if(countAlgRanges>=3) {
- if(pData!=NULL) {
- udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange));
- udata_writeString(pData, prefix, PREFIX_LENGTH);
- if(PREFIX_LENGTH=4) {
- if(pData!=NULL) {
- udata_writeBlock(pData, &cjkExtB, sizeof(AlgorithmicRange));
- udata_writeString(pData, prefix, PREFIX_LENGTH);
- if(PREFIX_LENGTH=5) {
- if(pData!=NULL) {
- udata_writeBlock(pData, &cjkExtC, sizeof(AlgorithmicRange));
- udata_writeString(pData, prefix, PREFIX_LENGTH);
- if(PREFIX_LENGTH=6) {
- if(pData!=NULL) {
- udata_writeBlock(pData, &cjkExtD, sizeof(AlgorithmicRange));
- udata_writeString(pData, prefix, PREFIX_LENGTH);
- if(PREFIX_LENGTH