diff --git a/tools/unicode/c/genprops/CMakeLists.txt b/tools/unicode/c/genprops/CMakeLists.txt index eb5f724ddc0..bbf48545a40 100644 --- a/tools/unicode/c/genprops/CMakeLists.txt +++ b/tools/unicode/c/genprops/CMakeLists.txt @@ -5,5 +5,5 @@ # created by: Markus W. Scherer # edited on: 2010jul20 # edited by: Stuart G. Gill -add_executable(genprops genprops.c props2.c store.c) +add_executable(genprops genprops.cpp props2.cpp store.c) target_link_libraries(genprops icuuc icutu) diff --git a/tools/unicode/c/genprops/genprops.c b/tools/unicode/c/genprops/genprops.cpp similarity index 99% rename from tools/unicode/c/genprops/genprops.c rename to tools/unicode/c/genprops/genprops.cpp index 119ed006d06..b72dd6a5b20 100644 --- a/tools/unicode/c/genprops/genprops.c +++ b/tools/unicode/c/genprops/genprops.cpp @@ -1,11 +1,11 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2008, International Business Machines +* Copyright (C) 1999-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* -* file name: genprops.c +* file name: genprops.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 @@ -40,6 +40,8 @@ U_CDECL_END #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) +U_NAMESPACE_USE + UBool beVerbose=FALSE, haveCopyright=TRUE; /* prototypes --------------------------------------------------------------- */ diff --git a/tools/unicode/c/genprops/genprops.h b/tools/unicode/c/genprops/genprops.h index b50a1037625..79ab229e594 100644 --- a/tools/unicode/c/genprops/genprops.h +++ b/tools/unicode/c/genprops/genprops.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2008, International Business Machines +* Copyright (C) 1999-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -34,13 +34,13 @@ typedef struct { } Props; /* global flags */ -extern UBool beVerbose, haveCopyright; +U_CFUNC UBool beVerbose, haveCopyright; -extern const char *const +U_CFUNC const char *const genCategoryNames[]; -/* properties vectors in props2.c */ -extern UPropsVectors *pv; +/* properties vectors in props2.cpp */ +U_CFUNC UPropsVectors *pv; /* prototypes */ U_CFUNC void @@ -52,28 +52,28 @@ isToken(const char *token, const char *s); U_CFUNC int32_t getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s); -extern void +U_CFUNC void setUnicodeVersion(const char *v); -extern void +U_CFUNC void initStore(void); -extern void +U_CFUNC void exitStore(void); -extern uint32_t +U_CFUNC uint32_t makeProps(Props *p); -extern void +U_CFUNC void addProps(uint32_t c, uint32_t props); -extern uint32_t +U_CFUNC uint32_t getProps(uint32_t c); -extern void +U_CFUNC void repeatProps(uint32_t first, uint32_t last, uint32_t props); -extern void +U_CFUNC void generateData(const char *dataDir, UBool csource); /* props2.c */ diff --git a/tools/unicode/c/genprops/props2.c b/tools/unicode/c/genprops/props2.cpp similarity index 77% rename from tools/unicode/c/genprops/props2.c rename to tools/unicode/c/genprops/props2.cpp index d1201279679..61a90f939c7 100644 --- a/tools/unicode/c/genprops/props2.c +++ b/tools/unicode/c/genprops/props2.cpp @@ -1,11 +1,11 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2009, International Business Machines +* Copyright (C) 2002-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* -* file name: props2.c +* file name: props2.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 @@ -20,6 +20,7 @@ #include #include "unicode/utypes.h" #include "unicode/uchar.h" +#include "unicode/unistr.h" #include "unicode/uscript.h" #include "cstring.h" #include "cmemory.h" @@ -32,11 +33,15 @@ #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) +U_NAMESPACE_USE + /* data --------------------------------------------------------------------- */ static UNewTrie *newTrie; UPropsVectors *pv; +static UnicodeString *scriptExtensions; + /* miscellaneous ------------------------------------------------------------ */ static char * @@ -45,7 +50,7 @@ trimTerminateField(char *s, char *limit) { s=(char *)u_skipWhitespace(s); /* trim trailing whitespace */ - while(scodes[i] + } else { + codes.append((UChar)value); + break; + } + } + if(c==0 || c==';') { + // the token ended at a terminator + break; + } else { + // the token ended at U_IS_INV_WHITESPACE(c), continue after c + s=limit+1; + } + } + int32_t length=codes.length(); + if(length==0) { + fprintf(stderr, + "genprops: missing values in ScriptExtensions.txt field 1 " + "for U+%04lx..U+%04lx\n", + (long)start, (long)end); + exit(U_INVALID_FORMAT_ERROR); + } + // Set bit 15 on the last script code, for termination. + codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000)); + // Find this list of codes in the Script_Extensions data so far, or add this list. + int32_t index=scriptExtensions->indexOf(codes); + if(index<0) { + index=scriptExtensions->length(); + scriptExtensions->append(codes); + } + // Modify the Script data for each of the start..end code points + // to include the Script_Extensions index. + do { + uint32_t scriptX=upvec_getValue(pv, (UChar32)start, 0)&UPROPS_SCRIPT_X_MASK; + // Find the next code point that has a different script value. + // We want to add the Script_Extensions index to the code point range start..next-1. + UChar32 next; + for(next=(UChar32)start+1; + next<=(UChar32)end && scriptX==(upvec_getValue(pv, next, 0)&UPROPS_SCRIPT_X_MASK); + ++next) {} + if(scriptX>=UPROPS_SCRIPT_X_WITH_COMMON) { + fprintf(stderr, + "genprops: ScriptExtensions.txt has values for U+%04lx..U+%04lx " + "which overlaps with a range including U+%04lx..U+%04lx\n", + (long)start, (long)end, (long)start, (long)(next-1)); + exit(U_INVALID_FORMAT_ERROR); + } + // Encode the (Script, Script_Extensions index) pair. + if(scriptX==USCRIPT_COMMON) { + scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index; + } else if(scriptX==USCRIPT_INHERITED) { + scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index; + } else { + // Store an additional pair of 16-bit units for an unusual main Script code + // together with the Script_Extensions index. + UnicodeString codeIndexPair; + codeIndexPair.append((UChar)scriptX).append((UChar)index); + index=scriptExtensions->indexOf(codeIndexPair); + if(index<0) { + index=scriptExtensions->length(); + scriptExtensions->append(codeIndexPair); + } + scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index; + } + if(index>UPROPS_SCRIPT_MASK) { + fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + // Write the (Script, Script_Extensions index) pair into + // the properties vector for start..next-1. + upvec_setValue(pv, (UChar32)start, (UChar32)(next-1), + 0, scriptX, UPROPS_SCRIPT_X_MASK, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "genprops error: unable to set Script_Extensions: %s\n", u_errorName(*pErrorCode)); + exit(*pErrorCode); + } + start=next; + } while(start<=end); +} + /* DerivedNumericValues.txt ------------------------------------------------- */ static void U_CALLCONV @@ -719,7 +848,36 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode)); exit(errorCode); } - if(p!=NULL) { + + /* round up scriptExtensions to multiple of 4 bytes */ + if(scriptExtensions->length()&1) { + scriptExtensions->append((UChar)0); + } + + /* set indexes */ + indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]= + indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4; + indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS; + indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]= + indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount; + indexes[UPROPS_RESERVED_INDEX_7]= + indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]+scriptExtensions->length()/2; + indexes[UPROPS_RESERVED_INDEX_8]=indexes[UPROPS_RESERVED_INDEX_7]; + indexes[UPROPS_DATA_TOP_INDEX]=indexes[UPROPS_RESERVED_INDEX_8]; + + indexes[UPROPS_MAX_VALUES_INDEX]= + (((int32_t)U_EA_COUNT-1)<getBuffer(), 16, scriptExtensions->length(), + "};\n\n"); } else { - uprv_memcpy(p, pvArray, pvCount*4); + p+=length; + length=pvCount*4; + uprv_memcpy(p, pvArray, length); + + p+=length; + length=scriptExtensions->length()*2; + uprv_memcpy(p, scriptExtensions->getBuffer(), length); } if(beVerbose) { printf("number of additional props vectors: %5u\n", (int)pvRows); printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS); + printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions->length()); } } - length+=pvCount*4; - return length; + return additionalPropsSize; } diff --git a/tools/unicode/c/genprops/store.c b/tools/unicode/c/genprops/store.c index 4499073a956..1874441ac16 100644 --- a/tools/unicode/c/genprops/store.c +++ b/tools/unicode/c/genprops/store.c @@ -41,7 +41,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure precedes the actual data. It contains platform properties values and the file format version. -The following is a description of format version 6 . +The following is a description of format version 7 . Data contents: @@ -74,8 +74,10 @@ Formally, the file contains the following structures: i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors i5 additionalVectorsColumns; -- number of 32-bit words per properties vector - i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table - i7..i9 reservedIndexes; -- reserved values; 0 for now + i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data + i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data + i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values + i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header) i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+) i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2) @@ -92,6 +94,20 @@ Formally, the file contains the following structures: AT serialized trie for additional properties (byte size: 4*(i4-i3)) PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; + SCX const uint16_t scriptExtensions[2*(i7-i6)]; + + SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs. + A Script_Extensions list is a sequence of UScriptCode values in ascending order, + with the last code having bit 15 set for termination. + A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value) + followed by the index of the Script_Extensions list. + If the propsVectors[] column 0 value indicates that there are Script_Extensions, + then the UPROPS_SCRIPT_MASK bit field is an index to either a list or a pair in SCX, + rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields + indicate whether the main Script value is Common or Inherited (and the index is to a list) + vs. another value (and the index is to a pair). + (See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.) + Trie lookup and properties: In order to condense the data for the 21-bit code space, several properties of @@ -206,6 +222,12 @@ Format version 6 became necessary because Unicode 5.2 adds fractions with denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric types and values rather than add another variant to the previous format. +--- Changes in format version 7 --- + +Unicode 6.0 adds Script_Extensions. For characters with script extensions data, +the script code bits are an index into the new Script_Extensions array rather +than a script code. + ----------------------------------------------------------------------------- */ /* UDataInfo cf. udata.h */ @@ -227,14 +249,14 @@ static UNewTrie *pTrie=NULL; /* -------------------------------------------------------------------------- */ -extern void +U_CFUNC void setUnicodeVersion(const char *v) { UVersionInfo version; u_versionFromString(version, v); uprv_memcpy(dataInfo.dataVersion, version, 4); } -extern void +U_CFUNC void initStore() { pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE); if(pTrie==NULL) { @@ -245,7 +267,7 @@ initStore() { initAdditionalProperties(); } -extern void +U_CFUNC void exitStore() { utrie_close(pTrie); exitAdditionalProperties(); @@ -253,7 +275,7 @@ exitStore() { /* store a character's properties ------------------------------------------- */ -extern uint32_t +U_CFUNC uint32_t makeProps(Props *p) { uint32_t den; int32_t type, value, exp, ntv; @@ -327,7 +349,7 @@ makeProps(Props *p) { (ntv<