diff --git a/.gitignore b/.gitignore index a97abb29039..0f7d4a8bdb5 100644 --- a/.gitignore +++ b/.gitignore @@ -212,6 +212,14 @@ icu4c/source/tools/gennames/Makefile icu4c/source/tools/gennames/Release icu4c/source/tools/gennames/gennames icu4c/source/tools/gennames/tmp +icu4c/source/tools/gennorm/*.d +icu4c/source/tools/gennorm/*.pdb +icu4c/source/tools/gennorm/*.plg +icu4c/source/tools/gennorm/Debug +icu4c/source/tools/gennorm/Makefile +icu4c/source/tools/gennorm/Release +icu4c/source/tools/gennorm/gennorm +icu4c/source/tools/gennorm/tmp icu4c/source/tools/genprops/*.d icu4c/source/tools/genprops/*.pdb icu4c/source/tools/genprops/Debug diff --git a/icu4c/source/allinone/allinone.dsw b/icu4c/source/allinone/allinone.dsw index f2ad6711170..e5ec3ff740c 100644 --- a/icu4c/source/allinone/allinone.dsw +++ b/icu4c/source/allinone/allinone.dsw @@ -74,6 +74,18 @@ Package=<4> Begin Project Dependency Project_Dep_Name decmn End Project Dependency + Begin Project Dependency + Project_Dep_Name genfchk + End Project Dependency + Begin Project Dependency + Project_Dep_Name gennorm + End Project Dependency + Begin Project Dependency + Project_Dep_Name genqchk + End Project Dependency + Begin Project Dependency + Project_Dep_Name genuca + End Project Dependency }}} ############################################################################### @@ -96,16 +108,10 @@ Package=<4> Project_Dep_Name i18n End Project Dependency Begin Project Dependency - Project_Dep_Name makeconv - End Project Dependency - Begin Project Dependency Project_Dep_Name gencol End Project Dependency Begin Project Dependency - Project_Dep_Name genrb - End Project Dependency - Begin Project Dependency - Project_Dep_Name gentest + Project_Dep_Name toolutil End Project Dependency }}} @@ -270,6 +276,21 @@ Package=<4> ############################################################################### +Project: "gennorm"=..\tools\gennorm\gennorm.dsp - Package Owner=<4> + +Package=<5> +{{{ +}}} + +Package=<4> +{{{ + Begin Project Dependency + Project_Dep_Name common + End Project Dependency +}}} + +############################################################################### + Project: "genprops"=..\tools\genprops\genprops.dsp - Package Owner=<4> Package=<5> @@ -432,30 +453,9 @@ Package=<4> Project_Dep_Name i18n End Project Dependency Begin Project Dependency - Project_Dep_Name makeconv - End Project Dependency - Begin Project Dependency Project_Dep_Name gencol End Project Dependency Begin Project Dependency - Project_Dep_Name genrb - End Project Dependency - Begin Project Dependency - Project_Dep_Name genccode - End Project Dependency - Begin Project Dependency - Project_Dep_Name gencmn - End Project Dependency - Begin Project Dependency - Project_Dep_Name gencnval - End Project Dependency - Begin Project Dependency - Project_Dep_Name gennames - End Project Dependency - Begin Project Dependency - Project_Dep_Name gentz - End Project Dependency - Begin Project Dependency Project_Dep_Name toolutil End Project Dependency }}} @@ -548,6 +548,15 @@ Package=<4> Begin Project Dependency Project_Dep_Name genqchk End Project Dependency + Begin Project Dependency + Project_Dep_Name common + End Project Dependency + Begin Project Dependency + Project_Dep_Name gennorm + End Project Dependency + Begin Project Dependency + Project_Dep_Name i18n + End Project Dependency }}} ############################################################################### diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp index 1c8b4af8025..fa9d12ff5b5 100644 --- a/icu4c/source/common/common.dsp +++ b/icu4c/source/common/common.dsp @@ -1268,6 +1268,10 @@ InputPath=.\unicode\unorm.h # End Source File # Begin Source File +SOURCE=.\unormimp.h +# End Source File +# Begin Source File + SOURCE=.\unicode\urep.h !IF "$(CFG)" == "common - Win32 Release" diff --git a/icu4c/source/common/normlzr.cpp b/icu4c/source/common/normlzr.cpp index d61176a9bf2..843b3bc090d 100644 --- a/icu4c/source/common/normlzr.cpp +++ b/icu4c/source/common/normlzr.cpp @@ -29,6 +29,11 @@ #include "unicode/unicode.h" #include "mutex.h" +/* ### TODO: new implementation */ +#include "unormimp.h" + + + #define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array)) /** @@ -666,6 +671,25 @@ Normalizer::decompose(const UnicodeString& source, UnicodeString& result, UErrorCode &status) { + /* ### TODO: begin new implementation */ + if(unorm_usesNewImplementation()) { + if(source.isBogus()) { + result.setToBogus(); + } else { + /* make sure that we do not operate on the same buffer in source and result */ + result.cloneArrayIfNeeded(-1, source.length()+20, FALSE); + result.fLength=unorm_decompose(result.fArray, result.fCapacity, + source.fArray, source.fLength, + compat, (options&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &result, + &status); + if(U_FAILURE(status)) { + result.setToBogus(); + } + } + return; + } + /* ### end new implementation */ if (U_FAILURE(status)) { return; } diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp index 64641e35fa4..46fb719ccd6 100644 --- a/icu4c/source/common/unorm.cpp +++ b/icu4c/source/common/unorm.cpp @@ -18,6 +18,7 @@ * mode NFC. */ +#include "unicode/utypes.h" #include "unicode/unorm.h" #include "unicode/normlzr.h" #include "unicode/ustring.h" @@ -25,11 +26,949 @@ #include "cpputils.h" #include "ustr_imp.h" #include "umutex.h" +#include "unormimp.h" -/* added by synwee */ +/* added by synwee ### TODO: remove once the new implementation is finished */ #include "unicode/uchar.h" #include "unicode/utf16.h" +/* ### TODO: remove this once the new implementation is finished */ +static UBool useNewImplementation=FALSE; + +U_CAPI void U_EXPORT2 +unorm_setNewImplementation(UBool useNew) { + useNewImplementation=useNew; +} + +U_CAPI UBool U_EXPORT2 +unorm_usesNewImplementation() { + return useNewImplementation; +} + +/* new implementation ------------------------------------------------------- */ + +/* Korean Hangul and Jamo constants */ +enum { + JAMO_L_BASE=0x1100, /* "lead" jamo */ + JAMO_V_BASE=0x1161, /* "vowel" jamo */ + JAMO_T_BASE=0x11a7, /* "trail" jamo */ + + HANGUL_BASE=0xac00, + + JAMO_L_COUNT=19, + JAMO_V_COUNT=21, + JAMO_T_COUNT=28 +}; + +/* load unorm.dat ----------------------------------------------------------- */ + +/* for a description of the file format, see icu/source/tools/gennorm/store.c */ +#define DATA_NAME "unorm" +#define DATA_TYPE "dat" + +static UDataMemory *normData=NULL; +static UErrorCode dataErrorCode=U_ZERO_ERROR; +static int8_t haveNormData=0; + +/* + * pointers into the memory-mapped unorm.dat + */ +static const uint16_t *indexes=NULL, + *normTrieIndex=NULL, *extraData=NULL, + *combiningTable=NULL, + *fcdTrieIndex=NULL; + +/* + * note that there is no uint32_t *normTrieData: + * the indexes in the trie are adjusted so that they point to the data based on + * (uint32_t *)normTrieIndex - this saves one variable at runtime + */ +#define normTrieData ((uint32_t *)normTrieIndex) + +/* similarly for the FCD trie index and data - but both are uint16_t * */ + +/* the Unicode version of the normalization data */ +static UVersionInfo dataVersion={ 3, 1, 0, 0 }; + +static UBool U_CALLCONV +isAcceptable(void *context, + const char *type, const char *name, + const UDataInfo *pInfo) { + if( + pInfo->size>=20 && + pInfo->isBigEndian==U_IS_BIG_ENDIAN && + pInfo->charsetFamily==U_CHARSET_FAMILY && + pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */ + pInfo->dataFormat[1]==0x6f && + pInfo->dataFormat[2]==0x72 && + pInfo->dataFormat[3]==0x6d && + pInfo->formatVersion[0]==1 && + pInfo->formatVersion[3]==_NORM_TRIE_SHIFT + ) { + uprv_memcpy(dataVersion, pInfo->dataVersion, 4); + return TRUE; + } else { + return FALSE; + } +} + +static int8_t +loadNormData(UErrorCode &errorCode) { + /* load Unicode normalization data from file */ + if(haveNormData==0) { + UDataMemory *data; + const uint16_t *p=NULL; + + if(&errorCode==NULL || U_FAILURE(errorCode)) { + return 0; + } + + /* open the data outside the mutex block */ + data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode); + dataErrorCode=errorCode; + if(U_FAILURE(errorCode)) { + return haveNormData=-1; + } + + p=(const uint16_t *)udata_getMemory(data); + + /* in the mutex block, set the data for this process */ + umtx_lock(NULL); + if(normData==NULL) { + normData=data; + data=NULL; + indexes=p; + p=NULL; + } + umtx_unlock(NULL); + + /* initialize some variables */ + normTrieIndex=indexes+indexes[_NORM_INDEX_COUNT]; + extraData=normTrieIndex+indexes[_NORM_INDEX_TRIE_INDEX_COUNT]+2*indexes[_NORM_INDEX_TRIE_DATA_COUNT]; + combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT]; + fcdTrieIndex=combiningTable+indexes[_NORM_INDEX_COMBINE_DATA_COUNT]; + haveNormData=1; + + /* if a different thread set it first, then close the extra data */ + if(data!=NULL) { + udata_close(data); /* NULL if it was set correctly */ + } + } + + return haveNormData; +} + +inline UBool +_haveData(UErrorCode &errorCode) { + if(haveNormData!=0) { + errorCode=dataErrorCode; + return (UBool)(haveNormData>0); + } else { + return (UBool)(loadNormData(errorCode)>0); + } +} + +U_CAPI UBool U_EXPORT2 +unorm_haveData(UErrorCode *pErrorCode) { + return _haveData(*pErrorCode); +} + +/* data access primitives --------------------------------------------------- */ + +inline uint32_t +_getNorm32(UChar c) { + return + normTrieData[ + normTrieIndex[ + c>>_NORM_TRIE_SHIFT + ]+ + (c&_NORM_STAGE_2_MASK) + ]; +} + +inline uint32_t +_getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) { + /* the surrogate index in norm32 is an offset over the BMP top of stage 1 */ + uint32_t c= + ((norm32>>(_NORM_EXTRA_SHIFT-10))&0xffc00)| + (c2&0x3ff); + return + normTrieData[ + normTrieIndex[ + _NORM_STAGE_1_BMP_COUNT+ + (c>>_NORM_TRIE_SHIFT) + ]+ + (c&_NORM_STAGE_2_MASK) + ]; +} + +inline uint16_t +_getFCD16(UChar c) { + return + fcdTrieIndex[ + fcdTrieIndex[ + c>>_NORM_TRIE_SHIFT + ]+ + (c&_NORM_STAGE_2_MASK) + ]; +} + +inline uint16_t +_getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) { + /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */ + uint32_t c= + ((uint32_t)fcd16<<10)| + (c2&0x3ff); + return + fcdTrieIndex[ + fcdTrieIndex[ + c>>_NORM_TRIE_SHIFT + ]+ + (c&_NORM_STAGE_2_MASK) + ]; +} + +inline const uint16_t * +_getExtraData(uint32_t norm32) { + return extraData+(norm32>>_NORM_EXTRA_SHIFT); +} + +/* + * get the combining class of (c, c2)=*p++ + * before: p>_NORM_CC_SHIFT); + } +} + +/* + * get the combining class of (c, c2)=*--p + * before: start

>_NORM_CC_SHIFT); + } else if(UTF_IS_SURROGATE_FIRST(c)) { + /* unpaired first surrogate */ + return 0; + } else if(p!=start && (c2=*(p-1), UTF_IS_FIRST_SURROGATE(c2))) { + --p; + norm32=_getNorm32(c2); + if((norm32&_NORM_CC_MASK)==0) { + /* all surrogate pairs with this lead surrogate have cc==0 */ + return 0; + } else { + /* norm32 must be a surrogate special */ + return (uint8_t)(_getNorm32FromSurrogatePair(norm32, c)>>_NORM_CC_SHIFT); + } + } else { + /* unpaired second surrogate */ + return 0; + } +} + +/* reorder UTF-16 in-place -------------------------------------------------- */ + +/* + * merge two parts of a UTF-16 string in-place + * to canonically order (order by combining classes) their concatenation + * + * before: [start..p[ is already ordered, and + * [p..limit[ is ordered in itself, but + * not in relation to [start..p[ + * after: [start..limit[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from *p++ + * and inserts them in correct combining class order into the preceding part + * of the string + * + * returns the trailing combining class + */ +static uint8_t +_mergeOrdered(const UChar *start, UChar *p, const UChar *limit) { + const UChar *pBack, *pPreBack; + UChar *pSplit, *q; + UChar c, c2; + uint8_t cc, prevCC, trailCC=0; + + if(start==p) { + /* nothing to do */ + if(start!=limit) { + return _getPrevCC(start, limit); + } else { + return 0; + } + } + + while(p=prevCC */ + pPreBack=pBack=pSplit; + prevCC=_getPrevCC(start, pPreBack); + if(cc>=prevCC) { + /* does not bubble back */ + trailCC=cc; + break; + } else { + /* this will be the last code point, so keep its cc */ + trailCC=prevCC; + pBack=pPreBack; + while(start=prevCC) { + break; + } + pBack=pPreBack; + } + + /* + * this is where we are right now with all these pointers: + * [start..pPreBack[ 0..? code points that we can ignore + * [pPreBack..pBack[ 0..1 code points with prevCC<=cc + * [pBack..pSplit[ 0..n code points with >cc, move up to insert (c, c2) + * [pSplit..p[ 1 code point (c, c2) with cc + * [p..limit[ 0..? code points yet to be bubbled in + */ + + /* move the code units in between up */ + q=p; + do { + *--q=*--pSplit; + } while(pBack!=pSplit); + + /* insert (c, c2) */ + *pSplit=c; + if(c2!=0) { + *(pSplit+1)=c2; + } + + /* we know that the new part is ordered in itself, so we can move start up */ + start=q; /* set it to after where (c, c2) were inserted */ + } + } + } + + if(p==limit) { + /* we know the cc of the last code point */ + return trailCC; + } else { + return _getPrevCC(start, limit); + } +} + +/* + * simpler, more efficient version of _mergeOrdered() - + * inserts only one code point into the preceding string + * assume that (c, c2) has not yet inserted at [pSplit..p[ + */ +static uint8_t +_insertOrdered(const UChar *start, UChar *pSplit, UChar *p, + UChar c, UChar c2, uint8_t cc) { + const UChar *pBack, *pPreBack; + UChar *q; + uint8_t prevCC, trailCC=cc; + + if(start=prevCC */ + pPreBack=pBack=pSplit; + prevCC=_getPrevCC(start, pPreBack); + if(cc=prevCC) { + break; + } + pBack=pPreBack; + } + + /* + * this is where we are right now with all these pointers: + * [start..pPreBack[ 0..? code points that we can ignore + * [pPreBack..pBack[ 0..1 code points with prevCC<=cc + * [pBack..pSplit[ 0..n code points with >cc, move up to insert (c, c2) + * [pSplit..p[ 1 code point (c, c2) with cc + */ + + /* move the code units in between up */ + q=p; + do { + *--q=*--pSplit; + } while(pBack!=pSplit); + } + } + + /* insert (c, c2) */ + *pSplit=c; + if(c2!=0) { + *(pSplit+1)=c2; + } + + /* we know the cc of the last code point */ + return trailCC; +} + +/* quick check functions ---------------------------------------------------- */ + +static UBool +unorm_checkFCD(const UChar *src, + int32_t srcLength, + UErrorCode *pErrorCode) { + const UChar *limit; + UChar c, c2; + uint16_t fcd16; + int16_t prevCC, cc; + + /* check arguments */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return FALSE; + } + + if(src==NULL || srcLength<-1) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return FALSE; + } + + if(!_haveData(*pErrorCode)) { + return FALSE; + } + + /* initialize */ + prevCC=0; + + if(srcLength>=0) { + /* string with length */ + limit=src+srcLength; + } else /* srcLength==-1 */ { + /* zero-terminated string */ + limit=NULL; + } + + U_ALIGN_CODE(16); + + for(;;) { + /* skip a run of code units below the minimum or with irrelevant data for the FCD check */ + if(limit==NULL) { + for(;;) { + c=*src++; + if(c<_NORM_MIN_WITH_LEAD_CC) { + if(c==0) { + return TRUE; + } + prevCC=-(int16_t)c; + } else if((fcd16=_getFCD16(c))==0) { + prevCC=0; + } else { + break; + } + } + } else { + for(;;) { + if(src==limit) { + return TRUE; + } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) { + prevCC=-(int16_t)c; + } else if((fcd16=_getFCD16(c))==0) { + prevCC=0; + } else { + break; + } + } + } + + /* check one above-minimum, relevant code unit */ + if(UTF_IS_FIRST_SURROGATE(c)) { + /* c is a lead surrogate, get the real fcd16 */ + if((limit==NULL || src!=limit) && (c2=*src, UTF_IS_SECOND_SURROGATE(c2))) { + ++src; + fcd16=_getFCD16FromSurrogatePair(fcd16, c2); + } else { + fcd16=0; + } + } + + /* + * prevCC has values from the following ranges: + * 0..0xff - the previous trail combining class + * <0 - the negative value of the previous code unit; + * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16() + * was deferred so that average text is checked faster + */ + + /* check the combining order */ + cc=(int16_t)(fcd16>>8); + if(cc!=0) { + if(prevCC<0) { + /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */ + prevCC=(int16_t)_getFCD16((UChar)-prevCC)&0xff; + } + + if(cc=0) { + /* string with length */ + limit=src+srcLength; + } else /* srcLength==-1 */ { + /* zero-terminated string */ + limit=NULL; + } + + U_ALIGN_CODE(16); + + for(;;) { + /* skip a run of code units below the minimum or with irrelevant data for the quick check */ + if(limit==NULL) { + for(;;) { + c=*src++; + if(c=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) { + break; + } + prevCC=0; + } + } + + /* check one above-minimum, relevant code unit */ + if(_NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP) { + /* c is a lead surrogate, get the real norm32 */ + if((limit==NULL || src!=limit) && (c2=*src, UTF_IS_SECOND_SURROGATE(c2))) { + ++src; + norm32=_getNorm32FromSurrogatePair(norm32, c2); + } else { + norm32=0; + } + } + + /* check the combining order */ + cc=(uint8_t)(norm32>>_NORM_CC_SHIFT); + if(cc!=0 && cc=0) { + /* string with length */ + limit=src+srcLength; + } else /* srcLength==-1 */ { + /* zero-terminated string */ + limit=NULL; + } + + U_ALIGN_CODE(16); + + for(;;) { + /* count code units below the minimum or with irrelevant data for the quick check */ + prevSrc=src; + if(limit==NULL) { + while((c=*src)=_NORM_MIN_HANGUL) { + if(ignoreHangul) { + c2=0; + p=NULL; + length=1; + } else { + /* Hangul syllable: decompose algorithmically */ + p=buffer; + cc=trailCC=0; + + c-=HANGUL_BASE; + + c2=(UChar)(c%JAMO_T_COUNT); + c/=JAMO_T_COUNT; + if(c2>0) { + buffer[2]=(UChar)(JAMO_T_BASE+c2); + length=3; + } else { + length=2; + } + + buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); + buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); + } + } else { + if(norm32<_NORM_MIN_SPECIAL) { + c2=0; + length=1; + } else { + /* c is a lead surrogate, get the real norm32 */ + if((limit==NULL || src!=limit) && (c2=*src, UTF_IS_SECOND_SURROGATE(c2))) { + ++src; + length=2; + norm32=_getNorm32FromSurrogatePair(norm32, c2); + } else { + c2=0; + length=1; + norm32=0; + } + } + + /* get the decomposition and the lead and trail cc's */ + if((norm32&qcMask)==0) { + /* c does not decompose */ + cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT); + p=NULL; + } else { + /* c decomposes, get everything from the variable-length extra data */ + p=(const UChar *)_getExtraData(norm32); + length=*p++; + + if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) { + /* use compatibility decomposition, skip canonical data */ + p+=((length>>7)&1)+(length&0x7f); + length>>=8; + } + + if(length&0x80) { + /* get the lead and trail cc's */ + UChar bothCCs=*p++; + cc=(uint8_t)(bothCCs>>8); + trailCC=(uint8_t)bothCCs; + } else { + /* lead and trail cc's are both 0 */ + cc=trailCC=0; + } + + length&=0x7f; + if(length==1) { + /* fastpath a single code unit from decomposition */ + c=*p; + c2=0; + p=NULL; + } + } + } + + /* append the decomposition to the destination buffer, assume length>0 */ + if( (destIndex+length)<=destCapacity || + /* attempt to grow the buffer */ + (canGrow && (canGrow=growBuffer(context, &dest, &destCapacity, + limit==NULL ? + 2*(destCapacity)+length+20 : + destCapacity+length+2*(limit-src)+20, + destIndex))) + ) { + UChar *reorderSplit=dest+destIndex; + if(p==NULL) { + /* fastpath: single code point */ + if(cc!=0 && cc0); + + if(cc!=0 && ccdestCapacity) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } +#else + /* ### TODO: this looks slightly to much more reasonable but fails some tests, esp. /tscoll/cmsccoll/TestIncrementalNormalize */ + if(limit==NULL) { + /* assume that we must NUL-terminate */ + if(destIndexdestCapacity) { + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } + } +#endif + + return destIndex; +} + +/* make NFC & NFKC ---------------------------------------------------------- */ + +U_CFUNC int32_t +unorm_compose(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UBool compat, UBool ignoreHangul, + GrowBuffer *growBuffer, void *context, + UErrorCode *pErrorCode) { + /* ### TODO: for now, this is just basically the same as the old unorm_normalize() */ + if(U_FAILURE(*pErrorCode)) return -1; + + /* synwee : removed hard coded conversion */ + Normalizer::EMode normMode = compat ? Normalizer::COMPOSE_COMPAT : Normalizer::COMPOSE; + if (U_FAILURE(*pErrorCode)) { + return -1; + } + + int32_t len = (srcLength == -1 ? u_strlen(src) : srcLength); + const UnicodeString source(srcLength == -1, src, len); + UnicodeString dst(dest, 0, destCapacity); + /* synwee : note quickcheck is added in C ++ normalize method */ + Normalizer::normalize(source, normMode, ignoreHangul ? Normalizer::IGNORE_HANGUL : 0, dst, *pErrorCode); + return uprv_fillOutputString(dst, dest, destCapacity, pErrorCode); +} + + + + + + + + + + + + + + + + + + + + +/* old implementation ------------------------------------------------------- */ + /* added by synwee for trie manipulation*/ #define STAGE_1_SHIFT_ 10 #define STAGE_2_SHIFT_ 4 @@ -134,29 +1073,87 @@ static const uint16_t *FCHK_STAGE_2_; static const uint16_t *FCHK_STAGE_3_; U_CAPI int32_t -unorm_normalize(const UChar* source, - int32_t sourceLength, +unorm_normalize(const UChar* src, + int32_t srcLength, UNormalizationMode mode, int32_t option, - UChar* result, - int32_t resultLength, - UErrorCode* status) + UChar* dest, + int32_t destCapacity, + UErrorCode* pErrorCode) { - if(U_FAILURE(*status)) return -1; + if(useNewImplementation) { + UBool ignoreHangul; + + /* check argument values */ + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return 0; + } + + if( destCapacity<0 || (dest==NULL && destCapacity>0) || + src==NULL || srcLength<-1 + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + /* check for overlapping src and destination */ + /* ### TODO: real API may provide a temp buffer */ + if( (src>=dest && src<(dest+destCapacity)) || + (srcLength>0 && dest>=src && dest<(src+srcLength)) + ) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + ignoreHangul= (option&UNORM_IGNORE_HANGUL)!=0; + + switch(mode) { + case UNORM_NFD: + return unorm_decompose(dest, destCapacity, + src, srcLength, + FALSE, ignoreHangul, + NULL, NULL, + pErrorCode); + case UNORM_NFKD: + return unorm_decompose(dest, destCapacity, + src, srcLength, + TRUE, ignoreHangul, + NULL, NULL, + pErrorCode); + case UNORM_NFC: + return unorm_compose(dest, destCapacity, + src, srcLength, + FALSE, ignoreHangul, + NULL, NULL, + pErrorCode); + case UNORM_NFKC: + return unorm_compose(dest, destCapacity, + src, srcLength, + TRUE, ignoreHangul, + NULL, NULL, + pErrorCode); + /* ### TODO: case UNORM_FCD: return unorm_makeFCD(); */ + default: + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + } + + if(U_FAILURE(*pErrorCode)) return -1; /* synwee : removed hard coded conversion */ - Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *status); - if (U_FAILURE(*status)) + Normalizer::EMode normMode = Normalizer::getNormalizerEMode(mode, *pErrorCode); + if (U_FAILURE(*pErrorCode)) return -1; - int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); - const UnicodeString src(sourceLength == -1, source, len); - UnicodeString dst(result, 0, resultLength); + int32_t len = (srcLength == -1 ? u_strlen(src) : srcLength); + const UnicodeString source(srcLength == -1, src, len); + UnicodeString dst(dest, 0, destCapacity); /* synwee : note quickcheck is added in C ++ normalize method */ if ((option & UNORM_IGNORE_HANGUL) != 0) option = Normalizer::IGNORE_HANGUL; - Normalizer::normalize(src, normMode, option, dst, *status); - return uprv_fillOutputString(dst, result, resultLength, status); + Normalizer::normalize(source, normMode, option, dst, *pErrorCode); + return uprv_fillOutputString(dst, dest, destCapacity, pErrorCode); } static UBool U_CALLCONV @@ -260,6 +1257,10 @@ unorm_quickCheck(const UChar *source, const UChar *psource; const UChar *pend = 0; + if(useNewImplementation) { + return _unorm_quickCheck(source, sourcelength, mode, status); + } + if (!loadQuickCheckData(status) || U_FAILURE(*status)) { return UNORM_MAYBE; } @@ -502,6 +1503,10 @@ U_CAPI const uint16_t * getFCHK_STAGE_3_(UErrorCode *error) U_CAPI UBool checkFCD(const UChar* source, int32_t sourcelength, UErrorCode* status) { + if(useNewImplementation) { + return unorm_checkFCD(source, sourcelength, status); + } + UChar32 codepoint; const UChar *psource; const UChar *pend = 0; diff --git a/icu4c/source/common/unormimp.h b/icu4c/source/common/unormimp.h new file mode 100644 index 00000000000..a50a88d682f --- /dev/null +++ b/icu4c/source/common/unormimp.h @@ -0,0 +1,164 @@ +/* +******************************************************************************* +* +* Copyright (C) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: unormimp.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001may25 +* created by: Markus W. Scherer +*/ + +#ifndef __UNORMIMP_H__ +#define __UNORMIMP_H__ + +#include "unicode/utypes.h" +#include "ustr_imp.h" + +/* trie constants */ +enum { + /* + * must be <=10: + * above 10, a lead surrogate's block is smaller than a stage 2 block + */ + _NORM_TRIE_SHIFT=5, + + _NORM_STAGE_2_BLOCK_COUNT=1<<_NORM_TRIE_SHIFT, + _NORM_STAGE_2_MASK=_NORM_STAGE_2_BLOCK_COUNT-1, + + _NORM_STAGE_1_BMP_COUNT=(1<<(16-_NORM_TRIE_SHIFT)), + + _NORM_SURROGATE_BLOCK_BITS=10-_NORM_TRIE_SHIFT, + _NORM_SURROGATE_BLOCK_COUNT=(1<<_NORM_SURROGATE_BLOCK_BITS) +}; + +/* this may be >0xffff and may not work as an enum */ +#define _NORM_STAGE_1_MAX_COUNT (0x110000>>_NORM_TRIE_SHIFT) + +/* value constants */ +enum { + /* quick check flags 0..3 set mean "no" for their forms */ + _NORM_QC_NFC=0x11, /* no|maybe */ + _NORM_QC_NFKC=0x22, /* no|maybe */ + _NORM_QC_NFD=4, /* no */ + _NORM_QC_NFKD=8, /* no */ + + _NORM_QC_ANY_NO=0xf, + + /* quick check flags 4..5 mean "maybe" for their forms; test flags>=_NORM_QC_MAYBE */ + _NORM_QC_MAYBE=0x10, + _NORM_QC_ANY_MAYBE=0x30, + + _NORM_COMBINES_FWD=0x40, + _NORM_COMBINES_BACK=0x80, + _NORM_COMBINES_ANY=0xc0, + +#if 0 + _NORM_CC_TYPE_MASK=0xc0, + _NORM_CC_TYPE_NONE=0, /* no cc - lead and trail cc are 0 */ + _NORM_CC_TYPE_SAME=0x40, /* lead and trail cc are same, non-zero, and in value */ + _NORM_CC_TYPE_TRAIL=0x80, /* lead cc=0, trail cc in value */ + _NORM_CC_TYPE_TWO=0xc0, /* 0 != lead cc < trail cc, lead cc in value, trail cc in extra data */ + + _NORM_CC_HAS_LEAD=0x40, /* side effect of the above flags: if and only if bit 6 is 0, then lead cc is 0 */ + _NORM_CC_HAS_LEAD_HAS_TRAIL=0x80, /* if(has lead) then one can check for (has trail) instead of (&cc mask==same/two) */ +#endif + + _NORM_CC_SHIFT=8, /* UnicodeData.txt combining class in bits 15..8 */ + _NORM_CC_MASK=0xff00, + + _NORM_EXTRA_SHIFT=16, /* 16 bits for the index to UChars and other extra data */ + _NORM_EXTRA_INDEX_TOP=0xfc00, /* start of surrogate specials after shift */ + + _NORM_EXTRA_SURROGATE_MASK=0x3ff, + _NORM_EXTRA_SURROGATE_TOP=0x3f0, /* hangul etc. */ + + _NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP, + _NORM_EXTRA_JAMO_1, /* ### not used */ + _NORM_EXTRA_JAMO_2, + _NORM_EXTRA_JAMO_3 +}; + +/* value constants using >16 bits */ +#define _NORM_MIN_SPECIAL 0xfc000000 +#define _NORM_SURROGATES_TOP 0xfff00000 +#define _NORM_MIN_HANGUL 0xfff00000 +#define _NORM_MIN_JAMO2 0xfff20000 +#define _NORM_JAMO2_TOP 0xfff30000 + + +/* index values */ +enum { + _NORM_INDEX_COUNT, + _NORM_INDEX_TRIE_SHIFT, + _NORM_INDEX_TRIE_INDEX_COUNT, + _NORM_INDEX_TRIE_DATA_COUNT, + _NORM_INDEX_UCHAR_COUNT, + + _NORM_INDEX_COMBINE_DATA_COUNT, + _NORM_INDEX_COMBINE_FWD_COUNT, + _NORM_INDEX_COMBINE_BOTH_COUNT, + _NORM_INDEX_COMBINE_BACK_COUNT, + + _NORM_INDEX_MIN_NFC_NO_MAYBE, + _NORM_INDEX_MIN_NFKC_NO_MAYBE, + _NORM_INDEX_MIN_NFD_NO_MAYBE, + _NORM_INDEX_MIN_NFKD_NO_MAYBE, + + _NORM_INDEX_FCD_TRIE_INDEX_COUNT, + _NORM_INDEX_FCD_TRIE_DATA_COUNT, + + _NORM_INDEX_TOP=16 +}; + +enum { + /* FCD check: everything below this code point is known to have a 0 lead combining class */ + _NORM_MIN_WITH_LEAD_CC=0x300 +}; + +/** + * Is the normalizer data loaded? + * This is used internally before other internal normalizer functions + * are called. + * It saves this check in each of many normalization calls that + * are made for, e.g., collation. + * + * @param pErrorCode as usual + * @return boolean value for whether the normalization data is loaded + * + * @internal + */ +U_CAPI UBool U_EXPORT2 +unorm_haveData(UErrorCode *pErrorCode); + +/** + * internal API, used by normlzr.cpp + * @internal + */ +U_CFUNC int32_t +unorm_decompose(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + UBool compat, UBool ignoreHangul, + GrowBuffer *growBuffer, void *context, + UErrorCode *pErrorCode); + +/** + * internal API, but used by tests + * @internal + */ +U_CAPI void U_EXPORT2 +unorm_setNewImplementation(UBool useNew); + +/** + * internal API, but used by tests + * @internal + */ +U_CAPI UBool U_EXPORT2 +unorm_usesNewImplementation(); + +#endif diff --git a/icu4c/source/data/build/Makefile.in b/icu4c/source/data/build/Makefile.in index 5827fe73131..f167d79926b 100644 --- a/icu4c/source/data/build/Makefile.in +++ b/icu4c/source/data/build/Makefile.in @@ -53,7 +53,7 @@ all-local: thaidict.brk build-local ##### Define all the data files. the build rule that depends on them is below. ## DAT files - Misc. data files. -DAT_FILES=qchk.dat fchk.dat uprops.dat unames.dat cnvalias.dat tz.dat ucadata.dat invuca.dat +DAT_FILES=qchk.dat fchk.dat uprops.dat unames.dat unorm.dat cnvalias.dat tz.dat ucadata.dat invuca.dat TEST_DAT_FILES=$(TESTOBJDATADIR)/test.dat ## BRK files @@ -150,6 +150,11 @@ uprops.dat: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/Mirror.txt $(TOO @echo Creating uprops.dat @ICU_DATA=. $(INVOKE) $(TOOLDIR)/genprops/genprops -s $(UNICODEDATADIR) -d . -u $(UNICODE_VERSION) +# unorm.dat +unorm.dat: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/DerivedNormalizationProperties.txt $(UNICODEDATADIR)/Mirror.txt $(TOOLDIR)/gennorm/gennorm + @echo Creating unorm.dat + @ICU_DATA=. $(INVOKE) $(TOOLDIR)/gennorm/gennorm -s $(UNICODEDATADIR) -d . -u $(UNICODE_VERSION) + # ucadata.dat ucadata.dat: $(UNICODEDATADIR)/FractionalUCA.txt $(TOOLDIR)/genuca/genuca @echo Creating ucadata.dat and invuca.dat @@ -205,7 +210,7 @@ endif $(TESTOBJDATADIR)/%.res: $(TESTSRCDATADIR)/%.txt $(TOOLDIR)/genrb/genrb @ICU_DATA=. $(INVOKE) $(TOOLDIR)/genrb/genrb -s $(TESTSRCDATADIR) -d $(TESTOBJDATADIR) $( +#include +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/putil.h" +#include "cmemory.h" +#include "cstring.h" +#include "unicode/udata.h" +#include "unewdata.h" +#include "uoptions.h" +#include "uparse.h" +#include "unormimp.h" + +U_CDECL_BEGIN +#include "gennorm.h" +U_CDECL_END + +UBool beVerbose=FALSE, haveCopyright=TRUE; + +/* prototypes --------------------------------------------------------------- */ + +static void +parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode); + +static void +parseDB(const char *filename, UErrorCode *pErrorCode); + +/* -------------------------------------------------------------------------- */ + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_DESTDIR, + UOPTION_SOURCEDIR, + { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 } +}; + +extern int +main(int argc, char* argv[]) { + char filename[300]; + const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; + char *basename=NULL; + UErrorCode errorCode=U_ZERO_ERROR; + + /* preset then read command line options */ + options[4].value=u_getDataDirectory(); + options[5].value=""; + options[6].value="3.0.0"; + argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + fprintf(stderr, + "usage: %s [-options] [suffix]\n" + "\tread the UnicodeData.txt file and other Unicode properties files and\n" + "\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n" + "\toptions:\n" + "\t\t-h or -? or --help this usage text\n" + "\t\t-v or --verbose verbose output\n" + "\t\t-c or --copyright include a copyright notice\n" + "\t\t-d or --destdir destination directory, followed by the path\n" + "\t\t-s or --sourcedir source directory, followed by the path\n" + "\t\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" + "\t\tsuffix suffix that is to be appended with a '-'\n" + "\t\t to the source file basenames before opening;\n" + "\t\t 'gennorm new' will read UnicodeData-new.txt etc.\n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + beVerbose=options[2].doesOccur; + haveCopyright=options[3].doesOccur; + srcDir=options[5].value; + destDir=options[4].value; + + if(argc>=2) { + suffix=argv[1]; + } else { + suffix=NULL; + } + + setUnicodeVersion(options[6].value); + + /* prepare the filename beginning with the source dir */ + uprv_strcpy(filename, srcDir); + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename++=U_FILE_SEP_CHAR; + } + + /* initialize */ + init(); + + /* process DerivedNormalizationProperties.txt (quick check flags) */ + if(suffix==NULL) { + uprv_strcpy(basename, "DerivedNormalizationProperties.txt"); + } else { + uprv_strcpy(basename, "DerivedNormalizationProperties"); + basename[30]='-'; + uprv_strcpy(basename+31, suffix); + uprv_strcat(basename+31, ".txt"); + } + parseDerivedNormalizationProperties(filename, &errorCode); + + /* process UnicodeData.txt */ + if(suffix==NULL) { + uprv_strcpy(basename, "UnicodeData.txt"); + } else { + uprv_strcpy(basename, "UnicodeData"); + basename[11]='-'; + uprv_strcpy(basename+12, suffix); + uprv_strcat(basename+12, ".txt"); + } + parseDB(filename, &errorCode); + + /* process parsed data */ + if(U_SUCCESS(errorCode)) { + processData(); + + /* write the properties data file */ + generateData(destDir); + } + + return errorCode; +} + +/* parsing helpers ---------------------------------------------------------- */ + +static const char * +skipWhitespace(const char *s) { + while(*s==' ' || *s=='\t') { + ++s; + } + return s; +} + +/* + * parse a list of code points + * store them as a UTF-32 string in dest[destCapacity] with the string length in dest[0] + * set the first code point in *pFirst + * return the number of code points + */ +static int32_t +parseCodePoints(const char *s, + uint32_t *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + int32_t count; + + count=0; + for(;;) { + s=skipWhitespace(s); + if(*s==';' || *s==0) { + return count; + } + + /* read one code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { + fprintf(stderr, "gennorm: syntax error parsing code point at %s\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } + + /* overflow? */ + if(count>=destCapacity) { + fprintf(stderr, "gennorm: code point sequence too long at at %s\n", s); + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return -1; + } + + /* append it to the destination array */ + dest[count++]=value; + + /* go to the following characters */ + s=end; + } +} + +/* read a range like start or start..end */ +static int32_t +parseCodePointRange(const char *s, + uint32_t *pStart, uint32_t *pEnd, + UErrorCode *pErrorCode) { + char *end; + uint32_t value; + + s=skipWhitespace(s); + if(*s==';' || *s==0) { + fprintf(stderr, "gennorm: syntax error parsing range at %s - empty field\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } + + /* read the start code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) { + fprintf(stderr, "gennorm: syntax error parsing range start code point at %s\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } + *pStart=*pEnd=value; + + /* is there a "..end"? */ + s=skipWhitespace(end); + if(*s==';' || *s==0) { + return 1; + } + + if(*s!='.' || s[1]!='.') { + fprintf(stderr, "gennorm: syntax error parsing range at %s\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } + s+=2; + + /* read the end code point */ + value=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) { + fprintf(stderr, "gennorm: syntax error parsing range end code point at %s\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } + *pEnd=value; + + /* is this a valid range? */ + if(value<*pStart) { + fprintf(stderr, "gennorm: syntax error parsing range at %s - not a valid range\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } + + /* no garbage after that? */ + s=skipWhitespace(end); + if(*s==';' || *s==0) { + return value-*pStart+1; + } else { + fprintf(stderr, "gennorm: syntax error parsing range at %s\n", s); + *pErrorCode=U_PARSE_ERROR; + return -1; + } +} + +/* parser for DerivedNormalizationProperties.txt ---------------------------- */ + +static void +derivedNormalizationPropertiesLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + char *s; + uint32_t start, end; + int32_t count; + uint8_t qcFlags; + + /* get code point range */ + count=parseCodePointRange(fields[0][0], &start, &end, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]); + exit(*pErrorCode); + } + + /* ignore hangul - handle explicitly */ + if(start==0xac00) { + return; + } + + /* get property - ignore unrecognized ones */ + s=(char *)skipWhitespace(fields[1][0]); + if(*s=='N' && s[1]=='F') { + qcFlags=0x11; + s+=2; + if(*s=='K') { + qcFlags<<=1; + ++s; + } + + if(*s=='C' && s[1]=='_') { + s+=2; + } else if(*s=='D' && s[1]=='_') { + qcFlags<<=2; + s+=2; + } else { + return; + } + + if(0==uprv_memcmp(s, "NO", 2)) { + qcFlags&=0xf; + } else if(0==uprv_memcmp(s, "MAYBE", 5)) { + qcFlags&=0x30; + } else { + return; + } + + /* set this flag for all code points in this range */ + while(start<=end) { + setQCFlags(start++, qcFlags); + } + } else if(0==uprv_memcmp(s, "Comp_Ex", 7)) { + while(start<=end) { + setCompositionExclusion(start++); + } + } +} + +static void +parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode) { + char *fields[2][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode); +} + +/* parser for UnicodeData.txt ----------------------------------------------- */ + +static void +unicodeDataLineFn(void *context, + char *fields[][2], int32_t fieldCount, + UErrorCode *pErrorCode) { + uint32_t decomp[40]; + Norm norm; + const char *s; + char *end; + uint32_t code, value; + int32_t length; + UBool isCompat, something=FALSE; + + /* ignore First and Last entries for ranges */ + if( *fields[1][0]=='<' && + (length=(fields[1][1]-fields[1][0]))>=9 && + (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) + ) { + return; + } + + /* reset the properties */ + uprv_memset(&norm, 0, sizeof(Norm)); + + /* get the character code, field 0 */ + code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); + if(end<=fields[0][0] || end!=fields[0][1]) { + fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + /* get canonical combining class, field 3 */ + value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10); + if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { + fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + if(value>0) { + norm.udataCC=(uint8_t)value; + something=TRUE; + } + + /* get the decomposition, field 5 */ + if(fields[5][0]' */ + fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + } while(*s++!='>'); + } else { + isCompat=FALSE; + } + + /* parse the decomposition string */ + length=parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + exit(*pErrorCode); + } + + /* store the string */ + if(length>0) { + something=TRUE; + if(isCompat) { + norm.lenNFKD=(uint8_t)length; + norm.nfkd=decomp; + } else { + if(length>2) { + fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n", + code, length); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + norm.lenNFD=(uint8_t)length; + norm.nfd=decomp; + } + } + } + + /* check for non-character code points */ + if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) { + fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n", + code); + *pErrorCode=U_PARSE_ERROR; + exit(U_PARSE_ERROR); + } + + if(something) { + /* there are normalization values, so store them */ + if(beVerbose) { + printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n", + code, norm.udataCC, norm.lenNFD, norm.lenNFKD); + } + storeNorm(code, &norm); + } +} + +static void +parseDB(const char *filename, UErrorCode *pErrorCode) { + char *fields[15][2]; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */ diff --git a/icu4c/source/tools/gennorm/gennorm.dsp b/icu4c/source/tools/gennorm/gennorm.dsp new file mode 100644 index 00000000000..578611f26da --- /dev/null +++ b/icu4c/source/tools/gennorm/gennorm.dsp @@ -0,0 +1,128 @@ +# Microsoft Developer Studio Project File - Name="gennorm" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Console Application" 0x0103 + +CFG=gennorm - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "gennorm.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "gennorm.mak" CFG="gennorm - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "gennorm - Win32 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "gennorm - Win32 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "gennorm - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD CPP /nologo /MD /Za /W3 /GX /O2 /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 +# ADD LINK32 icutu.lib /nologo /subsystem:console /machine:I386 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" /libpath:"..\..\..\lib" +# Begin Custom Build +InputPath=.\Release\gennorm.exe +InputName=gennorm +SOURCE="$(InputPath)" + +"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(InputPath) ..\..\..\bin + +# End Custom Build + +!ELSEIF "$(CFG)" == "gennorm - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept +# ADD LINK32 icutud.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" /libpath:"..\..\..\lib" +# Begin Custom Build +InputPath=.\Debug\gennorm.exe +InputName=gennorm +SOURCE="$(InputPath)" + +"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)" + copy $(InputPath) ..\..\..\bin + +# End Custom Build + +!ENDIF + +# Begin Target + +# Name "gennorm - Win32 Release" +# Name "gennorm - Win32 Debug" +# Begin Group "Source Files" + +# PROP Default_Filter "c;cpp;rc" +# Begin Source File + +SOURCE=.\gennorm.c +# End Source File +# Begin Source File + +SOURCE=.\store.c +# End Source File +# End Group +# Begin Group "Header Files" + +# PROP Default_Filter "h" +# Begin Source File + +SOURCE=.\gennorm.h +# End Source File +# End Group +# Begin Group "Resource Files" + +# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe" +# End Group +# End Target +# End Project diff --git a/icu4c/source/tools/gennorm/gennorm.h b/icu4c/source/tools/gennorm/gennorm.h new file mode 100644 index 00000000000..e15caaf2d71 --- /dev/null +++ b/icu4c/source/tools/gennorm/gennorm.h @@ -0,0 +1,63 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2001, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: genprops.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999dec13 +* created by: Markus W. Scherer +*/ + +#ifndef __GENPROPS_H__ +#define __GENPROPS_H__ + +#include "unicode/utypes.h" + +/* file definitions */ +#define DATA_NAME "unorm" +#define DATA_TYPE "dat" + +/* + * data structure that holds the normalization properties for one or more + * code point(s) at build time + */ +typedef struct Norm { + uint8_t udataCC, lenNFD, lenNFKD; + uint8_t qcFlags, combiningFlags; + uint16_t canonBothCCs, compatBothCCs, combiningIndex, specialTag; + uint32_t *nfd, *nfkd; +} Norm; + +/* global flags */ +extern UBool beVerbose, haveCopyright; + +/* prototypes */ +extern void +setUnicodeVersion(const char *v); + +extern void +init(void); + +extern void +storeNorm(uint32_t code, Norm *norm); + +extern void +setQCFlags(uint32_t code, uint8_t qcFlags); + +extern void +setCompositionExclusion(uint32_t code); + +extern void +processData(void); + +extern void +generateData(const char *dataDir); + +#endif + diff --git a/icu4c/source/tools/gennorm/store.c b/icu4c/source/tools/gennorm/store.c new file mode 100644 index 00000000000..2b42998b21f --- /dev/null +++ b/icu4c/source/tools/gennorm/store.c @@ -0,0 +1,1428 @@ +/* +******************************************************************************* +* +* Copyright (C) 1999-2001, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: store.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001may25 +* created by: Markus W. Scherer +* +* Store Unicode normalization data in a memory-mappable file. +*/ + +#include +#include +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "unicode/udata.h" +#include "unewdata.h" +#include "unormimp.h" +#include "gennorm.h" + +#define DO_DEBUG_OUT 0 + +/* file data ---------------------------------------------------------------- */ + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ + {1, 0, 0, _NORM_TRIE_SHIFT}, /* formatVersion - [3] contains the trie shift! */ + {3, 1, 0, 0} /* dataVersion (Unicode version) */ +}; + +extern void +setUnicodeVersion(const char *v) { + UVersionInfo version; + u_versionFromString(version, v); + uprv_memcpy(dataInfo.dataVersion, version, 4); +} + +static uint16_t indexes[_NORM_INDEX_TOP]={ 0 }; + +/* tool memory helper ------------------------------------------------------- */ + +typedef struct UToolMemory { + char name[64]; + uint32_t count, size, index; + uint32_t array[1]; +} UToolMemory; + +static UToolMemory * +utm_open(const char *name, uint32_t count, uint32_t size) { + UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size); + if(mem==NULL) { + fprintf(stderr, "error: %s - out of memory\n", name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + uprv_strcpy(mem->name, name); + mem->count=count; + mem->size=size; + mem->index=0; + return mem; +} + +static void +utm_close(UToolMemory *mem) { + if(mem!=NULL) { + uprv_free(mem); + } +} + +static void * +utm_getStart(UToolMemory *mem) { + return (char *)mem->array; +} + +static void * +utm_alloc(UToolMemory *mem) { + char *p=(char *)mem->array+mem->index*mem->size; + if(++mem->index<=mem->count) { + uprv_memset(p, 0, mem->size); + return p; + } else { + fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", + mem->name, mem->count); + exit(U_MEMORY_ALLOCATION_ERROR); + } +} + +static void * +utm_allocN(UToolMemory *mem, int32_t n) { + char *p=(char *)mem->array+mem->index*mem->size; + if((mem->index+=(uint32_t)n)<=mem->count) { + uprv_memset(p, 0, n*mem->size); + return p; + } else { + fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", + mem->name, mem->count); + exit(U_MEMORY_ALLOCATION_ERROR); + } +} + +/* builder data ------------------------------------------------------------- */ + +typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); + +static UToolMemory *stage2Mem, *normMem, *utf32Mem, *extraMem, *combiningTriplesMem; + +static uint16_t stage1[_NORM_STAGE_1_MAX_COUNT], fcdStage1[_NORM_STAGE_1_MAX_COUNT]; +static uint16_t *stage2; + +static Norm *norms; + +/* + * set a flag for each code point that was seen in decompositions - + * avoid to decompose ones that have not been used before + */ +static uint32_t haveSeenFlags[256]; + +static uint32_t combiningCPs[2000]; +static uint16_t combiningIndexes[2000]; +static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0; + +typedef struct CombiningTriple { + uint16_t leadIndex, trailIndex; + uint32_t lead, trail, combined; +} CombiningTriple; + +/* 15b in the combining index -> <=0x8000 pairs of uint16_t in the combining table */ +static uint16_t combiningTable[2*0x8000]; +static uint16_t combiningTableTop=0; + +/* stage 2 table after turning Norm structs into 32-bit words */ +static uint32_t *norm32Table=NULL, *fcdTable=NULL; + +/* number of units used in stage 1 and norm32Table, and same for FCD */ +static uint16_t stage1Top, fcdStage1Top, + norm32TableTop, fcdTableTop; + +extern void +init() { + /* reset stage 1 of the trie */ + uprv_memset(stage1, 0, sizeof(stage1)); + + /* allocate stage 2 of the trie and reset the first block */ + stage2Mem=utm_open("gennorm trie stage 2", 30000, sizeof(*stage2)); + stage2=utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT); + + /* allocate Norm structures and reset the first one */ + normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm)); + norms=utm_alloc(normMem); + + /* allocate UTF-32 string memory */ + utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4); + + /* reset all "have seen" flags */ + uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags)); + + /* allocate extra data memory for UTF-16 decomposition strings and other values */ + extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2); + + /* allocate temporary memory for combining triples */ + combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple)); + + /* set the minimum code points for no/maybe quick check values to the end of the BMP */ + indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff; + indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff; + indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff; + indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff; +} + +/* get or create a block in stage 2 of the trie */ +static uint16_t +createStage2Block(uint32_t code) { + uint32_t i; + uint16_t j; + + i=code>>_NORM_TRIE_SHIFT; + j=stage1[i]; + if(j==0) { + /* allocate a stage 2 block */ + uint16_t *p; + + p=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT); + stage1[i]=j=p-stage2; + } + return j; +} + +/* + * get or create a Norm unit; + * get or create the intermediate trie entries for it as well + */ +static Norm * +createNorm(code) { + Norm *p; + uint16_t stage2Block, k; + + stage2Block=createStage2Block(code); + k=(uint16_t)(stage2Block+(code&_NORM_STAGE_2_MASK)); + if(stage2[k]==0) { + /* allocate Norm */ + p=(Norm *)utm_alloc(normMem); + stage2[k]=p-norms; + } else { + p=norms+stage2[k]; + } + return p; +} + +/* get an existing Norm unit */ +static Norm * +getNorm(code) { + uint32_t i; + uint16_t j; + + /* access stage 1 and get the stage 2 block start index */ + i=code>>_NORM_TRIE_SHIFT; + j=stage1[i]; + if(j==0) { + return NULL; + } + + /* access stage 2 and get the Norm unit */ + i=(uint16_t)(j+(code&_NORM_STAGE_2_MASK)); + j=stage2[i]; + if(j==0) { + return NULL; + } else { + return norms+j; + } +} + +/* get the canonical combining class of a character */ +static uint8_t +getCCFromCP(uint32_t code) { + Norm *norm=getNorm(code); + if(norm==NULL) { + return 0; + } else { + return norm->udataCC; + } +} + +/* + * enumerate all code points with their Norm structs and call a function for each + * return the number of code points with data + */ +static uint32_t +enumTrie(EnumTrieFn *fn, void *context) { + uint32_t code, count, i; + uint16_t j, k, l; + + code=0; + for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) { + j=stage1[i]; + if(j!=0) { + for(k=0; k<_NORM_STAGE_2_BLOCK_COUNT; ++k) { + l=stage2[j+k]; + if(l!=0) { + fn(context, code, norms+l); + ++count; + } + ++code; + } + } else { + code+=_NORM_STAGE_2_BLOCK_COUNT; + } + } + return count; +} + +static void +setHaveSeenString(const uint32_t *s, int32_t length) { + uint32_t c; + + while(length>0) { + c=*s++; + haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f)); + --length; + } +} + +#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f))) + +/* handle combining data ---------------------------------------------------- */ + +static void +addCombiningCP(uint32_t code, uint8_t flags) { + uint32_t newEntry; + uint16_t i; + + newEntry=code|((uint32_t)flags<<24); + + /* search for this code point */ + for(i=0; i=sizeof(combiningCPs)/4) { + fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n", + sizeof(combiningCPs)/4); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* set i to the insertion point */ + flags=(uint8_t)(newEntry>>24); + if(flags==1) { + i=combineFwdTop++; + ++combineBothTop; + } else if(flags==3) { + i=combineBothTop++; + } else /* flags==2 */ { + i=combineBackTop; + } + + /* move the following code points up one and insert newEntry at i */ + if(icombiningFlags|=1; /* combines forward */ + createNorm(trail)->combiningFlags|=2; /* combines backward */ + + addCombiningCP(lead, 1); + addCombiningCP(trail, 2); + + triple=(CombiningTriple *)utm_alloc(combiningTriplesMem); + triple->lead=lead; + triple->trail=trail; + triple->combined=combined; +} + +static int +compareTriples(const void *l, const void *r) { + int diff; + diff=(int)((CombiningTriple *)l)->leadIndex- + (int)((CombiningTriple *)r)->leadIndex; + if(diff==0) { + diff=(int)((CombiningTriple *)l)->trailIndex- + (int)((CombiningTriple *)r)->trailIndex; + } + return diff; +} + +static void +processCombining() { + CombiningTriple *triples; + uint16_t *p; + uint32_t combined; + uint16_t i, j, count, tableTop, finalIndex; + + triples=utm_getStart(combiningTriplesMem); + + /* add lead and trail indexes to the triples for sorting */ + count=(uint16_t)combiningTriplesMem->index; + for(i=0; icombiningIndex=combiningIndexes[i]=tableTop; + + /* calculate the length of the combining data for this lead code point in the combiningTable */ + while(jcombiningIndex=combiningIndexes[i]=finalIndex++; + } + + /* it must be tableTop<0x7fff because bit 15 is used in combiningTable as an end-for-this-lead marker */ + if(tableTop>=sizeof(combiningTable)/4) { + fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n", + tableTop, sizeof(combiningTable)/4); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + combiningTableTop=tableTop; + + /* store the combining data in the combiningTable, with the final indexes from above */ + p=combiningTable; + j=0; /* triples counter */ + + /* + * this is essentially the same loop as above, but + * it writes the table data instead of calculating and setting the final indexes; + * it is necessary to have two passes so that all the final indexes are known before + * they are written into the table + */ + for(i=0; i>10)); + *p++=finalIndex; + *p++=(uint16_t)(0xdc00|(combined&0x3ff)); + } + ++j; + } + + /* set a marker on the last final trail index in this lead's table */ + *(p-2)|=0x8000; + } + + /* post condition: tableTop==(p-combiningTable) */ +} + +/* processing incoming normalization data ----------------------------------- */ + +/* + * decompose the one decomposition further, may generate two decompositions + * apply all previous characters' decompositions to this one + */ +static void +decompStoreNewNF(uint32_t code, Norm *norm) { + uint32_t nfd[40], nfkd[40]; + uint32_t *s32; + Norm *p; + uint32_t c; + int32_t i, length; + uint8_t lenNFD=0, lenNFKD=0; + UBool changedNFD=FALSE, changedNFKD=FALSE; + + if((length=norm->lenNFD)!=0) { + /* always allocate the original string */ + changedNFD=TRUE; + s32=norm->nfd; + } else if((length=norm->lenNFKD)!=0) { + /* always allocate the original string */ + changedNFKD=TRUE; + s32=norm->nfkd; + } else { + /* no decomposition here, nothing to do */ + return; + } + + /* decompose each code point */ + for(i=0; ilenNFD!=0) { + uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4); + lenNFD+=p->lenNFD; + } else { + nfd[lenNFD++]=c; + } + } + + /* compatibility-decompose c */ + if(p->lenNFKD!=0) { + uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4); + lenNFKD+=p->lenNFKD; + changedNFKD=TRUE; + } else if(p->lenNFD!=0) { + uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4); + lenNFKD+=p->lenNFD; + changedNFKD=TRUE; + } else { + nfkd[lenNFKD++]=c; + } + } + + /* assume that norm->lenNFD==1 or ==2 */ + if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) { + addCombiningTriple(s32[0], s32[1], code); + } + + if(changedNFD) { + if(lenNFD!=0) { + s32=utm_allocN(utf32Mem, lenNFD); + uprv_memcpy(s32, nfd, lenNFD*4); + } else { + s32=NULL; + } + norm->lenNFD=lenNFD; + norm->nfd=s32; + setHaveSeenString(nfd, lenNFD); + } + if(changedNFKD) { + if(lenNFKD!=0) { + s32=utm_allocN(utf32Mem, lenNFKD); + uprv_memcpy(s32, nfkd, lenNFKD*4); + } else { + s32=NULL; + } + norm->lenNFKD=lenNFKD; + norm->nfkd=s32; + setHaveSeenString(nfkd, lenNFKD); + } +} + +typedef struct DecompSingle { + uint32_t c; + Norm *norm; +} DecompSingle; + +/* + * apply this one character's decompositions (there is at least one!) to + * all previous characters' decompositions to decompose them further + */ +static void +decompWithSingleFn(void *context, uint32_t code, Norm *norm) { + uint32_t nfd[40], nfkd[40]; + uint32_t *s32; + DecompSingle *me=(DecompSingle *)context; + uint32_t c, myC; + int32_t i, length; + uint8_t lenNFD, lenNFKD, myLenNFD, myLenNFKD; + UBool changedNFD=FALSE, changedNFKD=FALSE; + + /* get the new character's data */ + myC=me->c; + myLenNFD=me->norm->lenNFD; + myLenNFKD=me->norm->lenNFKD; + /* assume that myC has at least one decomposition */ + + if((length=norm->lenNFD)!=0 && myLenNFD!=0) { + /* apply NFD(myC) to norm->nfd */ + s32=norm->nfd; + lenNFD=0; + for(i=0; inorm->nfd, myLenNFD*4); + lenNFD+=myLenNFD; + changedNFD=TRUE; + } else { + nfd[lenNFD++]=c; + } + } + } + + if((length=norm->lenNFKD)!=0) { + /* apply NFD(myC) and NFKD(myC) to norm->nfkd */ + s32=norm->nfkd; + lenNFKD=0; + for(i=0; inorm->nfkd, myLenNFKD*4); + lenNFKD+=myLenNFKD; + } else /* assume myLenNFD!=0 */ { + uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4); + lenNFKD+=myLenNFD; + } + changedNFKD=TRUE; + } else { + nfkd[lenNFKD++]=c; + } + } + } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) { + /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */ + s32=norm->nfd; + lenNFKD=0; + for(i=0; inorm->nfkd, myLenNFKD*4); + lenNFKD+=myLenNFKD; + changedNFKD=TRUE; + } else { + nfkd[lenNFKD++]=c; + } + } + } + + /* set the new decompositions, forget the old ones */ + if(changedNFD) { + if(lenNFD!=0) { + if(lenNFD>norm->lenNFD) { + s32=utm_allocN(utf32Mem, lenNFD); + } else { + s32=norm->nfd; + } + uprv_memcpy(s32, nfd, lenNFD*4); + } else { + s32=NULL; + } + norm->lenNFD=lenNFD; + norm->nfd=s32; + } + if(changedNFKD) { + if(lenNFKD!=0) { + if(lenNFKD>norm->lenNFKD) { + s32=utm_allocN(utf32Mem, lenNFKD); + } else { + s32=norm->nfkd; + } + uprv_memcpy(s32, nfkd, lenNFKD*4); + } else { + s32=NULL; + } + norm->lenNFKD=lenNFKD; + norm->nfkd=s32; + } +} + +/* + * process the data for one code point listed in UnicodeData; + * UnicodeData itself never maps a code point to both NFD and NFKD + */ +extern void +storeNorm(uint32_t code, Norm *norm) { + DecompSingle decompSingle; + Norm *p; + + /* copy existing derived normalization properties */ + p=createNorm(code); + norm->qcFlags=p->qcFlags; + norm->combiningFlags=p->combiningFlags; + + /* process the decomposition if if there is at one here */ + if((norm->lenNFD|norm->lenNFKD)!=0) { + /* decompose this one decomposition further, may generate two decompositions */ + decompStoreNewNF(code, norm); + + /* has this code point been used in previous decompositions? */ + if(HAVE_SEEN(code)) { + /* use this decomposition to decompose other decompositions further */ + decompSingle.c=code; + decompSingle.norm=norm; + enumTrie(decompWithSingleFn, &decompSingle); + } + } + + /* store the data */ + uprv_memcpy(p, norm, sizeof(Norm)); +} + +extern void +setQCFlags(uint32_t code, uint8_t qcFlags) { + createNorm(code)->qcFlags|=qcFlags; + + /* adjust the minimum code point for quick check no/maybe */ + if(code<0xffff) { + if((qcFlags&_NORM_QC_NFC) && (uint16_t)codecombiningFlags|=0x80; +} + +static void +setHangulJamoSpecials() { + Norm *norm; + uint16_t *pStage2Block; + uint32_t c; + uint16_t i; + + /* + * Hangul syllables are algorithmically decomposed into Jamos, + * and Jamos are algorithmically composed into Hangul syllables. + * The quick check flags are parsed, except for Hangul. + */ + +#if 0 + /* set Jamo 1 specials */ + for(c=0x1100; c<=0x1112; ++c) { + norm=createNorm(c); + norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_1; + norm->combiningFlags=1; + } +#endif + + /* set Jamo 2 specials */ + for(c=0x1161; c<=0x1175; ++c) { + norm=createNorm(c); + norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_2; + norm->combiningFlags=3; + } + + /* set Jamo 3 specials */ + for(c=0x11a8; c<=0x11c2; ++c) { + norm=createNorm(c); + norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_3; + norm->combiningFlags=2; + } + + /* set Hangul specials, precompacted */ + norm=(Norm *)utm_alloc(normMem); + norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL; + norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD; + + /* set one complete stage 2 block with this Hangul information */ + pStage2Block=(uint16_t *)utm_allocN(stage2Mem, _NORM_STAGE_2_BLOCK_COUNT); + for(i=0; i<_NORM_STAGE_2_BLOCK_COUNT; ++i) { + pStage2Block[i]=norm-norms; + } + + /* set these data for U+ac00..U+d7a3 */ + c=0xac00; + + /* set a partial stage 2 block before pStage2Block can be repeated */ + if(c&_NORM_STAGE_2_MASK) { + i=createStage2Block(c)+(uint16_t)(c&_NORM_STAGE_2_MASK); + do { + stage2[i++]=norm-norms; + } while(++c&_NORM_STAGE_2_MASK); + } + + /* set full stage 1 blocks to the common stage 2 block */ + while(c<(0xd7a3&~_NORM_STAGE_2_MASK)) { + stage1[c>>_NORM_TRIE_SHIFT]=pStage2Block-stage2; + c+=_NORM_STAGE_2_BLOCK_COUNT; + } + + /* set a partial stage 2 block after the repetition */ + i=createStage2Block(c); + while(c<=0xd7a3) { + stage2[i++]=norm-norms; + ++c; + } +} + +/* build runtime structures ------------------------------------------------- */ + +/* canonically reorder a UTF-32 string; return { leadCC, trailCC } */ +static uint16_t +reorderString(uint32_t *s, int32_t length) { + uint8_t ccs[40]; + uint32_t c; + int32_t i, j; + uint8_t cc, prevCC; + + if(length<=0) { + return 0; + } + + for(i=0; ilenNFD; + if(length>0) { + norm->canonBothCCs=reorderString(norm->nfd, length); + } + + /* canonically reorder the NFKD */ + length=norm->lenNFKD; + if(length>0) { + norm->compatBothCCs=reorderString(norm->nfkd, length); + } + + /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */ + if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) { + printf("U+%04lx has NFD[%d] but quick check 0x%02x\n", code, norm->lenNFD, norm->qcFlags); + } + if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) { + printf("U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", code, norm->lenNFD, norm->lenNFKD, norm->qcFlags); + } + + /* ### see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */ + combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1; + + if(norm->combiningFlags&1) { + if(norm->udataCC!=0) { + /* illegal - data-derivable composition exclusion */ + printf("U+%04lx combines forward but udataCC==%u\n", code, norm->udataCC); + } + } + if(norm->combiningFlags&2) { + if((norm->qcFlags&0x11)==0) { + printf("U+%04lx combines backward but qcNF?C==0\n", code); + } +#if 0 + /* occurs sometimes */ + if(norm->udataCC==0) { + printf("U+%04lx combines backward but udataCC==0\n", code); + } +#endif + } + if((norm->combiningFlags&3)==3) { + printf("U+%04lx combines both ways\n", code); + } +} + +/* ### debug */ +static uint32_t countCCSame=0, countCCTrail=0, countCCTwo=0; + +static uint32_t +make32BitNorm(Norm *norm) { + UChar extra[100]; + uint32_t word; + int32_t i, length, beforeZero=0, count, start; + + /* reset the 32-bit word and set the quick check flags */ + word=norm->qcFlags; + + /* set the UnicodeData combining class */ + word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT; + + /* set the combining flag and index */ + if(norm->combiningFlags&3) { + word|=(uint32_t)(norm->combiningFlags&3)<<6; + } + + /* set the combining index value into the extra data */ + if(norm->combiningIndex!=0) { + extra[0]=norm->combiningIndex; + beforeZero=1; + } + + count=beforeZero; + + /* write the decompositions */ + if((norm->lenNFD|norm->lenNFKD)!=0) { + extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */ + + length=norm->lenNFD; + if(length>0) { + if(norm->canonBothCCs!=0) { + extra[beforeZero]|=0x80; + extra[count++]=norm->canonBothCCs; + } + start=count; + for(i=0; infd[i]); + } + extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */ + } + + length=norm->lenNFKD; + if(length>0) { + if(norm->compatBothCCs!=0) { + extra[beforeZero]|=0x8000; + extra[count++]=norm->compatBothCCs; + } + start=count; + for(i=0; infkd[i]); + } + extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */ + } + } + + /* allocate and copy the extra data */ + if(count!=0) { + UChar *p; + + if(norm->specialTag!=0) { + fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag); + exit(U_ILLEGAL_ARGUMENT_ERROR); + } + + p=(UChar *)utm_allocN(extraMem, count); + uprv_memcpy(p, extra, count*2); + + /* set the extra index, offset by beforeZero */ + word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT; + } else if(norm->specialTag!=0) { + /* set a special tag instead of an extra index */ + word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT; + } + + return word; +} + +/* turn all Norm structs into corresponding 32-bit norm values */ +static void +makeAll32() { + uint16_t i, count; + + /* + * allocate and fill the table of 32-bit normalization data + * leave space for data for the up to 1024 lead surrogates + */ + norm32TableTop=(uint16_t)stage2Mem->index; + norm32Table=(uint32_t *)uprv_malloc((norm32TableTop+1024)*4); + if(norm32Table==NULL) { + fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for norm32Table\n", + norm32TableTop+1024); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* reset all entries */ + uprv_memset(norm32Table, 0, (norm32TableTop+1024)*4); + + count=0; + + /* skip the first, all-empty block */ + for(i=_NORM_STAGE_2_BLOCK_COUNT; iindex); + printf("count of (uncompacted) non-zero 32-bit words: %lu\n", count); + printf("count CC frequencies: same %lu trail %lu two %lu\n", countCCSame, countCCTrail, countCCTwo); +} + +/* + * extract all Norm.canonBothCCs into the FCD table + * set 32-bit values to use the common fold and compact functions + */ +static void +makeFCD() { + static uint16_t map[0x10000>>_NORM_TRIE_SHIFT]; + Norm *norm; + uint32_t i, oredValues; + uint16_t bothCCs, delta; + + /* + * allocate and fill the table of 32-bit normalization data + * leave space for data for the up to 1024 lead surrogates + */ + fcdTableTop=(uint16_t)stage2Mem->index; + fcdTable=(uint32_t *)uprv_malloc((fcdTableTop+1024)*4); + if(fcdTable==NULL) { + fprintf(stderr, "error: gennorm - unable to allocate %ld 32-bit words for fcdTable\n", + fcdTableTop+1024); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* reset all entries */ + uprv_memset(fcdTable, 0, (fcdTableTop+1024)*4); + + /* compact out the all-zero stage 2 blocks */ + map[0]=0; + delta=0; + + /* oredValues detects all-zero stage 2 blocks that will be removed from fcdStage1 */ + oredValues=0; + + /* skip the first, all-empty block */ + for(i=_NORM_STAGE_2_BLOCK_COUNT; icanonBothCCs; + if(bothCCs==0) { + /* if there are no decomposition cc's then use the udataCC twice */ + bothCCs=norm->udataCC; + bothCCs|=bothCCs<<8; + } + oredValues|=fcdTable[i-delta]=bothCCs; + } + + if((i&_NORM_STAGE_2_MASK)==_NORM_STAGE_2_MASK) { + /* at the end of a stage 2 block, check if there are any non-zero entries */ + if(oredValues==0) { + /* all zero: skip this block */ + delta+=_NORM_STAGE_2_BLOCK_COUNT; + map[i>>_NORM_TRIE_SHIFT]=(uint16_t)0; + } else { + /* keep this block */ + map[i>>_NORM_TRIE_SHIFT]=(uint16_t)(i&~_NORM_STAGE_2_MASK)-delta; + oredValues=0; + } + } + } + + /* now adjust stage 1 */ + for(i=0; i<_NORM_STAGE_1_MAX_COUNT; ++i) { + fcdStage1[i]=map[fcdStage1[i]>>_NORM_TRIE_SHIFT]; + } + + printf("FCD: omitted %u stage 2 entries in all-zero blocks\n", delta); + + /* adjust the table top */ + fcdTableTop-=delta; +} + +/* + * Fold the supplementary code point data for one lead surrogate. + */ +static uint16_t +foldLeadSurrogate(uint16_t *parent, uint16_t parentCount, + uint32_t *stage, uint16_t *pStageCount, + uint32_t base, + UBool isNorm32) { + uint32_t leadNorm32=0; + uint32_t i, j, s2; + uint32_t leadSurrogate=0xd7c0+(base>>10); + + printf("supplementary data for lead surrogate U+%04lx\n", leadSurrogate); + + /* calculate the 32-bit data word for the lead surrogate */ + for(i=0; i<_NORM_SURROGATE_BLOCK_COUNT; ++i) { + s2=parent[(base>>_NORM_TRIE_SHIFT)+i]; + if(s2!=0) { + for(j=0; j<_NORM_STAGE_2_BLOCK_COUNT; ++j) { + /* basically, or all 32-bit data into the one for the lead surrogate */ + leadNorm32|=stage[s2+j]; + } + } + } + + if(isNorm32) { + /* turn multi-bit fields into the worst-case value */ + if(leadNorm32&_NORM_CC_MASK) { + leadNorm32|=_NORM_CC_MASK; + } + + /* clean up unnecessarily ored bit fields */ + leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT); + + if(leadNorm32==0) { + /* nothing to do (only composition exclusions?) */ + return 0; + } + + /* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */ + leadNorm32|=( + (uint32_t)_NORM_EXTRA_INDEX_TOP+ + (uint32_t)((parentCount-_NORM_STAGE_1_BMP_COUNT)>>_NORM_SURROGATE_BLOCK_BITS) + )<<_NORM_EXTRA_SHIFT; + } else { + if(leadNorm32==0) { + /* FCD: nothing to do */ + return 0; + } + + /* + * For FCD, replace the entire combined value by the surrogate index + * and make sure that it is not 0 (by not offsetting it by the BMP top, + * since here we have enough bits for this); + * lead surrogates are tested at runtime on the character code itself + * instead on special values of the trie data - + * this is because 16 bits in the FCD trie data do not allow for anything + * but the two leading and trailing combining classes of the canonical decomposition. + */ + leadNorm32=parentCount>>_NORM_SURROGATE_BLOCK_BITS; + } + + /* enter the lead surrogate's data */ + s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT]; + if(s2==0) { + /* allocate a new stage 2 block in stage (the memory is there from makeAll32()/makeFCD()) */ + s2=parent[leadSurrogate>>_NORM_TRIE_SHIFT]=*pStageCount; + *pStageCount+=_NORM_STAGE_2_BLOCK_COUNT; + } + stage[s2+(leadSurrogate&_NORM_STAGE_2_MASK)]=leadNorm32; + + /* move the actual stage 1 indexes from the supplementary position to the new one */ + uprv_memmove(parent+parentCount, parent+(base>>_NORM_TRIE_SHIFT), _NORM_SURROGATE_BLOCK_COUNT*2); + + /* increment stage 1 top */ + return _NORM_SURROGATE_BLOCK_COUNT; +} + +/* + * Fold the normalization data for supplementary code points into + * a compact area on top of the BMP-part of the trie index, + * with the lead surrogates indexing this compact area. + * + * Use after makeAll32(). + */ +static uint16_t +foldSupplementary(uint16_t *parent, uint16_t parentCount, + uint32_t *stage, uint16_t *pStageCount, + UBool isNorm32) { + uint32_t c; + uint16_t i; + + /* search for any stage 1 entries for supplementary code points */ + for(c=0x10000; c<0x110000;) { + i=parent[c>>_NORM_TRIE_SHIFT]; + if(i!=0) { + /* there is data, treat the full block for a lead surrogate */ + c&=~0x3ff; + parentCount+=foldLeadSurrogate(parent, parentCount, stage, pStageCount, c, isNorm32); + c+=0x400; + } else { + c+=_NORM_STAGE_2_BLOCK_COUNT; + } + } + + printf("trie index count: BMP %u all Unicode %lu folded %u\n", + _NORM_STAGE_1_BMP_COUNT, _NORM_STAGE_1_MAX_COUNT, parentCount); + return parentCount; +} + +static uint16_t +compact(uint16_t *parent, uint16_t parentCount, + uint32_t *stage, uint16_t stageCount) { + /* + * This function is the common implementation for compacting + * the stage 2 tables of 32-bit values. + * It is a copy of genprops/store.c's compactStage() adapted for the 32-bit stage 2 tables. + */ + static uint16_t map[0x10000>>_NORM_TRIE_SHIFT]; + uint32_t x; + uint16_t i, start, prevEnd, newStart; + + map[0]=0; + newStart=_NORM_STAGE_2_BLOCK_COUNT; + for(start=newStart; start>_NORM_TRIE_SHIFT]=(uint16_t)(newStart-i); + + /* move the non-overlapping indexes to their new positions */ + start+=i; + for(i=(uint16_t)(_NORM_STAGE_2_BLOCK_COUNT-i); i>0; --i) { + stage[newStart++]=stage[start++]; + } + } else if(newStart>_NORM_TRIE_SHIFT]=newStart; + for(i=_NORM_STAGE_2_BLOCK_COUNT; i>0; --i) { + stage[newStart++]=stage[start++]; + } + } else /* no overlap && newStart==start */ { + map[start>>_NORM_TRIE_SHIFT]=start; + newStart+=_NORM_STAGE_2_BLOCK_COUNT; + start=newStart; + } + } + + /* now adjust the parent table */ + for(i=0; i>_NORM_TRIE_SHIFT]; + } + + /* we saved some space */ + printf("compacting trie: count of 32-bit words %lu->%lu\n", stageCount, newStart); + return newStart; +} + +extern void +processData() { +#if 0 + uint16_t i; +#endif + + processCombining(); + + /* canonically reorder decompositions and assign combining classes for decompositions */ + enumTrie(postParseFn, NULL); + +#if 0 + for(i=1; i<64; ++i) { + if(combineAndQC[i]) { + printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33); + } + } +#endif + + /* add hangul/jamo specials */ + setHangulJamoSpecials(); + + /* copy stage 1 for the FCD trie */ + uprv_memcpy(fcdStage1, stage1, sizeof(stage1)); + + /* --- finalize data for quick checks & normalization: stage1/norm32Table --- */ + + /* turn the Norm structs (stage2, norms) into 32-bit data words (norm32Table) */ + makeAll32(); + + /* fold supplementary code points into lead surrogates */ + stage1Top=foldSupplementary(stage1, _NORM_STAGE_1_BMP_COUNT, norm32Table, &norm32TableTop, TRUE); + + /* compact stage 2 */ + norm32TableTop=compact(stage1, stage1Top, norm32Table, norm32TableTop); + + /* --- finalize data for FCD checks: fcdStage1/fcdTable --- */ + + /* FCD data: take Norm.canonBothCCs and store them in the FCD table */ + makeFCD(); + + /* FCD: fold supplementary code points into lead surrogates */ + fcdStage1Top=foldSupplementary(fcdStage1, _NORM_STAGE_1_BMP_COUNT, fcdTable, &fcdTableTop, FALSE); + + /* FCD: compact stage 2 */ + fcdTableTop=compact(fcdStage1, fcdStage1Top, fcdTable, fcdTableTop); + + /* ### debug output */ +#if 0 + printf("number of stage 2 entries: %ld\n", stage2Mem->index); + printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2); +#endif + printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop); + printf("combining table count: %u\n", combiningTableTop); +} + +extern void +generateData(const char *dataDir) { + UNewDataMemory *pData; + uint16_t *p16; + UErrorCode errorCode=U_ZERO_ERROR; + uint32_t size, dataLength; + uint16_t i; + + size= + _NORM_INDEX_TOP*2+ + stage1Top*2+ + norm32TableTop*4+ + extraMem->index*2+ + combiningTableTop*2+ + fcdStage1Top*2+ + fcdTableTop*2; + + printf("size of " DATA_NAME "." DATA_TYPE " contents: %lu bytes\n", size); + + indexes[_NORM_INDEX_COUNT]=_NORM_INDEX_TOP; + indexes[_NORM_INDEX_TRIE_SHIFT]=_NORM_TRIE_SHIFT; + indexes[_NORM_INDEX_TRIE_INDEX_COUNT]=stage1Top; + indexes[_NORM_INDEX_TRIE_DATA_COUNT]=norm32TableTop; + indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index; + + indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop; + indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop; + indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop; + indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop; + + indexes[_NORM_INDEX_FCD_TRIE_INDEX_COUNT]=fcdStage1Top; + indexes[_NORM_INDEX_FCD_TRIE_DATA_COUNT]=fcdTableTop; + + /* adjust the stage 1 indexes to offset stage 2 from the beginning of stage 1 */ + + /* stage1/norm32Table */ + for(i=0; iindex*2); + udata_writeBlock(pData, combiningTable, combiningTableTop*2); + udata_writeBlock(pData, fcdStage1, fcdStage1Top*2); + udata_writeBlock(pData, fcdTable, fcdTableTop*2); + + /* finish up */ + dataLength=udata_finish(pData, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode); + exit(errorCode); + } + + if(dataLength!=size) { + fprintf(stderr, "gennorm: data length %lu != calculated size %lu\n", + dataLength, size); + exit(U_INTERNAL_PROGRAM_ERROR); + } +} + +/* + * Hey, Emacs, please set the following: + * + * Local Variables: + * indent-tabs-mode: nil + * End: + * + */