mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-1007 initial new normalization code
X-SVN-Rev: 5035
This commit is contained in:
parent
a93780d01d
commit
42c2ec7dc8
16 changed files with 3485 additions and 58 deletions
8
.gitignore
vendored
8
.gitignore
vendored
|
@ -212,6 +212,14 @@ icu4c/source/tools/gennames/Makefile
|
|||
icu4c/source/tools/gennames/Release
|
||||
icu4c/source/tools/gennames/gennames
|
||||
icu4c/source/tools/gennames/tmp
|
||||
icu4c/source/tools/gennorm/*.d
|
||||
icu4c/source/tools/gennorm/*.pdb
|
||||
icu4c/source/tools/gennorm/*.plg
|
||||
icu4c/source/tools/gennorm/Debug
|
||||
icu4c/source/tools/gennorm/Makefile
|
||||
icu4c/source/tools/gennorm/Release
|
||||
icu4c/source/tools/gennorm/gennorm
|
||||
icu4c/source/tools/gennorm/tmp
|
||||
icu4c/source/tools/genprops/*.d
|
||||
icu4c/source/tools/genprops/*.pdb
|
||||
icu4c/source/tools/genprops/Debug
|
||||
|
|
|
@ -74,6 +74,18 @@ Package=<4>
|
|||
Begin Project Dependency
|
||||
Project_Dep_Name decmn
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name genfchk
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gennorm
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name genqchk
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name genuca
|
||||
End Project Dependency
|
||||
}}}
|
||||
|
||||
###############################################################################
|
||||
|
@ -96,16 +108,10 @@ Package=<4>
|
|||
Project_Dep_Name i18n
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name makeconv
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gencol
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name genrb
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gentest
|
||||
Project_Dep_Name toolutil
|
||||
End Project Dependency
|
||||
}}}
|
||||
|
||||
|
@ -270,6 +276,21 @@ Package=<4>
|
|||
|
||||
###############################################################################
|
||||
|
||||
Project: "gennorm"=..\tools\gennorm\gennorm.dsp - Package Owner=<4>
|
||||
|
||||
Package=<5>
|
||||
{{{
|
||||
}}}
|
||||
|
||||
Package=<4>
|
||||
{{{
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name common
|
||||
End Project Dependency
|
||||
}}}
|
||||
|
||||
###############################################################################
|
||||
|
||||
Project: "genprops"=..\tools\genprops\genprops.dsp - Package Owner=<4>
|
||||
|
||||
Package=<5>
|
||||
|
@ -432,30 +453,9 @@ Package=<4>
|
|||
Project_Dep_Name i18n
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name makeconv
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gencol
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name genrb
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name genccode
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gencmn
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gencnval
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gennames
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gentz
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name toolutil
|
||||
End Project Dependency
|
||||
}}}
|
||||
|
@ -548,6 +548,15 @@ Package=<4>
|
|||
Begin Project Dependency
|
||||
Project_Dep_Name genqchk
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name common
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name gennorm
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name i18n
|
||||
End Project Dependency
|
||||
}}}
|
||||
|
||||
###############################################################################
|
||||
|
|
|
@ -1268,6 +1268,10 @@ InputPath=.\unicode\unorm.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unormimp.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\urep.h
|
||||
|
||||
!IF "$(CFG)" == "common - Win32 Release"
|
||||
|
|
|
@ -29,6 +29,11 @@
|
|||
#include "unicode/unicode.h"
|
||||
#include "mutex.h"
|
||||
|
||||
/* ### TODO: new implementation */
|
||||
#include "unormimp.h"
|
||||
|
||||
|
||||
|
||||
|
||||
#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))
|
||||
/**
|
||||
|
@ -666,6 +671,25 @@ Normalizer::decompose(const UnicodeString& source,
|
|||
UnicodeString& result,
|
||||
UErrorCode &status)
|
||||
{
|
||||
/* ### TODO: begin new implementation */
|
||||
if(unorm_usesNewImplementation()) {
|
||||
if(source.isBogus()) {
|
||||
result.setToBogus();
|
||||
} else {
|
||||
/* make sure that we do not operate on the same buffer in source and result */
|
||||
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
|
||||
result.fLength=unorm_decompose(result.fArray, result.fCapacity,
|
||||
source.fArray, source.fLength,
|
||||
compat, (options&IGNORE_HANGUL)!=0,
|
||||
UnicodeString::growBuffer, &result,
|
||||
&status);
|
||||
if(U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* ### end new implementation */
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
164
icu4c/source/common/unormimp.h
Normal file
164
icu4c/source/common/unormimp.h
Normal file
|
@ -0,0 +1,164 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: unormimp.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UNORMIMP_H__
|
||||
#define __UNORMIMP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* trie constants */
|
||||
enum {
|
||||
/*
|
||||
* must be <=10:
|
||||
* above 10, a lead surrogate's block is smaller than a stage 2 block
|
||||
*/
|
||||
_NORM_TRIE_SHIFT=5,
|
||||
|
||||
_NORM_STAGE_2_BLOCK_COUNT=1<<_NORM_TRIE_SHIFT,
|
||||
_NORM_STAGE_2_MASK=_NORM_STAGE_2_BLOCK_COUNT-1,
|
||||
|
||||
_NORM_STAGE_1_BMP_COUNT=(1<<(16-_NORM_TRIE_SHIFT)),
|
||||
|
||||
_NORM_SURROGATE_BLOCK_BITS=10-_NORM_TRIE_SHIFT,
|
||||
_NORM_SURROGATE_BLOCK_COUNT=(1<<_NORM_SURROGATE_BLOCK_BITS)
|
||||
};
|
||||
|
||||
/* this may be >0xffff and may not work as an enum */
|
||||
#define _NORM_STAGE_1_MAX_COUNT (0x110000>>_NORM_TRIE_SHIFT)
|
||||
|
||||
/* value constants */
|
||||
enum {
|
||||
/* quick check flags 0..3 set mean "no" for their forms */
|
||||
_NORM_QC_NFC=0x11, /* no|maybe */
|
||||
_NORM_QC_NFKC=0x22, /* no|maybe */
|
||||
_NORM_QC_NFD=4, /* no */
|
||||
_NORM_QC_NFKD=8, /* no */
|
||||
|
||||
_NORM_QC_ANY_NO=0xf,
|
||||
|
||||
/* quick check flags 4..5 mean "maybe" for their forms; test flags>=_NORM_QC_MAYBE */
|
||||
_NORM_QC_MAYBE=0x10,
|
||||
_NORM_QC_ANY_MAYBE=0x30,
|
||||
|
||||
_NORM_COMBINES_FWD=0x40,
|
||||
_NORM_COMBINES_BACK=0x80,
|
||||
_NORM_COMBINES_ANY=0xc0,
|
||||
|
||||
#if 0
|
||||
_NORM_CC_TYPE_MASK=0xc0,
|
||||
_NORM_CC_TYPE_NONE=0, /* no cc - lead and trail cc are 0 */
|
||||
_NORM_CC_TYPE_SAME=0x40, /* lead and trail cc are same, non-zero, and in value */
|
||||
_NORM_CC_TYPE_TRAIL=0x80, /* lead cc=0, trail cc in value */
|
||||
_NORM_CC_TYPE_TWO=0xc0, /* 0 != lead cc < trail cc, lead cc in value, trail cc in extra data */
|
||||
|
||||
_NORM_CC_HAS_LEAD=0x40, /* side effect of the above flags: if and only if bit 6 is 0, then lead cc is 0 */
|
||||
_NORM_CC_HAS_LEAD_HAS_TRAIL=0x80, /* if(has lead) then one can check for (has trail) instead of (&cc mask==same/two) */
|
||||
#endif
|
||||
|
||||
_NORM_CC_SHIFT=8, /* UnicodeData.txt combining class in bits 15..8 */
|
||||
_NORM_CC_MASK=0xff00,
|
||||
|
||||
_NORM_EXTRA_SHIFT=16, /* 16 bits for the index to UChars and other extra data */
|
||||
_NORM_EXTRA_INDEX_TOP=0xfc00, /* start of surrogate specials after shift */
|
||||
|
||||
_NORM_EXTRA_SURROGATE_MASK=0x3ff,
|
||||
_NORM_EXTRA_SURROGATE_TOP=0x3f0, /* hangul etc. */
|
||||
|
||||
_NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP,
|
||||
_NORM_EXTRA_JAMO_1, /* ### not used */
|
||||
_NORM_EXTRA_JAMO_2,
|
||||
_NORM_EXTRA_JAMO_3
|
||||
};
|
||||
|
||||
/* value constants using >16 bits */
|
||||
#define _NORM_MIN_SPECIAL 0xfc000000
|
||||
#define _NORM_SURROGATES_TOP 0xfff00000
|
||||
#define _NORM_MIN_HANGUL 0xfff00000
|
||||
#define _NORM_MIN_JAMO2 0xfff20000
|
||||
#define _NORM_JAMO2_TOP 0xfff30000
|
||||
|
||||
|
||||
/* index values */
|
||||
enum {
|
||||
_NORM_INDEX_COUNT,
|
||||
_NORM_INDEX_TRIE_SHIFT,
|
||||
_NORM_INDEX_TRIE_INDEX_COUNT,
|
||||
_NORM_INDEX_TRIE_DATA_COUNT,
|
||||
_NORM_INDEX_UCHAR_COUNT,
|
||||
|
||||
_NORM_INDEX_COMBINE_DATA_COUNT,
|
||||
_NORM_INDEX_COMBINE_FWD_COUNT,
|
||||
_NORM_INDEX_COMBINE_BOTH_COUNT,
|
||||
_NORM_INDEX_COMBINE_BACK_COUNT,
|
||||
|
||||
_NORM_INDEX_MIN_NFC_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFKC_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFD_NO_MAYBE,
|
||||
_NORM_INDEX_MIN_NFKD_NO_MAYBE,
|
||||
|
||||
_NORM_INDEX_FCD_TRIE_INDEX_COUNT,
|
||||
_NORM_INDEX_FCD_TRIE_DATA_COUNT,
|
||||
|
||||
_NORM_INDEX_TOP=16
|
||||
};
|
||||
|
||||
enum {
|
||||
/* FCD check: everything below this code point is known to have a 0 lead combining class */
|
||||
_NORM_MIN_WITH_LEAD_CC=0x300
|
||||
};
|
||||
|
||||
/**
|
||||
* Is the normalizer data loaded?
|
||||
* This is used internally before other internal normalizer functions
|
||||
* are called.
|
||||
* It saves this check in each of many normalization calls that
|
||||
* are made for, e.g., collation.
|
||||
*
|
||||
* @param pErrorCode as usual
|
||||
* @return boolean value for whether the normalization data is loaded
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm_haveData(UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* internal API, used by normlzr.cpp
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_decompose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, UBool ignoreHangul,
|
||||
GrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* internal API, but used by tests
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
unorm_setNewImplementation(UBool useNew);
|
||||
|
||||
/**
|
||||
* internal API, but used by tests
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm_usesNewImplementation();
|
||||
|
||||
#endif
|
|
@ -53,7 +53,7 @@ all-local: thaidict.brk build-local
|
|||
##### Define all the data files. the build rule that depends on them is below.
|
||||
|
||||
## DAT files - Misc. data files.
|
||||
DAT_FILES=qchk.dat fchk.dat uprops.dat unames.dat cnvalias.dat tz.dat ucadata.dat invuca.dat
|
||||
DAT_FILES=qchk.dat fchk.dat uprops.dat unames.dat unorm.dat cnvalias.dat tz.dat ucadata.dat invuca.dat
|
||||
TEST_DAT_FILES=$(TESTOBJDATADIR)/test.dat
|
||||
|
||||
## BRK files
|
||||
|
@ -150,6 +150,11 @@ uprops.dat: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/Mirror.txt $(TOO
|
|||
@echo Creating uprops.dat
|
||||
@ICU_DATA=. $(INVOKE) $(TOOLDIR)/genprops/genprops -s $(UNICODEDATADIR) -d . -u $(UNICODE_VERSION)
|
||||
|
||||
# unorm.dat
|
||||
unorm.dat: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/DerivedNormalizationProperties.txt $(UNICODEDATADIR)/Mirror.txt $(TOOLDIR)/gennorm/gennorm
|
||||
@echo Creating unorm.dat
|
||||
@ICU_DATA=. $(INVOKE) $(TOOLDIR)/gennorm/gennorm -s $(UNICODEDATADIR) -d . -u $(UNICODE_VERSION)
|
||||
|
||||
# ucadata.dat
|
||||
ucadata.dat: $(UNICODEDATADIR)/FractionalUCA.txt $(TOOLDIR)/genuca/genuca
|
||||
@echo Creating ucadata.dat and invuca.dat
|
||||
|
@ -205,7 +210,7 @@ endif
|
|||
$(TESTOBJDATADIR)/%.res: $(TESTSRCDATADIR)/%.txt $(TOOLDIR)/genrb/genrb
|
||||
@ICU_DATA=. $(INVOKE) $(TOOLDIR)/genrb/genrb -s $(TESTSRCDATADIR) -d $(TESTOBJDATADIR) $(<F)
|
||||
|
||||
%.res: $(SRCDATADIR)/%.txt $(TOOLDIR)/genrb/genrb ucadata.dat uprops.dat
|
||||
%.res: $(SRCDATADIR)/%.txt $(TOOLDIR)/genrb/genrb ucadata.dat uprops.dat unorm.dat
|
||||
@ICU_DATA=. $(INVOKE) $(TOOLDIR)/genrb/genrb -s $(SRCDATADIR) -d . $(<F)
|
||||
|
||||
|
||||
|
@ -219,7 +224,7 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
|||
###########
|
||||
########### 390 support
|
||||
UCMFILES390=ebcdic-xml-us.ucm ibm-37-s390.ucm ibm-1047-s390.ucm ibm-4909.ucm
|
||||
ALLFILES390=qchk.dat fchk.dat uprops.dat cnvalias.dat $(UCMFILES390:.ucm=.cnv)
|
||||
ALLFILES390=qchk.dat fchk.dat uprops.dat unorm.dat cnvalias.dat $(UCMFILES390:.ucm=.cnv)
|
||||
|
||||
icudata390.lst: $(SRCLISTDEPS)
|
||||
@echo Generating $@ list of 390 data files
|
||||
|
|
|
@ -137,7 +137,7 @@ testdata: ucadata.dat $(RB_FILES) {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe
|
|||
BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
|
||||
|
||||
#invoke pkgdata
|
||||
"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : $(CNV_FILES) $(BRK_FILES) qchk.dat fchk.dat uprops.dat unames.dat cnvalias.dat tz.dat ucadata.dat invuca.dat $(ALL_RES) icudata.res
|
||||
"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : $(CNV_FILES) $(BRK_FILES) qchk.dat fchk.dat uprops.dat unames.dat unorm.dat cnvalias.dat tz.dat ucadata.dat invuca.dat $(ALL_RES) icudata.res
|
||||
@echo Building icu data
|
||||
@cd "$(ICUDBLD)"
|
||||
"$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata" -e icudata -v -T . -m dll -c -p $(U_ICUDATA_NAME) -O "$(PKGOPT)" -d "$(DLL_OUTPUT)" -s . <<pkgdatain.txt
|
||||
|
@ -145,6 +145,7 @@ qchk.dat
|
|||
fchk.dat
|
||||
uprops.dat
|
||||
unames.dat
|
||||
unorm.dat
|
||||
cnvalias.dat
|
||||
tz.dat
|
||||
ucadata.dat
|
||||
|
@ -196,6 +197,7 @@ CLEAN :
|
|||
-@erase "fchk*.*"
|
||||
-@erase "uprops*.*"
|
||||
-@erase "unames*.*"
|
||||
-@erase "unorm*.*"
|
||||
-@erase "cnvalias*.*"
|
||||
-@erase "tz*.*"
|
||||
-@erase "ibm*_cnv.c"
|
||||
|
@ -266,7 +268,7 @@ fchk.dat: "$(ICUDATA)\unidata\FCDCheck.txt" "$(ICUTOOLS)\genfchk\$(CFG)\genfchk.
|
|||
unames.dat: {"$(ICUDATA)"}\unidata\UnicodeData.txt "$(ICUTOOLS)\gennames\$(CFG)\gennames.exe"
|
||||
@echo Creating data file for Unicode Names
|
||||
@set ICU_DATA=$(ICUDBLD)
|
||||
@"$(ICUTOOLS)\gennames\$(CFG)\gennames" $(ICUDATA)\unidata\UnicodeData.txt
|
||||
@"$(ICUTOOLS)\gennames\$(CFG)\gennames" -1 $(ICUDATA)\unidata\UnicodeData.txt
|
||||
|
||||
# Targets for uprops.dat
|
||||
uprops.dat: "$(ICUDATA)\unidata\UnicodeData.txt" "$(ICUTOOLS)\genprops\$(CFG)\genprops.exe"
|
||||
|
@ -274,6 +276,12 @@ uprops.dat: "$(ICUDATA)\unidata\UnicodeData.txt" "$(ICUTOOLS)\genprops\$(CFG)\ge
|
|||
@set ICU_DATA=$(ICUDBLD)
|
||||
@"$(ICUTOOLS)\genprops\$(CFG)\genprops" -s "$(ICUDATA)\unidata"
|
||||
|
||||
# Targets for unorm.dat
|
||||
unorm.dat: "$(ICUDATA)\unidata\UnicodeData.txt" "$(ICUDATA)\unidata\DerivedNormalizationProperties.txt" "$(ICUTOOLS)\gennorm\$(CFG)\gennorm.exe"
|
||||
@echo Creating data file for Unicode Normalization
|
||||
@set ICU_DATA=$(ICUDBLD)
|
||||
@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -s "$(ICUDATA)\unidata"
|
||||
|
||||
# Targets for converters
|
||||
cnvalias.dat : {"$(ICUDATA)"}\convrtrs.txt "$(ICUTOOLS)\gencnval\$(CFG)\gencnval.exe"
|
||||
@echo Creating data file for Converter Aliases
|
||||
|
@ -294,18 +302,18 @@ ucadata.dat: "$(ICUDATA)\unidata\FractionalUCA.txt" "$(ICUTOOLS)\genuca\$(CFG)\g
|
|||
|
||||
invuca.dat: ucadata.dat
|
||||
|
||||
{"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe : ucadata.dat qchk.dat fchk.dat uprops.dat
|
||||
{"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe : ucadata.dat qchk.dat fchk.dat uprops.dat unorm.dat
|
||||
|
||||
ucadata.dat : uprops.dat qchk.dat fchk.dat uprops.dat
|
||||
ucadata.dat : uprops.dat qchk.dat fchk.dat unorm.dat
|
||||
|
||||
# Dependencies on the tools
|
||||
convrtrs.txt : {"$(ICUTOOLS)\gencnval\$(CFG)"}gencnval.exe
|
||||
|
||||
tz.txt : {"$(ICUTOOLS)\gentz\$(CFG)"}gentz.exe
|
||||
|
||||
uprops.dat unames.dat cnvalias.dat tz.dat ucadata.dat invuca.dat: {"$(ICUTOOLS)\genccode\$(CFG)"}genccode.exe
|
||||
uprops.dat unames.dat unorm.dat cnvalias.dat tz.dat ucadata.dat invuca.dat: {"$(ICUTOOLS)\genccode\$(CFG)"}genccode.exe
|
||||
|
||||
|
||||
$(TRANSLIT_SOURCE) $(GENRB_SOURCE) : {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe ucadata.dat qchk.dat fchk.dat uprops.dat
|
||||
$(TRANSLIT_SOURCE) $(GENRB_SOURCE) : {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe ucadata.dat qchk.dat fchk.dat uprops.dat unorm.dat
|
||||
|
||||
$(UCM_SOURCE) : {"$(ICUTOOLS)\makeconv\$(CFG)"}makeconv.exe {"$(ICUTOOLS)\genccode\$(CFG)"}genccode.exe
|
||||
|
|
|
@ -137,7 +137,7 @@ testdata: ucadata.dat $(RB_FILES) {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe
|
|||
BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
|
||||
|
||||
#invoke pkgdata
|
||||
"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : $(CNV_FILES) $(BRK_FILES) qchk.dat fchk.dat uprops.dat unames.dat cnvalias.dat tz.dat ucadata.dat invuca.dat $(ALL_RES) icudata.res
|
||||
"$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" : $(CNV_FILES) $(BRK_FILES) qchk.dat fchk.dat uprops.dat unames.dat unorm.dat cnvalias.dat tz.dat ucadata.dat invuca.dat $(ALL_RES) icudata.res
|
||||
@echo Building icu data
|
||||
@cd "$(ICUDBLD)"
|
||||
"$(ICUTOOLS)\pkgdata\$(CFG)\pkgdata" -e icudata -v -T . -m dll -c -p $(U_ICUDATA_NAME) -O "$(PKGOPT)" -d "$(DLL_OUTPUT)" -s . <<pkgdatain.txt
|
||||
|
@ -145,6 +145,7 @@ qchk.dat
|
|||
fchk.dat
|
||||
uprops.dat
|
||||
unames.dat
|
||||
unorm.dat
|
||||
cnvalias.dat
|
||||
tz.dat
|
||||
ucadata.dat
|
||||
|
@ -196,6 +197,7 @@ CLEAN :
|
|||
-@erase "fchk*.*"
|
||||
-@erase "uprops*.*"
|
||||
-@erase "unames*.*"
|
||||
-@erase "unorm*.*"
|
||||
-@erase "cnvalias*.*"
|
||||
-@erase "tz*.*"
|
||||
-@erase "ibm*_cnv.c"
|
||||
|
@ -266,7 +268,7 @@ fchk.dat: "$(ICUDATA)\unidata\FCDCheck.txt" "$(ICUTOOLS)\genfchk\$(CFG)\genfchk.
|
|||
unames.dat: {"$(ICUDATA)"}\unidata\UnicodeData.txt "$(ICUTOOLS)\gennames\$(CFG)\gennames.exe"
|
||||
@echo Creating data file for Unicode Names
|
||||
@set ICU_DATA=$(ICUDBLD)
|
||||
@"$(ICUTOOLS)\gennames\$(CFG)\gennames" $(ICUDATA)\unidata\UnicodeData.txt
|
||||
@"$(ICUTOOLS)\gennames\$(CFG)\gennames" -1 $(ICUDATA)\unidata\UnicodeData.txt
|
||||
|
||||
# Targets for uprops.dat
|
||||
uprops.dat: "$(ICUDATA)\unidata\UnicodeData.txt" "$(ICUTOOLS)\genprops\$(CFG)\genprops.exe"
|
||||
|
@ -274,6 +276,12 @@ uprops.dat: "$(ICUDATA)\unidata\UnicodeData.txt" "$(ICUTOOLS)\genprops\$(CFG)\ge
|
|||
@set ICU_DATA=$(ICUDBLD)
|
||||
@"$(ICUTOOLS)\genprops\$(CFG)\genprops" -s "$(ICUDATA)\unidata"
|
||||
|
||||
# Targets for unorm.dat
|
||||
unorm.dat: "$(ICUDATA)\unidata\UnicodeData.txt" "$(ICUDATA)\unidata\DerivedNormalizationProperties.txt" "$(ICUTOOLS)\gennorm\$(CFG)\gennorm.exe"
|
||||
@echo Creating data file for Unicode Normalization
|
||||
@set ICU_DATA=$(ICUDBLD)
|
||||
@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -s "$(ICUDATA)\unidata"
|
||||
|
||||
# Targets for converters
|
||||
cnvalias.dat : {"$(ICUDATA)"}\convrtrs.txt "$(ICUTOOLS)\gencnval\$(CFG)\gencnval.exe"
|
||||
@echo Creating data file for Converter Aliases
|
||||
|
@ -294,18 +302,18 @@ ucadata.dat: "$(ICUDATA)\unidata\FractionalUCA.txt" "$(ICUTOOLS)\genuca\$(CFG)\g
|
|||
|
||||
invuca.dat: ucadata.dat
|
||||
|
||||
{"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe : ucadata.dat qchk.dat fchk.dat uprops.dat
|
||||
{"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe : ucadata.dat qchk.dat fchk.dat uprops.dat unorm.dat
|
||||
|
||||
ucadata.dat : uprops.dat qchk.dat fchk.dat uprops.dat
|
||||
ucadata.dat : uprops.dat qchk.dat fchk.dat unorm.dat
|
||||
|
||||
# Dependencies on the tools
|
||||
convrtrs.txt : {"$(ICUTOOLS)\gencnval\$(CFG)"}gencnval.exe
|
||||
|
||||
tz.txt : {"$(ICUTOOLS)\gentz\$(CFG)"}gentz.exe
|
||||
|
||||
uprops.dat unames.dat cnvalias.dat tz.dat ucadata.dat invuca.dat: {"$(ICUTOOLS)\genccode\$(CFG)"}genccode.exe
|
||||
uprops.dat unames.dat unorm.dat cnvalias.dat tz.dat ucadata.dat invuca.dat: {"$(ICUTOOLS)\genccode\$(CFG)"}genccode.exe
|
||||
|
||||
|
||||
$(TRANSLIT_SOURCE) $(GENRB_SOURCE) : {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe ucadata.dat qchk.dat fchk.dat uprops.dat
|
||||
$(TRANSLIT_SOURCE) $(GENRB_SOURCE) : {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe ucadata.dat qchk.dat fchk.dat uprops.dat unorm.dat
|
||||
|
||||
$(UCM_SOURCE) : {"$(ICUTOOLS)\makeconv\$(CFG)"}makeconv.exe {"$(ICUTOOLS)\genccode\$(CFG)"}genccode.exe
|
||||
|
|
|
@ -59,7 +59,7 @@ VERSION = @VERSION@
|
|||
|
||||
|
||||
SUBDIRS = ctestfw toolutil makeconv genrb genuca \
|
||||
genccode genqchk genfchk genprops gennames gencmn gencnval gentz gentest pkgdata
|
||||
genccode genqchk genfchk genprops gennames gennorm gencmn gencnval gentz gentest pkgdata
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local all-recursive install install-local install-files install-dlls build-cmnfile build-dll \
|
||||
|
|
8
icu4c/source/tools/gennorm/.cvsignore
Normal file
8
icu4c/source/tools/gennorm/.cvsignore
Normal file
|
@ -0,0 +1,8 @@
|
|||
tmp
|
||||
Debug
|
||||
Release
|
||||
Makefile
|
||||
*.d
|
||||
*.pdb
|
||||
*.plg
|
||||
gennorm
|
94
icu4c/source/tools/gennorm/Makefile.in
Normal file
94
icu4c/source/tools/gennorm/Makefile.in
Normal file
|
@ -0,0 +1,94 @@
|
|||
## Makefile.in for ICU - tools/gennorm
|
||||
## Copyright (c) 2001, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
## Steven R. Loomis/Markus W. Scherer
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Platform-specific setup
|
||||
include @platform_make_fragment@
|
||||
|
||||
##
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/gennorm
|
||||
|
||||
ICUDATADIR=$(top_builddir)/data
|
||||
UNICODEDATADIR=$(top_srcdir)/../data/unidata
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS) $(RES_FILES) $(TEST_FILES)
|
||||
|
||||
## Target information
|
||||
TARGET = gennorm
|
||||
|
||||
DEFS = @DEFS@
|
||||
CPPFLAGS = @CPPFLAGS@ -I$(top_builddir)/common -I$(top_srcdir)/common -I$(srcdir)/../toolutil
|
||||
CFLAGS = @CFLAGS@
|
||||
CXXFLAGS = @CXXFLAGS@
|
||||
ENABLE_RPATH = @ENABLE_RPATH@
|
||||
ifeq ($(ENABLE_RPATH),YES)
|
||||
RPATHLDFLAGS = $(LD_RPATH)$(LD_RPATH_PRE)$(libdir)
|
||||
endif
|
||||
LDFLAGS = @LDFLAGS@ $(RPATHLDFLAGS)
|
||||
INVOKE = $(LDLIBRARYPATH_ENVVAR)=$(top_builddir)/common:$(top_builddir)/tools/toolutil:$$$(LDLIBRARYPATH_ENVVAR)
|
||||
LIBS = $(LIBICUTOOLUTIL) @LIBS@ @LIB_M@
|
||||
|
||||
OBJECTS = gennorm.o store.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check \
|
||||
check-local build-data
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET) build-data
|
||||
|
||||
install-local: all-local
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
|
||||
$(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)/$(TARGET)
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(OBJECTS)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) -o $@ $^ $(LIBS)
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
|
471
icu4c/source/tools/gennorm/gennorm.c
Normal file
471
icu4c/source/tools/gennorm/gennorm.c
Normal file
|
@ -0,0 +1,471 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: gennorm.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This program reads the Unicode character database text file,
|
||||
* parses it, and extracts the data for normalization.
|
||||
* It then preprocesses it and writes a binary file for efficient use
|
||||
* in various Unicode text normalization processes.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "uoptions.h"
|
||||
#include "uparse.h"
|
||||
#include "unormimp.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
#include "gennorm.h"
|
||||
U_CDECL_END
|
||||
|
||||
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
UOPTION_COPYRIGHT,
|
||||
UOPTION_DESTDIR,
|
||||
UOPTION_SOURCEDIR,
|
||||
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
|
||||
};
|
||||
|
||||
extern int
|
||||
main(int argc, char* argv[]) {
|
||||
char filename[300];
|
||||
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
|
||||
char *basename=NULL;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
||||
/* preset then read command line options */
|
||||
options[4].value=u_getDataDirectory();
|
||||
options[5].value="";
|
||||
options[6].value="3.0.0";
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
fprintf(stderr,
|
||||
"error in command line argument \"%s\"\n",
|
||||
argv[-argc]);
|
||||
}
|
||||
if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
|
||||
fprintf(stderr,
|
||||
"usage: %s [-options] [suffix]\n"
|
||||
"\tread the UnicodeData.txt file and other Unicode properties files and\n"
|
||||
"\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
|
||||
"\toptions:\n"
|
||||
"\t\t-h or -? or --help this usage text\n"
|
||||
"\t\t-v or --verbose verbose output\n"
|
||||
"\t\t-c or --copyright include a copyright notice\n"
|
||||
"\t\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t\t-s or --sourcedir source directory, followed by the path\n"
|
||||
"\t\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
|
||||
"\t\tsuffix suffix that is to be appended with a '-'\n"
|
||||
"\t\t to the source file basenames before opening;\n"
|
||||
"\t\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
|
||||
argv[0]);
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
/* get the options values */
|
||||
beVerbose=options[2].doesOccur;
|
||||
haveCopyright=options[3].doesOccur;
|
||||
srcDir=options[5].value;
|
||||
destDir=options[4].value;
|
||||
|
||||
if(argc>=2) {
|
||||
suffix=argv[1];
|
||||
} else {
|
||||
suffix=NULL;
|
||||
}
|
||||
|
||||
setUnicodeVersion(options[6].value);
|
||||
|
||||
/* prepare the filename beginning with the source dir */
|
||||
uprv_strcpy(filename, srcDir);
|
||||
basename=filename+uprv_strlen(filename);
|
||||
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
init();
|
||||
|
||||
/* process DerivedNormalizationProperties.txt (quick check flags) */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "DerivedNormalizationProperties");
|
||||
basename[30]='-';
|
||||
uprv_strcpy(basename+31, suffix);
|
||||
uprv_strcat(basename+31, ".txt");
|
||||
}
|
||||
parseDerivedNormalizationProperties(filename, &errorCode);
|
||||
|
||||
/* process UnicodeData.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "UnicodeData.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "UnicodeData");
|
||||
basename[11]='-';
|
||||
uprv_strcpy(basename+12, suffix);
|
||||
uprv_strcat(basename+12, ".txt");
|
||||
}
|
||||
parseDB(filename, &errorCode);
|
||||
|
||||
/* process parsed data */
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
processData();
|
||||
|
||||
/* write the properties data file */
|
||||
generateData(destDir);
|
||||
}
|
||||
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
/* parsing helpers ---------------------------------------------------------- */
|
||||
|
||||
static const char *
|
||||
skipWhitespace(const char *s) {
|
||||
while(*s==' ' || *s=='\t') {
|
||||
++s;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
/*
|
||||
* parse a list of code points
|
||||
* store them as a UTF-32 string in dest[destCapacity] with the string length in dest[0]
|
||||
* set the first code point in *pFirst
|
||||
* return the number of code points
|
||||
*/
|
||||
static int32_t
|
||||
parseCodePoints(const char *s,
|
||||
uint32_t *dest, int32_t destCapacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
uint32_t value;
|
||||
int32_t count;
|
||||
|
||||
count=0;
|
||||
for(;;) {
|
||||
s=skipWhitespace(s);
|
||||
if(*s==';' || *s==0) {
|
||||
return count;
|
||||
}
|
||||
|
||||
/* read one code point */
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 16);
|
||||
if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
|
||||
fprintf(stderr, "gennorm: syntax error parsing code point at %s\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* overflow? */
|
||||
if(count>=destCapacity) {
|
||||
fprintf(stderr, "gennorm: code point sequence too long at at %s\n", s);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* append it to the destination array */
|
||||
dest[count++]=value;
|
||||
|
||||
/* go to the following characters */
|
||||
s=end;
|
||||
}
|
||||
}
|
||||
|
||||
/* read a range like start or start..end */
|
||||
static int32_t
|
||||
parseCodePointRange(const char *s,
|
||||
uint32_t *pStart, uint32_t *pEnd,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
uint32_t value;
|
||||
|
||||
s=skipWhitespace(s);
|
||||
if(*s==';' || *s==0) {
|
||||
fprintf(stderr, "gennorm: syntax error parsing range at %s - empty field\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* read the start code point */
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 16);
|
||||
if(end<=s || (*end!=' ' && *end!='\t' && *end!='.' && *end!=';') || value>=0x110000) {
|
||||
fprintf(stderr, "gennorm: syntax error parsing range start code point at %s\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
*pStart=*pEnd=value;
|
||||
|
||||
/* is there a "..end"? */
|
||||
s=skipWhitespace(end);
|
||||
if(*s==';' || *s==0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if(*s!='.' || s[1]!='.') {
|
||||
fprintf(stderr, "gennorm: syntax error parsing range at %s\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
s+=2;
|
||||
|
||||
/* read the end code point */
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 16);
|
||||
if(end<=s || (*end!=' ' && *end!='\t' && *end!=';') || value>=0x110000) {
|
||||
fprintf(stderr, "gennorm: syntax error parsing range end code point at %s\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
*pEnd=value;
|
||||
|
||||
/* is this a valid range? */
|
||||
if(value<*pStart) {
|
||||
fprintf(stderr, "gennorm: syntax error parsing range at %s - not a valid range\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* no garbage after that? */
|
||||
s=skipWhitespace(end);
|
||||
if(*s==';' || *s==0) {
|
||||
return value-*pStart+1;
|
||||
} else {
|
||||
fprintf(stderr, "gennorm: syntax error parsing range at %s\n", s);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for DerivedNormalizationProperties.txt ---------------------------- */
|
||||
|
||||
static void
|
||||
derivedNormalizationPropertiesLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *s;
|
||||
uint32_t start, end;
|
||||
int32_t count;
|
||||
uint8_t qcFlags;
|
||||
|
||||
/* get code point range */
|
||||
count=parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* ignore hangul - handle explicitly */
|
||||
if(start==0xac00) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get property - ignore unrecognized ones */
|
||||
s=(char *)skipWhitespace(fields[1][0]);
|
||||
if(*s=='N' && s[1]=='F') {
|
||||
qcFlags=0x11;
|
||||
s+=2;
|
||||
if(*s=='K') {
|
||||
qcFlags<<=1;
|
||||
++s;
|
||||
}
|
||||
|
||||
if(*s=='C' && s[1]=='_') {
|
||||
s+=2;
|
||||
} else if(*s=='D' && s[1]=='_') {
|
||||
qcFlags<<=2;
|
||||
s+=2;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
if(0==uprv_memcmp(s, "NO", 2)) {
|
||||
qcFlags&=0xf;
|
||||
} else if(0==uprv_memcmp(s, "MAYBE", 5)) {
|
||||
qcFlags&=0x30;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
/* set this flag for all code points in this range */
|
||||
while(start<=end) {
|
||||
setQCFlags(start++, qcFlags);
|
||||
}
|
||||
} else if(0==uprv_memcmp(s, "Comp_Ex", 7)) {
|
||||
while(start<=end) {
|
||||
setCompositionExclusion(start++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[2][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
|
||||
}
|
||||
|
||||
/* parser for UnicodeData.txt ----------------------------------------------- */
|
||||
|
||||
static void
|
||||
unicodeDataLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t decomp[40];
|
||||
Norm norm;
|
||||
const char *s;
|
||||
char *end;
|
||||
uint32_t code, value;
|
||||
int32_t length;
|
||||
UBool isCompat, something=FALSE;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* reset the properties */
|
||||
uprv_memset(&norm, 0, sizeof(Norm));
|
||||
|
||||
/* get the character code, field 0 */
|
||||
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get canonical combining class, field 3 */
|
||||
value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
|
||||
if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
|
||||
fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(value>0) {
|
||||
norm.udataCC=(uint8_t)value;
|
||||
something=TRUE;
|
||||
}
|
||||
|
||||
/* get the decomposition, field 5 */
|
||||
if(fields[5][0]<fields[5][1]) {
|
||||
if(*(s=fields[5][0])=='<') {
|
||||
++s;
|
||||
isCompat=TRUE;
|
||||
|
||||
/* skip and ignore the compatibility type name */
|
||||
do {
|
||||
if(s==fields[5][1]) {
|
||||
/* missing '>' */
|
||||
fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
} while(*s++!='>');
|
||||
} else {
|
||||
isCompat=FALSE;
|
||||
}
|
||||
|
||||
/* parse the decomposition string */
|
||||
length=parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* store the string */
|
||||
if(length>0) {
|
||||
something=TRUE;
|
||||
if(isCompat) {
|
||||
norm.lenNFKD=(uint8_t)length;
|
||||
norm.nfkd=decomp;
|
||||
} else {
|
||||
if(length>2) {
|
||||
fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
|
||||
code, length);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
norm.lenNFD=(uint8_t)length;
|
||||
norm.nfd=decomp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* check for non-character code points */
|
||||
if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
|
||||
fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
|
||||
code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
if(something) {
|
||||
/* there are normalization values, so store them */
|
||||
if(beVerbose) {
|
||||
printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
|
||||
code, norm.udataCC, norm.lenNFD, norm.lenNFKD);
|
||||
}
|
||||
storeNorm(code, &norm);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[15][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
128
icu4c/source/tools/gennorm/gennorm.dsp
Normal file
128
icu4c/source/tools/gennorm/gennorm.dsp
Normal file
|
@ -0,0 +1,128 @@
|
|||
# Microsoft Developer Studio Project File - Name="gennorm" - Package Owner=<4>
|
||||
# Microsoft Developer Studio Generated Build File, Format Version 6.00
|
||||
# ** DO NOT EDIT **
|
||||
|
||||
# TARGTYPE "Win32 (x86) Console Application" 0x0103
|
||||
|
||||
CFG=gennorm - Win32 Debug
|
||||
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
|
||||
!MESSAGE use the Export Makefile command and run
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "gennorm.mak".
|
||||
!MESSAGE
|
||||
!MESSAGE You can specify a configuration when running NMAKE
|
||||
!MESSAGE by defining the macro CFG on the command line. For example:
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "gennorm.mak" CFG="gennorm - Win32 Debug"
|
||||
!MESSAGE
|
||||
!MESSAGE Possible choices for configuration are:
|
||||
!MESSAGE
|
||||
!MESSAGE "gennorm - Win32 Release" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE "gennorm - Win32 Debug" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE
|
||||
|
||||
# Begin Project
|
||||
# PROP AllowPerConfigDependencies 0
|
||||
# PROP Scc_ProjName ""
|
||||
# PROP Scc_LocalPath ""
|
||||
CPP=cl.exe
|
||||
RSC=rc.exe
|
||||
|
||||
!IF "$(CFG)" == "gennorm - Win32 Release"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 0
|
||||
# PROP BASE Output_Dir "Release"
|
||||
# PROP BASE Intermediate_Dir "Release"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 0
|
||||
# PROP Output_Dir "Release"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD CPP /nologo /MD /Za /W3 /GX /O2 /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD BASE RSC /l 0x409 /d "NDEBUG"
|
||||
# ADD RSC /l 0x409 /d "NDEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
||||
# ADD LINK32 icutu.lib /nologo /subsystem:console /machine:I386 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" /libpath:"..\..\..\lib"
|
||||
# Begin Custom Build
|
||||
InputPath=.\Release\gennorm.exe
|
||||
InputName=gennorm
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "gennorm - Win32 Debug"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 1
|
||||
# PROP BASE Output_Dir "Debug"
|
||||
# PROP BASE Intermediate_Dir "Debug"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 1
|
||||
# PROP Output_Dir "Debug"
|
||||
# PROP Intermediate_Dir "Debug"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\toolutil" /I "..\..\common" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
# ADD RSC /l 0x409 /d "_DEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
|
||||
# ADD LINK32 icutud.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" /libpath:"..\..\..\lib"
|
||||
# Begin Custom Build
|
||||
InputPath=.\Debug\gennorm.exe
|
||||
InputName=gennorm
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# Begin Target
|
||||
|
||||
# Name "gennorm - Win32 Release"
|
||||
# Name "gennorm - Win32 Debug"
|
||||
# Begin Group "Source Files"
|
||||
|
||||
# PROP Default_Filter "c;cpp;rc"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\gennorm.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\store.c
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
# PROP Default_Filter "h"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\gennorm.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
||||
# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
# End Group
|
||||
# End Target
|
||||
# End Project
|
63
icu4c/source/tools/gennorm/gennorm.h
Normal file
63
icu4c/source/tools/gennorm/gennorm.h
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: genprops.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 1999dec13
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __GENPROPS_H__
|
||||
#define __GENPROPS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/* file definitions */
|
||||
#define DATA_NAME "unorm"
|
||||
#define DATA_TYPE "dat"
|
||||
|
||||
/*
|
||||
* data structure that holds the normalization properties for one or more
|
||||
* code point(s) at build time
|
||||
*/
|
||||
typedef struct Norm {
|
||||
uint8_t udataCC, lenNFD, lenNFKD;
|
||||
uint8_t qcFlags, combiningFlags;
|
||||
uint16_t canonBothCCs, compatBothCCs, combiningIndex, specialTag;
|
||||
uint32_t *nfd, *nfkd;
|
||||
} Norm;
|
||||
|
||||
/* global flags */
|
||||
extern UBool beVerbose, haveCopyright;
|
||||
|
||||
/* prototypes */
|
||||
extern void
|
||||
setUnicodeVersion(const char *v);
|
||||
|
||||
extern void
|
||||
init(void);
|
||||
|
||||
extern void
|
||||
storeNorm(uint32_t code, Norm *norm);
|
||||
|
||||
extern void
|
||||
setQCFlags(uint32_t code, uint8_t qcFlags);
|
||||
|
||||
extern void
|
||||
setCompositionExclusion(uint32_t code);
|
||||
|
||||
extern void
|
||||
processData(void);
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir);
|
||||
|
||||
#endif
|
||||
|
1428
icu4c/source/tools/gennorm/store.c
Normal file
1428
icu4c/source/tools/gennorm/store.c
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue