From 58d21ee915b1168923ebd3ba083dcd3ec4d9a026 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Fri, 23 Jul 2010 23:51:14 +0000 Subject: [PATCH] ICU-7264 scripts for data file generation X-SVN-Rev: 28364 --- tools/unicode/c/genprops/store.c | 6 +++-- tools/unicode/makedefs.sh | 16 ++++++++++++++ tools/unicode/makeprops.sh | 38 ++++++++++++++++++++++++++++++++ tools/unicode/makeuca.sh | 15 +++++++++++++ tools/unicode/py/idna2nrm.py | 15 +++++++------ 5 files changed, 81 insertions(+), 9 deletions(-) create mode 100755 tools/unicode/makedefs.sh create mode 100755 tools/unicode/makeprops.sh create mode 100755 tools/unicode/makeuca.sh diff --git a/tools/unicode/c/genprops/store.c b/tools/unicode/c/genprops/store.c index 7adc9e75325..4499073a956 100644 --- a/tools/unicode/c/genprops/store.c +++ b/tools/unicode/c/genprops/store.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2009, International Business Machines +* Copyright (C) 1999-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -466,7 +466,9 @@ generateData(const char *dataDir, UBool csource) { pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { - fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode)); + fprintf(stderr, "genprops: udata_create(%s, %s.%s) failed - %s\n", + dataDir, DATA_NAME, DATA_TYPE, + u_errorName(errorCode)); exit(errorCode); } diff --git a/tools/unicode/makedefs.sh b/tools/unicode/makedefs.sh new file mode 100755 index 00000000000..9496164d634 --- /dev/null +++ b/tools/unicode/makedefs.sh @@ -0,0 +1,16 @@ +# Copyright (C) 2010, International Business Machines +# Corporation and others. All Rights Reserved. +# +# Basic definitions for building ICU Unicode data. +# Sourced from makeprops.sh for example. +UNICODE_VERSION=6.0 +# Assume that there are parallel src & bld trees with the Unicode tools +# source files and the out-of-source build files. +# Assume that the current folder is some/path/src/unicode +UNITOOLS_BLD=../../bld/unicode +# The sourcing script must define ICU_SRC and ICU_BLD for the ICU library +# source files and the out-of-source build files. +UNIDATA=$ICU_SRC/source/data/unidata +COMMON=$ICU_SRC/source/common +SRC_DATA_IN=$ICU_SRC/source/data/in +BLD_DATA_FILES=$ICU_BLD/data/out/build/icudt45l diff --git a/tools/unicode/makeprops.sh b/tools/unicode/makeprops.sh new file mode 100755 index 00000000000..acb62cbbaa2 --- /dev/null +++ b/tools/unicode/makeprops.sh @@ -0,0 +1,38 @@ +# Copyright (C) 2010, International Business Machines +# Corporation and others. All Rights Reserved. +# +# Parses Unicode Character Database files and build ICU core properties files. +# +# Invoke as +# ./makeprops.sh path/to/ICU/src/tree path/to/ICU/build/tree +ICU_SRC=$1 +ICU_BLD=$2 +source ./makedefs.sh + +# uprops.icu +$UNITOOLS_BLD/c/genprops/genprops -d $SRC_DATA_IN -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION +$UNITOOLS_BLD/c/genprops/genprops -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION + +# ubidi.icu +$UNITOOLS_BLD/c/genbidi/genbidi -d $SRC_DATA_IN -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION +$UNITOOLS_BLD/c/genbidi/genbidi -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION + +# ucase.icu +$UNITOOLS_BLD/c/gencase/gencase -d $SRC_DATA_IN -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION +$UNITOOLS_BLD/c/gencase/gencase -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION + +# unames.icu +$UNITOOLS_BLD/c/gennames/gennames -d $SRC_DATA_IN -1 -q $UNIDATA/UnicodeData.txt $UNIDATA/NameAliases.txt -u $UNICODE_VERSION + +# unidata/norm2/*.txt +$UNITOOLS_BLD/c/gennorm/gennorm -d $UNIDATA/norm2 -s $UNIDATA -i $BLD_DATA_FILES + +# *.nrm +export LD_LIBRARY_PATH=$ICU_BLD/lib +$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt -u $UNICODE_VERSION +$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfkc.txt -u $UNICODE_VERSION +$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfkc.txt nfkc_cf.txt -u $UNICODE_VERSION +$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt -u $UNICODE_VERSION + +# UCA +$UNITOOLS_BLD/c/genuca/genuca -d $SRC_DATA_IN/coll -s $UNIDATA -i $BLD_DATA_FILES diff --git a/tools/unicode/makeuca.sh b/tools/unicode/makeuca.sh new file mode 100755 index 00000000000..575361ebf16 --- /dev/null +++ b/tools/unicode/makeuca.sh @@ -0,0 +1,15 @@ +# Copyright (C) 2010, International Business Machines +# Corporation and others. All Rights Reserved. +# +# Parses Unicode Character Database files and build ICU UCA data files. +# +# Requires: 1. run makeprops.sh 2. rebuild ICU & Unicode tools +# See (ICU)/source/data/unidata/changes.txt +# +# Invoke as +# ./makeuca.sh path/to/ICU/src/tree path/to/ICU/build/tree +ICU_SRC=$1 +ICU_BLD=$2 +source ./makedefs.sh + +$UNITOOLS_BLD/c/genuca/genuca -d $SRC_DATA_IN/coll -s $UNIDATA -i $BLD_DATA_FILES diff --git a/tools/unicode/py/idna2nrm.py b/tools/unicode/py/idna2nrm.py index 6d1eac03041..f8af47e7535 100755 --- a/tools/unicode/py/idna2nrm.py +++ b/tools/unicode/py/idna2nrm.py @@ -24,17 +24,18 @@ replacements = [ (re.compile(r"005B..0060 ; disallowed"), "# 005B..0060 (allow ASCII)"), (re.compile(r"007B..00A0 ; disallowed #"), "0080..00A0 >FFFD # (allow ASCII)"), + # Several versions of avoiding circular FFFD>FFFD mappings, + # depending on the version of the input file. + (re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"), + (re.compile(r"\.\.FFFD"), "..FFFC"), + (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"), # Normal transformations. (re.compile(r"; disallowed "), ">FFFD"), (re.compile(r"; ignored "), ">"), (re.compile(r"^([^;]+) ; valid"), r"# \1valid"), (re.compile(r"; mapped ; "), ">"), (re.compile(r"^([^;]+) ; deviation"), r"# \1deviation"), - (re.compile(r" +(\# [^\#]+)$"), r" \1"), - # Two versions of avoiding circular FFFD>FFFD mappings, - # depending on the version of the input file. - (re.compile(r"\.\.FFFD"), "..FFFC"), - (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC") + (re.compile(r" +(\# [^\#]+)$"), r" \1") ] in_file = open("IdnaMappingTable.txt", "r") @@ -61,8 +62,8 @@ for line in in_file: # they are handled in code. # Deviation characters are also handled in code. # -# A circular mapping FFFD>FFFD is avoided by rewriting the line that contains -# ..FFFD to contain ..FFFC instead. +# A circular mapping FFFD>FFFD is avoided by +# rewriting the line that contains FFFD. # # Use this file as the second gennorm2 input file after nfc.txt. # ================================================