ICU-7264 scripts for data file generation

X-SVN-Rev: 28364
This commit is contained in:
Markus Scherer 2010-07-23 23:51:14 +00:00
parent a2605b9c83
commit 58d21ee915
5 changed files with 81 additions and 9 deletions

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -466,7 +466,9 @@ generateData(const char *dataDir, UBool csource) {
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
fprintf(stderr, "genprops: udata_create(%s, %s.%s) failed - %s\n",
dataDir, DATA_NAME, DATA_TYPE,
u_errorName(errorCode));
exit(errorCode);
}

16
tools/unicode/makedefs.sh Executable file
View file

@ -0,0 +1,16 @@
# Copyright (C) 2010, International Business Machines
# Corporation and others. All Rights Reserved.
#
# Basic definitions for building ICU Unicode data.
# Sourced from makeprops.sh for example.
UNICODE_VERSION=6.0
# Assume that there are parallel src & bld trees with the Unicode tools
# source files and the out-of-source build files.
# Assume that the current folder is some/path/src/unicode
UNITOOLS_BLD=../../bld/unicode
# The sourcing script must define ICU_SRC and ICU_BLD for the ICU library
# source files and the out-of-source build files.
UNIDATA=$ICU_SRC/source/data/unidata
COMMON=$ICU_SRC/source/common
SRC_DATA_IN=$ICU_SRC/source/data/in
BLD_DATA_FILES=$ICU_BLD/data/out/build/icudt45l

38
tools/unicode/makeprops.sh Executable file
View file

@ -0,0 +1,38 @@
# Copyright (C) 2010, International Business Machines
# Corporation and others. All Rights Reserved.
#
# Parses Unicode Character Database files and build ICU core properties files.
#
# Invoke as
# ./makeprops.sh path/to/ICU/src/tree path/to/ICU/build/tree
ICU_SRC=$1
ICU_BLD=$2
source ./makedefs.sh
# uprops.icu
$UNITOOLS_BLD/c/genprops/genprops -d $SRC_DATA_IN -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
$UNITOOLS_BLD/c/genprops/genprops -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
# ubidi.icu
$UNITOOLS_BLD/c/genbidi/genbidi -d $SRC_DATA_IN -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
$UNITOOLS_BLD/c/genbidi/genbidi -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
# ucase.icu
$UNITOOLS_BLD/c/gencase/gencase -d $SRC_DATA_IN -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
$UNITOOLS_BLD/c/gencase/gencase -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
# unames.icu
$UNITOOLS_BLD/c/gennames/gennames -d $SRC_DATA_IN -1 -q $UNIDATA/UnicodeData.txt $UNIDATA/NameAliases.txt -u $UNICODE_VERSION
# unidata/norm2/*.txt
$UNITOOLS_BLD/c/gennorm/gennorm -d $UNIDATA/norm2 -s $UNIDATA -i $BLD_DATA_FILES
# *.nrm
export LD_LIBRARY_PATH=$ICU_BLD/lib
$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt -u $UNICODE_VERSION
$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfkc.txt -u $UNICODE_VERSION
$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfkc.txt nfkc_cf.txt -u $UNICODE_VERSION
$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt -u $UNICODE_VERSION
# UCA
$UNITOOLS_BLD/c/genuca/genuca -d $SRC_DATA_IN/coll -s $UNIDATA -i $BLD_DATA_FILES

15
tools/unicode/makeuca.sh Executable file
View file

@ -0,0 +1,15 @@
# Copyright (C) 2010, International Business Machines
# Corporation and others. All Rights Reserved.
#
# Parses Unicode Character Database files and build ICU UCA data files.
#
# Requires: 1. run makeprops.sh 2. rebuild ICU & Unicode tools
# See (ICU)/source/data/unidata/changes.txt
#
# Invoke as
# ./makeuca.sh path/to/ICU/src/tree path/to/ICU/build/tree
ICU_SRC=$1
ICU_BLD=$2
source ./makedefs.sh
$UNITOOLS_BLD/c/genuca/genuca -d $SRC_DATA_IN/coll -s $UNIDATA -i $BLD_DATA_FILES

View file

@ -24,17 +24,18 @@ replacements = [
(re.compile(r"005B..0060 ; disallowed"), "# 005B..0060 (allow ASCII)"),
(re.compile(r"007B..00A0 ; disallowed #"),
"0080..00A0 >FFFD # (allow ASCII)"),
# Several versions of avoiding circular FFFD>FFFD mappings,
# depending on the version of the input file.
(re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"),
(re.compile(r"\.\.FFFD"), "..FFFC"),
(re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
# Normal transformations.
(re.compile(r"; disallowed "), ">FFFD"),
(re.compile(r"; ignored "), ">"),
(re.compile(r"^([^;]+) ; valid"), r"# \1valid"),
(re.compile(r"; mapped ; "), ">"),
(re.compile(r"^([^;]+) ; deviation"), r"# \1deviation"),
(re.compile(r" +(\# [^\#]+)$"), r" \1"),
# Two versions of avoiding circular FFFD>FFFD mappings,
# depending on the version of the input file.
(re.compile(r"\.\.FFFD"), "..FFFC"),
(re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC")
(re.compile(r" +(\# [^\#]+)$"), r" \1")
]
in_file = open("IdnaMappingTable.txt", "r")
@ -61,8 +62,8 @@ for line in in_file:
# they are handled in code.
# Deviation characters are also handled in code.
#
# A circular mapping FFFD>FFFD is avoided by rewriting the line that contains
# ..FFFD to contain ..FFFC instead.
# A circular mapping FFFD>FFFD is avoided by
# rewriting the line that contains FFFD.
#
# Use this file as the second gennorm2 input file after nfc.txt.
# ================================================