From 58d21ee915b1168923ebd3ba083dcd3ec4d9a026 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Fri, 23 Jul 2010 23:51:14 +0000
Subject: [PATCH] ICU-7264 scripts for data file generation

X-SVN-Rev: 28364
---
 tools/unicode/c/genprops/store.c |  6 +++--
 tools/unicode/makedefs.sh        | 16 ++++++++++++++
 tools/unicode/makeprops.sh       | 38 ++++++++++++++++++++++++++++++++
 tools/unicode/makeuca.sh         | 15 +++++++++++++
 tools/unicode/py/idna2nrm.py     | 15 +++++++------
 5 files changed, 81 insertions(+), 9 deletions(-)
 create mode 100755 tools/unicode/makedefs.sh
 create mode 100755 tools/unicode/makeprops.sh
 create mode 100755 tools/unicode/makeuca.sh

diff --git a/tools/unicode/c/genprops/store.c b/tools/unicode/c/genprops/store.c
index 7adc9e75325..4499073a956 100644
--- a/tools/unicode/c/genprops/store.c
+++ b/tools/unicode/c/genprops/store.c
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1999-2009, International Business Machines
+*   Copyright (C) 1999-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -466,7 +466,9 @@ generateData(const char *dataDir, UBool csource) {
         pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
                         haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
         if(U_FAILURE(errorCode)) {
-            fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
+            fprintf(stderr, "genprops: udata_create(%s, %s.%s) failed - %s\n",
+                    dataDir, DATA_NAME, DATA_TYPE,
+                    u_errorName(errorCode));
             exit(errorCode);
         }
 
diff --git a/tools/unicode/makedefs.sh b/tools/unicode/makedefs.sh
new file mode 100755
index 00000000000..9496164d634
--- /dev/null
+++ b/tools/unicode/makedefs.sh
@@ -0,0 +1,16 @@
+# Copyright (C) 2010, International Business Machines
+# Corporation and others.  All Rights Reserved.
+#
+# Basic definitions for building ICU Unicode data.
+# Sourced from makeprops.sh for example.
+UNICODE_VERSION=6.0
+# Assume that there are parallel src & bld trees with the Unicode tools
+# source files and the out-of-source build files.
+# Assume that the current folder is some/path/src/unicode
+UNITOOLS_BLD=../../bld/unicode
+# The sourcing script must define ICU_SRC and ICU_BLD for the ICU library
+# source files and the out-of-source build files.
+UNIDATA=$ICU_SRC/source/data/unidata
+COMMON=$ICU_SRC/source/common
+SRC_DATA_IN=$ICU_SRC/source/data/in
+BLD_DATA_FILES=$ICU_BLD/data/out/build/icudt45l
diff --git a/tools/unicode/makeprops.sh b/tools/unicode/makeprops.sh
new file mode 100755
index 00000000000..acb62cbbaa2
--- /dev/null
+++ b/tools/unicode/makeprops.sh
@@ -0,0 +1,38 @@
+# Copyright (C) 2010, International Business Machines
+# Corporation and others.  All Rights Reserved.
+#
+# Parses Unicode Character Database files and build ICU core properties files.
+#
+# Invoke as
+#   ./makeprops.sh path/to/ICU/src/tree path/to/ICU/build/tree
+ICU_SRC=$1
+ICU_BLD=$2
+source ./makedefs.sh
+
+# uprops.icu
+$UNITOOLS_BLD/c/genprops/genprops -d $SRC_DATA_IN      -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
+$UNITOOLS_BLD/c/genprops/genprops -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
+
+# ubidi.icu
+$UNITOOLS_BLD/c/genbidi/genbidi -d $SRC_DATA_IN      -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
+$UNITOOLS_BLD/c/genbidi/genbidi -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
+
+# ucase.icu
+$UNITOOLS_BLD/c/gencase/gencase -d $SRC_DATA_IN      -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
+$UNITOOLS_BLD/c/gencase/gencase -d $COMMON --csource -s $UNIDATA -i $BLD_DATA_FILES -u $UNICODE_VERSION
+
+# unames.icu
+$UNITOOLS_BLD/c/gennames/gennames -d $SRC_DATA_IN -1 -q $UNIDATA/UnicodeData.txt $UNIDATA/NameAliases.txt -u $UNICODE_VERSION
+
+# unidata/norm2/*.txt
+$UNITOOLS_BLD/c/gennorm/gennorm -d $UNIDATA/norm2 -s $UNIDATA -i $BLD_DATA_FILES
+
+# *.nrm
+export LD_LIBRARY_PATH=$ICU_BLD/lib
+$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm     -s $UNIDATA/norm2 nfc.txt              -u $UNICODE_VERSION
+$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm    -s $UNIDATA/norm2 nfkc.txt             -u $UNICODE_VERSION
+$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfkc.txt nfkc_cf.txt -u $UNICODE_VERSION
+$ICU_BLD/bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm   -s $UNIDATA/norm2 nfc.txt uts46.txt    -u $UNICODE_VERSION
+
+# UCA
+$UNITOOLS_BLD/c/genuca/genuca -d $SRC_DATA_IN/coll -s $UNIDATA -i $BLD_DATA_FILES
diff --git a/tools/unicode/makeuca.sh b/tools/unicode/makeuca.sh
new file mode 100755
index 00000000000..575361ebf16
--- /dev/null
+++ b/tools/unicode/makeuca.sh
@@ -0,0 +1,15 @@
+# Copyright (C) 2010, International Business Machines
+# Corporation and others.  All Rights Reserved.
+#
+# Parses Unicode Character Database files and build ICU UCA data files.
+#
+# Requires: 1. run makeprops.sh  2. rebuild ICU & Unicode tools
+# See (ICU)/source/data/unidata/changes.txt
+#
+# Invoke as
+#   ./makeuca.sh path/to/ICU/src/tree path/to/ICU/build/tree
+ICU_SRC=$1
+ICU_BLD=$2
+source ./makedefs.sh
+
+$UNITOOLS_BLD/c/genuca/genuca -d $SRC_DATA_IN/coll -s $UNIDATA -i $BLD_DATA_FILES
diff --git a/tools/unicode/py/idna2nrm.py b/tools/unicode/py/idna2nrm.py
index 6d1eac03041..f8af47e7535 100755
--- a/tools/unicode/py/idna2nrm.py
+++ b/tools/unicode/py/idna2nrm.py
@@ -24,17 +24,18 @@ replacements = [
   (re.compile(r"005B..0060    ; disallowed"), "# 005B..0060 (allow ASCII)"),
   (re.compile(r"007B..00A0    ; disallowed                                 #"),
    "0080..00A0    >FFFD  # (allow ASCII)"),
+  # Several versions of avoiding circular FFFD>FFFD mappings,
+  # depending on the version of the input file.
+  (re.compile(r"FFFD          ; disallowed"), "# FFFD (avoid circular mapping)"),
+  (re.compile(r"\.\.FFFD"), "..FFFC"),
+  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
   # Normal transformations.
   (re.compile(r"; disallowed   "), ">FFFD"),
   (re.compile(r"; ignored      "), ">"),
   (re.compile(r"^([^;]+)  ; valid"), r"# \1valid"),
   (re.compile(r"; mapped     ; "), ">"),
   (re.compile(r"^([^;]+)  ; deviation"), r"# \1deviation"),
-  (re.compile(r"   +(\#  [^\#]+)$"), r"  \1"),
-  # Two versions of avoiding circular FFFD>FFFD mappings,
-  # depending on the version of the input file.
-  (re.compile(r"\.\.FFFD"), "..FFFC"),
-  (re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC")
+  (re.compile(r"   +(\#  [^\#]+)$"), r"  \1")
 ]
 
 in_file = open("IdnaMappingTable.txt", "r")
@@ -61,8 +62,8 @@ for line in in_file:
 # they are handled in code.
 # Deviation characters are also handled in code.
 #
-# A circular mapping FFFD>FFFD is avoided by rewriting the line that contains
-# ..FFFD to contain ..FFFC instead.
+# A circular mapping FFFD>FFFD is avoided by
+# rewriting the line that contains FFFD.
 #
 # Use this file as the second gennorm2 input file after nfc.txt.
 # ================================================