From 99c4dfa56503a6e4ad83befb7052c9c95a254d23 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Thu, 16 Apr 2015 23:42:50 +0000
Subject: [PATCH] ICU-11574 Unicode 8 updates

X-SVN-Rev: 37353
---
 tools/unicode/c/genprops/corepropsbuilder.cpp |  4 +-
 .../unicode/c/genprops/namespropsbuilder.cpp  |  8 +--
 tools/unicode/c/genprops/pnames_data.h        | 30 +++++++----
 tools/unicode/py/preparseucd.py               | 52 +++++++++++++------
 4 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/tools/unicode/c/genprops/corepropsbuilder.cpp b/tools/unicode/c/genprops/corepropsbuilder.cpp
index 20d75c53ccf..531595c3e74 100644
--- a/tools/unicode/c/genprops/corepropsbuilder.cpp
+++ b/tools/unicode/c/genprops/corepropsbuilder.cpp
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1999-2012, International Business Machines
+*   Copyright (C) 1999-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -431,7 +431,7 @@ CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newVa
     }
 
     int32_t ntv=UPROPS_NTV_NONE;  // numeric type & value
-    if(nvString!=NULL) {
+    if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
         int32_t digitValue=props.digitValue;
         if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
             ((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
diff --git a/tools/unicode/c/genprops/namespropsbuilder.cpp b/tools/unicode/c/genprops/namespropsbuilder.cpp
index 73658efa1cf..968dd3e5a6d 100644
--- a/tools/unicode/c/genprops/namespropsbuilder.cpp
+++ b/tools/unicode/c/genprops/namespropsbuilder.cpp
@@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1999-2012, International Business Machines
+*   Copyright (C) 1999-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@@ -129,7 +129,7 @@
 #include "unewdata.h"
 #include "uoptions.h"
 
-#define STRING_STORE_SIZE 1000000
+#define STRING_STORE_SIZE 2000000
 #define GROUP_STORE_SIZE 5000
 
 #define GROUP_SHIFT 5
@@ -1097,7 +1097,7 @@ allocLine(int32_t length) {
     uint8_t *p;
 
     if(top>wordBottom) {
-        fprintf(stderr, "gennames: out of memory\n");
+        fprintf(stderr, "gennames allocLine(): out of memory\n");
         exit(U_MEMORY_ALLOCATION_ERROR);
     }
     p=stringStore+lineTop;
@@ -1110,7 +1110,7 @@ allocWord(uint32_t length) {
     uint32_t bottom=wordBottom-length;
 
     if(lineTop>bottom) {
-        fprintf(stderr, "gennames: out of memory\n");
+        fprintf(stderr, "gennames allocWord(): out of memory\n");
         exit(U_MEMORY_ALLOCATION_ERROR);
     }
     wordBottom=bottom;
diff --git a/tools/unicode/c/genprops/pnames_data.h b/tools/unicode/c/genprops/pnames_data.h
index 267bdd67dac..e89e782b0eb 100644
--- a/tools/unicode/c/genprops/pnames_data.h
+++ b/tools/unicode/c/genprops/pnames_data.h
@@ -1,11 +1,11 @@
 /**
- * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * Copyright (C) 2002-2015, International Business Machines Corporation and
  * others. All Rights Reserved.
  *
  * machine-generated by: icu/tools/unicode/py/preparseucd.py
  */
 
-#define UNICODE_VERSION { 7, 0, 0, 0 }
+#define UNICODE_VERSION { 8, 0, 0, 0 }
 
 static const Value VALUES_binprop[2] = {
     Value(0, "N No F False"),
@@ -38,7 +38,7 @@ static const Value VALUES_bc[23] = {
     Value(U_POP_DIRECTIONAL_ISOLATE, "PDI Pop_Directional_Isolate"),
 };
 
-static const Value VALUES_blk[253] = {
+static const Value VALUES_blk[263] = {
     Value(UBLOCK_NO_BLOCK, "NB No_Block"),
     Value(UBLOCK_BASIC_LATIN, "ASCII Basic_Latin"),
     Value(UBLOCK_LATIN_1_SUPPLEMENT, "Latin_1_Sup Latin_1_Supplement Latin_1"),
@@ -292,6 +292,16 @@ static const Value VALUES_blk[253] = {
     Value(UBLOCK_SUPPLEMENTAL_ARROWS_C, "Sup_Arrows_C Supplemental_Arrows_C"),
     Value(UBLOCK_TIRHUTA, "Tirhuta Tirhuta"),
     Value(UBLOCK_WARANG_CITI, "Warang_Citi Warang_Citi"),
+    Value(UBLOCK_AHOM, "Ahom Ahom"),
+    Value(UBLOCK_ANATOLIAN_HIEROGLYPHS, "Anatolian_Hieroglyphs Anatolian_Hieroglyphs"),
+    Value(UBLOCK_CHEROKEE_SUPPLEMENT, "Cherokee_Sup Cherokee_Supplement"),
+    Value(UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E, "CJK_Ext_E CJK_Unified_Ideographs_Extension_E"),
+    Value(UBLOCK_EARLY_DYNASTIC_CUNEIFORM, "Early_Dynastic_Cuneiform Early_Dynastic_Cuneiform"),
+    Value(UBLOCK_HATRAN, "Hatran Hatran"),
+    Value(UBLOCK_MULTANI, "Multani Multani"),
+    Value(UBLOCK_OLD_HUNGARIAN, "Old_Hungarian Old_Hungarian"),
+    Value(UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS, "Sup_Symbols_And_Pictographs Supplemental_Symbols_And_Pictographs"),
+    Value(UBLOCK_SUTTON_SIGNWRITING, "Sutton_SignWriting Sutton_SignWriting"),
 };
 
 static const Value VALUES_ccc[57] = {
@@ -642,7 +652,7 @@ static const Value VALUES_sc[167] = {
     Value(USCRIPT_SIMPLIFIED_HAN, "Hans Hans"),
     Value(USCRIPT_TRADITIONAL_HAN, "Hant Hant"),
     Value(USCRIPT_PAHAWH_HMONG, "Hmng Pahawh_Hmong"),
-    Value(USCRIPT_OLD_HUNGARIAN, "Hung Hung"),
+    Value(USCRIPT_OLD_HUNGARIAN, "Hung Old_Hungarian"),
     Value(USCRIPT_HARAPPAN_INDUS, "Inds Inds"),
     Value(USCRIPT_JAVANESE, "Java Javanese"),
     Value(USCRIPT_KAYAH_LI, "Kali Kayah_Li"),
@@ -678,7 +688,7 @@ static const Value VALUES_sc[167] = {
     Value(USCRIPT_OL_CHIKI, "Olck Ol_Chiki"),
     Value(USCRIPT_REJANG, "Rjng Rejang"),
     Value(USCRIPT_SAURASHTRA, "Saur Saurashtra"),
-    Value(USCRIPT_SIGN_WRITING, "Sgnw Sgnw"),
+    Value(USCRIPT_SIGN_WRITING, "Sgnw SignWriting"),
     Value(USCRIPT_SUNDANESE, "Sund Sundanese"),
     Value(USCRIPT_MOON, "Moon Moon"),
     Value(USCRIPT_MEITEI_MAYEK, "Mtei Meetei_Mayek"),
@@ -701,7 +711,7 @@ static const Value VALUES_sc[167] = {
     Value(USCRIPT_NAKHI_GEBA, "Nkgb Nkgb"),
     Value(USCRIPT_OLD_SOUTH_ARABIAN, "Sarb Old_South_Arabian"),
     Value(USCRIPT_BASSA_VAH, "Bass Bassa_Vah"),
-    Value(USCRIPT_DUPLOYAN_SHORTAND, "Dupl Duployan"),
+    Value(USCRIPT_DUPLOYAN, "Dupl Duployan"),
     Value(USCRIPT_ELBASAN, "Elba Elbasan"),
     Value(USCRIPT_GRANTHA, "Gran Grantha"),
     Value(USCRIPT_KPELLE, "Kpel Kpel"),
@@ -722,15 +732,15 @@ static const Value VALUES_sc[167] = {
     Value(USCRIPT_TAKRI, "Takr Takri"),
     Value(USCRIPT_TANGUT, "Tang Tang"),
     Value(USCRIPT_WOLEAI, "Wole Wole"),
-    Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Hluw"),
+    Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Anatolian_Hieroglyphs"),
     Value(USCRIPT_KHOJKI, "Khoj Khojki"),
     Value(USCRIPT_TIRHUTA, "Tirh Tirhuta"),
     Value(USCRIPT_CAUCASIAN_ALBANIAN, "Aghb Caucasian_Albanian"),
     Value(USCRIPT_MAHAJANI, "Mahj Mahajani"),
     Value(USCRIPT_AHOM, "Ahom Ahom"),
-    Value(USCRIPT_HATRAN, "Hatr Hatr"),
+    Value(USCRIPT_HATRAN, "Hatr Hatran"),
     Value(USCRIPT_MODI, "Modi Modi"),
-    Value(USCRIPT_MULTANI, "Mult Mult"),
+    Value(USCRIPT_MULTANI, "Mult Multani"),
     Value(USCRIPT_PAU_CIN_HAU, "Pauc Pau_Cin_Hau"),
     Value(USCRIPT_SIDDHAM, "Sidd Siddham"),
 };
@@ -1046,7 +1056,7 @@ static const Property PROPERTIES[96] = {
     Property(UCHAR_CHANGES_WHEN_CASEMAPPED, "CWCM Changes_When_Casemapped"),
     Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"),
     Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 23),
-    Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 253),
+    Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 263),
     Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57),
     Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18),
     Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6),
diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py
index 05d08791856..e68d5bd1f5b 100755
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
-# Copyright (c) 2009-2014 International Business Machines
+# Copyright (c) 2009-2015 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #   file name:  preparseucd.py
@@ -47,12 +47,12 @@ _current_year = datetime.date.today().strftime("%Y")
 # Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
 # that are not yet in the UCD.
 _scripts_only_in_iso15924 = (
-    "Afak", "Ahom", "Blis", "Cirt", "Cyrs",
+    "Afak", "Blis", "Cirt", "Cyrs",
     "Egyd", "Egyh", "Geok",
-    "Hans", "Hant", "Hatr", "Hluw", "Hung",
+    "Hans", "Hant",
     "Inds", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
-    "Maya", "Moon", "Mult", "Nkgb", "Nshu", "Phlv", "Roro",
-    "Sara", "Sgnw", "Syre", "Syrj", "Syrn",
+    "Maya", "Moon", "Nkgb", "Nshu", "Phlv", "Roro",
+    "Sara", "Syre", "Syrj", "Syrn",
     "Tang", "Teng", "Visp", "Wole", "Zmth", "Zsym", "Zxxx"
 )
 
@@ -680,6 +680,23 @@ def ParseUnicodeData(in_file):
     if (decimal and decimal != nv) or (digit and digit != nv):
       raise SyntaxError("error: numeric values differ at\n  %s\n" % line)
     if nv:
+      # Map improper fractions to proper ones.
+      # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
+      # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
+      if nv == "2/12":
+        nv = "1/6"
+      elif nv == "3/12":
+        nv = "1/4"
+      elif nv == "4/12":
+        nv = "1/3"
+      elif nv == "6/12":
+        nv = "1/2"
+      elif nv == "8/12":
+        nv = "2/3"
+      elif nv == "9/12":
+        nv = "3/4"
+      elif nv == "10/12":
+        nv = "5/6"
       props["nv"] = nv
       props["nt"] = "De" if decimal else "Di" if digit else "Nu"
     if fields[9] == "Y": props["Bidi_M"] = True
@@ -773,7 +790,7 @@ def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
 def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
 def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
 def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
-def ParseIndicMatraCategory(in_file): ParseOneProperty(in_file, "InMC")
+def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
 def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
 def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
 def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
@@ -824,8 +841,8 @@ def NeedToSetNumericValue(nv, start, end, c_props):
     assert "nt" not in c_props
     return True
   if nv != c_nv:
-    raise ValueError("UnicodeData.txt has nv=%s for %04lX..%04lX " +
-                     "but DerivedNumericValues.txt has nv=%s" %
+    raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
+                     "but DerivedNumericValues.txt has nv=%s") %
                      (c_nv, start, end, nv))
   return False
 
@@ -920,31 +937,32 @@ def CompactBlock(b, i):
   assert b[0] == _starts[i]
   orig_i = i
   # Count the number of occurrences of each property's value in this block.
-  num_cp_so_far = 0
+  # To minimize the output, count the number of ranges,
+  # not the number of code points.
+  num_ranges_so_far = 0
   prop_counters = {}
   while True:
     start = _starts[i]
     if start > b[1]: break
-    num_cp_in_this_range = _starts[i + 1] - start
     props = _props[i]
     for (pname, value) in props.iteritems():
       if pname in prop_counters:
         counter = prop_counters[pname]
       else:
-        counter = {_null_or_defaults[pname]: num_cp_so_far}
+        counter = {_null_or_defaults[pname]: num_ranges_so_far}
         prop_counters[pname] = counter
       if value in counter:
-        counter[value] += num_cp_in_this_range
+        counter[value] += 1
       else:
-        counter[value] = num_cp_in_this_range
+        counter[value] = 1
     # Also count default values for properties that do not occur in a range.
     for pname in prop_counters:
       if pname not in props:
         counter = prop_counters[pname]
         value = _null_or_defaults[pname]
-        counter[value] += num_cp_in_this_range
-    num_cp_so_far += num_cp_in_this_range
-    # Invariant: For each counter, the sum of counts must equal num_cp_so_far.
+        counter[value] += 1
+    num_ranges_so_far += 1
+    # Invariant: For each counter, the sum of counts must equal num_ranges_so_far.
     i += 1
   # For each property that occurs within this block,
   # set the most common value as a block property value.
@@ -1519,7 +1537,7 @@ _files = {
   "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
   "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
   "GraphemeBreakTest.txt": (PrependBOM, "testdata"),
-  "IndicMatraCategory.txt": (DontCopy, ParseIndicMatraCategory),
+  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
   "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
   "LineBreak.txt": (DontCopy, ParseLineBreak),
   "LineBreakTest.txt": (PrependBOM, "testdata"),