ICU-11574 Unicode 8 updates

X-SVN-Rev: 37353
2025-04-07 14:31:31 +00:00 · 2015-04-16 23:42:50 +00:00 · 2015-04-16 23:42:50 +00:00 · 99c4dfa565
commit 99c4dfa565
parent 90b2bf6959
4 changed files with 61 additions and 33 deletions
--- a/tools/unicode/c/genprops/corepropsbuilder.cpp
+++ b/tools/unicode/c/genprops/corepropsbuilder.cpp
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1999-2012, International Business Machines
+*   Copyright (C) 1999-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -431,7 +431,7 @@ CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newVa
    }

    int32_t ntv=UPROPS_NTV_NONE;  // numeric type & value
-    if(nvString!=NULL) {
+    if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
        int32_t digitValue=props.digitValue;
        if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
            ((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
--- a/tools/unicode/c/genprops/namespropsbuilder.cpp
+++ b/tools/unicode/c/genprops/namespropsbuilder.cpp
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1999-2012, International Business Machines
+*   Copyright (C) 1999-2015, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -129,7 +129,7 @@
 #include "unewdata.h"
 #include "uoptions.h"

-#define STRING_STORE_SIZE 1000000
+#define STRING_STORE_SIZE 2000000
 #define GROUP_STORE_SIZE 5000

 #define GROUP_SHIFT 5
@ -1097,7 +1097,7 @@ allocLine(int32_t length) {
    uint8_t *p;

    if(top>wordBottom) {
-        fprintf(stderr, "gennames: out of memory\n");
+        fprintf(stderr, "gennames allocLine(): out of memory\n");
        exit(U_MEMORY_ALLOCATION_ERROR);
    }
    p=stringStore+lineTop;
@ -1110,7 +1110,7 @@ allocWord(uint32_t length) {
    uint32_t bottom=wordBottom-length;

    if(lineTop>bottom) {
-        fprintf(stderr, "gennames: out of memory\n");
+        fprintf(stderr, "gennames allocWord(): out of memory\n");
        exit(U_MEMORY_ALLOCATION_ERROR);
    }
    wordBottom=bottom;
--- a/tools/unicode/c/genprops/pnames_data.h
+++ b/tools/unicode/c/genprops/pnames_data.h
@ -1,11 +1,11 @@
 /**
- * Copyright (C) 2002-2014, International Business Machines Corporation and
+ * Copyright (C) 2002-2015, International Business Machines Corporation and
 * others. All Rights Reserved.
 *
 * machine-generated by: icu/tools/unicode/py/preparseucd.py
 */

-#define UNICODE_VERSION { 7, 0, 0, 0 }
+#define UNICODE_VERSION { 8, 0, 0, 0 }

 static const Value VALUES_binprop[2] = {
    Value(0, "N No F False"),
@ -38,7 +38,7 @@ static const Value VALUES_bc[23] = {
    Value(U_POP_DIRECTIONAL_ISOLATE, "PDI Pop_Directional_Isolate"),
 };

-static const Value VALUES_blk[253] = {
+static const Value VALUES_blk[263] = {
    Value(UBLOCK_NO_BLOCK, "NB No_Block"),
    Value(UBLOCK_BASIC_LATIN, "ASCII Basic_Latin"),
    Value(UBLOCK_LATIN_1_SUPPLEMENT, "Latin_1_Sup Latin_1_Supplement Latin_1"),
@ -292,6 +292,16 @@ static const Value VALUES_blk[253] = {
    Value(UBLOCK_SUPPLEMENTAL_ARROWS_C, "Sup_Arrows_C Supplemental_Arrows_C"),
    Value(UBLOCK_TIRHUTA, "Tirhuta Tirhuta"),
    Value(UBLOCK_WARANG_CITI, "Warang_Citi Warang_Citi"),
+    Value(UBLOCK_AHOM, "Ahom Ahom"),
+    Value(UBLOCK_ANATOLIAN_HIEROGLYPHS, "Anatolian_Hieroglyphs Anatolian_Hieroglyphs"),
+    Value(UBLOCK_CHEROKEE_SUPPLEMENT, "Cherokee_Sup Cherokee_Supplement"),
+    Value(UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E, "CJK_Ext_E CJK_Unified_Ideographs_Extension_E"),
+    Value(UBLOCK_EARLY_DYNASTIC_CUNEIFORM, "Early_Dynastic_Cuneiform Early_Dynastic_Cuneiform"),
+    Value(UBLOCK_HATRAN, "Hatran Hatran"),
+    Value(UBLOCK_MULTANI, "Multani Multani"),
+    Value(UBLOCK_OLD_HUNGARIAN, "Old_Hungarian Old_Hungarian"),
+    Value(UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS, "Sup_Symbols_And_Pictographs Supplemental_Symbols_And_Pictographs"),
+    Value(UBLOCK_SUTTON_SIGNWRITING, "Sutton_SignWriting Sutton_SignWriting"),
 };

 static const Value VALUES_ccc[57] = {
@ -642,7 +652,7 @@ static const Value VALUES_sc[167] = {
    Value(USCRIPT_SIMPLIFIED_HAN, "Hans Hans"),
    Value(USCRIPT_TRADITIONAL_HAN, "Hant Hant"),
    Value(USCRIPT_PAHAWH_HMONG, "Hmng Pahawh_Hmong"),
-    Value(USCRIPT_OLD_HUNGARIAN, "Hung Hung"),
+    Value(USCRIPT_OLD_HUNGARIAN, "Hung Old_Hungarian"),
    Value(USCRIPT_HARAPPAN_INDUS, "Inds Inds"),
    Value(USCRIPT_JAVANESE, "Java Javanese"),
    Value(USCRIPT_KAYAH_LI, "Kali Kayah_Li"),
@ -678,7 +688,7 @@ static const Value VALUES_sc[167] = {
    Value(USCRIPT_OL_CHIKI, "Olck Ol_Chiki"),
    Value(USCRIPT_REJANG, "Rjng Rejang"),
    Value(USCRIPT_SAURASHTRA, "Saur Saurashtra"),
-    Value(USCRIPT_SIGN_WRITING, "Sgnw Sgnw"),
+    Value(USCRIPT_SIGN_WRITING, "Sgnw SignWriting"),
    Value(USCRIPT_SUNDANESE, "Sund Sundanese"),
    Value(USCRIPT_MOON, "Moon Moon"),
    Value(USCRIPT_MEITEI_MAYEK, "Mtei Meetei_Mayek"),
@ -701,7 +711,7 @@ static const Value VALUES_sc[167] = {
    Value(USCRIPT_NAKHI_GEBA, "Nkgb Nkgb"),
    Value(USCRIPT_OLD_SOUTH_ARABIAN, "Sarb Old_South_Arabian"),
    Value(USCRIPT_BASSA_VAH, "Bass Bassa_Vah"),
-    Value(USCRIPT_DUPLOYAN_SHORTAND, "Dupl Duployan"),
+    Value(USCRIPT_DUPLOYAN, "Dupl Duployan"),
    Value(USCRIPT_ELBASAN, "Elba Elbasan"),
    Value(USCRIPT_GRANTHA, "Gran Grantha"),
    Value(USCRIPT_KPELLE, "Kpel Kpel"),
@ -722,15 +732,15 @@ static const Value VALUES_sc[167] = {
    Value(USCRIPT_TAKRI, "Takr Takri"),
    Value(USCRIPT_TANGUT, "Tang Tang"),
    Value(USCRIPT_WOLEAI, "Wole Wole"),
-    Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Hluw"),
+    Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Anatolian_Hieroglyphs"),
    Value(USCRIPT_KHOJKI, "Khoj Khojki"),
    Value(USCRIPT_TIRHUTA, "Tirh Tirhuta"),
    Value(USCRIPT_CAUCASIAN_ALBANIAN, "Aghb Caucasian_Albanian"),
    Value(USCRIPT_MAHAJANI, "Mahj Mahajani"),
    Value(USCRIPT_AHOM, "Ahom Ahom"),
-    Value(USCRIPT_HATRAN, "Hatr Hatr"),
+    Value(USCRIPT_HATRAN, "Hatr Hatran"),
    Value(USCRIPT_MODI, "Modi Modi"),
-    Value(USCRIPT_MULTANI, "Mult Mult"),
+    Value(USCRIPT_MULTANI, "Mult Multani"),
    Value(USCRIPT_PAU_CIN_HAU, "Pauc Pau_Cin_Hau"),
    Value(USCRIPT_SIDDHAM, "Sidd Siddham"),
 };
@ -1046,7 +1056,7 @@ static const Property PROPERTIES[96] = {
    Property(UCHAR_CHANGES_WHEN_CASEMAPPED, "CWCM Changes_When_Casemapped"),
    Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"),
    Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 23),
-    Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 253),
+    Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 263),
    Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57),
    Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18),
    Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6),
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@ -1,6 +1,6 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
-# Copyright (c) 2009-2014 International Business Machines
+# Copyright (c) 2009-2015 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
 #   file name:  preparseucd.py
@ -47,12 +47,12 @@ _current_year = datetime.date.today().strftime("%Y")
 # Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
 # that are not yet in the UCD.
 _scripts_only_in_iso15924 = (
-    "Afak", "Ahom", "Blis", "Cirt", "Cyrs",
+    "Afak", "Blis", "Cirt", "Cyrs",
    "Egyd", "Egyh", "Geok",
-    "Hans", "Hant", "Hatr", "Hluw", "Hung",
+    "Hans", "Hant",
    "Inds", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
-    "Maya", "Moon", "Mult", "Nkgb", "Nshu", "Phlv", "Roro",
-    "Sara", "Sgnw", "Syre", "Syrj", "Syrn",
+    "Maya", "Moon", "Nkgb", "Nshu", "Phlv", "Roro",
+    "Sara", "Syre", "Syrj", "Syrn",
    "Tang", "Teng", "Visp", "Wole", "Zmth", "Zsym", "Zxxx"
 )

@ -680,6 +680,23 @@ def ParseUnicodeData(in_file):
    if (decimal and decimal != nv) or (digit and digit != nv):
      raise SyntaxError("error: numeric values differ at\n  %s\n" % line)
    if nv:
+      # Map improper fractions to proper ones.
+      # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
+      # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
+      if nv == "2/12":
+        nv = "1/6"
+      elif nv == "3/12":
+        nv = "1/4"
+      elif nv == "4/12":
+        nv = "1/3"
+      elif nv == "6/12":
+        nv = "1/2"
+      elif nv == "8/12":
+        nv = "2/3"
+      elif nv == "9/12":
+        nv = "3/4"
+      elif nv == "10/12":
+        nv = "5/6"
      props["nv"] = nv
      props["nt"] = "De" if decimal else "Di" if digit else "Nu"
    if fields[9] == "Y": props["Bidi_M"] = True
@ -773,7 +790,7 @@ def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
 def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
 def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
 def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
-def ParseIndicMatraCategory(in_file): ParseOneProperty(in_file, "InMC")
+def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
 def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
 def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
 def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
@ -824,8 +841,8 @@ def NeedToSetNumericValue(nv, start, end, c_props):
    assert "nt" not in c_props
    return True
  if nv != c_nv:
-    raise ValueError("UnicodeData.txt has nv=%s for %04lX..%04lX " +
-                     "but DerivedNumericValues.txt has nv=%s" %
+    raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
+                     "but DerivedNumericValues.txt has nv=%s") %
                     (c_nv, start, end, nv))
  return False

@ -920,31 +937,32 @@ def CompactBlock(b, i):
  assert b[0] == _starts[i]
  orig_i = i
  # Count the number of occurrences of each property's value in this block.
-  num_cp_so_far = 0
+  # To minimize the output, count the number of ranges,
+  # not the number of code points.
+  num_ranges_so_far = 0
  prop_counters = {}
  while True:
    start = _starts[i]
    if start > b[1]: break
-    num_cp_in_this_range = _starts[i + 1] - start
    props = _props[i]
    for (pname, value) in props.iteritems():
      if pname in prop_counters:
        counter = prop_counters[pname]
      else:
-        counter = {_null_or_defaults[pname]: num_cp_so_far}
+        counter = {_null_or_defaults[pname]: num_ranges_so_far}
        prop_counters[pname] = counter
      if value in counter:
-        counter[value] += num_cp_in_this_range
+        counter[value] += 1
      else:
-        counter[value] = num_cp_in_this_range
+        counter[value] = 1
    # Also count default values for properties that do not occur in a range.
    for pname in prop_counters:
      if pname not in props:
        counter = prop_counters[pname]
        value = _null_or_defaults[pname]
-        counter[value] += num_cp_in_this_range
-    num_cp_so_far += num_cp_in_this_range
-    # Invariant: For each counter, the sum of counts must equal num_cp_so_far.
+        counter[value] += 1
+    num_ranges_so_far += 1
+    # Invariant: For each counter, the sum of counts must equal num_ranges_so_far.
    i += 1
  # For each property that occurs within this block,
  # set the most common value as a block property value.
@ -1519,7 +1537,7 @@ _files = {
  "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
  "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
  "GraphemeBreakTest.txt": (PrependBOM, "testdata"),
-  "IndicMatraCategory.txt": (DontCopy, ParseIndicMatraCategory),
+  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
  "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
  "LineBreak.txt": (DontCopy, ParseLineBreak),
  "LineBreakTest.txt": (PrependBOM, "testdata"),