From 99c4dfa56503a6e4ad83befb7052c9c95a254d23 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 16 Apr 2015 23:42:50 +0000 Subject: [PATCH] ICU-11574 Unicode 8 updates X-SVN-Rev: 37353 --- tools/unicode/c/genprops/corepropsbuilder.cpp | 4 +- .../unicode/c/genprops/namespropsbuilder.cpp | 8 +-- tools/unicode/c/genprops/pnames_data.h | 30 +++++++---- tools/unicode/py/preparseucd.py | 52 +++++++++++++------ 4 files changed, 61 insertions(+), 33 deletions(-) diff --git a/tools/unicode/c/genprops/corepropsbuilder.cpp b/tools/unicode/c/genprops/corepropsbuilder.cpp index 20d75c53ccf..531595c3e74 100644 --- a/tools/unicode/c/genprops/corepropsbuilder.cpp +++ b/tools/unicode/c/genprops/corepropsbuilder.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2012, International Business Machines +* Copyright (C) 1999-2015, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -431,7 +431,7 @@ CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newVa } int32_t ntv=UPROPS_NTV_NONE; // numeric type & value - if(nvString!=NULL) { + if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) { int32_t digitValue=props.digitValue; if( type<=U_NT_NONE || U_NT_NUMERICwordBottom) { - fprintf(stderr, "gennames: out of memory\n"); + fprintf(stderr, "gennames allocLine(): out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } p=stringStore+lineTop; @@ -1110,7 +1110,7 @@ allocWord(uint32_t length) { uint32_t bottom=wordBottom-length; if(lineTop>bottom) { - fprintf(stderr, "gennames: out of memory\n"); + fprintf(stderr, "gennames allocWord(): out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } wordBottom=bottom; diff --git a/tools/unicode/c/genprops/pnames_data.h b/tools/unicode/c/genprops/pnames_data.h index 267bdd67dac..e89e782b0eb 100644 --- a/tools/unicode/c/genprops/pnames_data.h +++ b/tools/unicode/c/genprops/pnames_data.h @@ -1,11 +1,11 @@ /** - * Copyright (C) 2002-2014, International Business Machines Corporation and + * Copyright (C) 2002-2015, International Business Machines Corporation and * others. All Rights Reserved. * * machine-generated by: icu/tools/unicode/py/preparseucd.py */ -#define UNICODE_VERSION { 7, 0, 0, 0 } +#define UNICODE_VERSION { 8, 0, 0, 0 } static const Value VALUES_binprop[2] = { Value(0, "N No F False"), @@ -38,7 +38,7 @@ static const Value VALUES_bc[23] = { Value(U_POP_DIRECTIONAL_ISOLATE, "PDI Pop_Directional_Isolate"), }; -static const Value VALUES_blk[253] = { +static const Value VALUES_blk[263] = { Value(UBLOCK_NO_BLOCK, "NB No_Block"), Value(UBLOCK_BASIC_LATIN, "ASCII Basic_Latin"), Value(UBLOCK_LATIN_1_SUPPLEMENT, "Latin_1_Sup Latin_1_Supplement Latin_1"), @@ -292,6 +292,16 @@ static const Value VALUES_blk[253] = { Value(UBLOCK_SUPPLEMENTAL_ARROWS_C, "Sup_Arrows_C Supplemental_Arrows_C"), Value(UBLOCK_TIRHUTA, "Tirhuta Tirhuta"), Value(UBLOCK_WARANG_CITI, "Warang_Citi Warang_Citi"), + Value(UBLOCK_AHOM, "Ahom Ahom"), + Value(UBLOCK_ANATOLIAN_HIEROGLYPHS, "Anatolian_Hieroglyphs Anatolian_Hieroglyphs"), + Value(UBLOCK_CHEROKEE_SUPPLEMENT, "Cherokee_Sup Cherokee_Supplement"), + Value(UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E, "CJK_Ext_E CJK_Unified_Ideographs_Extension_E"), + Value(UBLOCK_EARLY_DYNASTIC_CUNEIFORM, "Early_Dynastic_Cuneiform Early_Dynastic_Cuneiform"), + Value(UBLOCK_HATRAN, "Hatran Hatran"), + Value(UBLOCK_MULTANI, "Multani Multani"), + Value(UBLOCK_OLD_HUNGARIAN, "Old_Hungarian Old_Hungarian"), + Value(UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS, "Sup_Symbols_And_Pictographs Supplemental_Symbols_And_Pictographs"), + Value(UBLOCK_SUTTON_SIGNWRITING, "Sutton_SignWriting Sutton_SignWriting"), }; static const Value VALUES_ccc[57] = { @@ -642,7 +652,7 @@ static const Value VALUES_sc[167] = { Value(USCRIPT_SIMPLIFIED_HAN, "Hans Hans"), Value(USCRIPT_TRADITIONAL_HAN, "Hant Hant"), Value(USCRIPT_PAHAWH_HMONG, "Hmng Pahawh_Hmong"), - Value(USCRIPT_OLD_HUNGARIAN, "Hung Hung"), + Value(USCRIPT_OLD_HUNGARIAN, "Hung Old_Hungarian"), Value(USCRIPT_HARAPPAN_INDUS, "Inds Inds"), Value(USCRIPT_JAVANESE, "Java Javanese"), Value(USCRIPT_KAYAH_LI, "Kali Kayah_Li"), @@ -678,7 +688,7 @@ static const Value VALUES_sc[167] = { Value(USCRIPT_OL_CHIKI, "Olck Ol_Chiki"), Value(USCRIPT_REJANG, "Rjng Rejang"), Value(USCRIPT_SAURASHTRA, "Saur Saurashtra"), - Value(USCRIPT_SIGN_WRITING, "Sgnw Sgnw"), + Value(USCRIPT_SIGN_WRITING, "Sgnw SignWriting"), Value(USCRIPT_SUNDANESE, "Sund Sundanese"), Value(USCRIPT_MOON, "Moon Moon"), Value(USCRIPT_MEITEI_MAYEK, "Mtei Meetei_Mayek"), @@ -701,7 +711,7 @@ static const Value VALUES_sc[167] = { Value(USCRIPT_NAKHI_GEBA, "Nkgb Nkgb"), Value(USCRIPT_OLD_SOUTH_ARABIAN, "Sarb Old_South_Arabian"), Value(USCRIPT_BASSA_VAH, "Bass Bassa_Vah"), - Value(USCRIPT_DUPLOYAN_SHORTAND, "Dupl Duployan"), + Value(USCRIPT_DUPLOYAN, "Dupl Duployan"), Value(USCRIPT_ELBASAN, "Elba Elbasan"), Value(USCRIPT_GRANTHA, "Gran Grantha"), Value(USCRIPT_KPELLE, "Kpel Kpel"), @@ -722,15 +732,15 @@ static const Value VALUES_sc[167] = { Value(USCRIPT_TAKRI, "Takr Takri"), Value(USCRIPT_TANGUT, "Tang Tang"), Value(USCRIPT_WOLEAI, "Wole Wole"), - Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Hluw"), + Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Anatolian_Hieroglyphs"), Value(USCRIPT_KHOJKI, "Khoj Khojki"), Value(USCRIPT_TIRHUTA, "Tirh Tirhuta"), Value(USCRIPT_CAUCASIAN_ALBANIAN, "Aghb Caucasian_Albanian"), Value(USCRIPT_MAHAJANI, "Mahj Mahajani"), Value(USCRIPT_AHOM, "Ahom Ahom"), - Value(USCRIPT_HATRAN, "Hatr Hatr"), + Value(USCRIPT_HATRAN, "Hatr Hatran"), Value(USCRIPT_MODI, "Modi Modi"), - Value(USCRIPT_MULTANI, "Mult Mult"), + Value(USCRIPT_MULTANI, "Mult Multani"), Value(USCRIPT_PAU_CIN_HAU, "Pauc Pau_Cin_Hau"), Value(USCRIPT_SIDDHAM, "Sidd Siddham"), }; @@ -1046,7 +1056,7 @@ static const Property PROPERTIES[96] = { Property(UCHAR_CHANGES_WHEN_CASEMAPPED, "CWCM Changes_When_Casemapped"), Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"), Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 23), - Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 253), + Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 263), Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57), Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18), Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6), diff --git a/tools/unicode/py/preparseucd.py b/tools/unicode/py/preparseucd.py index 05d08791856..e68d5bd1f5b 100755 --- a/tools/unicode/py/preparseucd.py +++ b/tools/unicode/py/preparseucd.py @@ -1,6 +1,6 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -# Copyright (c) 2009-2014 International Business Machines +# Copyright (c) 2009-2015 International Business Machines # Corporation and others. All Rights Reserved. # # file name: preparseucd.py @@ -47,12 +47,12 @@ _current_year = datetime.date.today().strftime("%Y") # Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html # that are not yet in the UCD. _scripts_only_in_iso15924 = ( - "Afak", "Ahom", "Blis", "Cirt", "Cyrs", + "Afak", "Blis", "Cirt", "Cyrs", "Egyd", "Egyh", "Geok", - "Hans", "Hant", "Hatr", "Hluw", "Hung", + "Hans", "Hant", "Inds", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma", - "Maya", "Moon", "Mult", "Nkgb", "Nshu", "Phlv", "Roro", - "Sara", "Sgnw", "Syre", "Syrj", "Syrn", + "Maya", "Moon", "Nkgb", "Nshu", "Phlv", "Roro", + "Sara", "Syre", "Syrj", "Syrn", "Tang", "Teng", "Visp", "Wole", "Zmth", "Zsym", "Zxxx" ) @@ -680,6 +680,23 @@ def ParseUnicodeData(in_file): if (decimal and decimal != nv) or (digit and digit != nv): raise SyntaxError("error: numeric values differ at\n %s\n" % line) if nv: + # Map improper fractions to proper ones. + # U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS + # .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS + if nv == "2/12": + nv = "1/6" + elif nv == "3/12": + nv = "1/4" + elif nv == "4/12": + nv = "1/3" + elif nv == "6/12": + nv = "1/2" + elif nv == "8/12": + nv = "2/3" + elif nv == "9/12": + nv = "3/4" + elif nv == "10/12": + nv = "5/6" props["nv"] = nv props["nt"] = "De" if decimal else "Di" if digit else "Nu" if fields[9] == "Y": props["Bidi_M"] = True @@ -773,7 +790,7 @@ def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg") def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt") def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea") def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB") -def ParseIndicMatraCategory(in_file): ParseOneProperty(in_file, "InMC") +def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC") def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC") def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb") def ParseScripts(in_file): ParseOneProperty(in_file, "sc") @@ -824,8 +841,8 @@ def NeedToSetNumericValue(nv, start, end, c_props): assert "nt" not in c_props return True if nv != c_nv: - raise ValueError("UnicodeData.txt has nv=%s for %04lX..%04lX " + - "but DerivedNumericValues.txt has nv=%s" % + raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " + + "but DerivedNumericValues.txt has nv=%s") % (c_nv, start, end, nv)) return False @@ -920,31 +937,32 @@ def CompactBlock(b, i): assert b[0] == _starts[i] orig_i = i # Count the number of occurrences of each property's value in this block. - num_cp_so_far = 0 + # To minimize the output, count the number of ranges, + # not the number of code points. + num_ranges_so_far = 0 prop_counters = {} while True: start = _starts[i] if start > b[1]: break - num_cp_in_this_range = _starts[i + 1] - start props = _props[i] for (pname, value) in props.iteritems(): if pname in prop_counters: counter = prop_counters[pname] else: - counter = {_null_or_defaults[pname]: num_cp_so_far} + counter = {_null_or_defaults[pname]: num_ranges_so_far} prop_counters[pname] = counter if value in counter: - counter[value] += num_cp_in_this_range + counter[value] += 1 else: - counter[value] = num_cp_in_this_range + counter[value] = 1 # Also count default values for properties that do not occur in a range. for pname in prop_counters: if pname not in props: counter = prop_counters[pname] value = _null_or_defaults[pname] - counter[value] += num_cp_in_this_range - num_cp_so_far += num_cp_in_this_range - # Invariant: For each counter, the sum of counts must equal num_cp_so_far. + counter[value] += 1 + num_ranges_so_far += 1 + # Invariant: For each counter, the sum of counts must equal num_ranges_so_far. i += 1 # For each property that occurs within this block, # set the most common value as a block property value. @@ -1519,7 +1537,7 @@ _files = { "EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth), "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty), "GraphemeBreakTest.txt": (PrependBOM, "testdata"), - "IndicMatraCategory.txt": (DontCopy, ParseIndicMatraCategory), + "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory), "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory), "LineBreak.txt": (DontCopy, ParseLineBreak), "LineBreakTest.txt": (PrependBOM, "testdata"),