ICU-11574 Unicode 8 updates

X-SVN-Rev: 37353
This commit is contained in:
Markus Scherer 2015-04-16 23:42:50 +00:00
parent 90b2bf6959
commit 99c4dfa565
4 changed files with 61 additions and 33 deletions

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -431,7 +431,7 @@ CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newVa
}
int32_t ntv=UPROPS_NTV_NONE; // numeric type & value
if(nvString!=NULL) {
if(nvString!=NULL && uprv_strcmp(nvString, "NaN")!=0) {
int32_t digitValue=props.digitValue;
if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -129,7 +129,7 @@
#include "unewdata.h"
#include "uoptions.h"
#define STRING_STORE_SIZE 1000000
#define STRING_STORE_SIZE 2000000
#define GROUP_STORE_SIZE 5000
#define GROUP_SHIFT 5
@ -1097,7 +1097,7 @@ allocLine(int32_t length) {
uint8_t *p;
if(top>wordBottom) {
fprintf(stderr, "gennames: out of memory\n");
fprintf(stderr, "gennames allocLine(): out of memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
p=stringStore+lineTop;
@ -1110,7 +1110,7 @@ allocWord(uint32_t length) {
uint32_t bottom=wordBottom-length;
if(lineTop>bottom) {
fprintf(stderr, "gennames: out of memory\n");
fprintf(stderr, "gennames allocWord(): out of memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
wordBottom=bottom;

View file

@ -1,11 +1,11 @@
/**
* Copyright (C) 2002-2014, International Business Machines Corporation and
* Copyright (C) 2002-2015, International Business Machines Corporation and
* others. All Rights Reserved.
*
* machine-generated by: icu/tools/unicode/py/preparseucd.py
*/
#define UNICODE_VERSION { 7, 0, 0, 0 }
#define UNICODE_VERSION { 8, 0, 0, 0 }
static const Value VALUES_binprop[2] = {
Value(0, "N No F False"),
@ -38,7 +38,7 @@ static const Value VALUES_bc[23] = {
Value(U_POP_DIRECTIONAL_ISOLATE, "PDI Pop_Directional_Isolate"),
};
static const Value VALUES_blk[253] = {
static const Value VALUES_blk[263] = {
Value(UBLOCK_NO_BLOCK, "NB No_Block"),
Value(UBLOCK_BASIC_LATIN, "ASCII Basic_Latin"),
Value(UBLOCK_LATIN_1_SUPPLEMENT, "Latin_1_Sup Latin_1_Supplement Latin_1"),
@ -292,6 +292,16 @@ static const Value VALUES_blk[253] = {
Value(UBLOCK_SUPPLEMENTAL_ARROWS_C, "Sup_Arrows_C Supplemental_Arrows_C"),
Value(UBLOCK_TIRHUTA, "Tirhuta Tirhuta"),
Value(UBLOCK_WARANG_CITI, "Warang_Citi Warang_Citi"),
Value(UBLOCK_AHOM, "Ahom Ahom"),
Value(UBLOCK_ANATOLIAN_HIEROGLYPHS, "Anatolian_Hieroglyphs Anatolian_Hieroglyphs"),
Value(UBLOCK_CHEROKEE_SUPPLEMENT, "Cherokee_Sup Cherokee_Supplement"),
Value(UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E, "CJK_Ext_E CJK_Unified_Ideographs_Extension_E"),
Value(UBLOCK_EARLY_DYNASTIC_CUNEIFORM, "Early_Dynastic_Cuneiform Early_Dynastic_Cuneiform"),
Value(UBLOCK_HATRAN, "Hatran Hatran"),
Value(UBLOCK_MULTANI, "Multani Multani"),
Value(UBLOCK_OLD_HUNGARIAN, "Old_Hungarian Old_Hungarian"),
Value(UBLOCK_SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS, "Sup_Symbols_And_Pictographs Supplemental_Symbols_And_Pictographs"),
Value(UBLOCK_SUTTON_SIGNWRITING, "Sutton_SignWriting Sutton_SignWriting"),
};
static const Value VALUES_ccc[57] = {
@ -642,7 +652,7 @@ static const Value VALUES_sc[167] = {
Value(USCRIPT_SIMPLIFIED_HAN, "Hans Hans"),
Value(USCRIPT_TRADITIONAL_HAN, "Hant Hant"),
Value(USCRIPT_PAHAWH_HMONG, "Hmng Pahawh_Hmong"),
Value(USCRIPT_OLD_HUNGARIAN, "Hung Hung"),
Value(USCRIPT_OLD_HUNGARIAN, "Hung Old_Hungarian"),
Value(USCRIPT_HARAPPAN_INDUS, "Inds Inds"),
Value(USCRIPT_JAVANESE, "Java Javanese"),
Value(USCRIPT_KAYAH_LI, "Kali Kayah_Li"),
@ -678,7 +688,7 @@ static const Value VALUES_sc[167] = {
Value(USCRIPT_OL_CHIKI, "Olck Ol_Chiki"),
Value(USCRIPT_REJANG, "Rjng Rejang"),
Value(USCRIPT_SAURASHTRA, "Saur Saurashtra"),
Value(USCRIPT_SIGN_WRITING, "Sgnw Sgnw"),
Value(USCRIPT_SIGN_WRITING, "Sgnw SignWriting"),
Value(USCRIPT_SUNDANESE, "Sund Sundanese"),
Value(USCRIPT_MOON, "Moon Moon"),
Value(USCRIPT_MEITEI_MAYEK, "Mtei Meetei_Mayek"),
@ -701,7 +711,7 @@ static const Value VALUES_sc[167] = {
Value(USCRIPT_NAKHI_GEBA, "Nkgb Nkgb"),
Value(USCRIPT_OLD_SOUTH_ARABIAN, "Sarb Old_South_Arabian"),
Value(USCRIPT_BASSA_VAH, "Bass Bassa_Vah"),
Value(USCRIPT_DUPLOYAN_SHORTAND, "Dupl Duployan"),
Value(USCRIPT_DUPLOYAN, "Dupl Duployan"),
Value(USCRIPT_ELBASAN, "Elba Elbasan"),
Value(USCRIPT_GRANTHA, "Gran Grantha"),
Value(USCRIPT_KPELLE, "Kpel Kpel"),
@ -722,15 +732,15 @@ static const Value VALUES_sc[167] = {
Value(USCRIPT_TAKRI, "Takr Takri"),
Value(USCRIPT_TANGUT, "Tang Tang"),
Value(USCRIPT_WOLEAI, "Wole Wole"),
Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Hluw"),
Value(USCRIPT_ANATOLIAN_HIEROGLYPHS, "Hluw Anatolian_Hieroglyphs"),
Value(USCRIPT_KHOJKI, "Khoj Khojki"),
Value(USCRIPT_TIRHUTA, "Tirh Tirhuta"),
Value(USCRIPT_CAUCASIAN_ALBANIAN, "Aghb Caucasian_Albanian"),
Value(USCRIPT_MAHAJANI, "Mahj Mahajani"),
Value(USCRIPT_AHOM, "Ahom Ahom"),
Value(USCRIPT_HATRAN, "Hatr Hatr"),
Value(USCRIPT_HATRAN, "Hatr Hatran"),
Value(USCRIPT_MODI, "Modi Modi"),
Value(USCRIPT_MULTANI, "Mult Mult"),
Value(USCRIPT_MULTANI, "Mult Multani"),
Value(USCRIPT_PAU_CIN_HAU, "Pauc Pau_Cin_Hau"),
Value(USCRIPT_SIDDHAM, "Sidd Siddham"),
};
@ -1046,7 +1056,7 @@ static const Property PROPERTIES[96] = {
Property(UCHAR_CHANGES_WHEN_CASEMAPPED, "CWCM Changes_When_Casemapped"),
Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"),
Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 23),
Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 253),
Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 263),
Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57),
Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18),
Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6),

View file

@ -1,6 +1,6 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2009-2014 International Business Machines
# Copyright (c) 2009-2015 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: preparseucd.py
@ -47,12 +47,12 @@ _current_year = datetime.date.today().strftime("%Y")
# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
# that are not yet in the UCD.
_scripts_only_in_iso15924 = (
"Afak", "Ahom", "Blis", "Cirt", "Cyrs",
"Afak", "Blis", "Cirt", "Cyrs",
"Egyd", "Egyh", "Geok",
"Hans", "Hant", "Hatr", "Hluw", "Hung",
"Hans", "Hant",
"Inds", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
"Maya", "Moon", "Mult", "Nkgb", "Nshu", "Phlv", "Roro",
"Sara", "Sgnw", "Syre", "Syrj", "Syrn",
"Maya", "Moon", "Nkgb", "Nshu", "Phlv", "Roro",
"Sara", "Syre", "Syrj", "Syrn",
"Tang", "Teng", "Visp", "Wole", "Zmth", "Zsym", "Zxxx"
)
@ -680,6 +680,23 @@ def ParseUnicodeData(in_file):
if (decimal and decimal != nv) or (digit and digit != nv):
raise SyntaxError("error: numeric values differ at\n %s\n" % line)
if nv:
# Map improper fractions to proper ones.
# U+109F7 MEROITIC CURSIVE FRACTION TWO TWELFTHS
# .. U+109FF MEROITIC CURSIVE FRACTION TEN TWELFTHS
if nv == "2/12":
nv = "1/6"
elif nv == "3/12":
nv = "1/4"
elif nv == "4/12":
nv = "1/3"
elif nv == "6/12":
nv = "1/2"
elif nv == "8/12":
nv = "2/3"
elif nv == "9/12":
nv = "3/4"
elif nv == "10/12":
nv = "5/6"
props["nv"] = nv
props["nt"] = "De" if decimal else "Di" if digit else "Nu"
if fields[9] == "Y": props["Bidi_M"] = True
@ -773,7 +790,7 @@ def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
def ParseIndicMatraCategory(in_file): ParseOneProperty(in_file, "InMC")
def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
@ -824,8 +841,8 @@ def NeedToSetNumericValue(nv, start, end, c_props):
assert "nt" not in c_props
return True
if nv != c_nv:
raise ValueError("UnicodeData.txt has nv=%s for %04lX..%04lX " +
"but DerivedNumericValues.txt has nv=%s" %
raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
"but DerivedNumericValues.txt has nv=%s") %
(c_nv, start, end, nv))
return False
@ -920,31 +937,32 @@ def CompactBlock(b, i):
assert b[0] == _starts[i]
orig_i = i
# Count the number of occurrences of each property's value in this block.
num_cp_so_far = 0
# To minimize the output, count the number of ranges,
# not the number of code points.
num_ranges_so_far = 0
prop_counters = {}
while True:
start = _starts[i]
if start > b[1]: break
num_cp_in_this_range = _starts[i + 1] - start
props = _props[i]
for (pname, value) in props.iteritems():
if pname in prop_counters:
counter = prop_counters[pname]
else:
counter = {_null_or_defaults[pname]: num_cp_so_far}
counter = {_null_or_defaults[pname]: num_ranges_so_far}
prop_counters[pname] = counter
if value in counter:
counter[value] += num_cp_in_this_range
counter[value] += 1
else:
counter[value] = num_cp_in_this_range
counter[value] = 1
# Also count default values for properties that do not occur in a range.
for pname in prop_counters:
if pname not in props:
counter = prop_counters[pname]
value = _null_or_defaults[pname]
counter[value] += num_cp_in_this_range
num_cp_so_far += num_cp_in_this_range
# Invariant: For each counter, the sum of counts must equal num_cp_so_far.
counter[value] += 1
num_ranges_so_far += 1
# Invariant: For each counter, the sum of counts must equal num_ranges_so_far.
i += 1
# For each property that occurs within this block,
# set the most common value as a block property value.
@ -1519,7 +1537,7 @@ _files = {
"EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
"GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
"GraphemeBreakTest.txt": (PrependBOM, "testdata"),
"IndicMatraCategory.txt": (DontCopy, ParseIndicMatraCategory),
"IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
"IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
"LineBreak.txt": (DontCopy, ParseLineBreak),
"LineBreakTest.txt": (PrependBOM, "testdata"),