diff --git a/icu4c/source/common/unicode/uchar.h b/icu4c/source/common/unicode/uchar.h index 7f48181822c..d0c979b3e5a 100644 --- a/icu4c/source/common/unicode/uchar.h +++ b/icu4c/source/common/unicode/uchar.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1997-2003, International Business Machines +* Copyright (C) 1997-2004, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -286,6 +286,17 @@ typedef enum UProperty { mapping or _in_ the target of a case mapping. Not the same as the general category Cased_Letter. @draft ICU 2.6 */ UCHAR_CASE_SENSITIVE, + /** Binary property STerm (new in Unicode 4.0.1). + Sentence Terminal. Used in UAX #29: Text Boundaries + (http://www.unicode.org/reports/tr29/) + @draft ICU 3.0 */ + UCHAR_S_TERM, + /** Binary property Variation_Selector (new in Unicode 4.0.1). + Indicates all those characters that qualify as Variation Selectors. + For details on the behavior of these characters, + see StandardizedVariants.html and 15.6 Variation Selectors. + @draft ICU 3.0 */ + UCHAR_VARIATION_SELECTOR, /** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */ UCHAR_BINARY_LIMIT, @@ -956,6 +967,11 @@ enum UBlockCode { /** @stable ICU 2.2 */ UBLOCK_CYRILLIC_SUPPLEMENTARY = 97, /*[0500]*/ + /** + * Unicode 4.0.1 renames the "Cyrillic Supplementary" block to "Cyrillic Supplement". + * @draft ICU 3.0 + */ + UBLOCK_CYRILLIC_SUPPLEMENT = 97, /*[0500]*/ /** @stable ICU 2.2 */ UBLOCK_TAGALOG = 98, /*[1700]*/ /** @stable ICU 2.2 */ @@ -1215,6 +1231,8 @@ typedef enum ULineBreak { U_LB_HYPHEN, /*[HY]*/ U_LB_IDEOGRAPHIC, /*[ID]*/ U_LB_INSEPERABLE, /*[IN]*/ + /** Renamed from the misspelled "inseperable" in Unicode 4.0.1/ICU 3.0 @draft ICU 3.0 */ + U_LB_INSEPARABLE=U_LB_INSEPERABLE,/*[IN]*/ U_LB_INFIX_NUMERIC, /*[IS]*/ U_LB_LINE_FEED, /*[LF]*/ U_LB_NONSTARTER, /*[NS]*/ diff --git a/icu4c/source/common/unicode/uscript.h b/icu4c/source/common/unicode/uscript.h index fea971b7cff..11f118fc6c2 100644 --- a/icu4c/source/common/unicode/uscript.h +++ b/icu4c/source/common/unicode/uscript.h @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 1997-2003, International Business Machines +* Copyright (C) 1997-2004, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * @@ -83,6 +83,9 @@ typedef enum UScriptCode { USCRIPT_TAI_LE, /* Tale */ USCRIPT_UGARITIC, /* Ugar */ + /** New script code in Unicode 4.0.1 @draft ICU 3.0 */ + USCRIPT_KATAKANA_OR_HIRAGANA,/*Hrkt */ + USCRIPT_CODE_LIMIT } UScriptCode; diff --git a/icu4c/source/common/uprops.c b/icu4c/source/common/uprops.c index 012c46cc8f2..ab3c1033235 100644 --- a/icu4c/source/common/uprops.c +++ b/icu4c/source/common/uprops.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2003, International Business Machines +* Copyright (C) 2002-2004, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -214,7 +214,9 @@ static const struct { { 1, U_MASK(UPROPS_WHITE_SPACE) }, { 1, U_MASK(UPROPS_XID_CONTINUE) }, { 1, U_MASK(UPROPS_XID_START) }, - { -1, U_MASK(UPROPS_CASE_SENSITIVE_SHIFT) } + { -1, U_MASK(UPROPS_CASE_SENSITIVE_SHIFT) }, + { 2, U_MASK(UPROPS_V2_S_TERM) }, + { 2, U_MASK(UPROPS_V2_VARIATION_SELECTOR) } }; U_CAPI UBool U_EXPORT2 diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h index ab6907ae6e0..588aad61db2 100644 --- a/icu4c/source/common/uprops.h +++ b/icu4c/source/common/uprops.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2003, International Business Machines +* Copyright (C) 2002-2004, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -164,6 +164,7 @@ enum { /* * Properties in vector word 2 * Bits + * 31..24 More binary properties * 13..11 Joining Type * 10.. 5 Joining Group * 4.. 0 Decomposition Type @@ -176,6 +177,12 @@ enum { #define UPROPS_DT_MASK 0x0000001f +enum { + UPROPS_V2_S_TERM=24, /* new in ICU 3.0 and Unicode 4.0.1 */ + UPROPS_V2_VARIATION_SELECTOR, + UPROPS_V2_TOP /* must be <=32 */ +}; + /** * Get a properties vector word for a code point. * Implemented in uchar.c for uprops.c. diff --git a/icu4c/source/tools/gennorm/gennorm.c b/icu4c/source/tools/gennorm/gennorm.c index fc706e230e5..f75537b7010 100644 --- a/icu4c/source/tools/gennorm/gennorm.c +++ b/icu4c/source/tools/gennorm/gennorm.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2001-2003, International Business Machines +* Copyright (C) 2001-2004, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -246,8 +246,22 @@ derivedNormalizationPropertiesLineFn(void *context, qcFlags&=0xf; } else if(0==uprv_memcmp(s, "MAYBE", 5)) { qcFlags&=0x30; + } else if(0==uprv_memcmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') { + /* + * Unicode 4.0.1: + * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc. + */ + /* start of the field */ + s=(char *)u_skipWhitespace(s+1); + if(*s=='N') { + qcFlags&=0xf; + } else if(*s=='M') { + qcFlags&=0x30; + } else { + return; /* do nothing for "Yes" because it's the default value */ + } } else { - return; + return; /* do nothing for "Yes" because it's the default value */ } /* set this flag for all code points in this range */ @@ -259,7 +273,10 @@ derivedNormalizationPropertiesLineFn(void *context, while(start<=end) { setCompositionExclusion(start++); } - } else if(0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') { + } else if( + (0==uprv_memcmp(s, "FNC", 3) || 0==uprv_memcmp(s, "FC_NFKC", 7)) + && *(s=(char *)u_skipWhitespace(s+3))==';' + ) { /* FC_NFKC_Closure, parse field 2 to get the string */ char *t; diff --git a/icu4c/source/tools/genprops/props2.c b/icu4c/source/tools/genprops/props2.c index 0e98acea023..78ea4ca6ed1 100644 --- a/icu4c/source/tools/genprops/props2.c +++ b/icu4c/source/tools/genprops/props2.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2003, International Business Machines +* Copyright (C) 2002-2004, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -265,7 +265,11 @@ propListNames[]={ { "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH }, { "Deprecated", 1, UPROPS_DEPRECATED }, { "Soft_Dotted", 1, UPROPS_SOFT_DOTTED }, - { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION } + { "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION }, + + /* new properties in Unicode 4.0.1 */ + { "STerm", 2, UPROPS_V2_S_TERM }, + { "Variation_Selector", 2, UPROPS_V2_VARIATION_SELECTOR } }; static const Binaries @@ -399,7 +403,7 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr /* process various UCD .txt files */ /* add Han numeric types & values */ - parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 3, numericLineFn, pErrorCode); + parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode); /* set proper bidi class for unassigned code points (Cn) */ parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode); @@ -537,7 +541,7 @@ numericLineFn(void *context, Props newProps; char *s, *end; uint32_t start, limit, value, oldProps32; - int32_t type, oldType; + int32_t oldType; char c; UBool isFraction; @@ -586,26 +590,22 @@ numericLineFn(void *context, } } - /* parse numeric type */ - s=trimTerminateField(fields[2][0], fields[2][1]); - type=u_getPropertyValueEnum(UCHAR_NUMERIC_TYPE, s); - if(type<=0) { - fprintf(stderr, "genprops error: unknown numeric type in DerivedNumericValues.txt field 1 at %s\n", s); - exit(U_PARSE_ERROR); - } + /* + * Unicode 4.0.1 removes the third column that used to list the numeric type. + * Assume that either the data is the same as in UnicodeData.txt, + * or else that the numeric type is "numeric". + * This should work because we only expect to add numeric values for + * Han characters; for those, UnicodeData.txt lists only ranges without + * specific properties for single characters. + */ for(; start