ICU-9437 update UCD tools for Unicode 6.2, especially new numeric values

X-SVN-Rev: 32193
This commit is contained in:
Markus Scherer 2012-08-17 19:18:06 +00:00
parent 68f6a941a2
commit a2f3849b0d
2 changed files with 63 additions and 31 deletions

View file

@ -45,7 +45,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 7 .
The following is a description of format version 7.1 .
Data contents:
@ -155,6 +155,8 @@ Encoding of numeric type and value in the 10-bit ntv field:
0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
(only one significant decimal digit)
0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
--- Additional properties (new in format version 2.1) ---
@ -234,6 +236,14 @@ than a script code.
Change from UTrie to UTrie2.
--- Changes in format version 7.1 ---
Unicode 6.2 adds sexagesimal (base-60) numeric values:
cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
The encoding of numeric values was extended to handle such values.
----------------------------------------------------------------------------- */
U_NAMESPACE_USE
@ -249,8 +259,8 @@ static UDataInfo dataInfo={
0,
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
{ 7, 0, 0, 0 }, /* formatVersion */
{ 6, 0, 0, 0 } /* dataVersion */
{ 7, 1, 0, 0 }, /* formatVersion */
{ 6, 2, 0, 0 } /* dataVersion */
};
class CorePropsBuilder : public PropsBuilder {
@ -301,6 +311,7 @@ CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
// For nt=U_NT_NUMERIC.
static int32_t
encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
const char *original;
/* get a possible minus sign */
UBool isNegative;
if(*s=='-') {
@ -322,15 +333,16 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
} else {
/* normal number parsing */
unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
if(ul>0x7fffffff) {
if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
ntv=-1;
} else {
value=(int32_t)ul;
}
if(s<numberLimit && *numberLimit=='/') {
if(ntv>=0 && *numberLimit=='/') {
/* fractional value, get the denominator */
ul=uprv_strtoul(numberLimit+1, &numberLimit, 10);
if(ul==0 || ul>0x7fffffff) {
s=numberLimit+1;
ul=uprv_strtoul(s, &numberLimit, 10);
if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
ntv=-1;
} else {
den=(int32_t)ul;
@ -359,22 +371,41 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
mant/=10;
++exp;
}
// Note: value<=0x7fffffff guarantees exp<=33
if(mant<=9) {
ntv=((mant+14)<<5)+(exp-2);
} else {
// Try sexagesimal (base 60) numbers.
mant=value;
exp=0;
while((mant%60)==0) {
mant/=60;
++exp;
}
if(mant<=9 && exp<=4) {
ntv=((mant+0xbf)<<2)+(exp-1);
} else {
ntv=-1;
}
}
}
} else if(2<=exp && exp<=33 && 1<=value && value<=9) {
/* large, single-significant-digit integer */
ntv=((value+14)<<5)+(exp-2);
} else {
ntv=-1;
}
} else if(exp==0) {
if(-1<=value && value<=17 && 1<=den && den<=16) {
/* fraction */
ntv=((value+12)<<4)+(den-1);
}
} else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
/* fraction */
ntv=((value+12)<<4)+(den-1);
} else if(exp==0 && value==-1 && den==0) {
/* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
ntv=((value+12)<<4);
} else {
ntv=-1;
}
if(ntv<0 || *numberLimit!=0) {
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", s);
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
return ntv;

View file

@ -258,7 +258,7 @@ static const Value VALUES_blk[221] = {
Value(UBLOCK_TAKRI, "Takri Takri"),
};
static const Value VALUES_ccc[56] = {
static const Value VALUES_ccc[57] = {
Value(0, "NR Not_Reordered"),
Value(1, "OV Overlay"),
Value(7, "NK Nukta"),
@ -299,7 +299,8 @@ static const Value VALUES_ccc[56] = {
Value(122, "CCC122 CCC122"),
Value(129, "CCC129 CCC129"),
Value(130, "CCC130 CCC130"),
Value(132, "CCC133 CCC133"),
Value(132, "CCC132 CCC132"),
Value(133, "CCC133 CCC133"),
Value(200, "ATBL Attached_Below_Left"),
Value(202, "ATB Attached_Below"),
Value(214, "ATA Attached_Above"),
@ -490,7 +491,7 @@ static const Value VALUES_lb[40] = {
Value(U_LB_CLOSE_PARENTHESIS, "CP Close_Parenthesis"),
Value(U_LB_CONDITIONAL_JAPANESE_STARTER, "CJ Conditional_Japanese_Starter"),
Value(U_LB_HEBREW_LETTER, "HL Hebrew_Letter"),
Value(U_LB_ZERO_WIDTH_JOINER, "ZJ Zero_Width_Joiner"),
Value(U_LB_REGIONAL_INDICATOR, "RI Regional_Indicator"),
};
static const Value VALUES_nt[4] = {
@ -693,7 +694,7 @@ static const Value VALUES_NFKC_QC[3] = {
Value(UNORM_MAYBE, "M Maybe"),
};
static const Value VALUES_lccc[56] = {
static const Value VALUES_lccc[57] = {
Value(0, "NR Not_Reordered"),
Value(1, "OV Overlay"),
Value(7, "NK Nukta"),
@ -734,7 +735,8 @@ static const Value VALUES_lccc[56] = {
Value(122, "CCC122 CCC122"),
Value(129, "CCC129 CCC129"),
Value(130, "CCC130 CCC130"),
Value(132, "CCC133 CCC133"),
Value(132, "CCC132 CCC132"),
Value(133, "CCC133 CCC133"),
Value(200, "ATBL Attached_Below_Left"),
Value(202, "ATB Attached_Below"),
Value(214, "ATA Attached_Above"),
@ -752,7 +754,7 @@ static const Value VALUES_lccc[56] = {
Value(240, "IS Iota_Subscript"),
};
static const Value VALUES_tccc[56] = {
static const Value VALUES_tccc[57] = {
Value(0, "NR Not_Reordered"),
Value(1, "OV Overlay"),
Value(7, "NK Nukta"),
@ -793,7 +795,8 @@ static const Value VALUES_tccc[56] = {
Value(122, "CCC122 CCC122"),
Value(129, "CCC129 CCC129"),
Value(130, "CCC130 CCC130"),
Value(132, "CCC133 CCC133"),
Value(132, "CCC132 CCC132"),
Value(133, "CCC133 CCC133"),
Value(200, "ATBL Attached_Below_Left"),
Value(202, "ATB Attached_Below"),
Value(214, "ATA Attached_Above"),
@ -811,7 +814,7 @@ static const Value VALUES_tccc[56] = {
Value(240, "IS Iota_Subscript"),
};
static const Value VALUES_GCB[14] = {
static const Value VALUES_GCB[13] = {
Value(U_GCB_OTHER, "XX Other"),
Value(U_GCB_CONTROL, "CN Control"),
Value(U_GCB_CR, "CR CR"),
@ -824,8 +827,7 @@ static const Value VALUES_GCB[14] = {
Value(U_GCB_V, "V V"),
Value(U_GCB_SPACING_MARK, "SM SpacingMark"),
Value(U_GCB_PREPEND, "PP Prepend"),
Value(U_GCB_AFTER_JOINER, "AJ After_Joiner"),
Value(U_GCB_JOINER, "J Joiner"),
Value(U_GCB_REGIONAL_INDICATOR, "RI Regional_Indicator"),
};
static const Value VALUES_SB[15] = {
@ -846,7 +848,7 @@ static const Value VALUES_SB[15] = {
Value(U_SB_SCONTINUE, "SC SContinue"),
};
static const Value VALUES_WB[15] = {
static const Value VALUES_WB[14] = {
Value(U_WB_OTHER, "XX Other"),
Value(U_WB_ALETTER, "LE ALetter"),
Value(U_WB_FORMAT, "FO Format"),
@ -860,8 +862,7 @@ static const Value VALUES_WB[15] = {
Value(U_WB_LF, "LF LF"),
Value(U_WB_MIDNUMLET, "MB MidNumLet"),
Value(U_WB_NEWLINE, "NL Newline"),
Value(U_WB_AFTER_JOINER, "AJ After_Joiner"),
Value(U_WB_JOINER, "J Joiner"),
Value(U_WB_REGIONAL_INDICATOR, "RI Regional_Indicator"),
};
static const Value VALUES_gcm[38] = {
@ -965,7 +966,7 @@ static const Property PROPERTIES[94] = {
Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"),
Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 19),
Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 221),
Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 56),
Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57),
Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18),
Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6),
Property(UCHAR_GENERAL_CATEGORY, "gc General_Category", VALUES_gc, 30),
@ -979,11 +980,11 @@ static const Property PROPERTIES[94] = {
Property(UCHAR_NFKD_QUICK_CHECK, "NFKD_QC NFKD_Quick_Check", VALUES_NFKD_QC, 2),
Property(UCHAR_NFC_QUICK_CHECK, "NFC_QC NFC_Quick_Check", VALUES_NFC_QC, 3),
Property(UCHAR_NFKC_QUICK_CHECK, "NFKC_QC NFKC_Quick_Check", VALUES_NFKC_QC, 3),
Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 56),
Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 56),
Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 14),
Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 57),
Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 57),
Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 13),
Property(UCHAR_SENTENCE_BREAK, "SB Sentence_Break", VALUES_SB, 15),
Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 15),
Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 14),
Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38),
Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"),
Property(UCHAR_AGE, "age Age"),