mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 14:31:31 +00:00
ICU-9437 update UCD tools for Unicode 6.2, especially new numeric values
X-SVN-Rev: 32193
This commit is contained in:
parent
68f6a941a2
commit
a2f3849b0d
2 changed files with 63 additions and 31 deletions
|
@ -45,7 +45,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
|||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 7 .
|
||||
The following is a description of format version 7.1 .
|
||||
|
||||
Data contents:
|
||||
|
||||
|
@ -155,6 +155,8 @@ Encoding of numeric type and value in the 10-bit ntv field:
|
|||
0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
|
||||
0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
|
||||
(only one significant decimal digit)
|
||||
0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
|
||||
((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
|
||||
|
||||
--- Additional properties (new in format version 2.1) ---
|
||||
|
||||
|
@ -234,6 +236,14 @@ than a script code.
|
|||
|
||||
Change from UTrie to UTrie2.
|
||||
|
||||
--- Changes in format version 7.1 ---
|
||||
|
||||
Unicode 6.2 adds sexagesimal (base-60) numeric values:
|
||||
cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
|
||||
cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
|
||||
|
||||
The encoding of numeric values was extended to handle such values.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
@ -249,8 +259,8 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
|
||||
{ 7, 0, 0, 0 }, /* formatVersion */
|
||||
{ 6, 0, 0, 0 } /* dataVersion */
|
||||
{ 7, 1, 0, 0 }, /* formatVersion */
|
||||
{ 6, 2, 0, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
class CorePropsBuilder : public PropsBuilder {
|
||||
|
@ -301,6 +311,7 @@ CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
|
|||
// For nt=U_NT_NUMERIC.
|
||||
static int32_t
|
||||
encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
|
||||
const char *original;
|
||||
/* get a possible minus sign */
|
||||
UBool isNegative;
|
||||
if(*s=='-') {
|
||||
|
@ -322,15 +333,16 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
|
|||
} else {
|
||||
/* normal number parsing */
|
||||
unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
|
||||
if(ul>0x7fffffff) {
|
||||
if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
|
||||
ntv=-1;
|
||||
} else {
|
||||
value=(int32_t)ul;
|
||||
}
|
||||
if(s<numberLimit && *numberLimit=='/') {
|
||||
if(ntv>=0 && *numberLimit=='/') {
|
||||
/* fractional value, get the denominator */
|
||||
ul=uprv_strtoul(numberLimit+1, &numberLimit, 10);
|
||||
if(ul==0 || ul>0x7fffffff) {
|
||||
s=numberLimit+1;
|
||||
ul=uprv_strtoul(s, &numberLimit, 10);
|
||||
if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
|
||||
ntv=-1;
|
||||
} else {
|
||||
den=(int32_t)ul;
|
||||
|
@ -359,22 +371,41 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
|
|||
mant/=10;
|
||||
++exp;
|
||||
}
|
||||
// Note: value<=0x7fffffff guarantees exp<=33
|
||||
if(mant<=9) {
|
||||
ntv=((mant+14)<<5)+(exp-2);
|
||||
} else {
|
||||
// Try sexagesimal (base 60) numbers.
|
||||
mant=value;
|
||||
exp=0;
|
||||
while((mant%60)==0) {
|
||||
mant/=60;
|
||||
++exp;
|
||||
}
|
||||
if(mant<=9 && exp<=4) {
|
||||
ntv=((mant+0xbf)<<2)+(exp-1);
|
||||
} else {
|
||||
ntv=-1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if(2<=exp && exp<=33 && 1<=value && value<=9) {
|
||||
/* large, single-significant-digit integer */
|
||||
ntv=((value+14)<<5)+(exp-2);
|
||||
} else {
|
||||
ntv=-1;
|
||||
}
|
||||
} else if(exp==0) {
|
||||
if(-1<=value && value<=17 && 1<=den && den<=16) {
|
||||
/* fraction */
|
||||
ntv=((value+12)<<4)+(den-1);
|
||||
}
|
||||
} else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
|
||||
/* fraction */
|
||||
ntv=((value+12)<<4)+(den-1);
|
||||
} else if(exp==0 && value==-1 && den==0) {
|
||||
/* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
|
||||
ntv=((value+12)<<4);
|
||||
} else {
|
||||
ntv=-1;
|
||||
}
|
||||
if(ntv<0 || *numberLimit!=0) {
|
||||
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", s);
|
||||
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
return ntv;
|
||||
|
|
|
@ -258,7 +258,7 @@ static const Value VALUES_blk[221] = {
|
|||
Value(UBLOCK_TAKRI, "Takri Takri"),
|
||||
};
|
||||
|
||||
static const Value VALUES_ccc[56] = {
|
||||
static const Value VALUES_ccc[57] = {
|
||||
Value(0, "NR Not_Reordered"),
|
||||
Value(1, "OV Overlay"),
|
||||
Value(7, "NK Nukta"),
|
||||
|
@ -299,7 +299,8 @@ static const Value VALUES_ccc[56] = {
|
|||
Value(122, "CCC122 CCC122"),
|
||||
Value(129, "CCC129 CCC129"),
|
||||
Value(130, "CCC130 CCC130"),
|
||||
Value(132, "CCC133 CCC133"),
|
||||
Value(132, "CCC132 CCC132"),
|
||||
Value(133, "CCC133 CCC133"),
|
||||
Value(200, "ATBL Attached_Below_Left"),
|
||||
Value(202, "ATB Attached_Below"),
|
||||
Value(214, "ATA Attached_Above"),
|
||||
|
@ -490,7 +491,7 @@ static const Value VALUES_lb[40] = {
|
|||
Value(U_LB_CLOSE_PARENTHESIS, "CP Close_Parenthesis"),
|
||||
Value(U_LB_CONDITIONAL_JAPANESE_STARTER, "CJ Conditional_Japanese_Starter"),
|
||||
Value(U_LB_HEBREW_LETTER, "HL Hebrew_Letter"),
|
||||
Value(U_LB_ZERO_WIDTH_JOINER, "ZJ Zero_Width_Joiner"),
|
||||
Value(U_LB_REGIONAL_INDICATOR, "RI Regional_Indicator"),
|
||||
};
|
||||
|
||||
static const Value VALUES_nt[4] = {
|
||||
|
@ -693,7 +694,7 @@ static const Value VALUES_NFKC_QC[3] = {
|
|||
Value(UNORM_MAYBE, "M Maybe"),
|
||||
};
|
||||
|
||||
static const Value VALUES_lccc[56] = {
|
||||
static const Value VALUES_lccc[57] = {
|
||||
Value(0, "NR Not_Reordered"),
|
||||
Value(1, "OV Overlay"),
|
||||
Value(7, "NK Nukta"),
|
||||
|
@ -734,7 +735,8 @@ static const Value VALUES_lccc[56] = {
|
|||
Value(122, "CCC122 CCC122"),
|
||||
Value(129, "CCC129 CCC129"),
|
||||
Value(130, "CCC130 CCC130"),
|
||||
Value(132, "CCC133 CCC133"),
|
||||
Value(132, "CCC132 CCC132"),
|
||||
Value(133, "CCC133 CCC133"),
|
||||
Value(200, "ATBL Attached_Below_Left"),
|
||||
Value(202, "ATB Attached_Below"),
|
||||
Value(214, "ATA Attached_Above"),
|
||||
|
@ -752,7 +754,7 @@ static const Value VALUES_lccc[56] = {
|
|||
Value(240, "IS Iota_Subscript"),
|
||||
};
|
||||
|
||||
static const Value VALUES_tccc[56] = {
|
||||
static const Value VALUES_tccc[57] = {
|
||||
Value(0, "NR Not_Reordered"),
|
||||
Value(1, "OV Overlay"),
|
||||
Value(7, "NK Nukta"),
|
||||
|
@ -793,7 +795,8 @@ static const Value VALUES_tccc[56] = {
|
|||
Value(122, "CCC122 CCC122"),
|
||||
Value(129, "CCC129 CCC129"),
|
||||
Value(130, "CCC130 CCC130"),
|
||||
Value(132, "CCC133 CCC133"),
|
||||
Value(132, "CCC132 CCC132"),
|
||||
Value(133, "CCC133 CCC133"),
|
||||
Value(200, "ATBL Attached_Below_Left"),
|
||||
Value(202, "ATB Attached_Below"),
|
||||
Value(214, "ATA Attached_Above"),
|
||||
|
@ -811,7 +814,7 @@ static const Value VALUES_tccc[56] = {
|
|||
Value(240, "IS Iota_Subscript"),
|
||||
};
|
||||
|
||||
static const Value VALUES_GCB[14] = {
|
||||
static const Value VALUES_GCB[13] = {
|
||||
Value(U_GCB_OTHER, "XX Other"),
|
||||
Value(U_GCB_CONTROL, "CN Control"),
|
||||
Value(U_GCB_CR, "CR CR"),
|
||||
|
@ -824,8 +827,7 @@ static const Value VALUES_GCB[14] = {
|
|||
Value(U_GCB_V, "V V"),
|
||||
Value(U_GCB_SPACING_MARK, "SM SpacingMark"),
|
||||
Value(U_GCB_PREPEND, "PP Prepend"),
|
||||
Value(U_GCB_AFTER_JOINER, "AJ After_Joiner"),
|
||||
Value(U_GCB_JOINER, "J Joiner"),
|
||||
Value(U_GCB_REGIONAL_INDICATOR, "RI Regional_Indicator"),
|
||||
};
|
||||
|
||||
static const Value VALUES_SB[15] = {
|
||||
|
@ -846,7 +848,7 @@ static const Value VALUES_SB[15] = {
|
|||
Value(U_SB_SCONTINUE, "SC SContinue"),
|
||||
};
|
||||
|
||||
static const Value VALUES_WB[15] = {
|
||||
static const Value VALUES_WB[14] = {
|
||||
Value(U_WB_OTHER, "XX Other"),
|
||||
Value(U_WB_ALETTER, "LE ALetter"),
|
||||
Value(U_WB_FORMAT, "FO Format"),
|
||||
|
@ -860,8 +862,7 @@ static const Value VALUES_WB[15] = {
|
|||
Value(U_WB_LF, "LF LF"),
|
||||
Value(U_WB_MIDNUMLET, "MB MidNumLet"),
|
||||
Value(U_WB_NEWLINE, "NL Newline"),
|
||||
Value(U_WB_AFTER_JOINER, "AJ After_Joiner"),
|
||||
Value(U_WB_JOINER, "J Joiner"),
|
||||
Value(U_WB_REGIONAL_INDICATOR, "RI Regional_Indicator"),
|
||||
};
|
||||
|
||||
static const Value VALUES_gcm[38] = {
|
||||
|
@ -965,7 +966,7 @@ static const Property PROPERTIES[94] = {
|
|||
Property(UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED, "CWKCF Changes_When_NFKC_Casefolded"),
|
||||
Property(UCHAR_BIDI_CLASS, "bc Bidi_Class", VALUES_bc, 19),
|
||||
Property(UCHAR_BLOCK, "blk Block", VALUES_blk, 221),
|
||||
Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 56),
|
||||
Property(UCHAR_CANONICAL_COMBINING_CLASS, "ccc Canonical_Combining_Class", VALUES_ccc, 57),
|
||||
Property(UCHAR_DECOMPOSITION_TYPE, "dt Decomposition_Type", VALUES_dt, 18),
|
||||
Property(UCHAR_EAST_ASIAN_WIDTH, "ea East_Asian_Width", VALUES_ea, 6),
|
||||
Property(UCHAR_GENERAL_CATEGORY, "gc General_Category", VALUES_gc, 30),
|
||||
|
@ -979,11 +980,11 @@ static const Property PROPERTIES[94] = {
|
|||
Property(UCHAR_NFKD_QUICK_CHECK, "NFKD_QC NFKD_Quick_Check", VALUES_NFKD_QC, 2),
|
||||
Property(UCHAR_NFC_QUICK_CHECK, "NFC_QC NFC_Quick_Check", VALUES_NFC_QC, 3),
|
||||
Property(UCHAR_NFKC_QUICK_CHECK, "NFKC_QC NFKC_Quick_Check", VALUES_NFKC_QC, 3),
|
||||
Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 56),
|
||||
Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 56),
|
||||
Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 14),
|
||||
Property(UCHAR_LEAD_CANONICAL_COMBINING_CLASS, "lccc Lead_Canonical_Combining_Class", VALUES_lccc, 57),
|
||||
Property(UCHAR_TRAIL_CANONICAL_COMBINING_CLASS, "tccc Trail_Canonical_Combining_Class", VALUES_tccc, 57),
|
||||
Property(UCHAR_GRAPHEME_CLUSTER_BREAK, "GCB Grapheme_Cluster_Break", VALUES_GCB, 13),
|
||||
Property(UCHAR_SENTENCE_BREAK, "SB Sentence_Break", VALUES_SB, 15),
|
||||
Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 15),
|
||||
Property(UCHAR_WORD_BREAK, "WB Word_Break", VALUES_WB, 14),
|
||||
Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38),
|
||||
Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"),
|
||||
Property(UCHAR_AGE, "age Age"),
|
||||
|
|
Loading…
Add table
Reference in a new issue