ICU-12526 uprops.icu formatVersion 7.3: support new fraction numeric values like 3/80; ppucd.txt mostly no block compression for String/Misc properties; minor bug fixes

X-SVN-Rev: 38706
This commit is contained in:
Markus Scherer 2016-05-05 22:51:18 +00:00
parent 6e907382d0
commit 3e5578f3bf
4 changed files with 79 additions and 15 deletions

View file

@ -1053,7 +1053,7 @@ static int32_t indexes[UCASE_IX_TOP]={
0, 0, 0, 0
};
static uint8_t trieBlock[40000];
static uint8_t trieBlock[100000];
static int32_t trieSize;
void
@ -1083,15 +1083,20 @@ CasePropsBuilder::build(UErrorCode &errorCode) {
}
}
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to set UCASE_SENSITIVE: %s\n",
fprintf(stderr, "genprops/case error: unable to set UCASE_SENSITIVE: %s\n",
u_errorName(errorCode));
return;
}
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops/case error: utrie2_freeze() failed: %s\n",
u_errorName(errorCode));
return;
}
trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: utrie2_freeze()+utrie2_serialize() failed: %s (length %ld)\n",
fprintf(stderr, "genprops/case error: utrie2_serialize() failed: %s (length %ld)\n",
u_errorName(errorCode), (long)trieSize);
return;
}

View file

@ -45,7 +45,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 7.2 .
The following is a description of format version 7.3 .
Data contents:
@ -146,8 +146,7 @@ Encoding of numeric type and value in the 10-bit ntv field:
0 U_NT_NONE 0
1..10 U_NT_DECIMAL 0..9
11..20 U_NT_DIGIT 0..9
21..0x2ff U_NT_NUMERIC see below
0x300..0x3ff reserved
21..0x3ff U_NT_NUMERIC see below
For U_NT_NUMERIC:
ntv value
@ -157,6 +156,11 @@ Encoding of numeric type and value in the 10-bit ntv field:
(only one significant decimal digit)
0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
0x324..0x34b fraction-20 (new in format version 7.3)
frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640
numerator: num = 2*(frac20&3)+1
denominator: den = 20<<(frac20>>2)
0x34c..0x3ff reserved
--- Additional properties (new in format version 2.1) ---
@ -250,6 +254,10 @@ ICU 57 adds 4 Emoji properties to vector word 2.
http://bugs.icu-project.org/trac/ticket/11802
http://www.unicode.org/reports/tr51/#Emoji_Properties
--- Changes in format version 7.3 ---
ICU 58 adds fraction-20 numeric values for new Unicode 9 Malayalam fraction characters.
----------------------------------------------------------------------------- */
U_NAMESPACE_USE
@ -265,7 +273,7 @@ static UDataInfo dataInfo={
0,
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
{ 7, 2, 0, 0 }, /* formatVersion */
{ 7, 3, 0, 0 }, /* formatVersion */
{ 8, 0, 0, 0 } /* dataVersion */
};
@ -314,10 +322,29 @@ CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
static int32_t encodeFractional20(int32_t value, int32_t den) {
if(den<20 || 640<den) { return -1; }
int32_t frac20;
if(value==1 || value==3 || value==5 || value==7) {
frac20=value/2;
} else {
return -1;
}
// Denominator: 20 times which power of 2: 0..5 into bits 4..2
do {
if(den==20) {
return UPROPS_NTV_FRACTION20_START+frac20;
}
den/=2;
frac20+=4;
} while(den>=20);
return -1;
}
// For nt=U_NT_NUMERIC.
static int32_t
encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
const char *original;
const char *original=s;
/* get a possible minus sign */
UBool isNegative;
if(*s=='-') {
@ -407,6 +434,8 @@ encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
} else if(exp==0 && value==-1 && den==0) {
/* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
ntv=((value+12)<<4);
} else if(exp==0 && (ntv=encodeFractional20(value, den))>=0) {
// fits into fractional-20 format
} else {
ntv=-1;
}
@ -669,7 +698,7 @@ static int32_t indexes[UPROPS_INDEX_COUNT]={
0, 0, 0, 0
};
static uint8_t trieBlock[40000];
static uint8_t trieBlock[100000];
static int32_t trieSize;
static uint8_t props2TrieBlock[100000];
static int32_t props2TrieSize;
@ -681,18 +710,23 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops/core error: utrie2_freeze(main trie) failed: %s\n",
u_errorName(errorCode));
return;
}
trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops error: utrie2_freeze(main trie)+utrie2_serialize() "
"failed: %s (length %ld)\n",
"genprops/core error: utrie2_serialize(main trie) failed: %s (length %ld)\n",
u_errorName(errorCode), (long)trieSize);
return;
}
props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
fprintf(stderr, "genprops/core error: unable to build trie for additional properties: %s\n",
u_errorName(errorCode));
return;
}
@ -702,7 +736,8 @@ CorePropsBuilder::build(UErrorCode &errorCode) {
&errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr,
"genprops error: utrie2_freeze(additional properties)+utrie2_serialize() failed: %s\n",
"genprops/core error: utrie2_freeze(additional properties)+utrie2_serialize() "
"failed: %s\n",
u_errorName(errorCode));
return;
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2015, International Business Machines
* Copyright (C) 2000-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -656,6 +656,13 @@ readAnElement(FILE *data,
buffer[--buflen] = 0;
}
if(buflen >= 3 && buffer[0] == (char)0xef &&
buffer[1] == (char)0xbb && buffer[2] == (char)0xbf) {
// U+FEFF UTF-8 signature byte sequence.
// Ignore, assuming it is at the start of the file.
buflen -= 3;
uprv_memmove(buffer, buffer + 3, buflen + 1); // +1: including NUL terminator
}
if(buffer[0] == 0 || buffer[0] == '#') {
return FALSE; // just a comment, skip whole line
}

View file

@ -58,6 +58,8 @@ _scripts_only_in_iso15924 = (
# Properties --------------------------------------------------------------- ***
# Properties that we do not want to store in ppucd.txt.
# Not a frozenset so that we can add aliases for simpler subsequent testing.
_ignored_properties = set((
# Other_Xyz only contribute to Xyz, store only the latter.
"OAlpha",
@ -95,6 +97,16 @@ _ignored_properties = set((
"cjkRSUnicode"
))
# These properties (short names) map code points to
# strings or other unusual values (property types String or Miscellaneous)
# that cannot be block-compressed (or would be confusing).
_uncompressible_props = frozenset((
"bmg", "bpb", "cf", "Conditional_Case_Mappings", "dm", "FC_NFKC",
"isc", "lc", "na", "na1", "Name_Alias", "NFKC_CF",
# scx is block-compressible.
"scf", "slc", "stc", "suc", "tc", "Turkic_Case_Folding", "uc"
))
# Dictionary of properties.
# Keyed by normalized property names and aliases.
# Each value is a tuple with
@ -985,7 +997,12 @@ def CompactBlock(b, i):
if count == 1: num_unique += 1
if max_value != _null_or_defaults[pname]:
# Avoid picking randomly among several unique values.
if (max_count > 1 or num_unique == 1):
# Do not compress uncompressible properties,
# with an exception for many empty-string values in a block
# (NFCK_CF='' for tags and variation selectors).
if ((max_count > 1 or num_unique == 1) and
((pname not in _uncompressible_props) or
(max_value == '' and max_count >= 12))):
b_props[pname] = max_value
# For each range and property, remove the default+block value
# but set the default value if that property was not set