ICU-8972 redo writing of pnames_data.h: write properties & their values in uchar.h order, write strings rather than double indirection through name groups & string table; stable output with minor, proportional diffs in version upgrades, and much simpler code

X-SVN-Rev: 31164
This commit is contained in:
Markus Scherer 2011-12-21 07:51:56 +00:00
parent cec5a252fd
commit 05d42d4ed3

View file

@ -110,8 +110,7 @@ _ignored_properties = set((
# 0: Type of property (binary, enum, ...)
# 1: List of aliases; short & long name followed by other aliases.
# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
# 2: Dictionary, maps short property value names
# initially to None, later to ICU4C API enum constants.
# 2: Set of short property value names.
# 3: Dictionary of property values.
# For Catalog & Enumerated properties,
# maps each value name to a list of aliases.
@ -140,11 +139,17 @@ _defaults = {}
# Initialized after parsing is done.
_null_or_defaults = {}
# Dictionary of short property names mapped to ICU4C UProperty enum constants.
_property_name_to_enum = {}
# List of properties with an ICU UProperty enum.
# Each item is an (enum, pname, values) tuple.
# - enum: the ICU enum UProperty constant string
# - pname: the UCD short property name
# - values: list of (enum, vname) pairs per property value
# - enum: the ICU property value's enum constant string
# - vname: the UCD short property value name
_icu_properties = []
# Dictionary of short gc value names mapped to UCharCategory enum constants.
_gc_vname_to_enum = {}
# Dictionary of short property names mapped to _icu_properties items.
_pname_to_icu_prop = {}
_non_alnum_re = re.compile("[^a-zA-Z0-9]")
@ -179,8 +184,7 @@ def GetShortPropertyName(pname):
if pname in _null_values: return pname # pname is already the short name.
prop = GetProperty(pname)
if not prop: return "" # For ignored properties.
name = prop[1][0]
return name if name else prop[1][1] # Long name if no short name.
return prop[1][0] or prop[1][1] # Long name if no short name.
def GetShortPropertyValueName(prop, vname):
@ -194,8 +198,7 @@ def GetShortPropertyValueName(prop, vname):
raise NameError("unknown value name %s for property %s\n" %
(vname, prop[1][0]))
values[vname] = aliases
short_name = aliases[0]
return short_name if short_name else aliases[1] # Long name if no short name.
return aliases[0] or aliases[1] # Long name if no short name.
def NormalizePropertyValue(prop, vname):
@ -456,7 +459,7 @@ def ParsePropertyAliases(in_file):
_null_values[name] = 0
else:
_null_values[name] = null_value
prop = (prop_type, aliases, {}, {})
prop = (prop_type, aliases, set(), {})
for alias in aliases:
_properties[alias] = prop
_properties[NormPropName(alias)] = prop
@ -470,13 +473,13 @@ def ParsePropertyAliases(in_file):
# Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
name = "Turkic_Case_Folding"
_null_values[name] = ""
prop = ("String", [name, name], {}, {})
prop = ("String", [name, name], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
name = "Conditional_Case_Mappings"
_null_values[name] = ""
prop = ("Miscellaneous", [name, name], {}, {})
prop = ("Miscellaneous", [name, name], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# lccc = ccc of first cp in canonical decomposition.
@ -497,7 +500,7 @@ def ParsePropertyAliases(in_file):
# Script_Extensions
if "scx" not in _properties:
_null_values["scx"] = ""
prop = ("Miscellaneous", ["scx", "Script_Extensions"], {}, {})
prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
_properties["scx"] = prop
_properties["Script_Extensions"] = prop
_properties["scriptextensions"] = prop
@ -541,7 +544,7 @@ def ParsePropertyValueAliases(in_file):
if short_name == "n/a": # no short name
fields[0] = ""
short_name = fields[1]
prop[2][short_name] = None
prop[2].add(short_name)
values = prop[3]
for alias in fields:
if alias:
@ -568,14 +571,14 @@ def ParsePropertyValueAliases(in_file):
# Add ISO 15924-only script codes.
# Only for the ICU script code API, not necessary for parsing the UCD.
script_prop = _properties["sc"]
short_script_names = script_prop[2] # dict
short_script_names = script_prop[2] # set
script_values = script_prop[3] # dict
remove_scripts = []
for script in _scripts_only_in_iso15924:
if script in short_script_names:
remove_scripts.append(script)
else:
short_script_names[script] = None
short_script_names.add(script)
# Do not invent a Unicode long script name before the UCD adds the script.
script_list = [script, script] # [short, long]
script_values[script] = script_list
@ -1470,30 +1473,6 @@ def PrintNameStats():
# ICU API ------------------------------------------------------------------ ***
# Sample line to match:
# USCRIPT_LOMA = 139,/* Loma */
_uscript_re = re.compile(
" *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
def ParseUScriptHeader(icu_src_root):
uscript_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uscript.h")
short_script_name_to_enum = _properties["sc"][2]
scripts_not_in_ucd = set()
with open(uscript_path, "r") as uscript_file:
for line in uscript_file:
match = _uscript_re.match(line)
if match:
(script_enum, script_code) = match.group(1, 2)
if script_code not in short_script_name_to_enum:
scripts_not_in_ucd.add(script_code)
else:
short_script_name_to_enum[script_code] = script_enum
if scripts_not_in_ucd:
raise ValueError("uscript.h has UScript constants for scripts "
"not in the UCD nor in ISO 15924: %s" % scripts_not_in_ucd)
# Sample line to match:
# UCHAR_UNIFIED_IDEOGRAPH=29,
_uchar_re = re.compile(
@ -1501,7 +1480,7 @@ _uchar_re = re.compile(
# Sample line to match:
# /** Zs @stable ICU 2.0 */
_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) *")
_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
# Sample line to match:
# U_SPACE_SEPARATOR = 12,
@ -1509,7 +1488,7 @@ _gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# /** L @stable ICU 2.0 */
_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) *")
_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
# Sample line to match:
# U_LEFT_TO_RIGHT = 0,
@ -1533,21 +1512,18 @@ def ParseUCharHeader(icu_src_root):
uchar_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uchar.h")
with open(uchar_path, "r") as uchar_file:
mode = ""
prop = None
comment_value = "??"
mode = "" # Mode string (=pname) during context-sensitive parsing.
comment_value = "" # Property value from a comment preceding an enum.
# Note: The enum UProperty is first in uchar.h, before the enums for values.
for line in uchar_file:
# Parse some enums via context-sensitive "modes".
# Necessary because the enum constant names do not contain
# enough information.
if "enum UCharCategory" in line:
mode = "gc"
prop = _properties["gc"]
comment_value = ""
continue
if mode == "gc":
# Leave the normal short-to-enum map which is shared between gc & gcm
# with enums like U_GC_ZS_MASK.
# For writing gc enums to pnames_data.h use _gc_vname_to_enum.
if line.startswith("}"):
mode = ""
continue
@ -1556,15 +1532,17 @@ def ParseUCharHeader(icu_src_root):
comment_value = match.group(1)
continue
match = _gc_re.match(line)
if match:
if match and comment_value:
gc_enum = match.group(1)
prop = _properties["gc"]
vname = GetShortPropertyValueName(prop, comment_value)
_gc_vname_to_enum[vname] = gc_enum
icu_values = _pname_to_icu_prop["gc"][2]
icu_values.append((gc_enum, vname))
comment_value = ""
continue
if "enum UCharDirection {" in line:
mode = "bc"
prop = _properties["bc"]
comment_value = "??"
comment_value = ""
continue
if mode == "bc":
if line.startswith("}"):
@ -1575,10 +1553,13 @@ def ParseUCharHeader(icu_src_root):
comment_value = match.group(1)
continue
match = _bc_re.match(line)
if match:
if match and comment_value:
bc_enum = match.group(1)
prop = _properties["bc"]
vname = GetShortPropertyValueName(prop, comment_value)
prop[2][vname] = bc_enum
icu_values = _pname_to_icu_prop["bc"][2]
icu_values.append((bc_enum, vname))
comment_value = ""
continue
# No mode, parse enum constants whose names contain
# enough information to parse without requiring context.
@ -1589,7 +1570,9 @@ def ParseUCharHeader(icu_src_root):
# Ignore "UCHAR_BINARY_LIMIT=57," etc.
continue
pname = GetShortPropertyName(prop_enum[6:])
_property_name_to_enum[pname] = prop_enum
icu_prop = (prop_enum, pname, [])
_icu_properties.append(icu_prop)
_pname_to_icu_prop[pname] = icu_prop
continue
match = _ublock_re.match(line)
if match:
@ -1598,107 +1581,93 @@ def ParseUCharHeader(icu_src_root):
continue
prop = _properties["blk"]
vname = GetShortPropertyValueName(prop, prop_enum[7:])
prop[2][vname] = prop_enum
icu_values = _pname_to_icu_prop["blk"][2]
icu_values.append((prop_enum, vname))
continue
match = _prop_and_value_re.match(line)
if match:
(prop_enum, vname) = match.group(1, 3)
if vname == "COUNT" or _prop_and_alias_re.match(line):
continue
prop = GetProperty(match.group(2))
pname = GetShortPropertyName(match.group(2))
prop = _properties[pname]
vname = GetShortPropertyValueName(prop, vname)
prop[2][vname] = prop_enum
icu_values = _pname_to_icu_prop[pname][2]
icu_values.append((prop_enum, vname))
# ccc, lccc, tccc use their numeric values as "enum" values.
# In the UCD data, these numeric values are the first value names,
# followed by the short & long value names.
# List the ccc values in numeric order.
prop = _properties["ccc"]
icu_values = _pname_to_icu_prop["ccc"][2]
for ccc in sorted([int(name) for name in prop[2]]):
icu_values.append((ccc, str(ccc)))
_pname_to_icu_prop["lccc"][2].extend(icu_values) # Copy ccc -> lccc.
_pname_to_icu_prop["tccc"][2].extend(icu_values) # Copy ccc -> tccc.
# No need to parse predictable General_Category_Mask enum constants.
short_gcm_name_to_enum = _properties["gcm"][2]
for value in short_gcm_name_to_enum:
short_gcm_name_to_enum[value] = "U_GC_" + value.upper() + "_MASK"
# Just define them in ASCII order.
prop = _properties["gcm"]
icu_values = _pname_to_icu_prop["gcm"][2]
for vname in sorted(prop[2]):
icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
# Hardcode known values for the normalization quick check properties,
# see unorm2.h for the UNormalizationCheckResult enum.
short_name_to_enum = _properties["NFC_QC"][2]
short_name_to_enum["N"] = "UNORM_NO"
short_name_to_enum["Y"] = "UNORM_YES"
short_name_to_enum["M"] = "UNORM_MAYBE"
short_name_to_enum = _properties["NFKC_QC"][2]
short_name_to_enum["N"] = "UNORM_NO"
short_name_to_enum["Y"] = "UNORM_YES"
short_name_to_enum["M"] = "UNORM_MAYBE"
icu_values = _pname_to_icu_prop["NFC_QC"][2]
icu_values.append(("UNORM_NO", "N"))
icu_values.append(("UNORM_YES", "Y"))
icu_values.append(("UNORM_MAYBE", "M"))
_pname_to_icu_prop["NFKC_QC"][2].extend(icu_values) # Copy NFC -> NFKC.
# No "maybe" values for NF[K]D.
short_name_to_enum = _properties["NFD_QC"][2]
short_name_to_enum["N"] = "UNORM_NO"
short_name_to_enum["Y"] = "UNORM_YES"
short_name_to_enum = _properties["NFKD_QC"][2]
short_name_to_enum["N"] = "UNORM_NO"
short_name_to_enum["Y"] = "UNORM_YES"
icu_values = _pname_to_icu_prop["NFD_QC"][2]
icu_values.append(("UNORM_NO", "N"))
icu_values.append(("UNORM_YES", "Y"))
_pname_to_icu_prop["NFKD_QC"][2].extend(icu_values) # Copy NFD -> NFKD.
def WritePNamesDataHeader(out_path):
# Build a sorted list of (key0, enum, aliases) tuples
# to emulate the output order of the old genpname/preparse.pl.
# key0 is either a preparse.pl property type string (for property names)
# or a Unicode short property name (for property value names).
# enum is the ICU4C enum constant name.
# aliases is the tuple of the property's or value's names and aliases.
# (We use a tuple, not the original list,
# so that we can use it as a dict key.)
# TODO: once we are sure this works, simplify the order;
# for example, change all "_bp" etc. to just ""
# (outputs property names first in enum order),
# and sorting ccc by numbers not strings
# TODO: simplify further, to make pnames_data.h more stable;
# try not to print string or group index numbers
# TODO: wiki/ReviewTicket8972 with diff links
prop_type_to_old_type = {
"Binary": "_bp",
"Bitmask": "_op",
"Catalog": "_ep",
"Enumerated": "_ep",
"Miscellaneous": "_sp",
"Numeric": "_dp",
"String": "_sp"
}
pnames_data = [
("binprop", "0", tuple(_binary_values["N"])),
("binprop", "1", tuple(_binary_values["Y"]))
]
# Sample line to match:
# USCRIPT_LOMA = 139,/* Loma */
_uscript_re = re.compile(
" *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
def ParseUScriptHeader(icu_src_root):
uscript_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uscript.h")
icu_values = _pname_to_icu_prop["sc"][2]
with open(uscript_path, "r") as uscript_file:
for line in uscript_file:
match = _uscript_re.match(line)
if match:
(script_enum, script_code) = match.group(1, 2)
icu_values.append((script_enum, script_code))
def CheckPNamesData():
"""Checks that every ICU property has a full set of value enum constants,
and that the _icu_properties value names map back to the UCD."""
missing_enums = []
# Only properties that have ICU API.
for (pname, prop_enum) in _property_name_to_enum.iteritems():
for (p_enum, pname, values) in _icu_properties:
prop = _properties[pname]
# Sometimes the uchar.h UProperty type differs
# from the PropertyAliases.txt type.
if pname == "age":
type = "_sp"
elif pname in ("gcm", "scx"):
type = "_op"
else:
type = prop_type_to_old_type[prop[0]]
pnames_data.append((type, prop_enum, tuple(prop[1])))
if type != "_bp" and pname != "age":
short_name_to_enum = prop[2]
if pname.endswith("ccc"):
# ccc, lccc, tccc use the string forms of their numeric values
# as "enum" values.
# In the UCD data, these numeric strings are the first value names,
# followed by the short & long value names.
# Omit the numeric strings from the aliases as well.
for name in short_name_to_enum:
pnames_data.append((pname, name, tuple(prop[3][name][1:])))
else:
if pname == "gc":
# See comment about _gc_vname_to_enum in ParseUCharHeader().
short_name_to_enum = _gc_vname_to_enum
for (name, enum) in short_name_to_enum.iteritems():
if enum:
pnames_data.append((pname, enum, tuple(prop[3][name])))
else:
missing_enums.append((pname, name))
vnames = set(prop[2]) # Modifiable copy of the set of short value names.
for (v_enum, vname) in values:
if vname not in vnames:
raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
(pname, vname, v_enum))
vnames.remove(vname)
# Exceptions to the all-values check:
# - ICU does not have specific enum values for binary No/Yes.
# - ICU represents Age values via UVersionInfo rather than enum constants.
# - gc: ICU enum UCharCategory only has the single-category values.
# (ICU's gcm property has all of the UCD gc property values.)
if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
missing_enums.append((pname, vnames))
if missing_enums:
raise ValueError(
"missing uchar.h enum constants for some property values: %s" %
missing_enums)
pnames_data.sort()
# Write pnames_data.h.
def WritePNamesDataHeader(out_path):
year = datetime.date.today().strftime("%Y")
with open(out_path, "w") as out_file:
out_file.write("""/**
@ -1707,133 +1676,53 @@ def WritePNamesDataHeader(out_path):
* others. All Rights Reserved.
*
* machine-generated by: icu/tools/unicode/py/preparseucd.py
*
* Unicode """ + _ucd_version + """
*/
""")
out_file.write("/* Unicode version %s */\n" % _ucd_version)
v = _ucd_version.split(".")
while len(v) < 4: v.append("0")
for i in xrange(4):
out_file.write("const uint8_t VERSION_%d = %s;\n" % (i, v[i]))
out_file.write("\n")
# Write the string table with all of the names and aliases
# of all of the properties and their values.
# Unique strings in ASCII order.
# The first string must be the empty string.
strings_set = set([""])
for (key0, enum, aliases) in pnames_data:
for s in aliases: strings_set.add(s)
strings = sorted(strings_set)
out_file.write("const int32_t STRING_COUNT = %d;\n\n" % len(strings))
# While printing, create a mapping from string table entry to index.
string_to_id = {}
out_file.write("const AliasName STRING_TABLE[] = {\n")
for s in strings:
i = len(string_to_id)
out_file.write(' AliasName("%s", %d),\n' % (s, i))
string_to_id[s] = i
# Write an array of "binprop" Value object initializers
# with the value aliases shared among all binary properties.
out_file.write("const int32_t VALUES_binprop_COUNT = 2;\n\n")
out_file.write("const Value VALUES_binprop[] = {\n")
out_file.write(' Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
out_file.write(' Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
out_file.write("};\n\n")
# Emit the name group table.
# [A table of name groups. A name group is the list of names and aliases
# for a property or property value.
# The name group table looks like this:
#
# 114, -115, 116, -117, 0, -118, 65, -64, ...
# [0] [2] [4] [6]
#
# The entry at [0] consists of 2 strings, 114 and 115.
# The entry at [2] consists of 116 and 117, etc.
# The last entry is negative.
# Build the name group list with nameGroup indices.
name_groups = []
# Count the total number of values, not just the groups.
name_groups_total_length = 0
# Check for duplicate name groups, and reuse the first of a kind.
group_to_int = {}
for (key0, enum, aliases) in pnames_data:
index = group_to_int.get(aliases)
if index == None:
index = name_groups_total_length
group_to_int[aliases] = index
name_groups.append([string_to_id[s] for s in aliases])
name_groups_total_length += len(aliases)
out_file.write("const int32_t NAME_GROUP_COUNT = %d;\n\n" %
name_groups_total_length)
out_file.write("int32_t NAME_GROUP[] = {\n")
# Emit one group per line, with annotations.
max_names = 0 # Maximum number of names & aliases per item.
start = 0
for group in name_groups:
line = " "
aliases = [] # For comments.
for i in group[:-1]:
line += "%d, " % i
aliases.append('"%s"' % strings[i])
# Negative entry terminates the group.
i = group[-1]
line += "%d, " % -i
aliases.append('"%s"' % strings[i])
out_file.write(
line + " " * (24 - len(line)) +
"/* %3d: " % start + ", ".join(aliases) + " */\n")
length = len(group)
if length > max_names: max_names = length
start += length
out_file.write("};\n\n")
out_file.write("#define MAX_NAMES_PER_GROUP %d\n\n" % max_names)
# Emit the enumerated property values.
i = 0
while i < len(pnames_data):
pname = pnames_data[i][0]
if pname.startswith("_"):
i += 1
continue
if pname == "binprop":
count = 2
elif pname == "gc":
count = len(_gc_vname_to_enum)
else:
count = len(_properties[pname][2])
# For each property with named values, write an array of
# Value object initializers with the value enum and the aliases.
for (p_enum, pname, values) in _icu_properties:
if not values: continue
out_file.write("const int32_t VALUES_%s_COUNT = %d;\n\n" %
(pname, count))
out_file.write("const Alias VALUES_%s[] = {\n" % pname)
limit = i + count
while i < limit:
(pname, enum, aliases) = pnames_data[i]
out_file.write(" Alias((int32_t) %s, %d),\n" %
(enum, group_to_int[aliases]))
i += 1
(pname, len(values)))
out_file.write("const Value VALUES_%s[] = {\n" % pname)
for (v_enum, vname) in values:
aliases = _properties[pname][3][vname]
# ccc, lccc, tccc: Omit the numeric strings from the aliases.
# (See the comment about ccc in the PropertyValueAliases.txt header.)
if pname.endswith("ccc"): aliases = aliases[1:]
cast = "(int32_t)" if pname == "gcm" else ""
out_file.write(' Value(%s%s, "%s"),\n' %
(cast, v_enum, " ".join(aliases)))
out_file.write("};\n\n")
# Emit the top-level properties (binary, enumerated, etc.).
out_file.write("const int32_t PROPERTY_COUNT = %d;\n\n" %
len(_property_name_to_enum))
out_file.write("const Property PROPERTY[] = {\n")
for (pname, enum, aliases) in pnames_data:
if not pname.startswith("_"): continue
pname = aliases[0]
if not pname: pname = aliases[1] # Long name if no short name.
# For each property, write a Property object initializer
# with the property enum, its aliases, and a reference to its values.
out_file.write("const int32_t PROPERTIES_COUNT = %d;\n\n" %
len(_icu_properties))
out_file.write("const Property PROPERTIES[] = {\n")
for (enum, pname, values) in _icu_properties:
prop = _properties[pname]
group_index = group_to_int[aliases]
if prop[2] and pname != "age": # Property with named values.
aliases = prop[1]
if values: # Property with named values.
if prop[0] == "Binary": pname = "binprop"
out_file.write(
" Property((int32_t) %s, %d, VALUES_%s_COUNT, VALUES_%s),\n" %
(enum, group_index, pname, pname))
' Property(%s, "%s", VALUES_%s, VALUES_%s_COUNT),\n' %
(enum, " ".join(aliases), pname, pname))
else:
out_file.write(" Property((int32_t) %s, %d, 0, NULL),\n" %
(enum, group_index))
out_file.write(' Property(%s, "%s", NULL, 0),\n' %
(enum, " ".join(aliases)))
out_file.write("};\n")
# main() ------------------------------------------------------------------- ***
@ -1878,10 +1767,13 @@ def main():
with open(out_path, "w") as out_file:
WritePreparsedUCD(out_file)
out_file.flush()
# TODO: PrintNameStats()
# ICU data for property & value names API
ParseUScriptHeader(icu_src_root)
ParseUCharHeader(icu_src_root)
ParseUScriptHeader(icu_src_root)
CheckPNamesData()
genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
if not os.path.exists(genprops_path): os.makedirs(genprops_path)
out_path = os.path.join(genprops_path, "pnames_data.h")