mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-8972 finish preparseucd.py parsing uchar.h, prepare data in order of old genpname/preparse.pl
X-SVN-Rev: 31158
This commit is contained in:
parent
5597c3ef96
commit
410c2b4580
1 changed files with 128 additions and 11 deletions
|
@ -142,6 +142,9 @@ _null_or_defaults = {}
|
|||
# Dictionary of short property names mapped to ICU4C UProperty enum constants.
|
||||
_property_name_to_enum = {}
|
||||
|
||||
# Dictionary of short gc value names mapped to UCharCategory enum constants.
|
||||
_gc_vname_to_enum = {}
|
||||
|
||||
_non_alnum_re = re.compile("[^a-zA-Z0-9]")
|
||||
|
||||
def NormPropName(pname):
|
||||
|
@ -1473,8 +1476,7 @@ def ParseUScriptHeader(icu_src_root):
|
|||
for line in uscript_file:
|
||||
match = _uscript_re.match(line)
|
||||
if match:
|
||||
script_enum = match.group(1)
|
||||
script_code = match.group(2)
|
||||
(script_enum, script_code) = match.group(1, 2)
|
||||
if script_code not in short_script_name_to_enum:
|
||||
scripts_not_in_ucd.add(script_code)
|
||||
else:
|
||||
|
@ -1489,9 +1491,17 @@ def ParseUScriptHeader(icu_src_root):
|
|||
_uchar_re = re.compile(
|
||||
" *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
|
||||
|
||||
# Sample line to match:
|
||||
# /** Zs @stable ICU 2.0 */
|
||||
_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) *")
|
||||
|
||||
# Sample line to match:
|
||||
# U_SPACE_SEPARATOR = 12,
|
||||
_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
|
||||
|
||||
# Sample line to match:
|
||||
# /** L @stable ICU 2.0 */
|
||||
_bc_comment_re = re.compile(" */\*\* *([A-Z]+) *")
|
||||
_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) *")
|
||||
|
||||
# Sample line to match:
|
||||
# U_LEFT_TO_RIGHT = 0,
|
||||
|
@ -1519,12 +1529,36 @@ def ParseUCharHeader(icu_src_root):
|
|||
prop = None
|
||||
comment_value = "??"
|
||||
for line in uchar_file:
|
||||
# Parse some enums via context-sensitive "modes".
|
||||
# Necessary because the enum constant names do not contain
|
||||
# enough information.
|
||||
if "enum UCharCategory" in line:
|
||||
mode = "gc"
|
||||
prop = _properties["gc"]
|
||||
continue
|
||||
if mode == "gc":
|
||||
# Leave the normal short-to-enum map which is shared between gc & gcm
|
||||
# with enums like U_GC_ZS_MASK.
|
||||
# For writing gc enums to pnames_data.h use _gc_vname_to_enum.
|
||||
if line.startswith("}"):
|
||||
mode = ""
|
||||
continue
|
||||
match = _gc_comment_re.match(line)
|
||||
if match:
|
||||
comment_value = match.group(1)
|
||||
continue
|
||||
match = _gc_re.match(line)
|
||||
if match:
|
||||
gc_enum = match.group(1)
|
||||
vname = GetShortPropertyValueName(prop, comment_value)
|
||||
_gc_vname_to_enum[vname] = gc_enum
|
||||
continue
|
||||
if "enum UCharDirection {" in line:
|
||||
mode = "UCharDirection"
|
||||
mode = "bc"
|
||||
prop = _properties["bc"]
|
||||
comment_value = "??"
|
||||
continue
|
||||
if mode == "UCharDirection":
|
||||
if mode == "bc":
|
||||
if line.startswith("}"):
|
||||
mode = ""
|
||||
continue
|
||||
|
@ -1538,6 +1572,8 @@ def ParseUCharHeader(icu_src_root):
|
|||
vname = GetShortPropertyValueName(prop, comment_value)
|
||||
prop[2][vname] = bc_enum
|
||||
continue
|
||||
# No mode, parse enum constants whose names contain
|
||||
# enough information to parse without requiring context.
|
||||
match = _uchar_re.match(line)
|
||||
if match:
|
||||
prop_enum = match.group(1)
|
||||
|
@ -1558,8 +1594,7 @@ def ParseUCharHeader(icu_src_root):
|
|||
continue
|
||||
match = _prop_and_value_re.match(line)
|
||||
if match:
|
||||
prop_enum = match.group(1)
|
||||
vname = match.group(3)
|
||||
(prop_enum, vname) = match.group(1, 3)
|
||||
if vname == "COUNT" or _prop_and_alias_re.match(line):
|
||||
continue
|
||||
prop = GetProperty(match.group(2))
|
||||
|
@ -1569,14 +1604,92 @@ def ParseUCharHeader(icu_src_root):
|
|||
short_gcm_name_to_enum = _properties["gcm"][2]
|
||||
for value in short_gcm_name_to_enum:
|
||||
short_gcm_name_to_enum[value] = "U_GC_" + value.upper() + "_MASK"
|
||||
# Hardcode known values for the normalization quick check properties,
|
||||
# see unorm2.h for the UNormalizationCheckResult enum.
|
||||
short_name_to_enum = _properties["NFC_QC"][2]
|
||||
short_name_to_enum["N"] = "UNORM_NO"
|
||||
short_name_to_enum["Y"] = "UNORM_YES"
|
||||
short_name_to_enum["M"] = "UNORM_MAYBE"
|
||||
short_name_to_enum = _properties["NFKC_QC"][2]
|
||||
short_name_to_enum["N"] = "UNORM_NO"
|
||||
short_name_to_enum["Y"] = "UNORM_YES"
|
||||
short_name_to_enum["M"] = "UNORM_MAYBE"
|
||||
# No "maybe" values for NF[K]D.
|
||||
short_name_to_enum = _properties["NFD_QC"][2]
|
||||
short_name_to_enum["N"] = "UNORM_NO"
|
||||
short_name_to_enum["Y"] = "UNORM_YES"
|
||||
short_name_to_enum = _properties["NFKD_QC"][2]
|
||||
short_name_to_enum["N"] = "UNORM_NO"
|
||||
short_name_to_enum["Y"] = "UNORM_YES"
|
||||
|
||||
|
||||
def WritePNamesDataHeader(icu_tools_root):
|
||||
def WritePNamesDataHeader(out_path):
|
||||
# Build a sorted list of (key0, enum) tuples
|
||||
# to emulate the output order of the old genpname/preparse.pl.
|
||||
# key0 is either a preparse.pl property type string (for property names)
|
||||
# or a Unicode short property name (for property value names).
|
||||
# enum is the ICU4C enum constant name.
|
||||
# TODO: rename prop to not collide with usual properties[x]
|
||||
# TODO: once we are sure this works, simplify the order;
|
||||
# for example, change all "_bp" etc. to just ""
|
||||
# (outputs property names first in enum order),
|
||||
# and sorting ccc by numbers not strings
|
||||
# TODO: simplify further, to make pnames_data.h more stable;
|
||||
# try not to print string or group index numbers
|
||||
# TODO: wiki/ReviewTicket8972 with diff links
|
||||
prop_type_to_old_type = {
|
||||
"Binary": "_bp",
|
||||
"Bitmask": "_op",
|
||||
"Catalog": "_ep",
|
||||
"Enumerated": "_ep",
|
||||
"Miscellaneous": "_sp",
|
||||
"Numeric": "_dp",
|
||||
"String": "_sp"
|
||||
}
|
||||
pnames_data = [("binprop", "0"), ("binprop", "1")]
|
||||
# Only properties that have ICU API.
|
||||
missing_enums = []
|
||||
for (pname, prop_enum) in _property_name_to_enum.iteritems():
|
||||
prop = _properties[pname]
|
||||
# Sometimes the uchar.h UProperty type differs
|
||||
# from the PropertyAliases.txt type.
|
||||
if pname == "age":
|
||||
type = "_sp"
|
||||
elif pname in ("gcm", "scx"):
|
||||
type = "_op"
|
||||
else:
|
||||
type = prop_type_to_old_type[prop[0]]
|
||||
pnames_data.append((type, prop_enum))
|
||||
if type != "_bp" and pname != "age":
|
||||
short_name_to_enum = prop[2]
|
||||
if pname.endswith("ccc"):
|
||||
# ccc, lccc, tccc use the string forms of their numeric values
|
||||
# as "enum" values.
|
||||
# In the UCD data, these numeric strings are the first value names,
|
||||
# followed by the short & long value names.
|
||||
for name in short_name_to_enum:
|
||||
pnames_data.append((pname, name))
|
||||
else:
|
||||
if pname == "gc":
|
||||
# See comment about _gc_vname_to_enum in ParseUCharHeader().
|
||||
short_name_to_enum = _gc_vname_to_enum
|
||||
for (name, enum) in short_name_to_enum.iteritems():
|
||||
if enum:
|
||||
pnames_data.append((pname, enum))
|
||||
else:
|
||||
missing_enums.append((pname, name))
|
||||
if missing_enums:
|
||||
raise ValueError(
|
||||
"missing uchar.h enum constants for some property values: %s" %
|
||||
missing_enums)
|
||||
pnames_data.sort()
|
||||
for item in pnames_data:
|
||||
print item
|
||||
short_script_name_to_enum = _properties["sc"][2]
|
||||
# print short_script_name_to_enum
|
||||
# print _property_name_to_enum
|
||||
print _properties["ea"][2]
|
||||
print _properties["gcm"][2]
|
||||
# print _properties["ea"][2]
|
||||
# print _properties["gcm"][2]
|
||||
|
||||
# main() ------------------------------------------------------------------- ***
|
||||
|
||||
|
@ -1621,9 +1734,13 @@ def main():
|
|||
WritePreparsedUCD(out_file)
|
||||
out_file.flush()
|
||||
# TODO: PrintNameStats()
|
||||
# ICU data for property & value names API
|
||||
ParseUScriptHeader(icu_src_root)
|
||||
ParseUCharHeader(icu_src_root)
|
||||
WritePNamesDataHeader(icu_tools_root)
|
||||
genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
|
||||
if not os.path.exists(genprops_path): os.makedirs(genprops_path)
|
||||
out_path = os.path.join(genprops_path, "pnames_data.h")
|
||||
WritePNamesDataHeader(out_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Add table
Reference in a new issue