ICU-9023 reduce norm2/nfkc.txt to a delta over nfc.txt

X-SVN-Rev: 31200
This commit is contained in:
Markus Scherer 2012-01-12 01:02:38 +00:00
parent 1481912039
commit f72bdf2ffb

View file

@ -1116,7 +1116,7 @@ def WritePreparsedUCD(out_file):
WriteFieldsRangeProps(["cp"], start, end, props, out_file)
# Write Normalizer2 input files -------------------------------------------- ***
# Ported from genprops/store.c.
# Ported from gennorm/store.c.
def WriteAllCC(out_file):
out_file.write("# Canonical_Combining_Class (ccc) values\n");
@ -1145,7 +1145,7 @@ def HasMapping(c):
return dt and dt != "None"
def HasOneWayMapping(c, with_compat):
def HasOneWayMapping(c):
while True:
props = GetProps(c)
dt = props.get("dt")
@ -1167,43 +1167,76 @@ def HasOneWayMapping(c, with_compat):
c = int(nfd[0], 16) # continue
else:
# c has a compatibility mapping.
return with_compat
return True
def WriteAllMappings(out_file, with_compat):
if with_compat:
out_file.write("\n# Canonical and compatibility decomposition mappings\n")
else:
out_file.write("\n# Canonical decomposition mappings\n")
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
props = _props[i]
dm = props.get("dm")
if dm and dm[0] != '<' and (with_compat or props["dt"] == "Can"):
assert start == end
separator = '>' if HasOneWayMapping(start, with_compat) else '='
out_file.write("%04X%s%s\n" % (start, separator, dm))
def WriteNorm2TextFile(path, filename, with_compat):
def WriteNorm2NFCTextFile(path):
year = datetime.date.today().strftime("%Y")
with open(os.path.join(path, filename), "w") as out_file:
with open(os.path.join(path, "nfc.txt"), "w") as out_file:
out_file.write(
"""# Copyright (C) 1999-""" + year +
""", International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: """ + filename + """
# file name: nfc.txt
#
# machine-generated by ICU preparseucd.py
#
# Complete data for Unicode NFC normalization.
* Unicode """ + _ucd_version + """
""")
type = "NFKC" if with_compat else "NFC"
out_file.write("# Complete data for Unicode " + type + " normalization.\n\n")
out_file.write("* Unicode " + _ucd_version + "\n\n")
WriteAllCC(out_file)
WriteAllMappings(out_file, with_compat)
out_file.write("\n# Canonical decomposition mappings\n")
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
props = _props[i]
dm = props.get("dm")
if dm and dm[0] != '<' and props["dt"] == "Can":
assert start == end
# The Comp_Ex=Full_Composition_Exclusion property tells us
# whether the canonical decomposition round-trips.
separator = '>' if props.get("Comp_Ex") else '='
out_file.write("%04X%s%s\n" % (start, separator, dm))
def WriteNorm2NFKCTextFile(path):
year = datetime.date.today().strftime("%Y")
with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
out_file.write(
"""# Copyright (C) 1999-""" + year +
""", International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc.txt
#
# machine-generated by ICU preparseucd.py
#
# Data for Unicode NFKC normalization.
# This file contains only compatibility decomposition mappings,
# plus those canonical decompositions that change from NFC round-trip mappings
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode """ + _ucd_version + """
""")
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
props = _props[i]
dm = props.get("dm")
if dm and dm[0] != '<':
assert start == end
if props["dt"] != "Can":
# Compatibility decomposition.
out_file.write("%04X>%s\n" % (start, dm))
elif not props.get("Comp_Ex") and HasOneWayMapping(start):
# NFC round-trip mapping turns into NFKC one-way mapping.
out_file.write("%04X>%s # NFC round-trip, NFKC one-way\n" %
(start, dm))
def WriteNorm2NFKC_CFTextFile(path):
@ -1222,7 +1255,7 @@ def WriteNorm2NFKC_CFTextFile(path):
# This file contains the Unicode NFKC_CF mappings,
# extracted from the UCD file DerivedNormalizationProps.txt,
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the second gennorm2 input file after nfkc.txt.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
""")
out_file.write("* Unicode " + _ucd_version + "\n\n")
@ -1234,30 +1267,25 @@ def WriteNorm2NFKC_CFTextFile(path):
end = _starts[i + 1] - 1
props = _props[i]
nfkc_cf = props.get("NFKC_CF")
if nfkc_cf != None and (not nfkc_cf or nfkc_cf[0] != '<'):
# Merge with the previous range if possible,
# or remember this range for merging.
if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
prev_end = end
else:
if prev_nfkc_cf != None:
if prev_start == prev_end:
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
else:
out_file.write("%04X..%04X>%s\n" % (prev_start, prev_end, prev_nfkc_cf))
prev_start = start
prev_end = end
prev_nfkc_cf = nfkc_cf
if prev_nfkc_cf != None:
if prev_start == prev_end:
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
# Merge with the previous range if possible,
# or remember this range for merging.
if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
prev_end = end
else:
out_file.write("%04X..%04X>%s\n" % (prev_start, prev_end, prev_nfkc_cf))
if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
if prev_start == prev_end:
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
else:
out_file.write("%04X..%04X>%s\n" %
(prev_start, prev_end, prev_nfkc_cf))
prev_start = start
prev_end = end
prev_nfkc_cf = nfkc_cf
def WriteNorm2(path):
WriteNorm2TextFile(path, "nfc.txt", False)
WriteNorm2TextFile(path, "nfkc.txt", True)
WriteNorm2NFCTextFile(path)
WriteNorm2NFKCTextFile(path)
WriteNorm2NFKC_CFTextFile(path)
# Preprocessing ------------------------------------------------------------ ***