mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 23:10:40 +00:00
ICU-9023 reduce norm2/nfkc.txt to a delta over nfc.txt
X-SVN-Rev: 31200
This commit is contained in:
parent
1481912039
commit
f72bdf2ffb
1 changed files with 75 additions and 47 deletions
|
@ -1116,7 +1116,7 @@ def WritePreparsedUCD(out_file):
|
|||
WriteFieldsRangeProps(["cp"], start, end, props, out_file)
|
||||
|
||||
# Write Normalizer2 input files -------------------------------------------- ***
|
||||
# Ported from genprops/store.c.
|
||||
# Ported from gennorm/store.c.
|
||||
|
||||
def WriteAllCC(out_file):
|
||||
out_file.write("# Canonical_Combining_Class (ccc) values\n");
|
||||
|
@ -1145,7 +1145,7 @@ def HasMapping(c):
|
|||
return dt and dt != "None"
|
||||
|
||||
|
||||
def HasOneWayMapping(c, with_compat):
|
||||
def HasOneWayMapping(c):
|
||||
while True:
|
||||
props = GetProps(c)
|
||||
dt = props.get("dt")
|
||||
|
@ -1167,43 +1167,76 @@ def HasOneWayMapping(c, with_compat):
|
|||
c = int(nfd[0], 16) # continue
|
||||
else:
|
||||
# c has a compatibility mapping.
|
||||
return with_compat
|
||||
return True
|
||||
|
||||
|
||||
def WriteAllMappings(out_file, with_compat):
|
||||
if with_compat:
|
||||
out_file.write("\n# Canonical and compatibility decomposition mappings\n")
|
||||
else:
|
||||
out_file.write("\n# Canonical decomposition mappings\n")
|
||||
for i in xrange(len(_starts) - 1):
|
||||
start = _starts[i]
|
||||
end = _starts[i + 1] - 1
|
||||
props = _props[i]
|
||||
dm = props.get("dm")
|
||||
if dm and dm[0] != '<' and (with_compat or props["dt"] == "Can"):
|
||||
assert start == end
|
||||
separator = '>' if HasOneWayMapping(start, with_compat) else '='
|
||||
out_file.write("%04X%s%s\n" % (start, separator, dm))
|
||||
|
||||
|
||||
def WriteNorm2TextFile(path, filename, with_compat):
|
||||
def WriteNorm2NFCTextFile(path):
|
||||
year = datetime.date.today().strftime("%Y")
|
||||
with open(os.path.join(path, filename), "w") as out_file:
|
||||
with open(os.path.join(path, "nfc.txt"), "w") as out_file:
|
||||
out_file.write(
|
||||
"""# Copyright (C) 1999-""" + year +
|
||||
""", International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: """ + filename + """
|
||||
# file name: nfc.txt
|
||||
#
|
||||
# machine-generated by ICU preparseucd.py
|
||||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode """ + _ucd_version + """
|
||||
|
||||
""")
|
||||
type = "NFKC" if with_compat else "NFC"
|
||||
out_file.write("# Complete data for Unicode " + type + " normalization.\n\n")
|
||||
out_file.write("* Unicode " + _ucd_version + "\n\n")
|
||||
WriteAllCC(out_file)
|
||||
WriteAllMappings(out_file, with_compat)
|
||||
out_file.write("\n# Canonical decomposition mappings\n")
|
||||
for i in xrange(len(_starts) - 1):
|
||||
start = _starts[i]
|
||||
end = _starts[i + 1] - 1
|
||||
props = _props[i]
|
||||
dm = props.get("dm")
|
||||
if dm and dm[0] != '<' and props["dt"] == "Can":
|
||||
assert start == end
|
||||
# The Comp_Ex=Full_Composition_Exclusion property tells us
|
||||
# whether the canonical decomposition round-trips.
|
||||
separator = '>' if props.get("Comp_Ex") else '='
|
||||
out_file.write("%04X%s%s\n" % (start, separator, dm))
|
||||
|
||||
|
||||
def WriteNorm2NFKCTextFile(path):
|
||||
year = datetime.date.today().strftime("%Y")
|
||||
with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
|
||||
out_file.write(
|
||||
"""# Copyright (C) 1999-""" + year +
|
||||
""", International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfkc.txt
|
||||
#
|
||||
# machine-generated by ICU preparseucd.py
|
||||
#
|
||||
# Data for Unicode NFKC normalization.
|
||||
# This file contains only compatibility decomposition mappings,
|
||||
# plus those canonical decompositions that change from NFC round-trip mappings
|
||||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode """ + _ucd_version + """
|
||||
|
||||
""")
|
||||
for i in xrange(len(_starts) - 1):
|
||||
start = _starts[i]
|
||||
end = _starts[i + 1] - 1
|
||||
props = _props[i]
|
||||
dm = props.get("dm")
|
||||
if dm and dm[0] != '<':
|
||||
assert start == end
|
||||
if props["dt"] != "Can":
|
||||
# Compatibility decomposition.
|
||||
out_file.write("%04X>%s\n" % (start, dm))
|
||||
elif not props.get("Comp_Ex") and HasOneWayMapping(start):
|
||||
# NFC round-trip mapping turns into NFKC one-way mapping.
|
||||
out_file.write("%04X>%s # NFC round-trip, NFKC one-way\n" %
|
||||
(start, dm))
|
||||
|
||||
|
||||
def WriteNorm2NFKC_CFTextFile(path):
|
||||
|
@ -1222,7 +1255,7 @@ def WriteNorm2NFKC_CFTextFile(path):
|
|||
# This file contains the Unicode NFKC_CF mappings,
|
||||
# extracted from the UCD file DerivedNormalizationProps.txt,
|
||||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the second gennorm2 input file after nfkc.txt.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
""")
|
||||
out_file.write("* Unicode " + _ucd_version + "\n\n")
|
||||
|
@ -1234,30 +1267,25 @@ def WriteNorm2NFKC_CFTextFile(path):
|
|||
end = _starts[i + 1] - 1
|
||||
props = _props[i]
|
||||
nfkc_cf = props.get("NFKC_CF")
|
||||
if nfkc_cf != None and (not nfkc_cf or nfkc_cf[0] != '<'):
|
||||
# Merge with the previous range if possible,
|
||||
# or remember this range for merging.
|
||||
if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
|
||||
prev_end = end
|
||||
else:
|
||||
if prev_nfkc_cf != None:
|
||||
if prev_start == prev_end:
|
||||
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
|
||||
else:
|
||||
out_file.write("%04X..%04X>%s\n" % (prev_start, prev_end, prev_nfkc_cf))
|
||||
prev_start = start
|
||||
prev_end = end
|
||||
prev_nfkc_cf = nfkc_cf
|
||||
if prev_nfkc_cf != None:
|
||||
if prev_start == prev_end:
|
||||
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
|
||||
# Merge with the previous range if possible,
|
||||
# or remember this range for merging.
|
||||
if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
|
||||
prev_end = end
|
||||
else:
|
||||
out_file.write("%04X..%04X>%s\n" % (prev_start, prev_end, prev_nfkc_cf))
|
||||
if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
|
||||
if prev_start == prev_end:
|
||||
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
|
||||
else:
|
||||
out_file.write("%04X..%04X>%s\n" %
|
||||
(prev_start, prev_end, prev_nfkc_cf))
|
||||
prev_start = start
|
||||
prev_end = end
|
||||
prev_nfkc_cf = nfkc_cf
|
||||
|
||||
|
||||
def WriteNorm2(path):
|
||||
WriteNorm2TextFile(path, "nfc.txt", False)
|
||||
WriteNorm2TextFile(path, "nfkc.txt", True)
|
||||
WriteNorm2NFCTextFile(path)
|
||||
WriteNorm2NFKCTextFile(path)
|
||||
WriteNorm2NFKC_CFTextFile(path)
|
||||
|
||||
# Preprocessing ------------------------------------------------------------ ***
|
||||
|
|
Loading…
Add table
Reference in a new issue