ICU-9023 reduce norm2/nfkc.txt to a delta over nfc.txt

X-SVN-Rev: 31200
2025-04-08 23:10:40 +00:00 · 2012-01-12 01:02:38 +00:00 · 2012-01-12 01:02:38 +00:00 · f72bdf2ffb
commit f72bdf2ffb
parent 1481912039
1 changed files with 75 additions and 47 deletions
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@ -1116,7 +1116,7 @@ def WritePreparsedUCD(out_file):
      WriteFieldsRangeProps(["cp"], start, end, props, out_file)

 # Write Normalizer2 input files -------------------------------------------- ***
-# Ported from genprops/store.c.
+# Ported from gennorm/store.c.

 def WriteAllCC(out_file):
  out_file.write("# Canonical_Combining_Class (ccc) values\n");
@ -1145,7 +1145,7 @@ def HasMapping(c):
  return dt and dt != "None"


-def HasOneWayMapping(c, with_compat):
+def HasOneWayMapping(c):
  while True:
    props = GetProps(c)
    dt = props.get("dt")
@ -1167,43 +1167,76 @@ def HasOneWayMapping(c, with_compat):
      c = int(nfd[0], 16)  # continue
    else:
      # c has a compatibility mapping.
-      return with_compat
+      return True


-def WriteAllMappings(out_file, with_compat):
-  if with_compat:
-    out_file.write("\n# Canonical and compatibility decomposition mappings\n")
-  else:
-    out_file.write("\n# Canonical decomposition mappings\n")
-  for i in xrange(len(_starts) - 1):
-    start = _starts[i]
-    end = _starts[i + 1] - 1
-    props = _props[i]
-    dm = props.get("dm")
-    if dm and dm[0] != '<' and (with_compat or props["dt"] == "Can"):
-      assert start == end
-      separator = '>' if HasOneWayMapping(start, with_compat) else '='
-      out_file.write("%04X%s%s\n" % (start, separator, dm))
-
-
-def WriteNorm2TextFile(path, filename, with_compat):
+def WriteNorm2NFCTextFile(path):
  year = datetime.date.today().strftime("%Y")
-  with open(os.path.join(path, filename), "w") as out_file:
+  with open(os.path.join(path, "nfc.txt"), "w") as out_file:
    out_file.write(
        """# Copyright (C) 1999-""" + year +
        """, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
-# file name: """ + filename + """
+# file name: nfc.txt
 #
 # machine-generated by ICU preparseucd.py
 #
+# Complete data for Unicode NFC normalization.
+
+* Unicode """ + _ucd_version + """
+
 """)
-    type = "NFKC" if with_compat else "NFC"
-    out_file.write("# Complete data for Unicode " + type + " normalization.\n\n")
-    out_file.write("* Unicode " + _ucd_version + "\n\n")
    WriteAllCC(out_file)
-    WriteAllMappings(out_file, with_compat)
+    out_file.write("\n# Canonical decomposition mappings\n")
+    for i in xrange(len(_starts) - 1):
+      start = _starts[i]
+      end = _starts[i + 1] - 1
+      props = _props[i]
+      dm = props.get("dm")
+      if dm and dm[0] != '<' and props["dt"] == "Can":
+        assert start == end
+        # The Comp_Ex=Full_Composition_Exclusion property tells us
+        # whether the canonical decomposition round-trips.
+        separator = '>' if props.get("Comp_Ex") else '='
+        out_file.write("%04X%s%s\n" % (start, separator, dm))
+
+
+def WriteNorm2NFKCTextFile(path):
+  year = datetime.date.today().strftime("%Y")
+  with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
+    out_file.write(
+        """# Copyright (C) 1999-""" + year +
+        """, International Business Machines
+# Corporation and others.  All Rights Reserved.
+#
+# file name: nfkc.txt
+#
+# machine-generated by ICU preparseucd.py
+#
+# Data for Unicode NFKC normalization.
+# This file contains only compatibility decomposition mappings,
+# plus those canonical decompositions that change from NFC round-trip mappings
+# to NFKC one-way mappings.
+# Use this file as the second gennorm2 input file after nfc.txt.
+
+* Unicode """ + _ucd_version + """
+
+""")
+    for i in xrange(len(_starts) - 1):
+      start = _starts[i]
+      end = _starts[i + 1] - 1
+      props = _props[i]
+      dm = props.get("dm")
+      if dm and dm[0] != '<':
+        assert start == end
+        if props["dt"] != "Can":
+          # Compatibility decomposition.
+          out_file.write("%04X>%s\n" % (start, dm))
+        elif not props.get("Comp_Ex") and HasOneWayMapping(start):
+          # NFC round-trip mapping turns into NFKC one-way mapping.
+          out_file.write("%04X>%s  # NFC round-trip, NFKC one-way\n" %
+                         (start, dm))


 def WriteNorm2NFKC_CFTextFile(path):
@ -1222,7 +1255,7 @@ def WriteNorm2NFKC_CFTextFile(path):
 # This file contains the Unicode NFKC_CF mappings,
 # extracted from the UCD file DerivedNormalizationProps.txt,
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
-# Use this file as the second gennorm2 input file after nfkc.txt.
+# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.

 """)
    out_file.write("* Unicode " + _ucd_version + "\n\n")
@ -1234,30 +1267,25 @@ def WriteNorm2NFKC_CFTextFile(path):
      end = _starts[i + 1] - 1
      props = _props[i]
      nfkc_cf = props.get("NFKC_CF")
-      if nfkc_cf != None and (not nfkc_cf or nfkc_cf[0] != '<'):
-        # Merge with the previous range if possible,
-        # or remember this range for merging.
-        if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
-          prev_end = end
-        else:
-          if prev_nfkc_cf != None:
-            if prev_start == prev_end:
-              out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
-            else:
-              out_file.write("%04X..%04X>%s\n" % (prev_start, prev_end, prev_nfkc_cf))
-          prev_start = start
-          prev_end = end
-          prev_nfkc_cf = nfkc_cf
-    if prev_nfkc_cf != None:
-      if prev_start == prev_end:
-        out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
+      # Merge with the previous range if possible,
+      # or remember this range for merging.
+      if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
+        prev_end = end
      else:
-        out_file.write("%04X..%04X>%s\n" % (prev_start, prev_end, prev_nfkc_cf))
+        if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
+          if prev_start == prev_end:
+            out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
+          else:
+            out_file.write("%04X..%04X>%s\n" %
+                           (prev_start, prev_end, prev_nfkc_cf))
+        prev_start = start
+        prev_end = end
+        prev_nfkc_cf = nfkc_cf


 def WriteNorm2(path):
-  WriteNorm2TextFile(path, "nfc.txt", False)
-  WriteNorm2TextFile(path, "nfkc.txt", True)
+  WriteNorm2NFCTextFile(path)
+  WriteNorm2NFKCTextFile(path)
  WriteNorm2NFKC_CFTextFile(path)

 # Preprocessing ------------------------------------------------------------ ***