From 804e4e91fa97657fda401623cb92a4fecf1faa75 Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Thu, 25 Oct 2001 20:37:09 +0000
Subject: [PATCH] additional derived properties, some cleanup

X-SVN-Rev: 6438
---
 .../text/UCD/GenerateHanTransliterator.java   | 186 ++++++++++++++++++
 .../com/ibm/text/UCD/PropertyAliasHeader.txt  |  47 +++++
 .../ibm/text/UCD/UnifiedBinaryProperty.java   | 180 +++++++++++++++++
 3 files changed, 413 insertions(+)
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java

diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
new file mode 100644
index 00000000000..5ece2bdb6d2
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java
@@ -0,0 +1,186 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
+* $Date: 2001/10/25 20:37:09 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+import java.io.*;
+import com.ibm.text.utility.*;
+import com.ibm.text.UTF16;
+import java.util.*;
+
+
+public final class GenerateHanTransliterator {
+    
+    static final boolean TESTING = false;
+    static int type;
+    
+    public static void main() {
+        try {
+            type = 0;
+            System.out.println("Starting");
+            generate();
+        } catch (Exception e) {
+            System.out.println("Exception: " + e);
+        }
+    }
+    
+    static PrintWriter out;
+    static PrintWriter err;
+    
+    static int count;
+    static int oldLine;
+  
+    static void generate() throws java.io.IOException {
+        String name = "$Han$English";
+        String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
+        String filter = "kJis0";
+        switch (type) {
+            default: break;
+            case 1: name = "$Han$OnRomaji";
+                key = "kJapaneseOn";
+                filter = "kJis0";
+                break;
+            case 2: name = "$Han$Pinyin";
+                key = "kMandarin";
+                filter = null;
+                break;
+        }
+        
+        out = Utility.openPrintWriter("Transliterate_Han_English.txt");
+        err = Utility.openPrintWriter("Transliterate_Han_English.log.txt");
+        
+        BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0"); 
+
+        int count = 0;
+        String oldCode = "";
+        String oldLine = "";
+        int oldStart = 0;
+        boolean foundFilter = (filter == null);
+        boolean foundKey = false;
+        
+        int lineCounter = 0;
+        
+        while (true) {
+            Utility.dot(++lineCounter);
+            
+            String line = in.readLine();
+            if (line == null) break;
+            if (line.length() < 6) continue;
+            if (line.charAt(0) == '#') continue;
+            String code = line.substring(2,6);
+            /* if (code.compareTo("9FA0") >= 0) {
+                System.out.println("? " + line);
+            }*/
+            if (!code.equals(oldCode)) {
+                if (foundKey && foundFilter) {
+                    count++;
+                    /*if (true) { //*/
+                    if (count == 1 || (count % 100) == 0) {
+                        System.out.println(count + ": " + oldLine);
+                    }
+                    printDef(out, oldCode, oldLine, oldStart);
+                }
+                if (TESTING) if (count > 1000) break;
+                oldCode = code;
+                foundKey = false;
+                foundFilter = (filter == null);
+            }
+            
+            // detect key, filter. Must be on different lines
+            if (!foundFilter && line.indexOf(filter) >= 0) {
+                foundFilter = true;
+            } else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) {
+                foundKey = true;
+                oldLine = line;
+                oldStart += key.length();
+            }
+        }
+        if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart);
+        
+        in.close();
+        out.close();
+        err.close();
+    }
+    
+    static void printDef(PrintWriter out, String code, String line, int start) {
+        if (code.length() == 0) return;
+        
+        // skip spaces & numbers at start
+        for (;start < line.length(); ++start) {
+            char ch = line.charAt(start);
+            if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
+        }
+
+        // go up to comma or semicolon, whichever is earlier
+        int end = line.indexOf(";", start);
+        if (end < 0) end = line.length();
+        
+        int end2 = line.indexOf(",", start);
+        if (end2 < 0) end2 = line.length();
+        if (end > end2) end = end2;
+  
+        if (type != 0) {
+            end2 = line.indexOf(" ", start);
+            if (end2 < 0) end2 = line.length();
+            if (end > end2) end = end2;
+        }
+        
+        String definition = line.substring(start,end);
+        if (type == 2) definition = handlePinyin(definition, line);
+        definition.trim();
+        String cp = UTF16.valueOf(Integer.parseInt(code, 16));
+        String key = (String) definitionMap.get(definition);
+        if (key == null) {
+            definitionMap.put(definition, cp);
+        }
+        out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';");
+        if (TESTING) System.out.println("# " + code + " > " + definition);
+    }
+    
+    static Map definitionMap = new HashMap();
+    
+    static StringBuffer handlePinyinTemp = new StringBuffer();
+    
+    static String handlePinyin(String source, String debugLine) {
+        try {
+            char ch = source.charAt(source.length()-1);
+            int num = (int)(ch-'1');
+            if (num < 0 || num > 5) throw new Exception("none");
+            handlePinyinTemp.setLength(0);
+            boolean gotIt = false;
+            boolean messageIfNoGotIt = true;
+            for (int i = source.length()-2; i >= 0; --i) {
+                ch = source.charAt(i);
+                if (!gotIt) switch (ch) {
+                    case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break;
+                    case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break;
+                    case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break;
+                    case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break;
+                    case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break;
+                    case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
+                }
+                handlePinyinTemp.insert(0,ch);
+            }
+            if (!gotIt && num > 0) {
+                handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num));
+                if (messageIfNoGotIt) {
+                    err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp
+                    .toString());
+                }
+            }
+            source = handlePinyinTemp.toString().toLowerCase();
+        } catch (Exception e) {
+            err.println("Bad line: " + debugLine);
+        }
+        return source;
+    }
+}
\ No newline at end of file
diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
new file mode 100644
index 00000000000..569a890a502
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
@@ -0,0 +1,47 @@
+# PropertyAliases-3.2.0.txt
+#
+# This file contains aliases for properties and property values used in the UCD.
+# These names can be used for XML formats of UCD data, for regular-expression
+# property tests, and other programmatic textual descriptions of Unicode data.
+# The names are not normative, except where they correspond to normative values
+# in the UCD.
+#
+# The names may be translated in appropriate environments, and additional
+# aliases may be useful.
+#
+# FORMAT
+#
+# Each line has three fields. Where the first field is AA, BB, or ZZ, then
+# the line describes a property name.
+# AA - non-enumerated properties
+# BB - enumerated, non-binary properties
+# ZZ - binary properties and quick-check properties
+#
+# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish
+# the different types.)
+#
+# Where the first field is not one of the above, the line describes a
+# property value name. The first field describes the property for which that
+# property value name is used. There are two special properties:
+#
+# xx stands for any binary property
+# qc stands for any quick-check property
+#
+# With loose matching of property names, case distinctions, whitespace,
+# and '_' are ignored.
+#
+# NOTE: the property value names are NOT unique across properties, especially
+# with loose matches. For example,
+# AL means Arabic Letter for the Bidi_Class property, and
+# AL means Alpha_Left for the Combining_Class property, and
+# AL means Alphabetic for the Line_Break property.
+#
+# In addition, some property names may be the same as some property value names:
+# cc means Combining_Class property, and
+# cc means the General_Category property value Control (cc)
+#
+# The combination of property value and property name is, however, unique.
+# For more information, see UTR #24: Regular Expression Guidelines
+# ================================================
+
+
diff --git a/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
new file mode 100644
index 00000000000..511f5e133d2
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
@@ -0,0 +1,180 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
+* $Date: 2001/10/25 20:37:09 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+import java.io.*;
+
+import com.ibm.text.utility.*;
+
+final class UnifiedBinaryProperty implements UCD_Types {
+    UCD ucd;
+    DerivedProperty dp;
+    
+    UnifiedBinaryProperty(UCD ucdin) {
+        ucd = ucdin;
+        dp = new DerivedProperty(ucd);
+    }
+    
+    public String getPropertyName(int propMask, byte style) {
+        if (style < LONG) return UCD_Names.ABB_UNIFIED_PROPERTIES[propMask>>8];
+        else return UCD_Names.SHORT_UNIFIED_PROPERTIES[propMask>>8];
+    }
+    
+    public boolean isTest(int propMask) {
+        int enum = propMask >> 8;
+        propMask &= 0xFF;
+        if (enum != (DERIVED>>8)) return false;
+        return dp.isTest(propMask);
+    }
+    
+    /**
+     * @return unified property number
+     */
+    public boolean isDefined(int propMask) {
+        int enum = propMask >> 8;
+        propMask &= 0xFF;
+        switch (enum) {
+          case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
+          case COMBINING_CLASS>>8: return true;
+          // ucd.isCombiningClassUsed((byte)propMask) 
+          //  || !ucd.getCombiningID_fromIndex ((byte)propMask, SHORT).startsWith("Fixed");
+          case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
+          case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
+          case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
+          case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
+          case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
+          case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
+          case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
+          case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
+          case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
+          case AGE>>8: return propMask < LIMIT_AGE;
+          case DERIVED>>8: return dp.isDefined(propMask);
+          default: return false;
+        }
+    }
+
+    public boolean get(int cp, int propMask) {
+        int enum = propMask >> 8;
+        propMask &= 0xFF;
+        switch (enum) {
+          case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
+            return ucd.getCategory(cp) == propMask;
+          case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
+            return ucd.getCombiningClass(cp) == propMask;
+          case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
+            return ucd.getBidiClass(cp) == propMask;
+          case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
+            return ucd.getDecompositionType(cp) == propMask;
+          case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
+            return ucd.getNumericType(cp) == propMask;
+          case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
+            return ucd.getEastAsianWidth(cp) == propMask;
+          case LINE_BREAK>>8:  if (propMask >= LIMIT_LINE_BREAK) break;
+            return ucd.getLineBreak(cp) == propMask;
+          case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
+            return ucd.getJoiningType(cp) == propMask;
+          case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
+            return ucd.getJoiningGroup(cp) == propMask;
+          case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
+            return ucd.getBinaryProperty(cp, propMask);
+          case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
+            return ucd.getScript(cp) == propMask;
+          case AGE>>8: if (propMask >= LIMIT_AGE) break;
+            return ucd.getAge(cp) == propMask;
+          case DERIVED>>8: if (!dp.isDefined(propMask)) break;
+            return dp.hasProperty(cp, propMask);
+        }
+        throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
+    }
+
+    public String getID(int unifiedPropMask) {
+        return getID(unifiedPropMask, NORMAL);
+    }
+/*
+    public static String getID(UCD ucd, int unifiedPropMask) {
+        String longOne = getID(ucd, unifiedPropMask, LONG);
+        String shortOne = getID(ucd, unifiedPropMask, SHORT);
+        if (longOne.equals(shortOne)) return longOne;
+        return shortOne + "(" + longOne + ")";
+    }
+*/
+    public String getFullID(int unifiedPropMask, byte style) {
+        String pre = "";
+        if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
+            String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
+            String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
+            if (style < LONG) pre = preShort;
+            else if (style == LONG || preShort.equals(preLong)) pre = preLong;
+            else pre = preShort + "(" + preLong + ")";
+        }
+        String shortOne = getID(unifiedPropMask, SHORT);
+        if (shortOne.length() == 0) shortOne = "xx";
+        String longOne = getID(unifiedPropMask, LONG);
+        if (longOne.length() == 0) longOne = "none";
+
+        String post;
+        if (style < LONG) post = shortOne;
+        else if (style == LONG || shortOne.equals(longOne)) post = longOne;
+        else post = shortOne + "(" + longOne + ")";
+
+        if (pre.length() == 0) {
+            pre = post + "=";
+            post = "T";
+        }
+
+        return pre + post;
+    }
+
+    public String getID(int unifiedPropMask, byte style) {
+        int enum = unifiedPropMask >> 8;
+        byte propMask = (byte)unifiedPropMask;
+        switch (enum) {
+          case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
+            if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
+            return UCD_Names.LONG_GC[propMask];
+          case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
+            return UCD.getCombiningID_fromIndex((short)(unifiedPropMask & 0xFF), style);
+          case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
+            if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
+            return UCD_Names.LONG_BC[propMask];
+          case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
+            if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
+            return UCD_Names.SHORT_DT[propMask];
+          case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
+            if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
+            return UCD_Names.SHORT_NT[propMask];
+          case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
+            if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
+            return UCD_Names.SHORT_EA[propMask];
+          case LINE_BREAK>>8:  if (propMask >= LIMIT_LINE_BREAK) break;
+            if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
+            return UCD_Names.LONG_LB[propMask];
+          case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
+            if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
+            return UCD_Names.LONG_JOINING_TYPE[propMask];
+          case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
+            return ucd.getJoiningGroupID_fromIndex(propMask);
+          case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
+            if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
+            return UCD_Names.SHORT_BP[propMask];
+          case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
+            if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
+            return UCD_Names.ABB_SCRIPT[propMask];
+          case AGE>>8: if (propMask >= LIMIT_AGE) break;
+            return ucd.getAgeID_fromIndex(propMask);
+          case DERIVED>>8: if (!dp.isDefined(propMask)) break;
+            return dp.getName(propMask, style);
+        }
+        throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
+    }
+}
\ No newline at end of file