From 804e4e91fa97657fda401623cb92a4fecf1faa75 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Thu, 25 Oct 2001 20:37:09 +0000 Subject: [PATCH] additional derived properties, some cleanup X-SVN-Rev: 6438 --- .../text/UCD/GenerateHanTransliterator.java | 186 ++++++++++++++++++ .../com/ibm/text/UCD/PropertyAliasHeader.txt | 47 +++++ .../ibm/text/UCD/UnifiedBinaryProperty.java | 180 +++++++++++++++++ 3 files changed, 413 insertions(+) create mode 100644 tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java create mode 100644 tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt create mode 100644 tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java new file mode 100644 index 00000000000..5ece2bdb6d2 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java @@ -0,0 +1,186 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $ +* $Date: 2001/10/25 20:37:09 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import java.io.*; +import com.ibm.text.utility.*; +import com.ibm.text.UTF16; +import java.util.*; + + +public final class GenerateHanTransliterator { + + static final boolean TESTING = false; + static int type; + + public static void main() { + try { + type = 0; + System.out.println("Starting"); + generate(); + } catch (Exception e) { + System.out.println("Exception: " + e); + } + } + + static PrintWriter out; + static PrintWriter err; + + static int count; + static int oldLine; + + static void generate() throws java.io.IOException { + String name = "$Han$English"; + String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn + String filter = "kJis0"; + switch (type) { + default: break; + case 1: name = "$Han$OnRomaji"; + key = "kJapaneseOn"; + filter = "kJis0"; + break; + case 2: name = "$Han$Pinyin"; + key = "kMandarin"; + filter = null; + break; + } + + out = Utility.openPrintWriter("Transliterate_Han_English.txt"); + err = Utility.openPrintWriter("Transliterate_Han_English.log.txt"); + + BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0"); + + int count = 0; + String oldCode = ""; + String oldLine = ""; + int oldStart = 0; + boolean foundFilter = (filter == null); + boolean foundKey = false; + + int lineCounter = 0; + + while (true) { + Utility.dot(++lineCounter); + + String line = in.readLine(); + if (line == null) break; + if (line.length() < 6) continue; + if (line.charAt(0) == '#') continue; + String code = line.substring(2,6); + /* if (code.compareTo("9FA0") >= 0) { + System.out.println("? " + line); + }*/ + if (!code.equals(oldCode)) { + if (foundKey && foundFilter) { + count++; + /*if (true) { //*/ + if (count == 1 || (count % 100) == 0) { + System.out.println(count + ": " + oldLine); + } + printDef(out, oldCode, oldLine, oldStart); + } + if (TESTING) if (count > 1000) break; + oldCode = code; + foundKey = false; + foundFilter = (filter == null); + } + + // detect key, filter. Must be on different lines + if (!foundFilter && line.indexOf(filter) >= 0) { + foundFilter = true; + } else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) { + foundKey = true; + oldLine = line; + oldStart += key.length(); + } + } + if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart); + + in.close(); + out.close(); + err.close(); + } + + static void printDef(PrintWriter out, String code, String line, int start) { + if (code.length() == 0) return; + + // skip spaces & numbers at start + for (;start < line.length(); ++start) { + char ch = line.charAt(start); + if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break; + } + + // go up to comma or semicolon, whichever is earlier + int end = line.indexOf(";", start); + if (end < 0) end = line.length(); + + int end2 = line.indexOf(",", start); + if (end2 < 0) end2 = line.length(); + if (end > end2) end = end2; + + if (type != 0) { + end2 = line.indexOf(" ", start); + if (end2 < 0) end2 = line.length(); + if (end > end2) end = end2; + } + + String definition = line.substring(start,end); + if (type == 2) definition = handlePinyin(definition, line); + definition.trim(); + String cp = UTF16.valueOf(Integer.parseInt(code, 16)); + String key = (String) definitionMap.get(definition); + if (key == null) { + definitionMap.put(definition, cp); + } + out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';"); + if (TESTING) System.out.println("# " + code + " > " + definition); + } + + static Map definitionMap = new HashMap(); + + static StringBuffer handlePinyinTemp = new StringBuffer(); + + static String handlePinyin(String source, String debugLine) { + try { + char ch = source.charAt(source.length()-1); + int num = (int)(ch-'1'); + if (num < 0 || num > 5) throw new Exception("none"); + handlePinyinTemp.setLength(0); + boolean gotIt = false; + boolean messageIfNoGotIt = true; + for (int i = source.length()-2; i >= 0; --i) { + ch = source.charAt(i); + if (!gotIt) switch (ch) { + case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break; + case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break; + case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break; + case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break; + case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break; + case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break; + } + handlePinyinTemp.insert(0,ch); + } + if (!gotIt && num > 0) { + handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num)); + if (messageIfNoGotIt) { + err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp + .toString()); + } + } + source = handlePinyinTemp.toString().toLowerCase(); + } catch (Exception e) { + err.println("Bad line: " + debugLine); + } + return source; + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt new file mode 100644 index 00000000000..569a890a502 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt @@ -0,0 +1,47 @@ +# PropertyAliases-3.2.0.txt +# +# This file contains aliases for properties and property values used in the UCD. +# These names can be used for XML formats of UCD data, for regular-expression +# property tests, and other programmatic textual descriptions of Unicode data. +# The names are not normative, except where they correspond to normative values +# in the UCD. +# +# The names may be translated in appropriate environments, and additional +# aliases may be useful. +# +# FORMAT +# +# Each line has three fields. Where the first field is AA, BB, or ZZ, then +# the line describes a property name. +# AA - non-enumerated properties +# BB - enumerated, non-binary properties +# ZZ - binary properties and quick-check properties +# +# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish +# the different types.) +# +# Where the first field is not one of the above, the line describes a +# property value name. The first field describes the property for which that +# property value name is used. There are two special properties: +# +# xx stands for any binary property +# qc stands for any quick-check property +# +# With loose matching of property names, case distinctions, whitespace, +# and '_' are ignored. +# +# NOTE: the property value names are NOT unique across properties, especially +# with loose matches. For example, +# AL means Arabic Letter for the Bidi_Class property, and +# AL means Alpha_Left for the Combining_Class property, and +# AL means Alphabetic for the Line_Break property. +# +# In addition, some property names may be the same as some property value names: +# cc means Combining_Class property, and +# cc means the General_Category property value Control (cc) +# +# The combination of property value and property name is, however, unique. +# For more information, see UTR #24: Regular Expression Guidelines +# ================================================ + + diff --git a/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java new file mode 100644 index 00000000000..511f5e133d2 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java @@ -0,0 +1,180 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $ +* $Date: 2001/10/25 20:37:09 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import java.io.*; + +import com.ibm.text.utility.*; + +final class UnifiedBinaryProperty implements UCD_Types { + UCD ucd; + DerivedProperty dp; + + UnifiedBinaryProperty(UCD ucdin) { + ucd = ucdin; + dp = new DerivedProperty(ucd); + } + + public String getPropertyName(int propMask, byte style) { + if (style < LONG) return UCD_Names.ABB_UNIFIED_PROPERTIES[propMask>>8]; + else return UCD_Names.SHORT_UNIFIED_PROPERTIES[propMask>>8]; + } + + public boolean isTest(int propMask) { + int enum = propMask >> 8; + propMask &= 0xFF; + if (enum != (DERIVED>>8)) return false; + return dp.isTest(propMask); + } + + /** + * @return unified property number + */ + public boolean isDefined(int propMask) { + int enum = propMask >> 8; + propMask &= 0xFF; + switch (enum) { + case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY; + case COMBINING_CLASS>>8: return true; + // ucd.isCombiningClassUsed((byte)propMask) + // || !ucd.getCombiningID_fromIndex ((byte)propMask, SHORT).startsWith("Fixed"); + case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS; + case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE; + case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE; + case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH; + case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK; + case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE; + case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP; + case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES; + case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT; + case AGE>>8: return propMask < LIMIT_AGE; + case DERIVED>>8: return dp.isDefined(propMask); + default: return false; + } + } + + public boolean get(int cp, int propMask) { + int enum = propMask >> 8; + propMask &= 0xFF; + switch (enum) { + case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break; + return ucd.getCategory(cp) == propMask; + case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break; + return ucd.getCombiningClass(cp) == propMask; + case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break; + return ucd.getBidiClass(cp) == propMask; + case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break; + return ucd.getDecompositionType(cp) == propMask; + case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break; + return ucd.getNumericType(cp) == propMask; + case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break; + return ucd.getEastAsianWidth(cp) == propMask; + case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break; + return ucd.getLineBreak(cp) == propMask; + case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break; + return ucd.getJoiningType(cp) == propMask; + case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break; + return ucd.getJoiningGroup(cp) == propMask; + case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break; + return ucd.getBinaryProperty(cp, propMask); + case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break; + return ucd.getScript(cp) == propMask; + case AGE>>8: if (propMask >= LIMIT_AGE) break; + return ucd.getAge(cp) == propMask; + case DERIVED>>8: if (!dp.isDefined(propMask)) break; + return dp.hasProperty(cp, propMask); + } + throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)}); + } + + public String getID(int unifiedPropMask) { + return getID(unifiedPropMask, NORMAL); + } +/* + public static String getID(UCD ucd, int unifiedPropMask) { + String longOne = getID(ucd, unifiedPropMask, LONG); + String shortOne = getID(ucd, unifiedPropMask, SHORT); + if (longOne.equals(shortOne)) return longOne; + return shortOne + "(" + longOne + ")"; + } +*/ + public String getFullID(int unifiedPropMask, byte style) { + String pre = ""; + if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) { + String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "="; + String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "="; + if (style < LONG) pre = preShort; + else if (style == LONG || preShort.equals(preLong)) pre = preLong; + else pre = preShort + "(" + preLong + ")"; + } + String shortOne = getID(unifiedPropMask, SHORT); + if (shortOne.length() == 0) shortOne = "xx"; + String longOne = getID(unifiedPropMask, LONG); + if (longOne.length() == 0) longOne = "none"; + + String post; + if (style < LONG) post = shortOne; + else if (style == LONG || shortOne.equals(longOne)) post = longOne; + else post = shortOne + "(" + longOne + ")"; + + if (pre.length() == 0) { + pre = post + "="; + post = "T"; + } + + return pre + post; + } + + public String getID(int unifiedPropMask, byte style) { + int enum = unifiedPropMask >> 8; + byte propMask = (byte)unifiedPropMask; + switch (enum) { + case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break; + if (style != LONG) return ucd.getCategoryID_fromIndex(propMask); + return UCD_Names.LONG_GC[propMask]; + case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break; + return UCD.getCombiningID_fromIndex((short)(unifiedPropMask & 0xFF), style); + case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break; + if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask); + return UCD_Names.LONG_BC[propMask]; + case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break; + if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask); + return UCD_Names.SHORT_DT[propMask]; + case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break; + if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask); + return UCD_Names.SHORT_NT[propMask]; + case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break; + if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask); + return UCD_Names.SHORT_EA[propMask]; + case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break; + if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask); + return UCD_Names.LONG_LB[propMask]; + case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break; + if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask); + return UCD_Names.LONG_JOINING_TYPE[propMask]; + case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break; + return ucd.getJoiningGroupID_fromIndex(propMask); + case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break; + if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask); + return UCD_Names.SHORT_BP[propMask]; + case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break; + if (style != SHORT) return ucd.getScriptID_fromIndex(propMask); + return UCD_Names.ABB_SCRIPT[propMask]; + case AGE>>8: if (propMask >= LIMIT_AGE) break; + return ucd.getAgeID_fromIndex(propMask); + case DERIVED>>8: if (!dp.isDefined(propMask)) break; + return dp.getName(propMask, style); + } + throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)}); + } +} \ No newline at end of file