additional derived properties, some cleanup

X-SVN-Rev: 6438
This commit is contained in:
Mark Davis 2001-10-25 20:37:09 +00:00
parent 899b56f176
commit 804e4e91fa
3 changed files with 413 additions and 0 deletions

View file

@ -0,0 +1,186 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
* $Date: 2001/10/25 20:37:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;
import java.util.*;
public final class GenerateHanTransliterator {
static final boolean TESTING = false;
static int type;
public static void main() {
try {
type = 0;
System.out.println("Starting");
generate();
} catch (Exception e) {
System.out.println("Exception: " + e);
}
}
static PrintWriter out;
static PrintWriter err;
static int count;
static int oldLine;
static void generate() throws java.io.IOException {
String name = "$Han$English";
String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
String filter = "kJis0";
switch (type) {
default: break;
case 1: name = "$Han$OnRomaji";
key = "kJapaneseOn";
filter = "kJis0";
break;
case 2: name = "$Han$Pinyin";
key = "kMandarin";
filter = null;
break;
}
out = Utility.openPrintWriter("Transliterate_Han_English.txt");
err = Utility.openPrintWriter("Transliterate_Han_English.log.txt");
BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0");
int count = 0;
String oldCode = "";
String oldLine = "";
int oldStart = 0;
boolean foundFilter = (filter == null);
boolean foundKey = false;
int lineCounter = 0;
while (true) {
Utility.dot(++lineCounter);
String line = in.readLine();
if (line == null) break;
if (line.length() < 6) continue;
if (line.charAt(0) == '#') continue;
String code = line.substring(2,6);
/* if (code.compareTo("9FA0") >= 0) {
System.out.println("? " + line);
}*/
if (!code.equals(oldCode)) {
if (foundKey && foundFilter) {
count++;
/*if (true) { //*/
if (count == 1 || (count % 100) == 0) {
System.out.println(count + ": " + oldLine);
}
printDef(out, oldCode, oldLine, oldStart);
}
if (TESTING) if (count > 1000) break;
oldCode = code;
foundKey = false;
foundFilter = (filter == null);
}
// detect key, filter. Must be on different lines
if (!foundFilter && line.indexOf(filter) >= 0) {
foundFilter = true;
} else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) {
foundKey = true;
oldLine = line;
oldStart += key.length();
}
}
if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart);
in.close();
out.close();
err.close();
}
static void printDef(PrintWriter out, String code, String line, int start) {
if (code.length() == 0) return;
// skip spaces & numbers at start
for (;start < line.length(); ++start) {
char ch = line.charAt(start);
if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
}
// go up to comma or semicolon, whichever is earlier
int end = line.indexOf(";", start);
if (end < 0) end = line.length();
int end2 = line.indexOf(",", start);
if (end2 < 0) end2 = line.length();
if (end > end2) end = end2;
if (type != 0) {
end2 = line.indexOf(" ", start);
if (end2 < 0) end2 = line.length();
if (end > end2) end = end2;
}
String definition = line.substring(start,end);
if (type == 2) definition = handlePinyin(definition, line);
definition.trim();
String cp = UTF16.valueOf(Integer.parseInt(code, 16));
String key = (String) definitionMap.get(definition);
if (key == null) {
definitionMap.put(definition, cp);
}
out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';");
if (TESTING) System.out.println("# " + code + " > " + definition);
}
static Map definitionMap = new HashMap();
static StringBuffer handlePinyinTemp = new StringBuffer();
static String handlePinyin(String source, String debugLine) {
try {
char ch = source.charAt(source.length()-1);
int num = (int)(ch-'1');
if (num < 0 || num > 5) throw new Exception("none");
handlePinyinTemp.setLength(0);
boolean gotIt = false;
boolean messageIfNoGotIt = true;
for (int i = source.length()-2; i >= 0; --i) {
ch = source.charAt(i);
if (!gotIt) switch (ch) {
case 'A': ch = "\u0102À\u0100".charAt(num); gotIt = true; break;
case 'E': ch = "\u0114È\u0112".charAt(num); gotIt = true; break;
case 'I': ch = "\u012CÌ\u012A".charAt(num); gotIt = true; break;
case 'O': ch = "\u014EÒ\u014C".charAt(num); gotIt = true; break;
case 'U': ch = "\u016CÙ\u016A".charAt(num); gotIt = true; break;
case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
}
handlePinyinTemp.insert(0,ch);
}
if (!gotIt && num > 0) {
handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num));
if (messageIfNoGotIt) {
err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp
.toString());
}
}
source = handlePinyinTemp.toString().toLowerCase();
} catch (Exception e) {
err.println("Bad line: " + debugLine);
}
return source;
}
}

View file

@ -0,0 +1,47 @@
# PropertyAliases-3.2.0.txt
#
# This file contains aliases for properties and property values used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
# The names are not normative, except where they correspond to normative values
# in the UCD.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
#
# FORMAT
#
# Each line has three fields. Where the first field is AA, BB, or ZZ, then
# the line describes a property name.
# AA - non-enumerated properties
# BB - enumerated, non-binary properties
# ZZ - binary properties and quick-check properties
#
# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish
# the different types.)
#
# Where the first field is not one of the above, the line describes a
# property value name. The first field describes the property for which that
# property value name is used. There are two special properties:
#
# xx stands for any binary property
# qc stands for any quick-check property
#
# With loose matching of property names, case distinctions, whitespace,
# and '_' are ignored.
#
# NOTE: the property value names are NOT unique across properties, especially
# with loose matches. For example,
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names:
# cc means Combining_Class property, and
# cc means the General_Category property value Control (cc)
#
# The combination of property value and property name is, however, unique.
# For more information, see UTR #24: Regular Expression Guidelines
# ================================================

View file

@ -0,0 +1,180 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
* $Date: 2001/10/25 20:37:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.text.utility.*;
final class UnifiedBinaryProperty implements UCD_Types {
UCD ucd;
DerivedProperty dp;
UnifiedBinaryProperty(UCD ucdin) {
ucd = ucdin;
dp = new DerivedProperty(ucd);
}
public String getPropertyName(int propMask, byte style) {
if (style < LONG) return UCD_Names.ABB_UNIFIED_PROPERTIES[propMask>>8];
else return UCD_Names.SHORT_UNIFIED_PROPERTIES[propMask>>8];
}
public boolean isTest(int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
if (enum != (DERIVED>>8)) return false;
return dp.isTest(propMask);
}
/**
* @return unified property number
*/
public boolean isDefined(int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
switch (enum) {
case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
case COMBINING_CLASS>>8: return true;
// ucd.isCombiningClassUsed((byte)propMask)
// || !ucd.getCombiningID_fromIndex ((byte)propMask, SHORT).startsWith("Fixed");
case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
case AGE>>8: return propMask < LIMIT_AGE;
case DERIVED>>8: return dp.isDefined(propMask);
default: return false;
}
}
public boolean get(int cp, int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
switch (enum) {
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
return ucd.getCategory(cp) == propMask;
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
return ucd.getCombiningClass(cp) == propMask;
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
return ucd.getBidiClass(cp) == propMask;
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
return ucd.getDecompositionType(cp) == propMask;
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
return ucd.getNumericType(cp) == propMask;
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
return ucd.getEastAsianWidth(cp) == propMask;
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
return ucd.getLineBreak(cp) == propMask;
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
return ucd.getJoiningType(cp) == propMask;
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroup(cp) == propMask;
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
return ucd.getBinaryProperty(cp, propMask);
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
return ucd.getScript(cp) == propMask;
case AGE>>8: if (propMask >= LIMIT_AGE) break;
return ucd.getAge(cp) == propMask;
case DERIVED>>8: if (!dp.isDefined(propMask)) break;
return dp.hasProperty(cp, propMask);
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
public String getID(int unifiedPropMask) {
return getID(unifiedPropMask, NORMAL);
}
/*
public static String getID(UCD ucd, int unifiedPropMask) {
String longOne = getID(ucd, unifiedPropMask, LONG);
String shortOne = getID(ucd, unifiedPropMask, SHORT);
if (longOne.equals(shortOne)) return longOne;
return shortOne + "(" + longOne + ")";
}
*/
public String getFullID(int unifiedPropMask, byte style) {
String pre = "";
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
if (style < LONG) pre = preShort;
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
else pre = preShort + "(" + preLong + ")";
}
String shortOne = getID(unifiedPropMask, SHORT);
if (shortOne.length() == 0) shortOne = "xx";
String longOne = getID(unifiedPropMask, LONG);
if (longOne.length() == 0) longOne = "none";
String post;
if (style < LONG) post = shortOne;
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
else post = shortOne + "(" + longOne + ")";
if (pre.length() == 0) {
pre = post + "=";
post = "T";
}
return pre + post;
}
public String getID(int unifiedPropMask, byte style) {
int enum = unifiedPropMask >> 8;
byte propMask = (byte)unifiedPropMask;
switch (enum) {
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
return UCD_Names.LONG_GC[propMask];
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
return UCD.getCombiningID_fromIndex((short)(unifiedPropMask & 0xFF), style);
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
return UCD_Names.LONG_BC[propMask];
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
return UCD_Names.SHORT_DT[propMask];
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
return UCD_Names.SHORT_NT[propMask];
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
return UCD_Names.SHORT_EA[propMask];
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
return UCD_Names.LONG_LB[propMask];
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
return UCD_Names.LONG_JOINING_TYPE[propMask];
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroupID_fromIndex(propMask);
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
return UCD_Names.SHORT_BP[propMask];
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
return UCD_Names.ABB_SCRIPT[propMask];
case AGE>>8: if (propMask >= LIMIT_AGE) break;
return ucd.getAgeID_fromIndex(propMask);
case DERIVED>>8: if (!dp.isDefined(propMask)) break;
return dp.getName(propMask, style);
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
}