mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
additional derived properties, some cleanup
X-SVN-Rev: 6438
This commit is contained in:
parent
899b56f176
commit
804e4e91fa
3 changed files with 413 additions and 0 deletions
|
@ -0,0 +1,186 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2001/10/25 20:37:09 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UTF16;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public final class GenerateHanTransliterator {
|
||||
|
||||
static final boolean TESTING = false;
|
||||
static int type;
|
||||
|
||||
public static void main() {
|
||||
try {
|
||||
type = 0;
|
||||
System.out.println("Starting");
|
||||
generate();
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
static PrintWriter out;
|
||||
static PrintWriter err;
|
||||
|
||||
static int count;
|
||||
static int oldLine;
|
||||
|
||||
static void generate() throws java.io.IOException {
|
||||
String name = "$Han$English";
|
||||
String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
||||
String filter = "kJis0";
|
||||
switch (type) {
|
||||
default: break;
|
||||
case 1: name = "$Han$OnRomaji";
|
||||
key = "kJapaneseOn";
|
||||
filter = "kJis0";
|
||||
break;
|
||||
case 2: name = "$Han$Pinyin";
|
||||
key = "kMandarin";
|
||||
filter = null;
|
||||
break;
|
||||
}
|
||||
|
||||
out = Utility.openPrintWriter("Transliterate_Han_English.txt");
|
||||
err = Utility.openPrintWriter("Transliterate_Han_English.log.txt");
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("Unihan", "3.2.0");
|
||||
|
||||
int count = 0;
|
||||
String oldCode = "";
|
||||
String oldLine = "";
|
||||
int oldStart = 0;
|
||||
boolean foundFilter = (filter == null);
|
||||
boolean foundKey = false;
|
||||
|
||||
int lineCounter = 0;
|
||||
|
||||
while (true) {
|
||||
Utility.dot(++lineCounter);
|
||||
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.length() < 6) continue;
|
||||
if (line.charAt(0) == '#') continue;
|
||||
String code = line.substring(2,6);
|
||||
/* if (code.compareTo("9FA0") >= 0) {
|
||||
System.out.println("? " + line);
|
||||
}*/
|
||||
if (!code.equals(oldCode)) {
|
||||
if (foundKey && foundFilter) {
|
||||
count++;
|
||||
/*if (true) { //*/
|
||||
if (count == 1 || (count % 100) == 0) {
|
||||
System.out.println(count + ": " + oldLine);
|
||||
}
|
||||
printDef(out, oldCode, oldLine, oldStart);
|
||||
}
|
||||
if (TESTING) if (count > 1000) break;
|
||||
oldCode = code;
|
||||
foundKey = false;
|
||||
foundFilter = (filter == null);
|
||||
}
|
||||
|
||||
// detect key, filter. Must be on different lines
|
||||
if (!foundFilter && line.indexOf(filter) >= 0) {
|
||||
foundFilter = true;
|
||||
} else if (!foundKey && (oldStart = line.indexOf(key)) >= 0) {
|
||||
foundKey = true;
|
||||
oldLine = line;
|
||||
oldStart += key.length();
|
||||
}
|
||||
}
|
||||
if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart);
|
||||
|
||||
in.close();
|
||||
out.close();
|
||||
err.close();
|
||||
}
|
||||
|
||||
static void printDef(PrintWriter out, String code, String line, int start) {
|
||||
if (code.length() == 0) return;
|
||||
|
||||
// skip spaces & numbers at start
|
||||
for (;start < line.length(); ++start) {
|
||||
char ch = line.charAt(start);
|
||||
if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) break;
|
||||
}
|
||||
|
||||
// go up to comma or semicolon, whichever is earlier
|
||||
int end = line.indexOf(";", start);
|
||||
if (end < 0) end = line.length();
|
||||
|
||||
int end2 = line.indexOf(",", start);
|
||||
if (end2 < 0) end2 = line.length();
|
||||
if (end > end2) end = end2;
|
||||
|
||||
if (type != 0) {
|
||||
end2 = line.indexOf(" ", start);
|
||||
if (end2 < 0) end2 = line.length();
|
||||
if (end > end2) end = end2;
|
||||
}
|
||||
|
||||
String definition = line.substring(start,end);
|
||||
if (type == 2) definition = handlePinyin(definition, line);
|
||||
definition.trim();
|
||||
String cp = UTF16.valueOf(Integer.parseInt(code, 16));
|
||||
String key = (String) definitionMap.get(definition);
|
||||
if (key == null) {
|
||||
definitionMap.put(definition, cp);
|
||||
}
|
||||
out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';");
|
||||
if (TESTING) System.out.println("# " + code + " > " + definition);
|
||||
}
|
||||
|
||||
static Map definitionMap = new HashMap();
|
||||
|
||||
static StringBuffer handlePinyinTemp = new StringBuffer();
|
||||
|
||||
static String handlePinyin(String source, String debugLine) {
|
||||
try {
|
||||
char ch = source.charAt(source.length()-1);
|
||||
int num = (int)(ch-'1');
|
||||
if (num < 0 || num > 5) throw new Exception("none");
|
||||
handlePinyinTemp.setLength(0);
|
||||
boolean gotIt = false;
|
||||
boolean messageIfNoGotIt = true;
|
||||
for (int i = source.length()-2; i >= 0; --i) {
|
||||
ch = source.charAt(i);
|
||||
if (!gotIt) switch (ch) {
|
||||
case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break;
|
||||
case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break;
|
||||
case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break;
|
||||
case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break;
|
||||
case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break;
|
||||
case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break;
|
||||
}
|
||||
handlePinyinTemp.insert(0,ch);
|
||||
}
|
||||
if (!gotIt && num > 0) {
|
||||
handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num));
|
||||
if (messageIfNoGotIt) {
|
||||
err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp
|
||||
.toString());
|
||||
}
|
||||
}
|
||||
source = handlePinyinTemp.toString().toLowerCase();
|
||||
} catch (Exception e) {
|
||||
err.println("Bad line: " + debugLine);
|
||||
}
|
||||
return source;
|
||||
}
|
||||
}
|
47
tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
Normal file
47
tools/unicodetools/com/ibm/text/UCD/PropertyAliasHeader.txt
Normal file
|
@ -0,0 +1,47 @@
|
|||
# PropertyAliases-3.2.0.txt
|
||||
#
|
||||
# This file contains aliases for properties and property values used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
# The names are not normative, except where they correspond to normative values
|
||||
# in the UCD.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line has three fields. Where the first field is AA, BB, or ZZ, then
|
||||
# the line describes a property name.
|
||||
# AA - non-enumerated properties
|
||||
# BB - enumerated, non-binary properties
|
||||
# ZZ - binary properties and quick-check properties
|
||||
#
|
||||
# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish
|
||||
# the different types.)
|
||||
#
|
||||
# Where the first field is not one of the above, the line describes a
|
||||
# property value name. The first field describes the property for which that
|
||||
# property value name is used. There are two special properties:
|
||||
#
|
||||
# xx stands for any binary property
|
||||
# qc stands for any quick-check property
|
||||
#
|
||||
# With loose matching of property names, case distinctions, whitespace,
|
||||
# and '_' are ignored.
|
||||
#
|
||||
# NOTE: the property value names are NOT unique across properties, especially
|
||||
# with loose matches. For example,
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names:
|
||||
# cc means Combining_Class property, and
|
||||
# cc means the General_Category property value Control (cc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
# For more information, see UTR #24: Regular Expression Guidelines
|
||||
# ================================================
|
||||
|
||||
|
180
tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
Normal file
180
tools/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java
Normal file
|
@ -0,0 +1,180 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
|
||||
* $Date: 2001/10/25 20:37:09 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
final class UnifiedBinaryProperty implements UCD_Types {
|
||||
UCD ucd;
|
||||
DerivedProperty dp;
|
||||
|
||||
UnifiedBinaryProperty(UCD ucdin) {
|
||||
ucd = ucdin;
|
||||
dp = new DerivedProperty(ucd);
|
||||
}
|
||||
|
||||
public String getPropertyName(int propMask, byte style) {
|
||||
if (style < LONG) return UCD_Names.ABB_UNIFIED_PROPERTIES[propMask>>8];
|
||||
else return UCD_Names.SHORT_UNIFIED_PROPERTIES[propMask>>8];
|
||||
}
|
||||
|
||||
public boolean isTest(int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
if (enum != (DERIVED>>8)) return false;
|
||||
return dp.isTest(propMask);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return unified property number
|
||||
*/
|
||||
public boolean isDefined(int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
|
||||
case COMBINING_CLASS>>8: return true;
|
||||
// ucd.isCombiningClassUsed((byte)propMask)
|
||||
// || !ucd.getCombiningID_fromIndex ((byte)propMask, SHORT).startsWith("Fixed");
|
||||
case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
|
||||
case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
|
||||
case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
|
||||
case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
|
||||
case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
|
||||
case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
|
||||
case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
|
||||
case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
|
||||
case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
|
||||
case AGE>>8: return propMask < LIMIT_AGE;
|
||||
case DERIVED>>8: return dp.isDefined(propMask);
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean get(int cp, int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
|
||||
return ucd.getCategory(cp) == propMask;
|
||||
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
|
||||
return ucd.getCombiningClass(cp) == propMask;
|
||||
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
|
||||
return ucd.getBidiClass(cp) == propMask;
|
||||
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
|
||||
return ucd.getDecompositionType(cp) == propMask;
|
||||
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
|
||||
return ucd.getNumericType(cp) == propMask;
|
||||
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
return ucd.getEastAsianWidth(cp) == propMask;
|
||||
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
|
||||
return ucd.getLineBreak(cp) == propMask;
|
||||
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
|
||||
return ucd.getJoiningType(cp) == propMask;
|
||||
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroup(cp) == propMask;
|
||||
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
|
||||
return ucd.getBinaryProperty(cp, propMask);
|
||||
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
|
||||
return ucd.getScript(cp) == propMask;
|
||||
case AGE>>8: if (propMask >= LIMIT_AGE) break;
|
||||
return ucd.getAge(cp) == propMask;
|
||||
case DERIVED>>8: if (!dp.isDefined(propMask)) break;
|
||||
return dp.hasProperty(cp, propMask);
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
public String getID(int unifiedPropMask) {
|
||||
return getID(unifiedPropMask, NORMAL);
|
||||
}
|
||||
/*
|
||||
public static String getID(UCD ucd, int unifiedPropMask) {
|
||||
String longOne = getID(ucd, unifiedPropMask, LONG);
|
||||
String shortOne = getID(ucd, unifiedPropMask, SHORT);
|
||||
if (longOne.equals(shortOne)) return longOne;
|
||||
return shortOne + "(" + longOne + ")";
|
||||
}
|
||||
*/
|
||||
public String getFullID(int unifiedPropMask, byte style) {
|
||||
String pre = "";
|
||||
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
|
||||
String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
|
||||
String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
|
||||
if (style < LONG) pre = preShort;
|
||||
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
|
||||
else pre = preShort + "(" + preLong + ")";
|
||||
}
|
||||
String shortOne = getID(unifiedPropMask, SHORT);
|
||||
if (shortOne.length() == 0) shortOne = "xx";
|
||||
String longOne = getID(unifiedPropMask, LONG);
|
||||
if (longOne.length() == 0) longOne = "none";
|
||||
|
||||
String post;
|
||||
if (style < LONG) post = shortOne;
|
||||
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
|
||||
else post = shortOne + "(" + longOne + ")";
|
||||
|
||||
if (pre.length() == 0) {
|
||||
pre = post + "=";
|
||||
post = "T";
|
||||
}
|
||||
|
||||
return pre + post;
|
||||
}
|
||||
|
||||
public String getID(int unifiedPropMask, byte style) {
|
||||
int enum = unifiedPropMask >> 8;
|
||||
byte propMask = (byte)unifiedPropMask;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
|
||||
if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_GC[propMask];
|
||||
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
|
||||
return UCD.getCombiningID_fromIndex((short)(unifiedPropMask & 0xFF), style);
|
||||
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
|
||||
if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_BC[propMask];
|
||||
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
|
||||
if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_DT[propMask];
|
||||
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
|
||||
if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_NT[propMask];
|
||||
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_EA[propMask];
|
||||
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
|
||||
if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_LB[propMask];
|
||||
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
|
||||
if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_JOINING_TYPE[propMask];
|
||||
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroupID_fromIndex(propMask);
|
||||
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
|
||||
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_BP[propMask];
|
||||
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
|
||||
if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
|
||||
return UCD_Names.ABB_SCRIPT[propMask];
|
||||
case AGE>>8: if (propMask >= LIMIT_AGE) break;
|
||||
return ucd.getAgeID_fromIndex(propMask);
|
||||
case DERIVED>>8: if (!dp.isDefined(propMask)) break;
|
||||
return dp.getName(propMask, style);
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue