mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
linebreak testing
X-SVN-Rev: 9616
This commit is contained in:
parent
ad9daf070c
commit
5a41a000bd
3 changed files with 1810 additions and 0 deletions
1446
tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
Normal file
1446
tools/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java
Normal file
File diff suppressed because it is too large
Load diff
177
tools/unicodetools/com/ibm/text/UCD/IANANames.java
Normal file
177
tools/unicodetools/com/ibm/text/UCD/IANANames.java
Normal file
|
@ -0,0 +1,177 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/IANANames.java,v $
|
||||
* $Date: 2002/08/08 15:38:16 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
|
||||
import java.util.*;
|
||||
import java.text.NumberFormat;
|
||||
import java.io.*;
|
||||
|
||||
public class IANANames implements UCD_Types {
|
||||
private Map aliasToBase = new TreeMap();
|
||||
private Map aliasToComment = new TreeMap();
|
||||
private Map aliasToLine = new TreeMap();
|
||||
|
||||
public static void testSensitivity() throws IOException {
|
||||
IANANames iNames = new IANANames();
|
||||
Map m = new HashMap();
|
||||
Iterator it = iNames.getIterator();
|
||||
UnicodeSet removed = new UnicodeSet();
|
||||
int maxLength = 0;
|
||||
while (it.hasNext()) {
|
||||
String alias = (String) it.next();
|
||||
if (maxLength < alias.length()) maxLength = alias.length();
|
||||
if (alias.length() > 40) System.out.println("Name >40: " + alias);
|
||||
if (alias.indexOf(')') >= 0 || alias.indexOf('(') >= 0) System.out.println("Illegal tag: " + alias);
|
||||
String skeleton = removeNonAlphanumeric(alias, removed);
|
||||
String other = (String) m.get(skeleton);
|
||||
if (other != null) {
|
||||
String base = iNames.getBase(alias);
|
||||
String otherBase = iNames.getBase(other);
|
||||
if (!base.equals(otherBase)) {
|
||||
System.out.println("Collision between: " + alias + " (" + base + ") and "
|
||||
+ other + " (" + otherBase + ")");
|
||||
} else {
|
||||
System.out.println("Alias Variant: " + alias + " and " + other + " (" + base + ")");
|
||||
}
|
||||
} else {
|
||||
m.put(skeleton, alias);
|
||||
}
|
||||
}
|
||||
System.out.println("Max Length: " + maxLength);
|
||||
|
||||
System.out.println("Characters removed: ");
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator(removed);
|
||||
while (usi.next()) {
|
||||
char c = (char) usi.codepoint; // safe, can't be supplementary
|
||||
System.out.println("0x" + usi.codepoint + "\t'" + c + "'\t" + UCharacter.getName(usi.codepoint));
|
||||
}
|
||||
}
|
||||
|
||||
public IANANames() throws IOException {
|
||||
BufferedReader in = Utility.openReadFile(BASE_DIR + "IANA\\character-sets.txt", false);
|
||||
try {
|
||||
boolean atStart = true;
|
||||
String lastName = "";
|
||||
int counter = 0;
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
counter++;
|
||||
if (atStart) {
|
||||
if (line.startsWith("-------------")) atStart = false;
|
||||
continue;
|
||||
}
|
||||
if (line.trim().length() == 0) continue;
|
||||
|
||||
if (line.startsWith("Name:") || line.startsWith("Alias:")) {
|
||||
lastName = add(line, lastName, counter);
|
||||
} else if (line.startsWith("Source:") || line.startsWith("MIBenum:")
|
||||
|| line.startsWith(" ")) {
|
||||
continue;
|
||||
} else if (line.equals("REFERENCES")) {
|
||||
break;
|
||||
} else {
|
||||
System.out.println("Unknown Line: " + line);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
in.close();
|
||||
}
|
||||
}
|
||||
|
||||
private String add(String line, String baseName, int counter) {
|
||||
// extract the alias, doing a little validity check
|
||||
int pos = line.indexOf(": ");
|
||||
if (pos < 0) throw new IllegalArgumentException("Bad line: " + counter + " '" + line + "'");
|
||||
String alias = line.substring(pos+2).trim();
|
||||
|
||||
// get comment
|
||||
String comment = null;
|
||||
pos = alias.indexOf(' ');
|
||||
if (pos >= 0) {
|
||||
comment = alias.substring(pos).trim();
|
||||
alias = alias.substring(0, pos);
|
||||
}
|
||||
|
||||
// reset the baseName if we are a name
|
||||
if (line.startsWith("Name:")) {
|
||||
baseName = alias;
|
||||
}
|
||||
|
||||
// store
|
||||
if (!alias.equals("None")) {
|
||||
if (false) {
|
||||
if (baseName.equals(alias)) System.out.println();
|
||||
System.out.println("Adding " + alias + "\t=> " + baseName + (comment != null ? "\t(" + comment + ")" : ""));
|
||||
}
|
||||
// check if it is stored already
|
||||
String oldbaseName = (String) aliasToBase.get(alias);
|
||||
if (oldbaseName != null) {
|
||||
System.out.println("Duplicate alias (" + alias + ", " + oldbaseName + ", " + baseName + "): "
|
||||
+ counter + " '" + line + "'");
|
||||
}
|
||||
aliasToBase.put(alias, baseName);
|
||||
if (comment != null) aliasToComment.put(alias, comment);
|
||||
aliasToLine.put(alias, comment);
|
||||
}
|
||||
return baseName;
|
||||
}
|
||||
|
||||
public Iterator getIterator() {
|
||||
return aliasToBase.keySet().iterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the name for this alias, or "" if there is none
|
||||
*/
|
||||
public String getBase(String alias) {
|
||||
return (String) aliasToBase.get(alias);
|
||||
}
|
||||
|
||||
public static String removeNonAlphanumeric(String s, UnicodeSet removed) {
|
||||
s = s.toUpperCase(Locale.ENGLISH); // can't have Turkish!
|
||||
StringBuffer result = new StringBuffer();
|
||||
boolean removedZero = false;
|
||||
for (int i = 0; i < s.length(); ++i) {
|
||||
char c = s.charAt(i);
|
||||
if (c == '0') {
|
||||
char cLast = result.length() > 0 ? result.charAt(result.length() - 1) : '0';
|
||||
if ('0' <= cLast && cLast <= '9') {
|
||||
result.append(c);
|
||||
} else {
|
||||
if (!removed.contains(c)) {
|
||||
System.out.println("Removed '" + c + "' from " + s + " => " + result);
|
||||
removed.add(c);
|
||||
}
|
||||
removedZero = true;
|
||||
}
|
||||
} else if (('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) {
|
||||
result.append(c);
|
||||
} else {
|
||||
if (!removed.contains(c)) {
|
||||
System.out.println("Removed '" + c + "' from " + s + " => " + result);
|
||||
removed.add(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
//if (removedZero) System.out.println("Removed 0 from " + s + " => " + result);
|
||||
return result.toString();
|
||||
}
|
||||
}
|
187
tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java
Normal file
187
tools/unicodetools/com/ibm/text/UCD/UnifiedProperty.java
Normal file
|
@ -0,0 +1,187 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedProperty.java,v $
|
||||
* $Date: 2002/08/08 15:38:16 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public final class UnifiedProperty extends UnicodeProperty {
|
||||
int majorProp;
|
||||
// DerivedProperty dp;
|
||||
|
||||
public static UnicodeProperty make(int propMask) {
|
||||
return make(propMask, Default.ucd);
|
||||
}
|
||||
|
||||
public static UnicodeProperty make(int propMask, UCD ucd) {
|
||||
if ((propMask & 0xFF00) == (BINARY_PROPERTIES & 0xFF00)) {
|
||||
return UnifiedBinaryProperty.make(propMask, ucd);
|
||||
}
|
||||
if ((propMask & 0xFF00) == DERIVED) {
|
||||
return DerivedProperty.make(propMask & 0xFF, ucd);
|
||||
}
|
||||
if (!isDefined(propMask, ucd)) return null;
|
||||
return getCached(propMask, ucd);
|
||||
}
|
||||
|
||||
public static UnicodeProperty make(String propID, UCD ucd) {
|
||||
return make(getPropmask(propID, ucd), ucd);
|
||||
}
|
||||
|
||||
public static UnicodeSet getSet(int propMask, UCD ucd) {
|
||||
UnicodeProperty up = make(propMask, ucd);
|
||||
return up.getSet();
|
||||
}
|
||||
|
||||
public static UnicodeSet getSet(String propID, UCD ucd) {
|
||||
return getSet(getPropmask(propID, ucd), ucd);
|
||||
}
|
||||
|
||||
private static Map propNameCache = null;
|
||||
|
||||
public static int getPropmask(String propID, UCD ucd) {
|
||||
|
||||
// cache the names
|
||||
if (propNameCache == null) {
|
||||
System.out.println("Caching Property Names");
|
||||
propNameCache = new HashMap();
|
||||
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) {
|
||||
UnicodeProperty up = UnifiedProperty.make(i, ucd);
|
||||
if (up == null) continue;
|
||||
if (!up.isStandard()) continue;
|
||||
if (up.getValueType() < BINARY) continue;
|
||||
String shortName = Utility.getSkeleton(up.getProperty(SHORT));
|
||||
String longName = Utility.getSkeleton(up.getProperty(LONG));
|
||||
Integer result = new Integer(i);
|
||||
propNameCache.put(longName, result);
|
||||
propNameCache.put(shortName, result);
|
||||
}
|
||||
System.out.println("Done Caching");
|
||||
}
|
||||
|
||||
propID = Utility.getSkeleton(propID);
|
||||
Integer indexObj = (Integer) propNameCache.get(propID);
|
||||
if (indexObj == null) {
|
||||
throw new IllegalArgumentException("No property found for " + propID);
|
||||
}
|
||||
return indexObj.intValue();
|
||||
}
|
||||
|
||||
static Map cache = new HashMap();
|
||||
static UCD lastUCD = null;
|
||||
static int lastPropMask = -1;
|
||||
static UnifiedProperty lastValue = null;
|
||||
static Clump probeClump = new Clump();
|
||||
|
||||
static class Clump {
|
||||
int prop;
|
||||
UCD ucd;
|
||||
public boolean equals(Object other) {
|
||||
Clump that = (Clump) other;
|
||||
return (that.prop != prop || !ucd.equals(that));
|
||||
}
|
||||
}
|
||||
|
||||
private static UnifiedProperty getCached(int propMask, UCD ucd) {
|
||||
System.out.println(ucd);
|
||||
if (ucd.equals(lastUCD) && propMask == lastPropMask) return lastValue;
|
||||
probeClump.prop = propMask;
|
||||
probeClump.ucd = ucd;
|
||||
UnifiedProperty dp = (UnifiedProperty) cache.get(probeClump);
|
||||
if (dp == null) {
|
||||
dp = new UnifiedProperty(propMask, ucd);
|
||||
cache.put(probeClump, dp);
|
||||
probeClump = new Clump();
|
||||
}
|
||||
lastUCD = ucd;
|
||||
lastValue = dp;
|
||||
lastPropMask = propMask;
|
||||
return dp;
|
||||
}
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
private UnifiedProperty(int propMask, UCD ucdin) {
|
||||
ucd = ucdin;
|
||||
majorProp = propMask >> 8;
|
||||
|
||||
//System.out.println("A: " + getValueType());
|
||||
if (majorProp <= (JOINING_GROUP>>8) || majorProp == SCRIPT>>8) setValueType(FLATTENED_BINARY);
|
||||
//System.out.println("B: " + getValueType());
|
||||
|
||||
header = UCD_Names.UNIFIED_PROPERTY_HEADERS[majorProp];
|
||||
name = UCD_Names.UNIFIED_PROPERTIES[majorProp];
|
||||
shortName = UCD_Names.SHORT_UNIFIED_PROPERTIES[majorProp];
|
||||
}
|
||||
|
||||
static private boolean isDefined(int propMask, UCD ucd) {
|
||||
int majorProp = propMask >> 8;
|
||||
switch (majorProp) {
|
||||
case CATEGORY>>8:
|
||||
case COMBINING_CLASS>>8:
|
||||
case BIDI_CLASS>>8:
|
||||
case DECOMPOSITION_TYPE>>8:
|
||||
case NUMERIC_TYPE>>8:
|
||||
case EAST_ASIAN_WIDTH>>8:
|
||||
case LINE_BREAK>>8:
|
||||
case JOINING_TYPE>>8:
|
||||
case JOINING_GROUP>>8:
|
||||
case SCRIPT>>8:
|
||||
case AGE>>8:
|
||||
return true;
|
||||
/*
|
||||
case DERIVED>>8:
|
||||
UnicodeProperty up = DerivedProperty.make(propValue, ucd);
|
||||
if (up == null) break;
|
||||
return up.hasValue(cp);
|
||||
*/
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean hasValue(int cp) {
|
||||
throw new ChainException("Can't call 'hasValue' on non-binary property {0}", new Object[]{
|
||||
new Integer(majorProp)});
|
||||
}
|
||||
|
||||
public String getFullName(byte style) {
|
||||
String pre = "";
|
||||
String preShort = getProperty(SHORT);
|
||||
String preLong = getProperty(LONG);
|
||||
if (style < LONG) pre = preShort;
|
||||
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
|
||||
else pre = preShort + "(" + preLong + ")";
|
||||
return pre;
|
||||
}
|
||||
|
||||
public String getValue(int cp, byte style) {
|
||||
switch (majorProp) {
|
||||
case CATEGORY>>8: return ucd.getCategoryID_fromIndex(ucd.getCategory(cp), style);
|
||||
case COMBINING_CLASS>>8: return ucd.getCombiningClassID_fromIndex(ucd.getCombiningClass(cp), style);
|
||||
case BIDI_CLASS>>8: return ucd.getBidiClassID_fromIndex(ucd.getBidiClass(cp), style);
|
||||
case DECOMPOSITION_TYPE>>8: return ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(cp), style);
|
||||
case NUMERIC_TYPE>>8: return ucd.getNumericTypeID_fromIndex(ucd.getNumericType(cp), style);
|
||||
case EAST_ASIAN_WIDTH>>8: return ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(cp));
|
||||
case LINE_BREAK>>8: return ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp));
|
||||
case JOINING_TYPE>>8: return ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(cp));
|
||||
case JOINING_GROUP>>8: return ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(cp));
|
||||
case SCRIPT>>8: return ucd.getScriptID_fromIndex(ucd.getScript(cp));
|
||||
case AGE>>8: return ucd.getAgeID_fromIndex(ucd.getAge(cp));
|
||||
default: throw new IllegalArgumentException("Internal Error");
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue