ICU-0 security tools

X-SVN-Rev: 18133
This commit is contained in:
Mark Davis 2005-07-02 01:42:51 +00:00
parent b79aaf70aa
commit ed99ae9728
2 changed files with 325 additions and 85 deletions
tools/unicodetools/com/ibm/text/UCD

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
* $Date: 2005/07/01 22:10:00 $
* $Revision: 1.4 $
* $Date: 2005/07/02 01:42:51 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -87,7 +87,34 @@ public class GenerateConfusables {
um.put(0x34E4, "2.1");
um.put(0x3007, "2.1");
_Non_IICore.removeAll(um.getSet("2.1"));
// add Chinese?
UnicodeSet cjk_nic = new UnicodeSet();
String line = null;
try {
BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
while (true) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line, ';');
// part 0 is range
String range = pieces[0].trim();
int rangeDivider = range.indexOf("..");
int start, end;
if (rangeDivider < 0) {
start = end = Integer.parseInt(range, 16);
} else {
start = Integer.parseInt(range.substring(0, rangeDivider), 16);
end = Integer.parseInt(range.substring(rangeDivider+2), 16);
}
cjk_nic.add(start, end);
}
br.close();
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
}
_Non_IICore.removeAll(cjk_nic);
}
return _Non_IICore;
// for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) {
@ -401,8 +428,8 @@ public class GenerateConfusables {
//reviews.putAll(UNASSIGNED, "");
out.print("\uFEFF");
out.println("# Review List for IDN");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
UnicodeSet fullSet = reviews.getSet("").complement();
@ -457,8 +484,8 @@ public class GenerateConfusables {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
out.println("# Recommended Identifier Profiles for IDN");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
out.println("# Output Characters");
@ -527,8 +554,8 @@ public class GenerateConfusables {
"xidmodifications.txt");
out.println("# Security Profile for General Identifiers");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
out.println("# Characters restricted");
@ -584,8 +611,8 @@ public class GenerateConfusables {
//someRemovals = removals;
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
out.println("# Characters restricted in domain names");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("#");
out.println("# This file contains a draft list of characters for use in");
out.println("# UTR #36: Unicode Security Considerations");
@ -763,7 +790,7 @@ public class GenerateConfusables {
/*
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
*/
private static int getSingleScript(String source) {
public static int getSingleScript(String source) {
int lastScript = UScript.INVALID_CODE;
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
@ -1119,8 +1146,8 @@ public class GenerateConfusables {
public void writeSource(String directory, String filename) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.println("# Source File for IDN Confusables");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
dataMixedAnycase.writeSource(out);
out.close();
@ -1130,8 +1157,8 @@ public class GenerateConfusables {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.print('\uFEFF');
out.println("# Recommended confusable mapping for IDN");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
if (appendFile) {
@ -1339,8 +1366,8 @@ public class GenerateConfusables {
UnicodeSet representable = new UnicodeSet();
out.print('\uFEFF');
out.println("# Summary: Recommended confusable mapping for IDN");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
MyEquivalenceClass data = dataMixedAnycase;
Set items = data.getOrderedExplicitItems();
@ -1446,8 +1473,8 @@ public class GenerateConfusables {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
out.print('\uFEFF');
out.println("# Summary: Whole-Script Confusables");
out.println("# $Revision: 1.4 $");
out.println("# $Date: 2005/07/01 22:10:00 $");
out.println("# $Revision: 1.5 $");
out.println("# $Date: 2005/07/02 01:42:51 $");
out.println("");
out.println("# Lowercase Only");
out.println("");
@ -1516,7 +1543,7 @@ public class GenerateConfusables {
}
}
static class UnicodeSetToScript {
public static class UnicodeSetToScript {
public int getScript() {
return script;
}
@ -1535,8 +1562,8 @@ public class GenerateConfusables {
private int script;
}
UnicodeSetToScript[][] scriptToUnicodeSetToScript;
UnicodeSet[] fastReject;
UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UScript.CODE_LIMIT][];
UnicodeSet[] fastReject = new UnicodeSet[UScript.CODE_LIMIT];
boolean finished = false;
void finish() {
@ -1562,15 +1589,13 @@ public class GenerateConfusables {
}
void write(PrintWriter out) throws IOException {
finish();
for (int j = 0; j < UScript.CODE_LIMIT; ++j) {
if (j == UScript.COMMON || j == UScript.INHERITED) continue;
if (script_representables[j].size() == 0) continue;
for (int k = 0; k < UScript.CODE_LIMIT; ++k) {
if (k == UScript.COMMON || k == UScript.INHERITED) continue;
if (script_representables[k].size() == 0) continue;
if (script_set[j].containsNone(script_representables[k])) continue;
UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]);
if (scriptToUnicodeSetToScript[j] == null) continue;
for (int q = 0; q < scriptToUnicodeSetToScript[j].length; ++q) {
UnicodeSetToScript uss = scriptToUnicodeSetToScript[j][q];
int k = uss.getScript();
UnicodeSet items = uss.getSet();
String sname = UScript.getShortName(j) + "; " + UScript.getShortName(k) + "; " + label;
String name = UScript.getName(j) + "; " + UScript.getName(k);
out.println("# " + name + ": " + items.toPattern(false));
@ -1581,61 +1606,6 @@ public class GenerateConfusables {
}
}
}
/*
* for this routine, we don't care what the targetScripts are,
* just whether there is at least one whole-script confusable.
*/
boolean hasWholeScriptConfusable(String givenString, BitSet resultingScripts) {
int givenScript = getSingleScript(givenString);
if (givenScript == UScript.INVALID_CODE) {
throw new IllegalArgumentException("Not single script string");
}
UnicodeSet givenSet = new UnicodeSet()
.addAll(givenString)
.removeAll(commonAndInherited);
return hasWholeScriptConfusable(givenScript, givenSet, resultingScripts);
}
/**
*
*/
private boolean hasWholeScriptConfusable(int givenScript, UnicodeSet givenSet, BitSet resultingScripts) {
resultingScripts.clear();
if (fastReject[givenScript].containsSome(givenSet)) return false;
UnicodeSetToScript[] possibles = scriptToUnicodeSetToScript[givenScript];
for (int i = 0; i < possibles.length; ++i) {
if (possibles[i].set.containsAll(givenSet)) {
resultingScripts.set(possibles[i].script);
}
}
return resultingScripts.isEmpty();
}
/*
* for this routine, we don't care what the targetScripts are,
* just whether there is at least one whole-script confusable.
*/
boolean hasMixedScriptConfusable(String givenString) {
// UnicodeSet givenSet = new UnicodeSet()
// .addAll(givenString)
// .removeAll(commonAndInherited);
// BitSet givenScripts = getScriptsIn(givenString);
// for (int i = 0; i < givenScripts.length(); ++i) {
// ;
// }
// UnicodeSet givenSet = new UnicodeSet()
// .addAll(givenString)
// .removeAll(commonAndInherited);
// resultingScripts.clear();
// if (fastReject[givenScript].containsSome(givenSet)) return false;
// UnicodeSetToScript[] possibles = scriptToUnicodeSets[givenScript];
// for (int i = 0; i < possibles.length; ++i) {
// if (possibles[i].set.containsAll(givenSet)) {
// resultingScripts.set(possibles[i].script);
// }
// }
// return resultingScripts.isEmpty();
return true;
}
}

View file

@ -0,0 +1,270 @@
package com.ibm.text.UCD;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.text.utility.Utility;
import com.ibm.icu.lang.UCharacter;
public class TestIdentifiers {
public static void main(String[] args) throws IOException {
String[] tests = { "MOPE", "VOP", "scope", "ibm", "vop", "toys-я-us", "1iνе", "back" };
TestIdentifiers ti = new TestIdentifiers("L");
TestIdentifiers tiany = new TestIdentifiers("A");
for (int i = 0; i < tests.length; ++i) {
System.out.print(tests[i]);
ti.testItem(tests[i]);
String folded = UCharacter.foldCase(tests[i], true);
if (!folded.equals(tests[i])) {
System.out.print("\t");
tiany.testItem(tests[i]);
System.out.print(folded);
ti.testItem(folded);
}
}
}
void testItem(String test) {
BitSet scripts = new BitSet();
System.out.print("\t" + caseType + "\t");
boolean foundProblem = false;
if (hasWholeScriptConfusable(test, scripts)) {
System.out.print("whole-script confusables: ");
for (int j = 0; j < scripts.length(); ++j) {
if (scripts.get(j))
System.out.print(UScript.getName(j) + " ");
}
System.out.println();
foundProblem = true;
}
if (hasMixedScriptConfusable(test)) {
System.out.println("mixed-script confusable");
foundProblem = true;
}
if (!foundProblem) {
System.out.println("no confusable");
}
}
private static final String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
private static UnicodeSet commonAndInherited = new UnicodeSet(
"[[:script=common:][:script=inherited:]]");
private static UnicodeSet XIDContinueSet = new UnicodeSet("[:xidcontinue:]")
.add('-');
private static final boolean DEBUG = false;
private String caseType;
TestIdentifiers(String caseType) throws IOException {
this.caseType = caseType;
loadFile(caseType);
}
private static class UnicodeSetToScript {
public int getScript() {
return script;
}
public UnicodeSetToScript setScript(int script) {
this.script = script;
return this;
}
public UnicodeSet getSet() {
return set;
}
public UnicodeSetToScript setSet(UnicodeSet set) {
this.set = set;
return this;
}
private UnicodeSet set;
private int script;
}
UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UScript.CODE_LIMIT][];
UnicodeSet[] fastReject = new UnicodeSet[UScript.CODE_LIMIT];
void loadFile(String filterType) throws IOException {
UnicodeSet[][] script_script_set = new UnicodeSet[UScript.CODE_LIMIT][UScript.CODE_LIMIT];
for (int i = 0; i < UScript.CODE_LIMIT; ++i) {
script_script_set[i] = new UnicodeSet[UScript.CODE_LIMIT];
}
BufferedReader br = BagFormatter.openUTF8Reader(indir,
"confusablesWholeScript.txt");
String line = null;
try {
while (true) {
line = Utility.readDataLine(br);
if (line == null)
break;
if (line.length() == 0)
continue;
String[] pieces = Utility.split(line, ';');
// part 0 is range
String range = pieces[0].trim();
int rangeDivider = range.indexOf("..");
int start, end;
if (rangeDivider < 0) {
start = end = Integer.parseInt(range, 16);
} else {
start = Integer.parseInt(range.substring(0, rangeDivider),
16);
end = Integer.parseInt(range.substring(rangeDivider + 2),
16);
}
// part 1 is script1
int script1 = UScript.getCodeFromName(pieces[1].trim());
// part 2 is script2
int script2 = UScript.getCodeFromName(pieces[2].trim());
String type = pieces[3].trim();
if (!type.equals(filterType))
continue;
if (script_script_set[script1][script2] == null) {
script_script_set[script1][script2] = new UnicodeSet();
}
script_script_set[script1][script2].add(start, end);
}
for (int i = 0; i < script_script_set.length; ++i) {
UnicodeSet accept = new UnicodeSet();
List curr = new ArrayList();
for (int j = 0; j < script_script_set[i].length; ++j) {
if (script_script_set[i][j] == null)
continue;
accept.addAll(script_script_set[i][j]);
curr.add(new UnicodeSetToScript().setScript(j).setSet(
script_script_set[i][j]));
if (DEBUG && i == UScript.LATIN)
System.out.println(UScript.getName(i) + "; "
+ UScript.getName(j) + "; "
+ script_script_set[i][j]);
}
if (curr.size() == 0)
continue;
scriptToUnicodeSetToScript[i] = (UnicodeSetToScript[]) curr
.toArray(new UnicodeSetToScript[curr.size()]);
fastReject[i] = accept.complement();
if (DEBUG && i == UScript.LATIN)
System.out.println(UScript.getName(i) + "; "
+ fastReject[i]);
}
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line "
+ line).initCause(e);
}
br.close();
}
/*
* for this routine, we don't care what the targetScripts are, just whether
* there is at least one whole-script confusable.
*/
boolean hasWholeScriptConfusable(String givenString, BitSet resultingScripts) {
int givenScript = getSingleScript(givenString);
if (givenScript == UScript.INVALID_CODE)
return false;
UnicodeSet givenSet = new UnicodeSet().addAll(givenString).removeAll(
commonAndInherited);
return hasWholeScriptConfusable(givenScript, givenSet, resultingScripts);
}
/**
*
*/
private boolean hasWholeScriptConfusable(int givenScript,
UnicodeSet givenSet, BitSet resultingScripts) {
resultingScripts.clear();
if (fastReject[givenScript] == null)
return false;
if (fastReject[givenScript].containsSome(givenSet))
return false;
UnicodeSetToScript[] possibles = scriptToUnicodeSetToScript[givenScript];
for (int i = 0; i < possibles.length; ++i) {
if (possibles[i].set.containsAll(givenSet)) {
resultingScripts.set(possibles[i].script);
}
}
return !resultingScripts.isEmpty();
}
/*
* for this routine, we don't care what the targetScripts are, just
* whether there is at least one whole-script confusable.
*/
boolean hasMixedScriptConfusable(String givenString) {
UnicodeSet givenSet = new UnicodeSet().addAll(givenString).removeAll(
commonAndInherited);
UnicodeSet[] byScript = getScripts(givenSet);
BitSet wholeScripts = new BitSet();
boolean result = false;
main: for (int i = 0; i < byScript.length; ++i) {
if (byScript[i] == null)
continue;
// see if the other characters have whole script confusables in
// my script
for (int j = 0; j < byScript.length; ++j) {
if (j == i || byScript[j] == null)
continue;
if (!hasWholeScriptConfusable(j, byScript[j], wholeScripts))
continue main;
if (!wholeScripts.get(i))
continue main; // doesn't have the
// one we want
result = true;
}
return result; // passed the guantlet
}
return false;
}
/*
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
*/
public static int getSingleScript(String source) {
int lastScript = UScript.INVALID_CODE;
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
int script = UScript.getScript(cp);
if (script == UScript.COMMON || script == UScript.INHERITED) {
if (XIDContinueSet.contains(cp)) {
if (lastScript == UScript.INVALID_CODE)
lastScript = script;
continue; // skip if not identifier
}
script = UScript.COMMON;
}
if (lastScript == UScript.INVALID_CODE)
lastScript = script;
else if (script != lastScript)
return UScript.INVALID_CODE;
}
return lastScript;
}
public static UnicodeSet[] getScripts(UnicodeSet sourceSet) {
UnicodeSet[] byScript = new UnicodeSet[UScript.CODE_LIMIT];
for (UnicodeSetIterator usi = new UnicodeSetIterator(sourceSet); usi
.next();) {
int script = UScript.getScript(usi.codepoint);
if (byScript[script] == null)
byScript[script] = new UnicodeSet();
byScript[script].add(usi.codepoint);
}
return byScript;
}
}