ICU-5222 fixes for UnicodeTools (unconnected with rest of ICU4J)

X-SVN-Rev: 20400
This commit is contained in:
Mark Davis 2006-09-24 23:32:45 +00:00
parent fa66eb7a07
commit 690f5c528c
9 changed files with 765 additions and 199 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2006/04/05 22:12:44 $
* $Revision: 1.14 $
* $Date: 2006/09/24 23:32:44 $
* $Revision: 1.15 $
*
*******************************************************************************
*/
@ -16,14 +16,17 @@ package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import org.unicode.cldr.util.Segmenter;
import com.ibm.text.utility.*;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
abstract public class GenerateBreakTest implements UCD_Types {
static boolean DEBUG = false;
static boolean DEBUG = true;
static final boolean SHOW_TYPE = false;
UCD ucd;
Normalizer nfd;
@ -122,7 +125,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
// quick & dirty routine
String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
static String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
String result = insertion;
for (int i = 0; i < source.length(); ++i) {
result += source.charAt(i);
@ -291,6 +294,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
private String[] ruleList = new String[100];
private int ruleListCount = 0;
protected boolean collectingRules = false;
protected boolean needsFullBreakSample = true;
public void setRule(String rule) {
if (collectingRules) {
@ -330,6 +334,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
out.println("<h2>" + fileName + " Break Chart</h2>");
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
out.println("<p>This page illustrates the application of the boundary specifications. " +
"The first chart shows where breaks would appear between different sample characters or strings. " +
"The sample characters are chosen mechanically to represent the different properties used by the specification. " +
"Where properties used in the rules have 'overlaps', the samples are given 'composed' names. " +
"For example, SentenceBreak uses GCLF_Sep: Sep is the SentenceBreak property, but it overlaps with the GraphemeClusterBreak property LF." +
"</p>");
generateTable(out);
@ -485,8 +495,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
result.append(ucd.getCodeAndName(cp));
result.append(", gc=" + ucd.getCategoryID_fromIndex(ucd.getCategory(cp),SHORT));
result.append(", sc=" + ucd.getScriptID_fromIndex(ucd.getScript(cp),SHORT));
result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp))
+ "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG));
//result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp))
// + "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG));
}
return result.toString();
}
@ -560,19 +570,41 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
// gather the data for the rules
if (needsFullBreakSample ) {
collectingRules = true;
isBreak(fullBreakSample(), 1);
collectingRules = false;
}
out.println("<h3>Rules</h3>");
out.println("<ul>");
out.println("<p>Due to the way they have been mechanically processed for generation, " +
"the following rules do not match the UAX rules precisely. " +
"In particular:</p>"+
"<ol>" +
"<li>The rules are cast into a more regex-style.</li>"+
"<li>The rules \"sot ÷\", \"÷ eot\", and \"÷ Any\" are added mechanically, and have artificial numbers.</li>"+
"<li>The rules are given decimal numbers, so rules such as 11a are given a number using tenths, such as 11.1.</li>"+
"<li>Where a rule has multiple parts (lines), each one is numbered using hundredths, such as 21.01) × BA, 21.02) × HY,...</li>"+
"<li>Any 'treat as' or 'ignore' rules are handled as discussed in Unicode Standard Annex #29, and thus" +
"reflected in a transformation of the rules not visible here.</li>" +
"</ol>" +
"<p>For the original rules, see the UAX.</p>"
);
out.println("<ul style='list-style-type: none'>");
for (int ii = 0; ii < ruleListCount; ++ii) {
out.println("<li>" + ruleList[ii] + "</li>");
out.println("<li>" + ruleList[ii].replaceAll("[$]","") + "</li>");
}
out.println("</ul>");
if (extraSingleSamples.length > 0) {
out.println("<h3>Sample Strings</h3>");
out.println("<p>" +
"The following samples illustrate the application of the rules. " +
"The blue lines indicate possible break points. " +
"If your browser supports titles, then positioning the mouse over each character will show its name, " +
"white positioning between characters shows the rule number of the rule responsible for the break-status." +
"</p>");
out.println("<ol>");
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
out.println("<li><font size='5'>");
@ -631,6 +663,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (comments && !html) string.append(comment);
out.println(string);
if (DEBUG) System.out.println("*" + string);
}
public void findSamples() {
@ -642,7 +675,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
BitSet bitset = new BitSet();
Map list = new TreeMap();
for (int i = 1; i <= 0x10FFFF; ++i) {
for (int i = 1; i <= 0xFFFF; ++i) {
if (!ucd.isAllocated(i)) continue;
if (0xD800 <= i && i <= 0xDFFF) continue;
if (DEBUG && i == 0x1100) {
@ -657,6 +690,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
int combined = (mapType(lb) << 7) + mapType(lb2);
if (combined < 0) {
throw new IllegalArgumentException("should never happen");
}
if (!bitset.get(combined)) {
bitset.set(combined);
list.put(new Integer(combined), UTF16.valueOf(i));
@ -777,10 +813,142 @@ abstract public class GenerateBreakTest implements UCD_Types {
//==============================================
static class XGenerateBreakTest extends GenerateBreakTest {
Segmenter seg;
String sample;
{
needsFullBreakSample = false;
}
public XGenerateBreakTest(UCD ucd, Segmenter.Builder segBuilder, String sample, String filename, String[] extraSamples, String[] extraSingleSamples) {
super(ucd);
this.seg = segBuilder.make();
this.sample = sample;
List rules = segBuilder.getRules();
collectingRules = true;
for (Iterator it = rules.iterator(); it.hasNext();) {
String rule = (String)it.next();
setRule(rule);
}
collectingRules = false;
map.add("Other", new UnicodeSet(0,0x10FFFF));
UnicodeMap segSamples = seg.getSamples();
Collection x = segSamples.getAvailableValues();
for (Iterator it = x.iterator(); it.hasNext();) {
String label = (String)it.next();
map.add(label, segSamples.getSet(label), true, false);
}
this.fileName = filename;
sampleMap = map;
this.extraSamples = extraSamples;
this.extraSingleSamples = extraSingleSamples;
}
static class GenerateGraphemeBreakTest extends GenerateBreakTest {
public boolean isBreak(String source, int offset) {
boolean result = seg.breaksAt(source, offset);
setRule(String.valueOf(seg.getBreakRule()));
return result;
}
GenerateGraphemeBreakTest(UCD ucd) {
public String fullBreakSample() {
return sample;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
return map.getLabel(cp);
}
// stuff that subclasses need to override
public byte getType(int cp) {
return (byte) map.getIndex(cp);
}
}
static class GenerateGraphemeBreakTest extends XGenerateBreakTest {
public GenerateGraphemeBreakTest(UCD ucd) {
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"GraphemeClusterBreak"), "aa", "Grapheme",
new String[]{}, new String[]{});
}
}
static class GenerateLineBreakTest extends XGenerateBreakTest {
public GenerateLineBreakTest(UCD ucd) {
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"LineBreak"), "aa", "Line",
new String[]{}, new String[] {
"can't", "can\u2019t", "ab\u00ADby",
"-3",
"e.g.",
"\u4e00.\u4e00.",
"a b",
"a \u200bb",
"a \u0308b",
"1\u0308b(a)-(b)",
});
}
}
static class GenerateSentenceBreakTest extends XGenerateBreakTest {
public GenerateSentenceBreakTest(UCD ucd) {
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"SentenceBreak"), "aa", "Sentence",
new String[]{},
getExtraSamples());
}
static String[] getExtraSamples() {
GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd());
String[] extraSingleSamples = new String[] {
"(\"Go.\") (He did.)",
"(\u201CGo?\u201D) (He did.)",
"U.S.A\u0300. is",
"U.S.A\u0300? He",
"U.S.A\u0300.",
"3.4",
"c.d",
"etc.)\u2019 \u2018(the",
"etc.)\u2019 \u2018(The",
"the resp. leaders are",
"\u5B57.\u5B57",
"etc.\u5B83",
"etc.\u3002",
"\u5B57\u3002\u5B83",
};
String[] temp = new String [extraSingleSamples.length * 2];
System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length);
for (int i = 0; i < extraSingleSamples.length; ++i) {
temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme);
}
extraSingleSamples = temp;
return extraSingleSamples;
}
}
static class GenerateWordBreakTest extends XGenerateBreakTest {
public GenerateWordBreakTest(UCD ucd) {
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"WordBreak"), "aa", "Word",
new String[] {
/*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060"
},
getExtraSamples());
}
static String[] getExtraSamples() {
GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd());
String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" };
String[] extraSingleSamples = new String [temp.length * 2];
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
for (int i = 0; i < temp.length; ++i) {
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
}
return extraSingleSamples;
}
}
static class OLDGenerateGraphemeBreakTest extends GenerateBreakTest {
OLDGenerateGraphemeBreakTest(UCD ucd) {
super(ucd);
fileName = "Grapheme";
sampleMap = map;
@ -866,13 +1034,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
//==============================================
static class GenerateWordBreakTest extends GenerateBreakTest {
static class XGenerateWordBreakTest extends GenerateBreakTest {
GenerateGraphemeBreakTest grapheme;
MyBreakIterator breaker;
Context context = new Context();
GenerateWordBreakTest(UCD ucd) {
XGenerateWordBreakTest(UCD ucd) {
super(ucd);
grapheme = new GenerateGraphemeBreakTest(ucd);
breaker = new MyBreakIterator(grapheme);
@ -1017,13 +1185,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
// ========================================
static class GenerateLineBreakTest extends GenerateBreakTest {
static class XGenerateLineBreakTest extends GenerateBreakTest {
GenerateGraphemeBreakTest grapheme;
MyBreakIterator breaker;
Context context = new Context();
GenerateLineBreakTest(UCD ucd) {
XGenerateLineBreakTest(UCD ucd) {
super(ucd);
grapheme = new GenerateGraphemeBreakTest(ucd);
breaker = new MyBreakIterator(grapheme);
@ -1505,12 +1673,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
//==============================================
static class GenerateSentenceBreakTest extends GenerateBreakTest {
static class XGenerateSentenceBreakTest extends GenerateBreakTest {
GenerateGraphemeBreakTest grapheme;
MyBreakIterator breaker;
GenerateSentenceBreakTest(UCD ucd) {
XGenerateSentenceBreakTest(UCD ucd) {
super(ucd);
grapheme = new GenerateGraphemeBreakTest(ucd);
breaker = new MyBreakIterator(grapheme);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
* $Date: 2006/06/09 21:21:20 $
* $Revision: 1.11 $
* $Date: 2006/09/24 23:32:44 $
* $Revision: 1.12 $
*
*******************************************************************************
*/
@ -19,6 +19,7 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
@ -37,16 +38,21 @@ import com.ibm.icu.dev.test.util.UnicodeProperty;
import com.ibm.icu.dev.test.util.XEquivalenceClass;
import com.ibm.icu.impl.CollectionUtilities;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.text.utility.Utility;
public class GenerateConfusables {
public static String version = "2.0";
public static boolean EXCLUDE_CONFUSABLE_COMPAT = true;
public static void main(String[] args) throws IOException {
quickTest();
Set arg2 = new HashSet(Arrays.asList(args));
try {
if (arg2.contains("-b")) generateIDN();
@ -59,6 +65,19 @@ public class GenerateConfusables {
System.out.println("Done");
}
}
private static void quickTest() {
int script = getSingleScript("\u0430\u0061");
script = getSingleScript("\u0061\u0430"); //0323 ; 093C
String a = "\u0323";
String b = "\u093C";
int isLess = betterTargetIsLess.compare(a, b); // ("\u0045", "\u13AC");
MyEquivalenceClass test = new MyEquivalenceClass();
test.add(a, b, "none");
Set x = test.getEquivalences(a);
String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1);
}
/**
*
*/
@ -82,32 +101,34 @@ public class GenerateConfusables {
_Non_IICore.removeAll(um.getSet("2.1"));
// add Chinese?
UnicodeSet cjk_nic = new UnicodeSet();
String line = null;
try {
BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
while (true) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line, ';');
// part 0 is range
String range = pieces[0].trim();
int rangeDivider = range.indexOf("..");
int start, end;
if (rangeDivider < 0) {
start = end = Integer.parseInt(range, 16);
} else {
start = Integer.parseInt(range.substring(0, rangeDivider), 16);
end = Integer.parseInt(range.substring(rangeDivider+2), 16);
}
cjk_nic.add(start, end);
}
br.close();
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
}
_Non_IICore.removeAll(cjk_nic);
if (true) {
UnicodeSet cjk_nic = new UnicodeSet();
String line = null;
try {
BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
while (true) {
line = Utility.readDataLine(br);
if (line == null) break;
if (line.length() == 0) continue;
String[] pieces = Utility.split(line, ';');
// part 0 is range
String range = pieces[0].trim();
int rangeDivider = range.indexOf("..");
int start, end;
if (rangeDivider < 0) {
start = end = Integer.parseInt(range, 16);
} else {
start = Integer.parseInt(range.substring(0, rangeDivider), 16);
end = Integer.parseInt(range.substring(rangeDivider+2), 16);
}
cjk_nic.add(start, end);
}
br.close();
} catch (Exception e) {
throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
}
_Non_IICore.removeAll(cjk_nic);
}
}
return _Non_IICore;
// for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) {
@ -118,7 +139,7 @@ public class GenerateConfusables {
}
static PrintWriter log;
static final String ARROW = "\u2192";
static final String ARROW = "\u2192"; // \u2194
static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn")
.addAll(ups.getSet("gc=Co"))
@ -131,12 +152,14 @@ public class GenerateConfusables {
static UnicodeSet _skipNFKD;
static Map gatheredNFKD = new TreeMap();
static UnicodeMap nfcMap = new UnicodeMap();
static UnicodeMap nfcMap;
static UnicodeMap nfkcMap;
static String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\source\\";
static String outdir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
static String indir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\source\\";
static String outdir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\";
static Comparator codepointComparator = new UTF16.StringComparator(true,false,0);
static Comparator UCAComparator = new CollectionUtilities.MultiComparator(new Comparator[] {Collator.getInstance(ULocale.ROOT), codepointComparator});
static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
"\\u3400-\\u4DB5" +
@ -208,23 +231,35 @@ public class GenerateConfusables {
private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(),
reviews, removals2, lowerIsBetter;
private UnicodeSet isCaseFolded;
private IdentifierInfo() throws IOException {
propNFKCSet = ups.getSet("NFKC_QuickCheck=N")
.complement();
isCaseFolded = new UnicodeSet();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
String source = UTF16.valueOf(cp);
String cf = Default.ucd().getCase(source, UCD.FULL, UCD.FOLD);
if (cf.equals(source)) isCaseFolded.add(cp);
}
propNFKCSet = ups.getSet("NFKC_QuickCheck=N").complement();
UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE");
//removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant");
loadFileData();
xidPlus = new UnicodeSet(propXIDContinueSet).addAll(
additions.getSet(null).complement()).retainAll(propNFKCSet);
xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet);
getIdentifierSet();
notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus);
removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
removalSet = removals.getSet(null).complement();
//UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet);
//removals.putAll(notNfkcXid, PROHIBITED + "compat variant");
removalSet = removals.keySet();
remainingOutputSet = new UnicodeSet(IDNOutputSet)
.removeAll(removalSet);
remainingOutputSet = new UnicodeSet(IDNOutputSet).removeAll(removalSet);
UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet)
.removeAll(removalSet).removeAll(remainingOutputSet);
@ -234,9 +269,9 @@ public class GenerateConfusables {
// the output set
for (UnicodeSetIterator usi = new UnicodeSetIterator(
remainingInputSet1); usi.next();) {
String nss = Default.nfkc().normalize(usi.getString());
String nss = getModifiedNKFC(usi.getString());
String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD);
String cf2 = Default.nfkc().normalize(cf);
String cf2 = getModifiedNKFC(cf);
if (remainingOutputSet.containsAll(cf2))
remainingInputSet.add(usi.codepoint);
else
@ -247,7 +282,7 @@ public class GenerateConfusables {
for (UnicodeSetIterator usi = new UnicodeSetIterator(
remainingInputSet); usi.next();) {
String ss = usi.getString();
String nss = Default.nfkc().normalize(ss);
String nss = getModifiedNKFC(ss);
String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD);
if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) {
System.out.println("check");
@ -395,7 +430,7 @@ public class GenerateConfusables {
throw (RuntimeException) new RuntimeException(
"Failure on line " + line).initCause(e);
}
removals.putAll(getNonIICore(), "~IICore");
removals.putAll(getNonIICore(), PROHIBITED + "~IICore");
br.close();
}
@ -417,13 +452,14 @@ public class GenerateConfusables {
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setMergeRanges(true);
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
PrintWriter out = openAndWriteHeader("review.txt", "Review List for IDN");
// PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
//reviews.putAll(UNASSIGNED, "");
out.print("\uFEFF");
out.println("# Review List for IDN");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("");
// out.print("\uFEFF");
// out.println("# Review List for IDN");
// out.println("# $Revision: 1.12 $");
// out.println("# $Date: 2006/09/24 23:32:44 $");
// out.println("");
UnicodeSet fullSet = reviews.getSet("").complement();
@ -474,19 +510,15 @@ public class GenerateConfusables {
UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
PrintWriter out = openAndWriteHeader("idnchars.txt", "Recommended Identifier Profiles for IDN");
out.println("# Recommended Identifier Profiles for IDN");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("");
out.println("# Output Characters");
out.println("# Allowed as output characters");
out.println("");
bf.setValueSource("output");
bf.showSetNames(out, remainingOutputSet);
showExtras(bf, remainingOutputSet, letters);
/*
out.println("");
out.println("");
@ -502,10 +534,10 @@ public class GenerateConfusables {
bf.setValueSource("input-lenient");
bf.showSetNames(out, inputSet_lenient);
showExtras(bf, inputSet_lenient, letters);
*/
out.println("");
out
.println("# Not allowed at start of identifier");
out.println("# Not allowed at start of identifier");
out.println("");
bf.setValueSource("nonstarting");
bf.showSetNames(out, nonstarting);
@ -517,6 +549,7 @@ public class GenerateConfusables {
out.close();
}
/**
*
*/
@ -543,13 +576,14 @@ public class GenerateConfusables {
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
bf.setMergeRanges(true);
PrintWriter out = BagFormatter.openUTF8Writer(outdir,
"xidmodifications.txt");
PrintWriter out = openAndWriteHeader("xidmodifications.txt", "Security Profile for General Identifiers");
/* PrintWriter out = BagFormatter.openUTF8Writer(outdir, "xidmodifications.txt");
out.println("# Security Profile for General Identifiers");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("");
out.println("# $Revision: 1.12 $");
out.println("# $Date: 2006/09/24 23:32:44 $");
*/
out.println("# Characters restricted");
out.println("");
@ -567,11 +601,26 @@ public class GenerateConfusables {
out.println("# Characters added");
out.println("");
bf.setValueSource("addition");
bf.showSetNames(out, additions.getSet(null).complement());
bf.showSetNames(out, additions.keySet());
//showRemapped(out, "Characters remapped on input", remap);
out.close();
out = openAndWriteHeader("xidAllowed.txt", "Security Profile for General Identifiers");
UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet());
UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet);
allowed.removeAll(cfAllowed);
bf.setValueSource("case_folded");
out.println("# XID characters allowed (no uppercase)");
out.println("");
bf.showSetNames(out, cfAllowed);
bf.setValueSource("not_case_folded");
out.println("");
out.println("# XID characters allowed (uppercase)");
out.println("");
bf.showSetNames(out, allowed);
out.close();
UnicodeMap someRemovals = new UnicodeMap();
UnicodeMap.Composer myComposer = new UnicodeMap.Composer() {
@ -604,8 +653,8 @@ public class GenerateConfusables {
//someRemovals = removals;
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
out.println("# Characters restricted in domain names");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("# $Revision: 1.12 $");
out.println("# $Date: 2006/09/24 23:32:44 $");
out.println("#");
out.println("# This file contains a draft list of characters for use in");
out.println("# UTR #36: Unicode Security Considerations");
@ -646,7 +695,7 @@ public class GenerateConfusables {
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
}).set(someRemovals).setMain("Removals", "GCB",
UnicodeProperty.ENUMERATED, "1.0"));
bf.showSetNames(out, someRemovals.getSet(null).complement());
bf.showSetNames(out, someRemovals.keySet());
}
out.close();
}
@ -654,6 +703,7 @@ public class GenerateConfusables {
static final String PROHIBITED = "restricted ; ";
static final String NOT_IN_XID = "not in XID+";
public static final boolean suppress_NFKC = true;
/**
*
*/
@ -674,7 +724,7 @@ public class GenerateConfusables {
out.println("");
for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) {
String source = usi.getString();
String target = Default.nfkc().normalize(source);
String target = getModifiedNKFC(source);
writeSourceTargetLine(out, source, null, target, value);
}
//bf.showSetNames(out, s);
@ -712,7 +762,7 @@ public class GenerateConfusables {
out.println("# " + title);
out.println("");
int count = 0;
for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.getSet(null).complement()); usi.next();) {
for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) {
writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null);
count++;
}
@ -747,6 +797,8 @@ public class GenerateConfusables {
}
private static UnicodeSet getSkipNFKD() {
nfcMap = new UnicodeMap();
nfkcMap = new UnicodeMap();
if (_skipNFKD == null) {
_skipNFKD = new UnicodeSet();
UnicodeSet idSet = getIdentifierSet();
@ -755,6 +807,8 @@ public class GenerateConfusables {
int cat = Default.ucd().getCategory(cp);
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
int decompType = Default.ucd().getDecompositionType(cp);
String nfc = Default.nfc().normalize(cp);
if (decompType == UCD.CANONICAL) nfcMap.put(cp, nfc);
if (decompType == UCD.COMPAT_CIRCLE
|| decompType == UCD.COMPAT_SUPER
|| decompType == UCD.COMPAT_SUB
@ -765,42 +819,58 @@ public class GenerateConfusables {
_skipNFKD.add(cp);
continue;
}
String source = UTF16.valueOf(cp);
String mapped = Default.nfkd().normalize(cp);
if (mapped.equals(UTF16.valueOf(cp))) continue;
String kmapped = getModifiedNKFC(source);
if (!kmapped.equals(source) && !kmapped.equals(nfc)) {
if (kmapped.startsWith(" ") || kmapped.startsWith("\u0640")) {
System.out.println("?? " + Default.ucd().getCodeAndName(cp));
System.out.println("\t" + Default.ucd().getCodeAndName(kmapped));
kmapped = getModifiedNKFC(source); // for debugging
}
nfkcMap.put(cp,kmapped);
}
if (mapped.equals(source)) continue;
if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp);
else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp);
if (decompType == UCD.CANONICAL) nfcMap.put(cp, Default.nfd().normalize(cp));
}
}
nfcMap.setMissing("");
nfcMap.setMissing("");
nfcMap.freeze();
nfkcMap.setMissing("");
nfkcMap.freeze();
return _skipNFKD;
}
private static boolean isMixedScript(String source) {
return getSingleScript(source) != UScript.INVALID_CODE;
return getSingleScript(source) == UScript.INVALID_CODE;
}
/*
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
*/
public static int getSingleScript(String source) {
int lastScript = UScript.INVALID_CODE;
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
int script = UScript.getScript(cp);
if (script == UScript.COMMON || script == UScript.INHERITED) {
if (XIDContinueSet.contains(cp)) {
if (lastScript == UScript.INVALID_CODE) lastScript = script;
continue; // skip if not identifier
}
script = UScript.COMMON;
}
if (lastScript == UScript.INVALID_CODE) lastScript = script;
else if (script != lastScript) return UScript.INVALID_CODE;
/**
* Returns the script of the input text. Script values of COMMON and INHERITED are ignored.
* @param source Input text.
* @return Script value found in the text.
* If more than one script values are found, then UScript.INVALID_CODE is returned.
* If no script value is found (other than COMMON or INHERITED), then UScript.COMMON is returned.
*/
public static int getSingleScript(String source) {
if (source.length() == 0) return UScript.COMMON;
int lastScript = UScript.COMMON; // temporary value
int cp;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
int script = UScript.getScript(cp);
if (script == UScript.COMMON || script == UScript.INHERITED) {
continue;
}
return lastScript;
if (lastScript == UScript.COMMON) {
lastScript = script;
} else if (script != lastScript) {
return UScript.INVALID_CODE;
}
}
return lastScript;
}
/**
*
@ -856,8 +926,9 @@ public class GenerateConfusables {
+ " ;\t" + Utility.hex(target)
+ (tag == null ? "" : " ;\t" + tag)
//+ " ;\t" + (preferredID.contains(source) ? "ID" : "")
+ "\t# "
+ "( " + source + " " + ARROW + " " + target + ") "
+ "\t#"
+ (isXid(source) ? "" : "*")
+ " ( " + source + " " + ARROW + " " + target + " ) "
+ Default.ucd().getName(source) + " " + ARROW + " "
+ Default.ucd().getName(target)
);
@ -992,18 +1063,45 @@ public class GenerateConfusables {
for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(item, i);
String cps = UTF16.valueOf(cp);
String mapped = getParadigm(cps);
String mapped = getParadigm(cps, false, false);
if (mapped.indexOf(cps) >= 0) result.append(cps);
else {
result.append(mapped);
reasons.append("[" + getReasons(cps, mapped) + "]");
List x = getReasons(cps, mapped);
reasons.append(getBestForm(x));
}
}
return result.toString();
}
public String getParadigm(Object item) {
return (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
private Object getBestForm(Collection x) {
if (x.size() != 1) return "[" + x + "]";
Object item = x.iterator().next();
if (!(item instanceof Collection)) return x.toString();
return getBestForm((Collection)item);
}
public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) {
Set filteredSet;
if (onlyLowercase == false && onlySameScript == false) {
filteredSet = getEquivalences(item);
} else {
filteredSet = new HashSet();
for (Iterator it = getEquivalences(item).iterator(); it.hasNext();) {
String other = (String) it.next();
String combined = item + other;
if (onlyLowercase) {
boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
if (!isLowercase) continue;
}
if (onlySameScript) {
boolean isMixed = isMixedScript(combined);
if (isMixed) continue;
}
filteredSet.add(other);
}
}
return (String) CollectionUtilities.getBest(filteredSet, betterTargetIsLess, -1);
}
public Set getOrderedExplicitItems() {
@ -1057,12 +1155,21 @@ public class GenerateConfusables {
type += ":" + lineCount;
String combined = source + target;
if (combined.indexOf("\u0430") >= 0) {
System.out.println(Default.ucd().getCodeAndName(combined));
}
boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
boolean isMixed = isMixedScript(combined);
dataMixedAnycase.add(source, target, type);
if (isLowercase) dataMixedLowercase.add(source, target, type);
if (!isMixed) dataSingleAnycase.add(source, target, type);
if (!isMixed && isLowercase) dataSingleLowercase.add(source, target, type);
if (isLowercase) {
dataMixedLowercase.add(source, target, type);
}
if (!isMixed) {
dataSingleAnycase.add(source, target, type);
}
if (!isMixed && isLowercase) {
dataSingleLowercase.add(source, target, type);
}
return this;
}
@ -1124,7 +1231,13 @@ public class GenerateConfusables {
String source = Utility.fromHex(pieces[0].trim(),true);
String target = Utility.fromHex(pieces[1].trim(),true);
//if (pieces.length > 2) type = pieces[2].trim();
add(source, target, type, count, line);
String nfkdSource = Default.nfkd().normalize(source);
String nfkdTarget = Default.nfkd().normalize(target);
if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) {
System.out.println("Suppressing nfkc for: " + Default.ucd().getCodeAndName(source));
} else {
add(source, target, type, count, line);
}
}
}
in.close();
@ -1137,39 +1250,49 @@ public class GenerateConfusables {
}
public void writeSource(String directory, String filename) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.println("# Source File for IDN Confusables");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("");
PrintWriter out = openAndWriteHeader(filename, "Source File for IDN Confusables");
// PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
// out.println("# Source File for IDN Confusables");
// out.println("# $Revision: 1.12 $");
// out.println("# $Date: 2006/09/24 23:32:44 $");
// out.println("");
dataMixedAnycase.writeSource(out);
out.close();
}
public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
out.print('\uFEFF');
out.println("# Recommended confusable mapping for IDN");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("");
PrintWriter out = openAndWriteHeader(filename, "Recommended confusable mapping for IDN");
// PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
// out.println("# Recommended confusable mapping for IDN");
// out.println("# $Revision: 1.12 $");
// out.println("# $Date: 2006/09/24 23:32:44 $");
// out.println("");
if (appendFile) {
String[] replacements = {"%date%", Default.getDate()};
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
Utility.UTF8_WINDOWS, out, replacements);
}
writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
if (true) {
writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, true, true);
writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, true);
writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, true, false);
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
} else {
writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false);
writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false);
writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false);
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
}
out.close();
}
/**
* @param skipNFKEquivs TODO
* @param onlyLowercase TODO
* @param onlySingleScript TODO
*
*/
private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript) {
// first get all the sets. Then get the best paradigm from each. Then sort.
// Set setOfSets = data.getEquivalenceSets();
// Map orderedResults = new TreeMap(betterTargetIsLess);
@ -1186,16 +1309,30 @@ public class GenerateConfusables {
out.println();
int count = 0;
UnicodeSet preferredID = getIdentifierSet();
ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator});
Set orderedPairs = new TreeSet(ac);
for (Iterator it = items.iterator(); it.hasNext();) {
String source = (String) it.next();
if (UTF16.hasMoreCodePointsThan(source,1)) continue;
String target = data.getParadigm(source);
if (UTF16.hasMoreCodePointsThan(source,1)) continue;
String target = data.getParadigm(source, onlyLowercase, onlySingleScript);
if (target == null) continue;
if (source.equals(target)) continue;
if (skipNFKEquivs) {
if (!Default.nfkd().normalize(source).equals(source)) continue;
}
orderedPairs.add(new String[] {target, source});
}
String lastTarget = null;
for (Iterator it = orderedPairs.iterator(); it.hasNext();) {
String[] pair = (String[]) it.next();
String source = pair[1];
String target = pair[0];
String reason = fixReason(data.getReasons(source, target));
if (lastTarget != null && !lastTarget.equals(target)) {
out.println();
}
writeSourceTargetLine(out, source, tag, target, reason);
lastTarget = target;
count++;
}
out.println();
@ -1326,7 +1463,7 @@ public class GenerateConfusables {
*/
public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) {
int count = 0;
for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.getSet(null).complement()); it.next(); ) {
for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) {
add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine);
}
}
@ -1355,13 +1492,14 @@ public class GenerateConfusables {
*
*/
public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
UnicodeSet representable = new UnicodeSet();
out.print('\uFEFF');
out.println("# Summary: Recommended confusable mapping for IDN");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
out.println("");
PrintWriter out = openAndWriteHeader(filename, "Summary: Recommended confusable mapping for IDN");
// PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
// out.print('\uFEFF');
// out.println("# Summary: Recommended confusable mapping for IDN");
// out.println("# $Revision: 1.12 $");
// out.println("# $Date: 2006/09/24 23:32:44 $");
// out.println("");
UnicodeSet representable = new UnicodeSet();
MyEquivalenceClass data = dataMixedAnycase;
Set items = data.getOrderedExplicitItems();
// for (Iterator it = items.iterator(); it.hasNext();) {
@ -1481,11 +1619,12 @@ public class GenerateConfusables {
wsAny.addEquivalents(equivalents);
wsLower.addEquivalents(equivalents);
}
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
out.print('\uFEFF');
out.println("# Summary: Whole-Script Confusables");
out.println("# $Revision: 1.11 $");
out.println("# $Date: 2006/06/09 21:21:20 $");
PrintWriter out = openAndWriteHeader(filename, "Summary: Whole-Script Confusables");
// PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
// out.print('\uFEFF');
// out.println("# Summary: Whole-Script Confusables");
// out.println("# $Revision: 1.12 $");
// out.println("# $Date: 2006/09/24 23:32:44 $");
out.println("# This data is used for determining whether a strings is a");
out.println("# whole-script or mixed-script confusable.");
out.println("# The mappings here ignore common and inherited script characters,");
@ -1716,7 +1855,6 @@ public class GenerateConfusables {
}
private static void generateConfusables(String indir, String outdir) throws IOException {
betterTargetIsLess.compare("\u0020", "\u2004");
File dir = new File(indir);
String[] names = dir.list();
DataSet total = new DataSet();
@ -1731,12 +1869,26 @@ public class GenerateConfusables {
total.addAll(ds);
total.close("t*" + names[i]);
}
// add normalized data
// for (int i = 0; i <= 0x10FFFF; ++i) {
// if (Default.nfkc().isNormalized(i)) continue;
// String result = getModifiedNKFC(UTF16.valueOf(i));
// ds.foo();
// }
getSkipNFKD();
DataSet ds = new DataSet();
ds.addUnicodeMap(nfcMap, "nfc", "nfc");
ds.close("*");
total.addAll(ds);
total.close("*");
ds = new DataSet();
ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc");
ds.close("*");
//ds.write(outdir, "new-decomp.txt", false, false);
total.addAll(ds);
total.close("*");
total.writeSummary(outdir, "confusablesSummary.txt", false, null);
total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null);
//total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true,
@ -1893,6 +2045,12 @@ public class GenerateConfusables {
MARK_ASCII = new Integer(10);
static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();
static UnicodeSet XID = new UnicodeSet("[:xidcontinue:]");
static boolean isXid(String x) {
return XID.containsAll(x);
}
static class _BetterTargetIsLess implements Comparator {
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
@ -1900,9 +2058,20 @@ public class GenerateConfusables {
public int compare(Object o1, Object o2) {
String a = (String)o1;
String b = (String)o2;
// longer is better (less)
int ca = UTF16.countCodePoint(a);
int cb = UTF16.countCodePoint(b);
if (ca != cb) return ca > cb ? -1 : 1;
if (ca != cb) {
return ca > cb ? -1 : 1;
}
// is Identifier is better
boolean ba = isXid(a);
boolean bb = isXid(b);
if (ba != bb) {
return ba ? -1 : 1;
}
int aok = getValue(a);
int bok = getValue(b);
if (aok != bok) return aok < bok ? -1 : 1;
@ -1947,4 +2116,28 @@ public class GenerateConfusables {
return type.substring(dash+1,period);
}
static Normalizer modNFKC ;
private static String getModifiedNKFC(String cf) {
if (modNFKC == null) {
modNFKC = new Normalizer(Normalizer.NFKC, Default.ucdVersion());
modNFKC.setSpacingSubstitute();
}
return modNFKC.normalize(cf);
}
private static PrintWriter openAndWriteHeader(String filename, String title) throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
out.print('\uFEFF');
out.println("# " + title);
out.println("# File: " + filename);
out.println("# Version: " + version);
out.println("# Generated: " + Default.getDate());
out.println("# Checkin: $Revision: 1.12 $");
out.println("#");
out.println("# For documentation and usage, see http://www.unicode.org/reports/tr39/");
out.println("#");
return out;
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
* $Date: 2004/02/07 01:01:14 $
* $Revision: 1.4 $
* $Date: 2006/09/24 23:32:44 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -14,9 +14,10 @@
package com.ibm.text.UCD;
import java.io.*;
import com.ibm.icu.text.UTF16;
import com.ibm.text.utility.*;
//import com.ibm.text.utility;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import com.ibm.text.utility.Utility;
//import java.util.*;
public class GenerateThaiBreaks {
public static void main(String [] args) throws IOException {

View file

@ -1,3 +1,20 @@
Show [[:block=tamil:] & [:age=3.2:] - [:age=3.1:]]
Show [[:block=tamil:] & [:age=4.0:] - [:age=3.2:]]
Show [[:block=tamil:] & [:age=4.1:] - [:age=4.0:]]
Show [[:block=tamil:] & [:age=5.0:] - [:age=4.1:]]
Stop
Show [[:NFKCQuickCheck=No:] & [$gc:Lm]]
Stop
[$Name: $gc:Sk]
[$Name: $gc:Lm]
Show [[$whitespace] - [$gc:zs]]
Show [[$gc:zs] - [$whitespace]]
Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
Let $number = [$gc:Nd $gc:Nl $gc:No]
Let $mark = [$gc:mn $gc:me $gc:mc]
@ -62,7 +79,7 @@ Let $guessClose = [$gc:pf $gc:pe $gc:pi]
$guessClose = $__closing_punc
Let $guessTerm = [$sb:aterm $sb:sterm]
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? <EFBFBD> ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]

View file

@ -1,5 +1,5 @@
Generate: .*
DeltaVersion: 16
Generate: .*BreakTest.*
DeltaVersion: 17
CopyrightYear: 2006
File: auxiliary/GraphemeBreakProperty

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2005/11/01 00:10:54 $
* $Revision: 1.17 $
* $Date: 2006/09/24 23:32:44 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
@ -14,9 +14,13 @@
package com.ibm.text.UCD;
import java.util.*;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.text.utility.*;
import com.sun.java_cup.internal.internal_error;
/**
@ -302,6 +306,7 @@ public final class Normalizer implements UCD_Types {
private byte form;
private boolean composition;
private boolean compatibility;
private UnicodeMap substituteMapping;
/**
* Decomposes text, either canonical or compatibility,
@ -319,7 +324,12 @@ public final class Normalizer implements UCD_Types {
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
buffer.setLength(0);
ch32 = UTF16.charAt(source, i);
data.getRecursiveDecomposition(ch32, buffer, compat);
String sub = substituteMapping == null ? null : (String) substituteMapping.getValue(ch32);
if (sub != null) {
buffer.append(sub);
} else {
data.getRecursiveDecomposition(ch32, buffer, compat);
}
// add all of the characters in the decomposition.
// (may be just the original character, if there was
@ -561,6 +571,81 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
return result;
}
public UnicodeMap getSubstituteMapping() {
return substituteMapping;
}
public Normalizer setSubstituteMapping(UnicodeMap substituteMapping) {
this.substituteMapping = substituteMapping;
return this;
}
static UnicodeMap spacingMap;;
public void setSpacingSubstitute() {
if (spacingMap == null) {
makeSpacingMap();
}
setSubstituteMapping(spacingMap);
}
private void makeSpacingMap() {
spacingMap = new UnicodeMap();
StringBuffer b = new StringBuffer();
main:
for (int i = 0; i <= 0x10FFFF; ++i) {
boolean compat = data.ucd.getDecompositionType(i) >= data.ucd.CANONICAL;
if (!compat) continue;
b.setLength(0);
data.getRecursiveDecomposition(i, b, true);
if (b.length() == 1) continue;
char firstChar = b.charAt(0);
if (firstChar != 0x20 && firstChar != '\u0640') continue;
// if rest are just Mn or Me marks, then add to substitute mapping
int cp;
for (int j = 1; j < b.length(); j += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(b,j);
int cat = data.ucd.getCategory(cp);
if (cat != data.ucd.Mn && cat != data.ucd.Me) continue main;
}
spacingMap.put(i, UTF16.valueOf(i));
}
String[][] specials = {
{"[\\u0384\\u1FFD]", "\u00B4"},
{"[\\uFFE3]", "\u00AF"},
{"[\\uFE49-\\uFE4C]", "\u203E"},
{"[\\u1FED]", "\u00A8\u0300"},
{"[\\u1FEE\\u0385]", "\u00A8\u0301"},
{"[\\u1FC1]", "\u00A8\u0342"},
{"[\\u1FBD]", "\u1FBF"},
{"[\\u1FCD]", "\u1FBF\u0300"},
{"[\\u1FCE]", "\u1FBF\u0301"},
{"[\\u1FCF]", "\u1FBF\u0342"},
{"[\\u1FDD]", "\u1FFE\u0300"},
{"[\\u1FDE]", "\u1FFE\u0301"},
{"[\\u1FDF]", "\u1FFE\u0342"},
{"[\\uFC5E]", "\uFE72\u0651"},
{"[\\uFC5F]", "\uFE74\u0651"},
{"[\\uFC60]", "\uFE76\u0651"},
{"[\\uFC61]", "\uFE78\u0651"},
{"[\\uFC62]", "\uFE7A\u0651"},
{"[\\uFC63]", "\uFE7C\u0670"},
{"[\\uFCF2]", "\uFE77\u0651"},
{"[\\uFCF3]", "\uFE79\u0651"},
{"[\\uFCF4]", "\uFE7B\u0651"},
};
int count = 0;
UnicodeSet mappedChars = spacingMap.keySet();
for (int i = 0; i < specials.length; ++i) {
UnicodeSet source = new UnicodeSet(specials[i][0]);
if (!mappedChars.containsAll(source)) {
throw new InternalError("Remapping character that doesn't need it!" + source);
}
spacingMap.putAll(source, specials[i][1]);
count += source.size();
}
spacingMap.freeze();
}
/**
* Just accessible for testing.
*/

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
* $Date: 2006/06/09 21:21:20 $
* $Revision: 1.12 $
* $Date: 2006/09/24 23:32:45 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -24,6 +24,7 @@ import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
@ -38,6 +39,7 @@ import com.ibm.icu.dev.demo.translit.CaseIterator;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.Tabber;
import com.ibm.icu.dev.test.util.UnicodeMap;
import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty;
import com.ibm.icu.impl.PrettyPrinter;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
@ -57,7 +59,16 @@ import com.ibm.icu.util.ULocale;
public class QuickTest implements UCD_Types {
public static void main(String[] args) throws IOException {
try {
getHangulDecomps();
if (true) return;
showLeadingTrailingNonStarters();
//checkBufferStatus(true);
checkNormalization("NFC", Default.nfc());
//checkNormalization("NFKC", Default.nfkc());
@ -66,7 +77,6 @@ public class QuickTest implements UCD_Types {
checkCaseChanges();
if (true) return;
checkBufferStatus();
checkCase();
@ -102,7 +112,43 @@ public class QuickTest implements UCD_Types {
}
}
static void checkNormalization(String title, Normalizer nfx) {
private static void getHangulDecomps() {
//Normalizer nfkd500 = new Normalizer(Normalizer.NFKD, "5.0.0");
Normalizer nfkd218 = new Normalizer(Normalizer.NFKD, "2.1.8");
UnicodeMap diff = new UnicodeMap();
Map compose = new HashMap();
Map decompose = new HashMap();
// UnicodeSet applicable = // new UnicodeSet("[:HangulSyllable=NA:]");
UnicodeSet applicable = new UnicodeSet("[[\u1100-\u11FF \uAC00-\uD7FF]&[:assigned:]]");
for (UnicodeSetIterator it = new UnicodeSetIterator(applicable); it.next(); ) {
String source = it.getString();
String v218 = nfkd218.normalize(source);
//String v500 = nfkd500.normalize(source);
if (v218.equals(source)) continue;
decompose.put(source, v218);
compose.put(v218, source);
}
// now try recomposing
for (Iterator it = decompose.keySet().iterator(); it.hasNext();) {
String source = (String) it.next();
String decomposition = (String) decompose.get(source);
if (decomposition.length() > 2) {
String trial = decomposition.substring(0, decomposition.length() - 1);
String composition = (String) compose.get(trial);
if (composition != null) {
decomposition = composition + decomposition.substring(decomposition.length() - 1);
}
}
if (decomposition.length() != 2) System.out.println("Failed decomp: " + Default.ucd().getCodeAndName(source));
diff.put(source.charAt(0), com.ibm.text.utility.Utility.hex(decomposition, " "));
}
UnicodeMapProperty p = new UnicodeMapProperty().set(diff);
BagFormatter bf = new BagFormatter().setValueSource(p);
System.out.println(bf.showSetNames(diff.keySet()));
}
static void checkNormalization(String title, Normalizer nfx) {
UnicodeSet trailing = new UnicodeSet();
UnicodeSet leading = new UnicodeSet();
UnicodeSet starter = new UnicodeSet();
@ -947,20 +993,22 @@ public class QuickTest implements UCD_Types {
static Counter bufferTypes = new Counter();
static class BufferData {
byte starterIsZero;
int initials;
int medials;
int finals;
int sample;
public boolean equals(Object other) {
BufferData that = (BufferData)other;
return initials == that.initials && medials == that.medials && finals == that.finals;
return starterIsZero == that.starterIsZero && initials == that.initials && medials == that.medials && finals == that.finals;
}
public int hashCode() {
return (initials*37 + medials)*37 + finals;
return ((starterIsZero * 37 + initials)*37 + medials)*37 + finals;
}
public BufferData set(int codepoint) {
String s = Default.nfkd().normalize(codepoint);
int cp;
starterIsZero = (byte)(UCharacter.getCombiningClass(codepoint) == 0 ? 0 : 1);
boolean isInitial = true;
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
@ -977,14 +1025,30 @@ public class QuickTest implements UCD_Types {
finals = 0;
}
}
if (medials != 0) medials = 1;
sample = codepoint;
if (starterIsZero == 0 && medials == 0) {
System.out.println("WARNING: BAD CHARACTER");
cp = sample;
int ccc = UCharacter.getCombiningClass(cp);
System.out.println("U+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
ccc = UCharacter.getCombiningClass(cp);
System.out.println("\tU+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
}
}
return this;
}
public static String getHeader() {
return "Starter?" + "\t" + "initials" + "\t" + "Contains Starter?" + "\t" + "finals" + "\t" + "sample hex" + "\t" + "sample name";
}
public String toString() {
String result = (starterIsZero == 0 ? "Y" : "") + "\t" + initials + "\t" + (medials != 0 ? "Y" : "") + "\t" + finals + "\t";
if (sample == 0) {
return initials + "\t" + medials + "\t" + finals + "\t" + "-" + "\t" + "all others";
return result + "-" + "\t" + "all others";
}
return initials + "\t" + medials + "\t" + finals + "\t" + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
return result + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
}
}
static class BufferDataComparator implements Comparator {
@ -992,14 +1056,15 @@ public class QuickTest implements UCD_Types {
BufferData a0 = (BufferData)arg0;
BufferData a1 = (BufferData)arg1;
int result;
if (0 != (result = a0.initials - a1.initials)) return result;
if (0 != (result = a0.starterIsZero - a1.starterIsZero)) return result;
if (0 != (result = a0.initials - a1.initials)) return result;
if (0 != (result = a0.finals - a1.finals)) return result;
if (0 != (result = a0.medials - a1.medials)) return result;
return 0;
}
}
private static void checkBufferStatus() {
BufferData non = new BufferData().set(0);
private static void showLeadingTrailingNonStarters() {
BufferData non = new BufferData().set(0);
Tabber tabber = new Tabber.HTMLTabber();
for (int i = 0; i <= 0x10ffff; ++i) {
int type = Default.ucd().getCategory(i);
@ -1013,6 +1078,7 @@ public class QuickTest implements UCD_Types {
TreeSet sorted = new TreeSet(new BufferDataComparator());
NumberFormat nf = NumberFormat.getInstance();
sorted.addAll(m.keySet());
System.out.println(tabber.process("total\t" + BufferData.getHeader()));
for (Iterator it = sorted.iterator(); it.hasNext();) {
Object key = it.next();
Object value = bufferTypes.getCount(key);

View file

@ -10,6 +10,8 @@ import java.util.List;
import java.util.Locale;
import com.ibm.icu.dev.test.util.BagFormatter;
import com.ibm.icu.dev.test.util.Tabber;
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
import com.ibm.icu.dev.tool.UOption;
import com.ibm.icu.text.SymbolTable;
import com.ibm.icu.text.UTF16;
@ -21,13 +23,15 @@ public class TestUnicodeInvariants {
private static final int
HELP1 = 0,
FILE = 1,
RANGE = 2
RANGE = 2,
TABLE = 3
;
private static final UOption[] options = {
UOption.HELP_H(),
UOption.create("file", 'f', UOption.REQUIRES_ARG),
UOption.create("range", 'r', UOption.NO_ARG),
UOption.create("norange", 'n', UOption.NO_ARG),
UOption.create("table", 't', UOption.NO_ARG),
};
public static void main(String[] args) throws IOException {
@ -35,7 +39,10 @@ public class TestUnicodeInvariants {
String file = "UnicodeInvariants.txt";
if (options[FILE].doesOccur) file = options[FILE].value;
boolean doRange = options[RANGE].doesOccur;
boolean doRange = !options[RANGE].doesOccur;
System.out.println("File:\t" + file);
System.out.println("Ranges?\t" + doRange);
System.out.println("HTML?\t" + options[TABLE].doesOccur);
testInvariants(file, doRange);
}
@ -92,11 +99,19 @@ public class TestUnicodeInvariants {
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
out.write('\uFEFF'); // BOM
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
BagFormatter bf = new BagFormatter();
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
BagFormatter bf2 = new BagFormatter();
bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
bf2.setMergeRanges(doRange);
BagFormatter errorLister = new BagFormatter();
errorLister.setMergeRanges(doRange);
errorLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
errorLister.setShowLiteral(TransliteratorUtilities.toXML);
if (options[TABLE].doesOccur) errorLister.setTabber(new Tabber.HTMLTabber());
BagFormatter showLister = new BagFormatter();
showLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
showLister.setMergeRanges(doRange);
showLister.setShowLiteral(TransliteratorUtilities.toXML);
if (options[TABLE].doesOccur) showLister.setTabber(new Tabber.HTMLTabber());
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
@ -112,6 +127,7 @@ public class TestUnicodeInvariants {
int pos = line.indexOf('#');
if (pos >= 0) line = line.substring(0,pos).trim();
if (line.length() == 0) continue;
if (line.equalsIgnoreCase("Stop")) break;
// fix all the variables
String oldLine = line;
@ -133,12 +149,12 @@ public class TestUnicodeInvariants {
String part = line.substring(4).trim();
if (part.startsWith("Each")) {
part = part.substring(4).trim();
bf2.setMergeRanges(false);
showLister.setMergeRanges(false);
}
pp.setIndex(0);
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
bf2.showSetNames(out, leftSet);
bf2.setMergeRanges(doRange);
showLister.showSetNames(out, leftSet);
showLister.setMergeRanges(doRange);
continue;
}
@ -210,7 +226,7 @@ public class TestUnicodeInvariants {
out.println();
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
out.println("**** START Error Info ****");
bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
errorLister.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
out.println("**** END Error Info ****");
out.println();
testFailureCount++;

View file

@ -331,11 +331,10 @@ AC00..D7A3 # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
</ol>
<h3>5. UCA</h3>
<ol>
<li>
You will use com.ibm.text.UCA.Main as your main class, creating along
<li>You will use com.ibm.text.UCA.Main as your main class, creating along
the same lines as above.</li>
<li>To test whether the UCA files are valid, use the
<span style="font-weight: 400">options (<i>note: you should also build the ICU
<span style="font-weight: 400">options (<i>note: you must also build the ICU
files below, since they test other aspects</i>).</span><pre>writeCollationValidityLog</pre>
<p>It will create a file:</p>
<pre><a href="file:///C:/DATA/GEN/collation/5.0.0/CheckCollationValidity.html">C:\DATA\GEN\collation\5.0.0\CheckCollationValidity.html</a></pre>
@ -354,24 +353,45 @@ AC00..D7A3 # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
</ol></li>
</ol></li>
<li>
<h4><span style="font-weight: 400">To build all the charts, use the options:
</span> </h4>
<h4><span style="font-weight: 400">To build all the charts (including for
the UCA), use the options: </span></h4>
<pre>normalizationChart caseChart scriptChart indexChart</pre>
<p>They will be built into</p>
<pre><a href="file:///C:/DATA/GEN/charts">C:\DATA\GEN\charts</a></pre>
<p><b>Once UCA is released, then copy those files up to the right spots in
the Unicode site:</b><ul>
<li>
<pre><a href="http://www.unicode.org/charts/normalization/">http://www.unicode.org/charts/normalization/</a></pre>
</li>
<li>
<pre><a href="http://www.unicode.org/charts/collation/">http://www.unicode.org/charts/collation/</a> </pre>
</li>
<li>
<pre><a href="http://www.unicode.org/charts/case/">http://www.unicode.org/charts/case/</a> </pre>
</li>
<li>
<pre><a href="http://www.unicode.org/charts/collation/">http://www.unicode.org/charts/collation/</a> </pre>
</li>
</ul>
</li>
<li>
<h4><span style="font-weight: 400">To build all the UCA files used by ICU, use the
option:</span></h4>
<pre>ICU</pre>
<p>They will be built into:</p>
<pre><a href="file:///C:/DATA/GEN/collation/5.0.0">C:\DATA\GEN\collation\5.0.0</a></pre>
</li>
<li>You should then build a set of the ICU files for the previous version,
if you don't have them. The key file is UCA_Rules_NoCE.txt. It contains the
rules expressed in ICU format, which allows for comparison across versions
of UCA.<ol>
<li>Do a Diff, and verify that all the differences are either new
characters, or were authorized to be changed by the UTC.</li>
</ol>
</li>
if you don't have them. Use the options:<pre>version 4.1.0 ICU</pre>
<p>Or whatever the last version was.</li>
<li>Now, you will want to compare versions. The key file is
UCA_Rules_NoCE.txt. It contains the rules expressed in ICU format, which
allows for comparison across versions of UCA without spurious variations of
the numbers getting in the way.<ol>
<li>Do a Diff between the last and current versions of these files, and
verify that all the differences are either new characters, or were
authorized to be changed by the UTC.</li>
</ol></li>
</ol>
</body>