mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-5222 fixes for UnicodeTools (unconnected with rest of ICU4J)
X-SVN-Rev: 20400
This commit is contained in:
parent
fa66eb7a07
commit
690f5c528c
9 changed files with 765 additions and 199 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
|
||||
* $Date: 2006/04/05 22:12:44 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2006/09/24 23:32:44 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -16,14 +16,17 @@ package com.ibm.text.UCD;
|
|||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import org.unicode.cldr.util.Segmenter;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
abstract public class GenerateBreakTest implements UCD_Types {
|
||||
|
||||
static boolean DEBUG = false;
|
||||
static boolean DEBUG = true;
|
||||
static final boolean SHOW_TYPE = false;
|
||||
UCD ucd;
|
||||
Normalizer nfd;
|
||||
|
@ -122,7 +125,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
}
|
||||
|
||||
// quick & dirty routine
|
||||
String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
|
||||
static String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
|
||||
String result = insertion;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
result += source.charAt(i);
|
||||
|
@ -291,6 +294,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
private String[] ruleList = new String[100];
|
||||
private int ruleListCount = 0;
|
||||
protected boolean collectingRules = false;
|
||||
protected boolean needsFullBreakSample = true;
|
||||
|
||||
public void setRule(String rule) {
|
||||
if (collectingRules) {
|
||||
|
@ -330,6 +334,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
out.println("<h2>" + fileName + " Break Chart</h2>");
|
||||
out.println("<p><b>Unicode Version:</b> " + ucd.getVersion() + "</p>");
|
||||
out.println("<p><b>Date:</b> " + Default.getDate() + "</p>");
|
||||
out.println("<p>This page illustrates the application of the boundary specifications. " +
|
||||
"The first chart shows where breaks would appear between different sample characters or strings. " +
|
||||
"The sample characters are chosen mechanically to represent the different properties used by the specification. " +
|
||||
"Where properties used in the rules have 'overlaps', the samples are given 'composed' names. " +
|
||||
"For example, SentenceBreak uses GCLF_Sep: Sep is the SentenceBreak property, but it overlaps with the GraphemeClusterBreak property LF." +
|
||||
"</p>");
|
||||
generateTable(out);
|
||||
|
||||
|
||||
|
@ -485,8 +495,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
result.append(ucd.getCodeAndName(cp));
|
||||
result.append(", gc=" + ucd.getCategoryID_fromIndex(ucd.getCategory(cp),SHORT));
|
||||
result.append(", sc=" + ucd.getScriptID_fromIndex(ucd.getScript(cp),SHORT));
|
||||
result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp))
|
||||
+ "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG));
|
||||
//result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp))
|
||||
// + "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
@ -560,19 +570,41 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
}
|
||||
|
||||
// gather the data for the rules
|
||||
if (needsFullBreakSample ) {
|
||||
collectingRules = true;
|
||||
isBreak(fullBreakSample(), 1);
|
||||
collectingRules = false;
|
||||
}
|
||||
|
||||
out.println("<h3>Rules</h3>");
|
||||
out.println("<ul>");
|
||||
out.println("<p>Due to the way they have been mechanically processed for generation, " +
|
||||
"the following rules do not match the UAX rules precisely. " +
|
||||
"In particular:</p>"+
|
||||
"<ol>" +
|
||||
"<li>The rules are cast into a more regex-style.</li>"+
|
||||
"<li>The rules \"sot ÷\", \"÷ eot\", and \"÷ Any\" are added mechanically, and have artificial numbers.</li>"+
|
||||
"<li>The rules are given decimal numbers, so rules such as 11a are given a number using tenths, such as 11.1.</li>"+
|
||||
"<li>Where a rule has multiple parts (lines), each one is numbered using hundredths, such as 21.01) × BA, 21.02) × HY,...</li>"+
|
||||
"<li>Any 'treat as' or 'ignore' rules are handled as discussed in Unicode Standard Annex #29, and thus" +
|
||||
"reflected in a transformation of the rules not visible here.</li>" +
|
||||
"</ol>" +
|
||||
"<p>For the original rules, see the UAX.</p>"
|
||||
|
||||
);
|
||||
out.println("<ul style='list-style-type: none'>");
|
||||
for (int ii = 0; ii < ruleListCount; ++ii) {
|
||||
out.println("<li>" + ruleList[ii] + "</li>");
|
||||
out.println("<li>" + ruleList[ii].replaceAll("[$]","") + "</li>");
|
||||
}
|
||||
out.println("</ul>");
|
||||
|
||||
if (extraSingleSamples.length > 0) {
|
||||
out.println("<h3>Sample Strings</h3>");
|
||||
out.println("<p>" +
|
||||
"The following samples illustrate the application of the rules. " +
|
||||
"The blue lines indicate possible break points. " +
|
||||
"If your browser supports titles, then positioning the mouse over each character will show its name, " +
|
||||
"white positioning between characters shows the rule number of the rule responsible for the break-status." +
|
||||
"</p>");
|
||||
out.println("<ol>");
|
||||
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
|
||||
out.println("<li><font size='5'>");
|
||||
|
@ -631,6 +663,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
if (comments && !html) string.append(comment);
|
||||
out.println(string);
|
||||
if (DEBUG) System.out.println("*" + string);
|
||||
}
|
||||
|
||||
public void findSamples() {
|
||||
|
@ -642,7 +675,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
BitSet bitset = new BitSet();
|
||||
Map list = new TreeMap();
|
||||
|
||||
for (int i = 1; i <= 0x10FFFF; ++i) {
|
||||
for (int i = 1; i <= 0xFFFF; ++i) {
|
||||
if (!ucd.isAllocated(i)) continue;
|
||||
if (0xD800 <= i && i <= 0xDFFF) continue;
|
||||
if (DEBUG && i == 0x1100) {
|
||||
|
@ -657,6 +690,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
}
|
||||
|
||||
int combined = (mapType(lb) << 7) + mapType(lb2);
|
||||
if (combined < 0) {
|
||||
throw new IllegalArgumentException("should never happen");
|
||||
}
|
||||
if (!bitset.get(combined)) {
|
||||
bitset.set(combined);
|
||||
list.put(new Integer(combined), UTF16.valueOf(i));
|
||||
|
@ -777,10 +813,142 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
|
||||
//==============================================
|
||||
|
||||
static class XGenerateBreakTest extends GenerateBreakTest {
|
||||
Segmenter seg;
|
||||
String sample;
|
||||
{
|
||||
needsFullBreakSample = false;
|
||||
}
|
||||
|
||||
public XGenerateBreakTest(UCD ucd, Segmenter.Builder segBuilder, String sample, String filename, String[] extraSamples, String[] extraSingleSamples) {
|
||||
super(ucd);
|
||||
this.seg = segBuilder.make();
|
||||
this.sample = sample;
|
||||
List rules = segBuilder.getRules();
|
||||
collectingRules = true;
|
||||
for (Iterator it = rules.iterator(); it.hasNext();) {
|
||||
String rule = (String)it.next();
|
||||
setRule(rule);
|
||||
}
|
||||
collectingRules = false;
|
||||
map.add("Other", new UnicodeSet(0,0x10FFFF));
|
||||
UnicodeMap segSamples = seg.getSamples();
|
||||
Collection x = segSamples.getAvailableValues();
|
||||
for (Iterator it = x.iterator(); it.hasNext();) {
|
||||
String label = (String)it.next();
|
||||
map.add(label, segSamples.getSet(label), true, false);
|
||||
}
|
||||
this.fileName = filename;
|
||||
sampleMap = map;
|
||||
this.extraSamples = extraSamples;
|
||||
this.extraSingleSamples = extraSingleSamples;
|
||||
}
|
||||
|
||||
static class GenerateGraphemeBreakTest extends GenerateBreakTest {
|
||||
public boolean isBreak(String source, int offset) {
|
||||
boolean result = seg.breaksAt(source, offset);
|
||||
setRule(String.valueOf(seg.getBreakRule()));
|
||||
return result;
|
||||
}
|
||||
|
||||
GenerateGraphemeBreakTest(UCD ucd) {
|
||||
public String fullBreakSample() {
|
||||
return sample;
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp) {
|
||||
return map.getLabel(cp);
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp) {
|
||||
return (byte) map.getIndex(cp);
|
||||
}
|
||||
}
|
||||
|
||||
static class GenerateGraphemeBreakTest extends XGenerateBreakTest {
|
||||
public GenerateGraphemeBreakTest(UCD ucd) {
|
||||
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"GraphemeClusterBreak"), "aa", "Grapheme",
|
||||
new String[]{}, new String[]{});
|
||||
}
|
||||
}
|
||||
|
||||
static class GenerateLineBreakTest extends XGenerateBreakTest {
|
||||
public GenerateLineBreakTest(UCD ucd) {
|
||||
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"LineBreak"), "aa", "Line",
|
||||
new String[]{}, new String[] {
|
||||
"can't", "can\u2019t", "ab\u00ADby",
|
||||
"-3",
|
||||
"e.g.",
|
||||
"\u4e00.\u4e00.",
|
||||
"a b",
|
||||
"a \u200bb",
|
||||
"a \u0308b",
|
||||
"1\u0308b(a)-(b)",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static class GenerateSentenceBreakTest extends XGenerateBreakTest {
|
||||
public GenerateSentenceBreakTest(UCD ucd) {
|
||||
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"SentenceBreak"), "aa", "Sentence",
|
||||
new String[]{},
|
||||
getExtraSamples());
|
||||
}
|
||||
static String[] getExtraSamples() {
|
||||
GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd());
|
||||
String[] extraSingleSamples = new String[] {
|
||||
"(\"Go.\") (He did.)",
|
||||
"(\u201CGo?\u201D) (He did.)",
|
||||
"U.S.A\u0300. is",
|
||||
"U.S.A\u0300? He",
|
||||
"U.S.A\u0300.",
|
||||
"3.4",
|
||||
"c.d",
|
||||
"etc.)\u2019 \u2018(the",
|
||||
"etc.)\u2019 \u2018(The",
|
||||
"the resp. leaders are",
|
||||
"\u5B57.\u5B57",
|
||||
"etc.\u5B83",
|
||||
"etc.\u3002",
|
||||
"\u5B57\u3002\u5B83",
|
||||
};
|
||||
String[] temp = new String [extraSingleSamples.length * 2];
|
||||
System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length);
|
||||
for (int i = 0; i < extraSingleSamples.length; ++i) {
|
||||
temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme);
|
||||
}
|
||||
extraSingleSamples = temp;
|
||||
return extraSingleSamples;
|
||||
}
|
||||
}
|
||||
|
||||
static class GenerateWordBreakTest extends XGenerateBreakTest {
|
||||
public GenerateWordBreakTest(UCD ucd) {
|
||||
super(ucd, Segmenter.make(ToolUnicodePropertySource.make(ucd.getVersion()),"WordBreak"), "aa", "Word",
|
||||
new String[] {
|
||||
/*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060"
|
||||
},
|
||||
|
||||
|
||||
getExtraSamples());
|
||||
}
|
||||
static String[] getExtraSamples() {
|
||||
GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(Default.ucd());
|
||||
String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" };
|
||||
String[] extraSingleSamples = new String [temp.length * 2];
|
||||
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
|
||||
for (int i = 0; i < temp.length; ++i) {
|
||||
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
|
||||
}
|
||||
|
||||
return extraSingleSamples;
|
||||
}
|
||||
}
|
||||
|
||||
static class OLDGenerateGraphemeBreakTest extends GenerateBreakTest {
|
||||
|
||||
OLDGenerateGraphemeBreakTest(UCD ucd) {
|
||||
super(ucd);
|
||||
fileName = "Grapheme";
|
||||
sampleMap = map;
|
||||
|
@ -866,13 +1034,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
//==============================================
|
||||
|
||||
static class GenerateWordBreakTest extends GenerateBreakTest {
|
||||
static class XGenerateWordBreakTest extends GenerateBreakTest {
|
||||
|
||||
GenerateGraphemeBreakTest grapheme;
|
||||
MyBreakIterator breaker;
|
||||
Context context = new Context();
|
||||
|
||||
GenerateWordBreakTest(UCD ucd) {
|
||||
XGenerateWordBreakTest(UCD ucd) {
|
||||
super(ucd);
|
||||
grapheme = new GenerateGraphemeBreakTest(ucd);
|
||||
breaker = new MyBreakIterator(grapheme);
|
||||
|
@ -1017,13 +1185,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
// ========================================
|
||||
|
||||
static class GenerateLineBreakTest extends GenerateBreakTest {
|
||||
static class XGenerateLineBreakTest extends GenerateBreakTest {
|
||||
|
||||
GenerateGraphemeBreakTest grapheme;
|
||||
MyBreakIterator breaker;
|
||||
Context context = new Context();
|
||||
|
||||
GenerateLineBreakTest(UCD ucd) {
|
||||
XGenerateLineBreakTest(UCD ucd) {
|
||||
super(ucd);
|
||||
grapheme = new GenerateGraphemeBreakTest(ucd);
|
||||
breaker = new MyBreakIterator(grapheme);
|
||||
|
@ -1505,12 +1673,12 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
//==============================================
|
||||
|
||||
static class GenerateSentenceBreakTest extends GenerateBreakTest {
|
||||
static class XGenerateSentenceBreakTest extends GenerateBreakTest {
|
||||
|
||||
GenerateGraphemeBreakTest grapheme;
|
||||
MyBreakIterator breaker;
|
||||
|
||||
GenerateSentenceBreakTest(UCD ucd) {
|
||||
XGenerateSentenceBreakTest(UCD ucd) {
|
||||
super(ucd);
|
||||
grapheme = new GenerateGraphemeBreakTest(ucd);
|
||||
breaker = new MyBreakIterator(grapheme);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateConfusables.java,v $
|
||||
* $Date: 2006/06/09 21:21:20 $
|
||||
* $Revision: 1.11 $
|
||||
* $Date: 2006/09/24 23:32:44 $
|
||||
* $Revision: 1.12 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -19,6 +19,7 @@ import java.io.IOException;
|
|||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
@ -37,16 +38,21 @@ import com.ibm.icu.dev.test.util.UnicodeProperty;
|
|||
import com.ibm.icu.dev.test.util.XEquivalenceClass;
|
||||
import com.ibm.icu.impl.CollectionUtilities;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
|
||||
public class GenerateConfusables {
|
||||
public static String version = "2.0";
|
||||
public static boolean EXCLUDE_CONFUSABLE_COMPAT = true;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
quickTest();
|
||||
|
||||
Set arg2 = new HashSet(Arrays.asList(args));
|
||||
try {
|
||||
if (arg2.contains("-b")) generateIDN();
|
||||
|
@ -59,6 +65,19 @@ public class GenerateConfusables {
|
|||
System.out.println("Done");
|
||||
}
|
||||
}
|
||||
|
||||
private static void quickTest() {
|
||||
int script = getSingleScript("\u0430\u0061");
|
||||
script = getSingleScript("\u0061\u0430"); //0323 ; 093C
|
||||
String a = "\u0323";
|
||||
String b = "\u093C";
|
||||
int isLess = betterTargetIsLess.compare(a, b); // ("\u0045", "\u13AC");
|
||||
MyEquivalenceClass test = new MyEquivalenceClass();
|
||||
test.add(a, b, "none");
|
||||
Set x = test.getEquivalences(a);
|
||||
String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
@ -82,32 +101,34 @@ public class GenerateConfusables {
|
|||
_Non_IICore.removeAll(um.getSet("2.1"));
|
||||
|
||||
// add Chinese?
|
||||
UnicodeSet cjk_nic = new UnicodeSet();
|
||||
String line = null;
|
||||
try {
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
// part 0 is range
|
||||
String range = pieces[0].trim();
|
||||
int rangeDivider = range.indexOf("..");
|
||||
int start, end;
|
||||
if (rangeDivider < 0) {
|
||||
start = end = Integer.parseInt(range, 16);
|
||||
} else {
|
||||
start = Integer.parseInt(range.substring(0, rangeDivider), 16);
|
||||
end = Integer.parseInt(range.substring(rangeDivider+2), 16);
|
||||
}
|
||||
cjk_nic.add(start, end);
|
||||
}
|
||||
br.close();
|
||||
} catch (Exception e) {
|
||||
throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
|
||||
}
|
||||
_Non_IICore.removeAll(cjk_nic);
|
||||
if (true) {
|
||||
UnicodeSet cjk_nic = new UnicodeSet();
|
||||
String line = null;
|
||||
try {
|
||||
BufferedReader br = BagFormatter.openUTF8Reader(indir, "cjk_nic.txt");
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
String[] pieces = Utility.split(line, ';');
|
||||
// part 0 is range
|
||||
String range = pieces[0].trim();
|
||||
int rangeDivider = range.indexOf("..");
|
||||
int start, end;
|
||||
if (rangeDivider < 0) {
|
||||
start = end = Integer.parseInt(range, 16);
|
||||
} else {
|
||||
start = Integer.parseInt(range.substring(0, rangeDivider), 16);
|
||||
end = Integer.parseInt(range.substring(rangeDivider+2), 16);
|
||||
}
|
||||
cjk_nic.add(start, end);
|
||||
}
|
||||
br.close();
|
||||
} catch (Exception e) {
|
||||
throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e);
|
||||
}
|
||||
_Non_IICore.removeAll(cjk_nic);
|
||||
}
|
||||
}
|
||||
return _Non_IICore;
|
||||
// for (Iterator it = um.getAvailableValues().iterator(); it.hasNext();) {
|
||||
|
@ -118,7 +139,7 @@ public class GenerateConfusables {
|
|||
}
|
||||
|
||||
static PrintWriter log;
|
||||
static final String ARROW = "\u2192";
|
||||
static final String ARROW = "\u2192"; // \u2194
|
||||
static UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(""); // ICUPropertyFactory.make();
|
||||
static UnicodeSet UNASSIGNED = ups.getSet("gc=Cn")
|
||||
.addAll(ups.getSet("gc=Co"))
|
||||
|
@ -131,12 +152,14 @@ public class GenerateConfusables {
|
|||
static UnicodeSet _skipNFKD;
|
||||
|
||||
static Map gatheredNFKD = new TreeMap();
|
||||
static UnicodeMap nfcMap = new UnicodeMap();
|
||||
static UnicodeMap nfcMap;
|
||||
static UnicodeMap nfkcMap;
|
||||
|
||||
static String indir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\source\\";
|
||||
static String outdir = "C:\\Unicode-CVS2\\draft\\reports\\tr36\\data\\";
|
||||
static String indir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\source\\";
|
||||
static String outdir = "C:\\cvsdata\\unicode\\draft\\reports\\tr36\\data\\";
|
||||
|
||||
static Comparator codepointComparator = new UTF16.StringComparator(true,false,0);
|
||||
static Comparator UCAComparator = new CollectionUtilities.MultiComparator(new Comparator[] {Collator.getInstance(ULocale.ROOT), codepointComparator});
|
||||
|
||||
static UnicodeSet setsToAbbreviate = new UnicodeSet("[" +
|
||||
"\\u3400-\\u4DB5" +
|
||||
|
@ -208,23 +231,35 @@ public class GenerateConfusables {
|
|||
|
||||
private UnicodeMap additions = new UnicodeMap(), remap = new UnicodeMap(), removals = new UnicodeMap(),
|
||||
reviews, removals2, lowerIsBetter;
|
||||
|
||||
private UnicodeSet isCaseFolded;
|
||||
|
||||
private IdentifierInfo() throws IOException {
|
||||
propNFKCSet = ups.getSet("NFKC_QuickCheck=N")
|
||||
.complement();
|
||||
isCaseFolded = new UnicodeSet();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
||||
String source = UTF16.valueOf(cp);
|
||||
String cf = Default.ucd().getCase(source, UCD.FULL, UCD.FOLD);
|
||||
if (cf.equals(source)) isCaseFolded.add(cp);
|
||||
}
|
||||
|
||||
propNFKCSet = ups.getSet("NFKC_QuickCheck=N").complement();
|
||||
UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=TRUE");
|
||||
|
||||
//removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant");
|
||||
loadFileData();
|
||||
xidPlus = new UnicodeSet(propXIDContinueSet).addAll(
|
||||
additions.getSet(null).complement()).retainAll(propNFKCSet);
|
||||
xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet);
|
||||
|
||||
getIdentifierSet();
|
||||
notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus);
|
||||
removals.putAll(notInXID, PROHIBITED + NOT_IN_XID);
|
||||
removalSet = removals.getSet(null).complement();
|
||||
//UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet);
|
||||
//removals.putAll(notNfkcXid, PROHIBITED + "compat variant");
|
||||
removalSet = removals.keySet();
|
||||
|
||||
remainingOutputSet = new UnicodeSet(IDNOutputSet)
|
||||
.removeAll(removalSet);
|
||||
remainingOutputSet = new UnicodeSet(IDNOutputSet).removeAll(removalSet);
|
||||
|
||||
UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet)
|
||||
.removeAll(removalSet).removeAll(remainingOutputSet);
|
||||
|
@ -234,9 +269,9 @@ public class GenerateConfusables {
|
|||
// the output set
|
||||
for (UnicodeSetIterator usi = new UnicodeSetIterator(
|
||||
remainingInputSet1); usi.next();) {
|
||||
String nss = Default.nfkc().normalize(usi.getString());
|
||||
String nss = getModifiedNKFC(usi.getString());
|
||||
String cf = Default.ucd().getCase(nss, UCD.FULL, UCD.FOLD);
|
||||
String cf2 = Default.nfkc().normalize(cf);
|
||||
String cf2 = getModifiedNKFC(cf);
|
||||
if (remainingOutputSet.containsAll(cf2))
|
||||
remainingInputSet.add(usi.codepoint);
|
||||
else
|
||||
|
@ -247,7 +282,7 @@ public class GenerateConfusables {
|
|||
for (UnicodeSetIterator usi = new UnicodeSetIterator(
|
||||
remainingInputSet); usi.next();) {
|
||||
String ss = usi.getString();
|
||||
String nss = Default.nfkc().normalize(ss);
|
||||
String nss = getModifiedNKFC(ss);
|
||||
String cf = Default.ucd().getCase(ss, UCD.FULL, UCD.FOLD);
|
||||
if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) {
|
||||
System.out.println("check");
|
||||
|
@ -395,7 +430,7 @@ public class GenerateConfusables {
|
|||
throw (RuntimeException) new RuntimeException(
|
||||
"Failure on line " + line).initCause(e);
|
||||
}
|
||||
removals.putAll(getNonIICore(), "~IICore");
|
||||
removals.putAll(getNonIICore(), PROHIBITED + "~IICore");
|
||||
br.close();
|
||||
}
|
||||
|
||||
|
@ -417,13 +452,14 @@ public class GenerateConfusables {
|
|||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setMergeRanges(true);
|
||||
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
|
||||
PrintWriter out = openAndWriteHeader("review.txt", "Review List for IDN");
|
||||
// PrintWriter out = BagFormatter.openUTF8Writer(outdir, "review.txt");
|
||||
//reviews.putAll(UNASSIGNED, "");
|
||||
out.print("\uFEFF");
|
||||
out.println("# Review List for IDN");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
out.println("");
|
||||
// out.print("\uFEFF");
|
||||
// out.println("# Review List for IDN");
|
||||
// out.println("# $Revision: 1.12 $");
|
||||
// out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
// out.println("");
|
||||
|
||||
UnicodeSet fullSet = reviews.getSet("").complement();
|
||||
|
||||
|
@ -474,19 +510,15 @@ public class GenerateConfusables {
|
|||
|
||||
UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]");
|
||||
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, "idnchars.txt");
|
||||
PrintWriter out = openAndWriteHeader("idnchars.txt", "Recommended Identifier Profiles for IDN");
|
||||
|
||||
out.println("# Recommended Identifier Profiles for IDN");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
|
||||
out.println("");
|
||||
out.println("# Output Characters");
|
||||
out.println("# Allowed as output characters");
|
||||
out.println("");
|
||||
bf.setValueSource("output");
|
||||
bf.showSetNames(out, remainingOutputSet);
|
||||
showExtras(bf, remainingOutputSet, letters);
|
||||
|
||||
/*
|
||||
out.println("");
|
||||
|
||||
out.println("");
|
||||
|
@ -502,10 +534,10 @@ public class GenerateConfusables {
|
|||
bf.setValueSource("input-lenient");
|
||||
bf.showSetNames(out, inputSet_lenient);
|
||||
showExtras(bf, inputSet_lenient, letters);
|
||||
|
||||
*/
|
||||
|
||||
out.println("");
|
||||
out
|
||||
.println("# Not allowed at start of identifier");
|
||||
out.println("# Not allowed at start of identifier");
|
||||
out.println("");
|
||||
bf.setValueSource("nonstarting");
|
||||
bf.showSetNames(out, nonstarting);
|
||||
|
@ -517,6 +549,7 @@ public class GenerateConfusables {
|
|||
out.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
@ -543,13 +576,14 @@ public class GenerateConfusables {
|
|||
bf.setShowLiteral(TransliteratorUtilities.toHTMLControl);
|
||||
bf.setMergeRanges(true);
|
||||
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir,
|
||||
"xidmodifications.txt");
|
||||
PrintWriter out = openAndWriteHeader("xidmodifications.txt", "Security Profile for General Identifiers");
|
||||
/* PrintWriter out = BagFormatter.openUTF8Writer(outdir, "xidmodifications.txt");
|
||||
|
||||
out.println("# Security Profile for General Identifiers");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
out.println("");
|
||||
out.println("# $Revision: 1.12 $");
|
||||
out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
*/
|
||||
|
||||
|
||||
out.println("# Characters restricted");
|
||||
out.println("");
|
||||
|
@ -567,11 +601,26 @@ public class GenerateConfusables {
|
|||
out.println("# Characters added");
|
||||
out.println("");
|
||||
bf.setValueSource("addition");
|
||||
bf.showSetNames(out, additions.getSet(null).complement());
|
||||
bf.showSetNames(out, additions.keySet());
|
||||
|
||||
//showRemapped(out, "Characters remapped on input", remap);
|
||||
|
||||
out.close();
|
||||
|
||||
out = openAndWriteHeader("xidAllowed.txt", "Security Profile for General Identifiers");
|
||||
UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet());
|
||||
UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet);
|
||||
allowed.removeAll(cfAllowed);
|
||||
bf.setValueSource("case_folded");
|
||||
out.println("# XID characters allowed (no uppercase)");
|
||||
out.println("");
|
||||
bf.showSetNames(out, cfAllowed);
|
||||
bf.setValueSource("not_case_folded");
|
||||
out.println("");
|
||||
out.println("# XID characters allowed (uppercase)");
|
||||
out.println("");
|
||||
bf.showSetNames(out, allowed);
|
||||
out.close();
|
||||
|
||||
UnicodeMap someRemovals = new UnicodeMap();
|
||||
UnicodeMap.Composer myComposer = new UnicodeMap.Composer() {
|
||||
|
@ -604,8 +653,8 @@ public class GenerateConfusables {
|
|||
//someRemovals = removals;
|
||||
out = BagFormatter.openUTF8Writer(outdir, "draft-restrictions.txt");
|
||||
out.println("# Characters restricted in domain names");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
out.println("# $Revision: 1.12 $");
|
||||
out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
out.println("#");
|
||||
out.println("# This file contains a draft list of characters for use in");
|
||||
out.println("# UTR #36: Unicode Security Considerations");
|
||||
|
@ -646,7 +695,7 @@ public class GenerateConfusables {
|
|||
bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() {
|
||||
}).set(someRemovals).setMain("Removals", "GCB",
|
||||
UnicodeProperty.ENUMERATED, "1.0"));
|
||||
bf.showSetNames(out, someRemovals.getSet(null).complement());
|
||||
bf.showSetNames(out, someRemovals.keySet());
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
|
@ -654,6 +703,7 @@ public class GenerateConfusables {
|
|||
|
||||
static final String PROHIBITED = "restricted ; ";
|
||||
static final String NOT_IN_XID = "not in XID+";
|
||||
public static final boolean suppress_NFKC = true;
|
||||
/**
|
||||
*
|
||||
*/
|
||||
|
@ -674,7 +724,7 @@ public class GenerateConfusables {
|
|||
out.println("");
|
||||
for (UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) {
|
||||
String source = usi.getString();
|
||||
String target = Default.nfkc().normalize(source);
|
||||
String target = getModifiedNKFC(source);
|
||||
writeSourceTargetLine(out, source, null, target, value);
|
||||
}
|
||||
//bf.showSetNames(out, s);
|
||||
|
@ -712,7 +762,7 @@ public class GenerateConfusables {
|
|||
out.println("# " + title);
|
||||
out.println("");
|
||||
int count = 0;
|
||||
for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.getSet(null).complement()); usi.next();) {
|
||||
for (UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) {
|
||||
writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null);
|
||||
count++;
|
||||
}
|
||||
|
@ -747,6 +797,8 @@ public class GenerateConfusables {
|
|||
}
|
||||
|
||||
private static UnicodeSet getSkipNFKD() {
|
||||
nfcMap = new UnicodeMap();
|
||||
nfkcMap = new UnicodeMap();
|
||||
if (_skipNFKD == null) {
|
||||
_skipNFKD = new UnicodeSet();
|
||||
UnicodeSet idSet = getIdentifierSet();
|
||||
|
@ -755,6 +807,8 @@ public class GenerateConfusables {
|
|||
int cat = Default.ucd().getCategory(cp);
|
||||
if (cat == UCD.Cn || cat == UCD.Co || cat == UCD.Cs) continue;
|
||||
int decompType = Default.ucd().getDecompositionType(cp);
|
||||
String nfc = Default.nfc().normalize(cp);
|
||||
if (decompType == UCD.CANONICAL) nfcMap.put(cp, nfc);
|
||||
if (decompType == UCD.COMPAT_CIRCLE
|
||||
|| decompType == UCD.COMPAT_SUPER
|
||||
|| decompType == UCD.COMPAT_SUB
|
||||
|
@ -765,42 +819,58 @@ public class GenerateConfusables {
|
|||
_skipNFKD.add(cp);
|
||||
continue;
|
||||
}
|
||||
String source = UTF16.valueOf(cp);
|
||||
String mapped = Default.nfkd().normalize(cp);
|
||||
if (mapped.equals(UTF16.valueOf(cp))) continue;
|
||||
String kmapped = getModifiedNKFC(source);
|
||||
if (!kmapped.equals(source) && !kmapped.equals(nfc)) {
|
||||
if (kmapped.startsWith(" ") || kmapped.startsWith("\u0640")) {
|
||||
System.out.println("?? " + Default.ucd().getCodeAndName(cp));
|
||||
System.out.println("\t" + Default.ucd().getCodeAndName(kmapped));
|
||||
kmapped = getModifiedNKFC(source); // for debugging
|
||||
}
|
||||
nfkcMap.put(cp,kmapped);
|
||||
}
|
||||
if (mapped.equals(source)) continue;
|
||||
if (idSet.contains(cp) && !idSet.contains(mapped)) _skipNFKD.add(cp);
|
||||
else if (!whiteSpace.contains(cp) && whiteSpace.containsSome(mapped)) _skipNFKD.add(cp);
|
||||
if (decompType == UCD.CANONICAL) nfcMap.put(cp, Default.nfd().normalize(cp));
|
||||
}
|
||||
}
|
||||
nfcMap.setMissing("");
|
||||
nfcMap.setMissing("");
|
||||
nfcMap.freeze();
|
||||
nfkcMap.setMissing("");
|
||||
nfkcMap.freeze();
|
||||
return _skipNFKD;
|
||||
}
|
||||
|
||||
private static boolean isMixedScript(String source) {
|
||||
return getSingleScript(source) != UScript.INVALID_CODE;
|
||||
return getSingleScript(source) == UScript.INVALID_CODE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns UScript.INVALID_CODE if mixed script, otherwise the script
|
||||
*/
|
||||
public static int getSingleScript(String source) {
|
||||
int lastScript = UScript.INVALID_CODE;
|
||||
int cp;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
int script = UScript.getScript(cp);
|
||||
if (script == UScript.COMMON || script == UScript.INHERITED) {
|
||||
if (XIDContinueSet.contains(cp)) {
|
||||
if (lastScript == UScript.INVALID_CODE) lastScript = script;
|
||||
continue; // skip if not identifier
|
||||
}
|
||||
script = UScript.COMMON;
|
||||
}
|
||||
if (lastScript == UScript.INVALID_CODE) lastScript = script;
|
||||
else if (script != lastScript) return UScript.INVALID_CODE;
|
||||
/**
|
||||
* Returns the script of the input text. Script values of COMMON and INHERITED are ignored.
|
||||
* @param source Input text.
|
||||
* @return Script value found in the text.
|
||||
* If more than one script values are found, then UScript.INVALID_CODE is returned.
|
||||
* If no script value is found (other than COMMON or INHERITED), then UScript.COMMON is returned.
|
||||
*/
|
||||
public static int getSingleScript(String source) {
|
||||
if (source.length() == 0) return UScript.COMMON;
|
||||
int lastScript = UScript.COMMON; // temporary value
|
||||
int cp;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
int script = UScript.getScript(cp);
|
||||
if (script == UScript.COMMON || script == UScript.INHERITED) {
|
||||
continue;
|
||||
}
|
||||
return lastScript;
|
||||
if (lastScript == UScript.COMMON) {
|
||||
lastScript = script;
|
||||
} else if (script != lastScript) {
|
||||
return UScript.INVALID_CODE;
|
||||
}
|
||||
}
|
||||
return lastScript;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -856,8 +926,9 @@ public class GenerateConfusables {
|
|||
+ " ;\t" + Utility.hex(target)
|
||||
+ (tag == null ? "" : " ;\t" + tag)
|
||||
//+ " ;\t" + (preferredID.contains(source) ? "ID" : "")
|
||||
+ "\t# "
|
||||
+ "( " + source + " " + ARROW + " " + target + ") "
|
||||
+ "\t#"
|
||||
+ (isXid(source) ? "" : "*")
|
||||
+ " ( " + source + " " + ARROW + " " + target + " ) "
|
||||
+ Default.ucd().getName(source) + " " + ARROW + " "
|
||||
+ Default.ucd().getName(target)
|
||||
);
|
||||
|
@ -992,18 +1063,45 @@ public class GenerateConfusables {
|
|||
for (int i = 0; i < item.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(item, i);
|
||||
String cps = UTF16.valueOf(cp);
|
||||
String mapped = getParadigm(cps);
|
||||
String mapped = getParadigm(cps, false, false);
|
||||
if (mapped.indexOf(cps) >= 0) result.append(cps);
|
||||
else {
|
||||
result.append(mapped);
|
||||
reasons.append("[" + getReasons(cps, mapped) + "]");
|
||||
List x = getReasons(cps, mapped);
|
||||
reasons.append(getBestForm(x));
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
public String getParadigm(Object item) {
|
||||
return (String) CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1);
|
||||
private Object getBestForm(Collection x) {
|
||||
if (x.size() != 1) return "[" + x + "]";
|
||||
Object item = x.iterator().next();
|
||||
if (!(item instanceof Collection)) return x.toString();
|
||||
return getBestForm((Collection)item);
|
||||
}
|
||||
|
||||
public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) {
|
||||
Set filteredSet;
|
||||
if (onlyLowercase == false && onlySameScript == false) {
|
||||
filteredSet = getEquivalences(item);
|
||||
} else {
|
||||
filteredSet = new HashSet();
|
||||
for (Iterator it = getEquivalences(item).iterator(); it.hasNext();) {
|
||||
String other = (String) it.next();
|
||||
String combined = item + other;
|
||||
if (onlyLowercase) {
|
||||
boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
|
||||
if (!isLowercase) continue;
|
||||
}
|
||||
if (onlySameScript) {
|
||||
boolean isMixed = isMixedScript(combined);
|
||||
if (isMixed) continue;
|
||||
}
|
||||
filteredSet.add(other);
|
||||
}
|
||||
}
|
||||
return (String) CollectionUtilities.getBest(filteredSet, betterTargetIsLess, -1);
|
||||
}
|
||||
|
||||
public Set getOrderedExplicitItems() {
|
||||
|
@ -1057,12 +1155,21 @@ public class GenerateConfusables {
|
|||
type += ":" + lineCount;
|
||||
|
||||
String combined = source + target;
|
||||
if (combined.indexOf("\u0430") >= 0) {
|
||||
System.out.println(Default.ucd().getCodeAndName(combined));
|
||||
}
|
||||
boolean isLowercase = combined.equals(Default.ucd().getCase(combined, UCD.FULL, UCD.FOLD));
|
||||
boolean isMixed = isMixedScript(combined);
|
||||
dataMixedAnycase.add(source, target, type);
|
||||
if (isLowercase) dataMixedLowercase.add(source, target, type);
|
||||
if (!isMixed) dataSingleAnycase.add(source, target, type);
|
||||
if (!isMixed && isLowercase) dataSingleLowercase.add(source, target, type);
|
||||
if (isLowercase) {
|
||||
dataMixedLowercase.add(source, target, type);
|
||||
}
|
||||
if (!isMixed) {
|
||||
dataSingleAnycase.add(source, target, type);
|
||||
}
|
||||
if (!isMixed && isLowercase) {
|
||||
dataSingleLowercase.add(source, target, type);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -1124,7 +1231,13 @@ public class GenerateConfusables {
|
|||
String source = Utility.fromHex(pieces[0].trim(),true);
|
||||
String target = Utility.fromHex(pieces[1].trim(),true);
|
||||
//if (pieces.length > 2) type = pieces[2].trim();
|
||||
add(source, target, type, count, line);
|
||||
String nfkdSource = Default.nfkd().normalize(source);
|
||||
String nfkdTarget = Default.nfkd().normalize(target);
|
||||
if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) {
|
||||
System.out.println("Suppressing nfkc for: " + Default.ucd().getCodeAndName(source));
|
||||
} else {
|
||||
add(source, target, type, count, line);
|
||||
}
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
|
@ -1137,39 +1250,49 @@ public class GenerateConfusables {
|
|||
}
|
||||
|
||||
public void writeSource(String directory, String filename) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.println("# Source File for IDN Confusables");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
out.println("");
|
||||
PrintWriter out = openAndWriteHeader(filename, "Source File for IDN Confusables");
|
||||
// PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
// out.println("# Source File for IDN Confusables");
|
||||
// out.println("# $Revision: 1.12 $");
|
||||
// out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
// out.println("");
|
||||
dataMixedAnycase.writeSource(out);
|
||||
out.close();
|
||||
}
|
||||
|
||||
public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
out.println("");
|
||||
PrintWriter out = openAndWriteHeader(filename, "Recommended confusable mapping for IDN");
|
||||
// PrintWriter out = BagFormatter.openUTF8Writer(directory, filename);
|
||||
// out.println("# Recommended confusable mapping for IDN");
|
||||
// out.println("# $Revision: 1.12 $");
|
||||
// out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
// out.println("");
|
||||
|
||||
if (appendFile) {
|
||||
String[] replacements = {"%date%", Default.getDate()};
|
||||
Utility.appendFile("com/ibm/text/UCD/confusablesHeader.txt",
|
||||
Utility.UTF8_WINDOWS, out, replacements);
|
||||
}
|
||||
writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs);
|
||||
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs);
|
||||
if (true) {
|
||||
writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, true, true);
|
||||
writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, true);
|
||||
writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, true, false);
|
||||
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
|
||||
} else {
|
||||
writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false);
|
||||
writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false);
|
||||
writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false);
|
||||
writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false);
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
/**
|
||||
* @param skipNFKEquivs TODO
|
||||
* @param onlyLowercase TODO
|
||||
* @param onlySingleScript TODO
|
||||
*
|
||||
*/
|
||||
private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs) {
|
||||
private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript) {
|
||||
// first get all the sets. Then get the best paradigm from each. Then sort.
|
||||
// Set setOfSets = data.getEquivalenceSets();
|
||||
// Map orderedResults = new TreeMap(betterTargetIsLess);
|
||||
|
@ -1186,16 +1309,30 @@ public class GenerateConfusables {
|
|||
out.println();
|
||||
int count = 0;
|
||||
UnicodeSet preferredID = getIdentifierSet();
|
||||
ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator});
|
||||
Set orderedPairs = new TreeSet(ac);
|
||||
for (Iterator it = items.iterator(); it.hasNext();) {
|
||||
String source = (String) it.next();
|
||||
if (UTF16.hasMoreCodePointsThan(source,1)) continue;
|
||||
String target = data.getParadigm(source);
|
||||
if (UTF16.hasMoreCodePointsThan(source,1)) continue;
|
||||
String target = data.getParadigm(source, onlyLowercase, onlySingleScript);
|
||||
if (target == null) continue;
|
||||
if (source.equals(target)) continue;
|
||||
if (skipNFKEquivs) {
|
||||
if (!Default.nfkd().normalize(source).equals(source)) continue;
|
||||
}
|
||||
orderedPairs.add(new String[] {target, source});
|
||||
}
|
||||
String lastTarget = null;
|
||||
for (Iterator it = orderedPairs.iterator(); it.hasNext();) {
|
||||
String[] pair = (String[]) it.next();
|
||||
String source = pair[1];
|
||||
String target = pair[0];
|
||||
String reason = fixReason(data.getReasons(source, target));
|
||||
if (lastTarget != null && !lastTarget.equals(target)) {
|
||||
out.println();
|
||||
}
|
||||
writeSourceTargetLine(out, source, tag, target, reason);
|
||||
lastTarget = target;
|
||||
count++;
|
||||
}
|
||||
out.println();
|
||||
|
@ -1326,7 +1463,7 @@ public class GenerateConfusables {
|
|||
*/
|
||||
public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) {
|
||||
int count = 0;
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.getSet(null).complement()); it.next(); ) {
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) {
|
||||
add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine);
|
||||
}
|
||||
}
|
||||
|
@ -1355,13 +1492,14 @@ public class GenerateConfusables {
|
|||
*
|
||||
*/
|
||||
public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
UnicodeSet representable = new UnicodeSet();
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Recommended confusable mapping for IDN");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
out.println("");
|
||||
PrintWriter out = openAndWriteHeader(filename, "Summary: Recommended confusable mapping for IDN");
|
||||
// PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
// out.print('\uFEFF');
|
||||
// out.println("# Summary: Recommended confusable mapping for IDN");
|
||||
// out.println("# $Revision: 1.12 $");
|
||||
// out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
// out.println("");
|
||||
UnicodeSet representable = new UnicodeSet();
|
||||
MyEquivalenceClass data = dataMixedAnycase;
|
||||
Set items = data.getOrderedExplicitItems();
|
||||
// for (Iterator it = items.iterator(); it.hasNext();) {
|
||||
|
@ -1481,11 +1619,12 @@ public class GenerateConfusables {
|
|||
wsAny.addEquivalents(equivalents);
|
||||
wsLower.addEquivalents(equivalents);
|
||||
}
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# Summary: Whole-Script Confusables");
|
||||
out.println("# $Revision: 1.11 $");
|
||||
out.println("# $Date: 2006/06/09 21:21:20 $");
|
||||
PrintWriter out = openAndWriteHeader(filename, "Summary: Whole-Script Confusables");
|
||||
// PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
// out.print('\uFEFF');
|
||||
// out.println("# Summary: Whole-Script Confusables");
|
||||
// out.println("# $Revision: 1.12 $");
|
||||
// out.println("# $Date: 2006/09/24 23:32:44 $");
|
||||
out.println("# This data is used for determining whether a strings is a");
|
||||
out.println("# whole-script or mixed-script confusable.");
|
||||
out.println("# The mappings here ignore common and inherited script characters,");
|
||||
|
@ -1716,7 +1855,6 @@ public class GenerateConfusables {
|
|||
}
|
||||
|
||||
private static void generateConfusables(String indir, String outdir) throws IOException {
|
||||
betterTargetIsLess.compare("\u0020", "\u2004");
|
||||
File dir = new File(indir);
|
||||
String[] names = dir.list();
|
||||
DataSet total = new DataSet();
|
||||
|
@ -1731,12 +1869,26 @@ public class GenerateConfusables {
|
|||
total.addAll(ds);
|
||||
total.close("t*" + names[i]);
|
||||
}
|
||||
// add normalized data
|
||||
// for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
// if (Default.nfkc().isNormalized(i)) continue;
|
||||
// String result = getModifiedNKFC(UTF16.valueOf(i));
|
||||
// ds.foo();
|
||||
// }
|
||||
getSkipNFKD();
|
||||
DataSet ds = new DataSet();
|
||||
ds.addUnicodeMap(nfcMap, "nfc", "nfc");
|
||||
ds.close("*");
|
||||
total.addAll(ds);
|
||||
total.close("*");
|
||||
|
||||
ds = new DataSet();
|
||||
ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc");
|
||||
ds.close("*");
|
||||
//ds.write(outdir, "new-decomp.txt", false, false);
|
||||
total.addAll(ds);
|
||||
total.close("*");
|
||||
|
||||
total.writeSummary(outdir, "confusablesSummary.txt", false, null);
|
||||
total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null);
|
||||
//total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true,
|
||||
|
@ -1893,6 +2045,12 @@ public class GenerateConfusables {
|
|||
MARK_ASCII = new Integer(10);
|
||||
|
||||
static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess();
|
||||
|
||||
static UnicodeSet XID = new UnicodeSet("[:xidcontinue:]");
|
||||
|
||||
static boolean isXid(String x) {
|
||||
return XID.containsAll(x);
|
||||
}
|
||||
|
||||
static class _BetterTargetIsLess implements Comparator {
|
||||
IdentifierInfo info = IdentifierInfo.getIdentifierInfo();
|
||||
|
@ -1900,9 +2058,20 @@ public class GenerateConfusables {
|
|||
public int compare(Object o1, Object o2) {
|
||||
String a = (String)o1;
|
||||
String b = (String)o2;
|
||||
// longer is better (less)
|
||||
int ca = UTF16.countCodePoint(a);
|
||||
int cb = UTF16.countCodePoint(b);
|
||||
if (ca != cb) return ca > cb ? -1 : 1;
|
||||
if (ca != cb) {
|
||||
return ca > cb ? -1 : 1;
|
||||
}
|
||||
|
||||
// is Identifier is better
|
||||
boolean ba = isXid(a);
|
||||
boolean bb = isXid(b);
|
||||
if (ba != bb) {
|
||||
return ba ? -1 : 1;
|
||||
}
|
||||
|
||||
int aok = getValue(a);
|
||||
int bok = getValue(b);
|
||||
if (aok != bok) return aok < bok ? -1 : 1;
|
||||
|
@ -1947,4 +2116,28 @@ public class GenerateConfusables {
|
|||
return type.substring(dash+1,period);
|
||||
}
|
||||
|
||||
static Normalizer modNFKC ;
|
||||
|
||||
private static String getModifiedNKFC(String cf) {
|
||||
if (modNFKC == null) {
|
||||
modNFKC = new Normalizer(Normalizer.NFKC, Default.ucdVersion());
|
||||
modNFKC.setSpacingSubstitute();
|
||||
}
|
||||
return modNFKC.normalize(cf);
|
||||
}
|
||||
|
||||
private static PrintWriter openAndWriteHeader(String filename, String title) throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(outdir, filename);
|
||||
out.print('\uFEFF');
|
||||
out.println("# " + title);
|
||||
out.println("# File: " + filename);
|
||||
out.println("# Version: " + version);
|
||||
out.println("# Generated: " + Default.getDate());
|
||||
out.println("# Checkin: $Revision: 1.12 $");
|
||||
out.println("#");
|
||||
out.println("# For documentation and usage, see http://www.unicode.org/reports/tr39/");
|
||||
out.println("#");
|
||||
return out;
|
||||
}
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
|
||||
* $Date: 2004/02/07 01:01:14 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2006/09/24 23:32:44 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,9 +14,10 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.text.utility.*;
|
||||
//import com.ibm.text.utility;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import java.util.*;
|
||||
import com.ibm.text.utility.Utility;
|
||||
//import java.util.*;
|
||||
|
||||
public class GenerateThaiBreaks {
|
||||
public static void main(String [] args) throws IOException {
|
||||
|
|
|
@ -1,3 +1,20 @@
|
|||
Show [[:block=tamil:] & [:age=3.2:] - [:age=3.1:]]
|
||||
Show [[:block=tamil:] & [:age=4.0:] - [:age=3.2:]]
|
||||
Show [[:block=tamil:] & [:age=4.1:] - [:age=4.0:]]
|
||||
Show [[:block=tamil:] & [:age=5.0:] - [:age=4.1:]]
|
||||
|
||||
Stop
|
||||
|
||||
Show [[:NFKCQuickCheck=No:] & [$gc:Lm]]
|
||||
|
||||
Stop
|
||||
|
||||
[$Name: $gc:Sk]
|
||||
[$Name: $gc:Lm]
|
||||
|
||||
Show [[$whitespace] - [$gc:zs]]
|
||||
Show [[$gc:zs] - [$whitespace]]
|
||||
|
||||
Let $letter = [$gc:Lu $gc:Ll $gc:Lt $gc:Lo $gc:Lm];
|
||||
Let $number = [$gc:Nd $gc:Nl $gc:No]
|
||||
Let $mark = [$gc:mn $gc:me $gc:mc]
|
||||
|
@ -62,7 +79,7 @@ Let $guessClose = [$gc:pf $gc:pe $gc:pi]
|
|||
$guessClose = $__closing_punc
|
||||
|
||||
Let $guessTerm = [$sb:aterm $sb:sterm]
|
||||
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? … ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
|
||||
$guessTerm = [? ? !?? ? ? ? ? ??? ? ? ? ? ? ? ? .?? <EFBFBD> ? ? ? ? ? ? ? ?? ? ? ? ? ? ? ?]
|
||||
|
||||
Let $__issymotherr = [\u00A6\u00A7\u06FD\u06FE\u0F01-\u0F03\u0F13-\u0F17\u0F1A-\u0F1F\u0FBE-\u0FC5\u0FC7-\u0FCC\u2100\u2101\u2104-\u2106\u2108\u2109\u2117\u2118\u211E-\u2121\u2195-\u2199\u219C-\u219F\u21A1\u21A2\u21A4\u21A5\u21A7-\u21AD\u21AF-\u21CD\u21D0\u21D1\u21D5-\u21F3\u2300-\u2307\u230C-\u231F\u2322-\u2328\u232B-\u237B\u237D-\u239A\u2400-\u2426\u2440-\u244A\u249C-\u24E9\u2500-\u25B6\u25B8-\u25C0\u25C2-\u25F7\u2600-\u2613\u2619-\u266E\u2670\u2671\u2701-\u2704\u2706-\u2709\u270C-\u2727\u2729-\u274B\u274F-\u2752\u2758-\u275E\u2761-\u2794\u2798-\u27AF\u27B1-\u27BE\u2800-\u28FF\u2E80-\u2E99\u2E9B-\u2EF3\u2F00-\u2FD5\u2FF0-\u2FFB\u3012\u3013\u3036\u3037\u303E\u303F\u3190\u3191\u3196-\u319F\u3200-\u321C\u322A-\u3243\u3260-\u327B\u328A-\u32B0\u32C0-\u32CB\u32D0-\u32FE\u3300-\u3376\u337B-\u33DD\u33E0-\u33FE\uA490-\uA4A1\uA4A4-\uA4B3\uA4B5-\uA4C0\uA4C2-\uA4C4\uFFED\uFFEE\uFFFC\uFFFD]
|
||||
Let $__issymothers = [\u00B6\u0482\u06E9\u09FA\u0B70\u0F34\u0F36\u0F38\u0FCF\u2114\u2123\u2125\u2127\u2129\u212E\u2132\u213A\u21D3\u220E\u2617\u274D\u2756\u3004\u3020\u327F\uA4C6\uFFE4\uFFE8]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
Generate: .*
|
||||
DeltaVersion: 16
|
||||
Generate: .*BreakTest.*
|
||||
DeltaVersion: 17
|
||||
CopyrightYear: 2006
|
||||
|
||||
File: auxiliary/GraphemeBreakProperty
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2005/11/01 00:10:54 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2006/09/24 23:32:44 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,9 +14,13 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.sun.java_cup.internal.internal_error;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -302,6 +306,7 @@ public final class Normalizer implements UCD_Types {
|
|||
private byte form;
|
||||
private boolean composition;
|
||||
private boolean compatibility;
|
||||
private UnicodeMap substituteMapping;
|
||||
|
||||
/**
|
||||
* Decomposes text, either canonical or compatibility,
|
||||
|
@ -319,7 +324,12 @@ public final class Normalizer implements UCD_Types {
|
|||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
|
||||
buffer.setLength(0);
|
||||
ch32 = UTF16.charAt(source, i);
|
||||
data.getRecursiveDecomposition(ch32, buffer, compat);
|
||||
String sub = substituteMapping == null ? null : (String) substituteMapping.getValue(ch32);
|
||||
if (sub != null) {
|
||||
buffer.append(sub);
|
||||
} else {
|
||||
data.getRecursiveDecomposition(ch32, buffer, compat);
|
||||
}
|
||||
|
||||
// add all of the characters in the decomposition.
|
||||
// (may be just the original character, if there was
|
||||
|
@ -561,6 +571,81 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
return result;
|
||||
}
|
||||
|
||||
public UnicodeMap getSubstituteMapping() {
|
||||
return substituteMapping;
|
||||
}
|
||||
|
||||
public Normalizer setSubstituteMapping(UnicodeMap substituteMapping) {
|
||||
this.substituteMapping = substituteMapping;
|
||||
return this;
|
||||
}
|
||||
|
||||
static UnicodeMap spacingMap;;
|
||||
public void setSpacingSubstitute() {
|
||||
if (spacingMap == null) {
|
||||
makeSpacingMap();
|
||||
}
|
||||
setSubstituteMapping(spacingMap);
|
||||
}
|
||||
|
||||
private void makeSpacingMap() {
|
||||
spacingMap = new UnicodeMap();
|
||||
StringBuffer b = new StringBuffer();
|
||||
main:
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
boolean compat = data.ucd.getDecompositionType(i) >= data.ucd.CANONICAL;
|
||||
if (!compat) continue;
|
||||
b.setLength(0);
|
||||
data.getRecursiveDecomposition(i, b, true);
|
||||
if (b.length() == 1) continue;
|
||||
char firstChar = b.charAt(0);
|
||||
if (firstChar != 0x20 && firstChar != '\u0640') continue;
|
||||
// if rest are just Mn or Me marks, then add to substitute mapping
|
||||
int cp;
|
||||
for (int j = 1; j < b.length(); j += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(b,j);
|
||||
int cat = data.ucd.getCategory(cp);
|
||||
if (cat != data.ucd.Mn && cat != data.ucd.Me) continue main;
|
||||
}
|
||||
spacingMap.put(i, UTF16.valueOf(i));
|
||||
}
|
||||
String[][] specials = {
|
||||
{"[\\u0384\\u1FFD]", "\u00B4"},
|
||||
{"[\\uFFE3]", "\u00AF"},
|
||||
{"[\\uFE49-\\uFE4C]", "\u203E"},
|
||||
{"[\\u1FED]", "\u00A8\u0300"},
|
||||
{"[\\u1FEE\\u0385]", "\u00A8\u0301"},
|
||||
{"[\\u1FC1]", "\u00A8\u0342"},
|
||||
{"[\\u1FBD]", "\u1FBF"},
|
||||
{"[\\u1FCD]", "\u1FBF\u0300"},
|
||||
{"[\\u1FCE]", "\u1FBF\u0301"},
|
||||
{"[\\u1FCF]", "\u1FBF\u0342"},
|
||||
{"[\\u1FDD]", "\u1FFE\u0300"},
|
||||
{"[\\u1FDE]", "\u1FFE\u0301"},
|
||||
{"[\\u1FDF]", "\u1FFE\u0342"},
|
||||
{"[\\uFC5E]", "\uFE72\u0651"},
|
||||
{"[\\uFC5F]", "\uFE74\u0651"},
|
||||
{"[\\uFC60]", "\uFE76\u0651"},
|
||||
{"[\\uFC61]", "\uFE78\u0651"},
|
||||
{"[\\uFC62]", "\uFE7A\u0651"},
|
||||
{"[\\uFC63]", "\uFE7C\u0670"},
|
||||
{"[\\uFCF2]", "\uFE77\u0651"},
|
||||
{"[\\uFCF3]", "\uFE79\u0651"},
|
||||
{"[\\uFCF4]", "\uFE7B\u0651"},
|
||||
};
|
||||
int count = 0;
|
||||
UnicodeSet mappedChars = spacingMap.keySet();
|
||||
for (int i = 0; i < specials.length; ++i) {
|
||||
UnicodeSet source = new UnicodeSet(specials[i][0]);
|
||||
if (!mappedChars.containsAll(source)) {
|
||||
throw new InternalError("Remapping character that doesn't need it!" + source);
|
||||
}
|
||||
spacingMap.putAll(source, specials[i][1]);
|
||||
count += source.size();
|
||||
}
|
||||
spacingMap.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Just accessible for testing.
|
||||
*/
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
|
||||
* $Date: 2006/06/09 21:21:20 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2006/09/24 23:32:45 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -24,6 +24,7 @@ import java.io.Writer;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
|
@ -38,6 +39,7 @@ import com.ibm.icu.dev.demo.translit.CaseIterator;
|
|||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.Tabber;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty;
|
||||
import com.ibm.icu.impl.PrettyPrinter;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
@ -57,7 +59,16 @@ import com.ibm.icu.util.ULocale;
|
|||
public class QuickTest implements UCD_Types {
|
||||
public static void main(String[] args) throws IOException {
|
||||
try {
|
||||
|
||||
getHangulDecomps();
|
||||
|
||||
if (true) return;
|
||||
|
||||
|
||||
showLeadingTrailingNonStarters();
|
||||
//checkBufferStatus(true);
|
||||
|
||||
|
||||
checkNormalization("NFC", Default.nfc());
|
||||
//checkNormalization("NFKC", Default.nfkc());
|
||||
|
||||
|
@ -66,7 +77,6 @@ public class QuickTest implements UCD_Types {
|
|||
checkCaseChanges();
|
||||
if (true) return;
|
||||
|
||||
checkBufferStatus();
|
||||
|
||||
|
||||
checkCase();
|
||||
|
@ -102,7 +112,43 @@ public class QuickTest implements UCD_Types {
|
|||
}
|
||||
}
|
||||
|
||||
static void checkNormalization(String title, Normalizer nfx) {
|
||||
private static void getHangulDecomps() {
|
||||
//Normalizer nfkd500 = new Normalizer(Normalizer.NFKD, "5.0.0");
|
||||
Normalizer nfkd218 = new Normalizer(Normalizer.NFKD, "2.1.8");
|
||||
UnicodeMap diff = new UnicodeMap();
|
||||
Map compose = new HashMap();
|
||||
Map decompose = new HashMap();
|
||||
// UnicodeSet applicable = // new UnicodeSet("[:HangulSyllable=NA:]");
|
||||
UnicodeSet applicable = new UnicodeSet("[[\u1100-\u11FF \uAC00-\uD7FF]&[:assigned:]]");
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(applicable); it.next(); ) {
|
||||
String source = it.getString();
|
||||
String v218 = nfkd218.normalize(source);
|
||||
//String v500 = nfkd500.normalize(source);
|
||||
if (v218.equals(source)) continue;
|
||||
decompose.put(source, v218);
|
||||
compose.put(v218, source);
|
||||
}
|
||||
// now try recomposing
|
||||
|
||||
for (Iterator it = decompose.keySet().iterator(); it.hasNext();) {
|
||||
String source = (String) it.next();
|
||||
String decomposition = (String) decompose.get(source);
|
||||
if (decomposition.length() > 2) {
|
||||
String trial = decomposition.substring(0, decomposition.length() - 1);
|
||||
String composition = (String) compose.get(trial);
|
||||
if (composition != null) {
|
||||
decomposition = composition + decomposition.substring(decomposition.length() - 1);
|
||||
}
|
||||
}
|
||||
if (decomposition.length() != 2) System.out.println("Failed decomp: " + Default.ucd().getCodeAndName(source));
|
||||
diff.put(source.charAt(0), com.ibm.text.utility.Utility.hex(decomposition, " "));
|
||||
}
|
||||
UnicodeMapProperty p = new UnicodeMapProperty().set(diff);
|
||||
BagFormatter bf = new BagFormatter().setValueSource(p);
|
||||
System.out.println(bf.showSetNames(diff.keySet()));
|
||||
}
|
||||
|
||||
static void checkNormalization(String title, Normalizer nfx) {
|
||||
UnicodeSet trailing = new UnicodeSet();
|
||||
UnicodeSet leading = new UnicodeSet();
|
||||
UnicodeSet starter = new UnicodeSet();
|
||||
|
@ -947,20 +993,22 @@ public class QuickTest implements UCD_Types {
|
|||
static Counter bufferTypes = new Counter();
|
||||
|
||||
static class BufferData {
|
||||
byte starterIsZero;
|
||||
int initials;
|
||||
int medials;
|
||||
int finals;
|
||||
int sample;
|
||||
public boolean equals(Object other) {
|
||||
BufferData that = (BufferData)other;
|
||||
return initials == that.initials && medials == that.medials && finals == that.finals;
|
||||
return starterIsZero == that.starterIsZero && initials == that.initials && medials == that.medials && finals == that.finals;
|
||||
}
|
||||
public int hashCode() {
|
||||
return (initials*37 + medials)*37 + finals;
|
||||
return ((starterIsZero * 37 + initials)*37 + medials)*37 + finals;
|
||||
}
|
||||
public BufferData set(int codepoint) {
|
||||
String s = Default.nfkd().normalize(codepoint);
|
||||
int cp;
|
||||
starterIsZero = (byte)(UCharacter.getCombiningClass(codepoint) == 0 ? 0 : 1);
|
||||
boolean isInitial = true;
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
|
@ -977,14 +1025,30 @@ public class QuickTest implements UCD_Types {
|
|||
finals = 0;
|
||||
}
|
||||
}
|
||||
if (medials != 0) medials = 1;
|
||||
sample = codepoint;
|
||||
if (starterIsZero == 0 && medials == 0) {
|
||||
System.out.println("WARNING: BAD CHARACTER");
|
||||
cp = sample;
|
||||
int ccc = UCharacter.getCombiningClass(cp);
|
||||
System.out.println("U+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
ccc = UCharacter.getCombiningClass(cp);
|
||||
System.out.println("\tU+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp) + " (ccc=" + ccc + ")");
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
public static String getHeader() {
|
||||
return "Starter?" + "\t" + "initials" + "\t" + "Contains Starter?" + "\t" + "finals" + "\t" + "sample hex" + "\t" + "sample name";
|
||||
}
|
||||
public String toString() {
|
||||
String result = (starterIsZero == 0 ? "Y" : "") + "\t" + initials + "\t" + (medials != 0 ? "Y" : "") + "\t" + finals + "\t";
|
||||
if (sample == 0) {
|
||||
return initials + "\t" + medials + "\t" + finals + "\t" + "-" + "\t" + "all others";
|
||||
return result + "-" + "\t" + "all others";
|
||||
}
|
||||
return initials + "\t" + medials + "\t" + finals + "\t" + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
|
||||
return result + Utility.hex(sample) + "\t" + UCharacter.getName(sample);
|
||||
}
|
||||
}
|
||||
static class BufferDataComparator implements Comparator {
|
||||
|
@ -992,14 +1056,15 @@ public class QuickTest implements UCD_Types {
|
|||
BufferData a0 = (BufferData)arg0;
|
||||
BufferData a1 = (BufferData)arg1;
|
||||
int result;
|
||||
if (0 != (result = a0.initials - a1.initials)) return result;
|
||||
if (0 != (result = a0.starterIsZero - a1.starterIsZero)) return result;
|
||||
if (0 != (result = a0.initials - a1.initials)) return result;
|
||||
if (0 != (result = a0.finals - a1.finals)) return result;
|
||||
if (0 != (result = a0.medials - a1.medials)) return result;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
private static void checkBufferStatus() {
|
||||
BufferData non = new BufferData().set(0);
|
||||
private static void showLeadingTrailingNonStarters() {
|
||||
BufferData non = new BufferData().set(0);
|
||||
Tabber tabber = new Tabber.HTMLTabber();
|
||||
for (int i = 0; i <= 0x10ffff; ++i) {
|
||||
int type = Default.ucd().getCategory(i);
|
||||
|
@ -1013,6 +1078,7 @@ public class QuickTest implements UCD_Types {
|
|||
TreeSet sorted = new TreeSet(new BufferDataComparator());
|
||||
NumberFormat nf = NumberFormat.getInstance();
|
||||
sorted.addAll(m.keySet());
|
||||
System.out.println(tabber.process("total\t" + BufferData.getHeader()));
|
||||
for (Iterator it = sorted.iterator(); it.hasNext();) {
|
||||
Object key = it.next();
|
||||
Object value = bufferTypes.getCount(key);
|
||||
|
|
|
@ -10,6 +10,8 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.Tabber;
|
||||
import com.ibm.icu.dev.test.util.TransliteratorUtilities;
|
||||
import com.ibm.icu.dev.tool.UOption;
|
||||
import com.ibm.icu.text.SymbolTable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
@ -21,13 +23,15 @@ public class TestUnicodeInvariants {
|
|||
private static final int
|
||||
HELP1 = 0,
|
||||
FILE = 1,
|
||||
RANGE = 2
|
||||
RANGE = 2,
|
||||
TABLE = 3
|
||||
;
|
||||
|
||||
private static final UOption[] options = {
|
||||
UOption.HELP_H(),
|
||||
UOption.create("file", 'f', UOption.REQUIRES_ARG),
|
||||
UOption.create("range", 'r', UOption.NO_ARG),
|
||||
UOption.create("norange", 'n', UOption.NO_ARG),
|
||||
UOption.create("table", 't', UOption.NO_ARG),
|
||||
};
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
@ -35,7 +39,10 @@ public class TestUnicodeInvariants {
|
|||
|
||||
String file = "UnicodeInvariants.txt";
|
||||
if (options[FILE].doesOccur) file = options[FILE].value;
|
||||
boolean doRange = options[RANGE].doesOccur;
|
||||
boolean doRange = !options[RANGE].doesOccur;
|
||||
System.out.println("File:\t" + file);
|
||||
System.out.println("Ranges?\t" + doRange);
|
||||
System.out.println("HTML?\t" + options[TABLE].doesOccur);
|
||||
|
||||
testInvariants(file, doRange);
|
||||
}
|
||||
|
@ -92,11 +99,19 @@ public class TestUnicodeInvariants {
|
|||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("com/ibm/text/UCD/", outputFile);
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
BagFormatter bf2 = new BagFormatter();
|
||||
bf2.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
bf2.setMergeRanges(doRange);
|
||||
|
||||
BagFormatter errorLister = new BagFormatter();
|
||||
errorLister.setMergeRanges(doRange);
|
||||
errorLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
errorLister.setShowLiteral(TransliteratorUtilities.toXML);
|
||||
if (options[TABLE].doesOccur) errorLister.setTabber(new Tabber.HTMLTabber());
|
||||
|
||||
BagFormatter showLister = new BagFormatter();
|
||||
showLister.setUnicodePropertyFactory(ToolUnicodePropertySource.make(""));
|
||||
showLister.setMergeRanges(doRange);
|
||||
showLister.setShowLiteral(TransliteratorUtilities.toXML);
|
||||
if (options[TABLE].doesOccur) showLister.setTabber(new Tabber.HTMLTabber());
|
||||
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make(UCD.lastVersion).getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
|
@ -112,6 +127,7 @@ public class TestUnicodeInvariants {
|
|||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
if (line.length() == 0) continue;
|
||||
if (line.equalsIgnoreCase("Stop")) break;
|
||||
|
||||
// fix all the variables
|
||||
String oldLine = line;
|
||||
|
@ -133,12 +149,12 @@ public class TestUnicodeInvariants {
|
|||
String part = line.substring(4).trim();
|
||||
if (part.startsWith("Each")) {
|
||||
part = part.substring(4).trim();
|
||||
bf2.setMergeRanges(false);
|
||||
showLister.setMergeRanges(false);
|
||||
}
|
||||
pp.setIndex(0);
|
||||
UnicodeSet leftSet = new UnicodeSet(part, pp, st);
|
||||
bf2.showSetNames(out, leftSet);
|
||||
bf2.setMergeRanges(doRange);
|
||||
showLister.showSetNames(out, leftSet);
|
||||
showLister.setMergeRanges(doRange);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -210,7 +226,7 @@ public class TestUnicodeInvariants {
|
|||
out.println();
|
||||
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
|
||||
out.println("**** START Error Info ****");
|
||||
bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
|
||||
errorLister.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
testFailureCount++;
|
||||
|
|
|
@ -331,11 +331,10 @@ AC00..D7A3 # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
|
|||
</ol>
|
||||
<h3>5. UCA</h3>
|
||||
<ol>
|
||||
<li>
|
||||
You will use com.ibm.text.UCA.Main as your main class, creating along
|
||||
<li>You will use com.ibm.text.UCA.Main as your main class, creating along
|
||||
the same lines as above.</li>
|
||||
<li>To test whether the UCA files are valid, use the
|
||||
<span style="font-weight: 400">options (<i>note: you should also build the ICU
|
||||
<span style="font-weight: 400">options (<i>note: you must also build the ICU
|
||||
files below, since they test other aspects</i>).</span><pre>writeCollationValidityLog</pre>
|
||||
<p>It will create a file:</p>
|
||||
<pre><a href="file:///C:/DATA/GEN/collation/5.0.0/CheckCollationValidity.html">C:\DATA\GEN\collation\5.0.0\CheckCollationValidity.html</a></pre>
|
||||
|
@ -354,24 +353,45 @@ AC00..D7A3 # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
|
|||
</ol></li>
|
||||
</ol></li>
|
||||
<li>
|
||||
<h4><span style="font-weight: 400">To build all the charts, use the options:
|
||||
</span> </h4>
|
||||
<h4><span style="font-weight: 400">To build all the charts (including for
|
||||
the UCA), use the options: </span></h4>
|
||||
<pre>normalizationChart caseChart scriptChart indexChart</pre>
|
||||
<p>They will be built into</p>
|
||||
<pre><a href="file:///C:/DATA/GEN/charts">C:\DATA\GEN\charts</a></pre>
|
||||
<p><b>Once UCA is released, then copy those files up to the right spots in
|
||||
the Unicode site:</b><ul>
|
||||
<li>
|
||||
<pre><a href="http://www.unicode.org/charts/normalization/">http://www.unicode.org/charts/normalization/</a></pre>
|
||||
</li>
|
||||
<li>
|
||||
<pre><a href="http://www.unicode.org/charts/collation/">http://www.unicode.org/charts/collation/</a> </pre>
|
||||
</li>
|
||||
<li>
|
||||
<pre><a href="http://www.unicode.org/charts/case/">http://www.unicode.org/charts/case/</a> </pre>
|
||||
</li>
|
||||
<li>
|
||||
<pre><a href="http://www.unicode.org/charts/collation/">http://www.unicode.org/charts/collation/</a> </pre>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
<h4><span style="font-weight: 400">To build all the UCA files used by ICU, use the
|
||||
option:</span></h4>
|
||||
<pre>ICU</pre>
|
||||
<p>They will be built into:</p>
|
||||
<pre><a href="file:///C:/DATA/GEN/collation/5.0.0">C:\DATA\GEN\collation\5.0.0</a></pre>
|
||||
</li>
|
||||
<li>You should then build a set of the ICU files for the previous version,
|
||||
if you don't have them. The key file is UCA_Rules_NoCE.txt. It contains the
|
||||
rules expressed in ICU format, which allows for comparison across versions
|
||||
of UCA.<ol>
|
||||
<li>Do a Diff, and verify that all the differences are either new
|
||||
characters, or were authorized to be changed by the UTC.</li>
|
||||
</ol>
|
||||
|
||||
</li>
|
||||
if you don't have them. Use the options:<pre>version 4.1.0 ICU</pre>
|
||||
<p>Or whatever the last version was.</li>
|
||||
<li>Now, you will want to compare versions. The key file is
|
||||
UCA_Rules_NoCE.txt. It contains the rules expressed in ICU format, which
|
||||
allows for comparison across versions of UCA without spurious variations of
|
||||
the numbers getting in the way.<ol>
|
||||
<li>Do a Diff between the last and current versions of these files, and
|
||||
verify that all the differences are either new characters, or were
|
||||
authorized to be changed by the UTC.</li>
|
||||
</ol></li>
|
||||
</ol>
|
||||
|
||||
</body>
|
||||
|
|
Loading…
Add table
Reference in a new issue