mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
updated for collation bugs, added isFCD.
X-SVN-Rev: 8886
This commit is contained in:
parent
775e63220e
commit
25561ba9b8
13 changed files with 1061 additions and 120 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2002/06/04 23:56:29 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -16,6 +16,7 @@ package com.ibm.text.UCA;
|
|||
import java.util.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
|
||||
import java.io.*;
|
||||
//import java.text.*;
|
||||
|
@ -135,7 +136,7 @@ public class WriteCollationData implements UCD_Types {
|
|||
|
||||
static public void writeCaseFolding() throws IOException {
|
||||
System.err.println("Writing Javascript data");
|
||||
BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true);
|
||||
BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true, false);
|
||||
// new BufferedReader(new FileReader(DIR31 + "CaseFolding-3.d3.alpha.txt"), 64*1024);
|
||||
// log = new PrintWriter(new FileOutputStream("CaseFolding_data.js"));
|
||||
log = Utility.openPrintWriter("CaseFolding_data.js", false, false);
|
||||
|
@ -1487,6 +1488,11 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
if (UCA.isImplicitLeadPrimary(primary)) {
|
||||
if (relation == PRIMARY_DIFF) {
|
||||
int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1]));
|
||||
|
||||
int[] ces2 = new int[50];
|
||||
int len2 = collator.getCEs(UTF16.valueOf(resetCp), true, ces2);
|
||||
relation = getStrengthDifference(ces, len, ces2, len2);
|
||||
|
||||
reset = quoteOperand(UTF16.valueOf(resetCp));
|
||||
resetComment = ucd.getCodeAndName(resetCp);
|
||||
// lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY);
|
||||
|
@ -1542,10 +1548,10 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
if (xmlReset == 2) {
|
||||
log.print("<reset>" + Utility.quoteXML(reset) + "</reset>");
|
||||
}
|
||||
log.print(" <" + XML_RELATION_NAMES[relation] + ">");
|
||||
if (expansion.length() > 0) {
|
||||
log.print("<x>" + Utility.quoteXML(expansion) + "</x>");
|
||||
}
|
||||
log.print(" <" + XML_RELATION_NAMES[relation] + ">");
|
||||
log.print(Utility.quoteXML(chr));
|
||||
log.print("</" + XML_RELATION_NAMES[relation] + ">");
|
||||
} else {
|
||||
|
@ -1631,7 +1637,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
// static final String[] RELATION_NAMES = {" <", " <<", " <<<", " ="};
|
||||
static final String[] RELATION_NAMES = {" <\t", " <<\t", " <<<\t", " =\t"};
|
||||
static final String[] XML_RELATION_NAMES = {"o1", "o2", "o3", "o4"};
|
||||
static final String[] XML_RELATION_NAMES = {"g1", "g2", "g3", "eq"};
|
||||
|
||||
static class ArrayWrapper {
|
||||
int[] array;
|
||||
|
@ -2080,16 +2086,80 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
System.out.println("Sorting");
|
||||
Map ordered = new TreeMap();
|
||||
UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, NFD);
|
||||
Set contentsForCanonicalIteration = new TreeSet();
|
||||
UCA.UCAContents ucac = collator.getContents(UCA.FIXED_CE, null); // NFD
|
||||
int ccounter = 0;
|
||||
while (true) {
|
||||
Utility.dot(ccounter++);
|
||||
String s = ucac.next();
|
||||
if (s == null) break;
|
||||
contentsForCanonicalIteration.add(s);
|
||||
ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s);
|
||||
}
|
||||
|
||||
|
||||
// Add canonically equivalent characters!!
|
||||
System.out.println("Start Adding canonical Equivalents2");
|
||||
int canCount = 0;
|
||||
|
||||
System.out.println("Add missing decomposibles");
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAllocated(i)) continue;
|
||||
if (NFD.isNormalized(i)) continue;
|
||||
if (collator.getCEType(i) >= UCA.FIXED_CE) continue;
|
||||
String s = UTF16.valueOf(i);
|
||||
if (contentsForCanonicalIteration.contains(s)) continue;
|
||||
contentsForCanonicalIteration.add(s);
|
||||
ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s);
|
||||
System.out.println(" + " + ucd.getCodeAndName(s));
|
||||
canCount++;
|
||||
}
|
||||
|
||||
Set additionalSet = new HashSet();
|
||||
System.out.println("Loading canonical iterator");
|
||||
CanonicalIterator canIt = new CanonicalIterator(".");
|
||||
Iterator it2 = contentsForCanonicalIteration.iterator();
|
||||
System.out.println("Adding any FCD equivalents that have different sort keys");
|
||||
while (it2.hasNext()) {
|
||||
String key = (String)it2.next();
|
||||
if (key == null) {
|
||||
System.out.println("Null Key");
|
||||
continue;
|
||||
}
|
||||
canIt.setSource(key);
|
||||
boolean first = true;
|
||||
while (true) {
|
||||
String s = canIt.next();
|
||||
if (s == null) break;
|
||||
if (s.equals(key)) continue;
|
||||
if (contentsForCanonicalIteration.contains(s)) continue;
|
||||
if (additionalSet.contains(s)) continue;
|
||||
|
||||
if (s.equals("\u01EC")) {
|
||||
System.out.println("01ec");
|
||||
}
|
||||
|
||||
// Skip anything that is not FCD.
|
||||
if (!NFD.isFCD(s)) continue;
|
||||
|
||||
// We ONLY add if the sort key would be different
|
||||
// Than what we would get if we didn't decompose!!
|
||||
String sortKey = collator.getSortKey(s, UCA.NON_IGNORABLE);
|
||||
String nonDecompSortKey = collator.getSortKey(s, UCA.NON_IGNORABLE, false);
|
||||
if (sortKey.equals(nonDecompSortKey)) continue;
|
||||
|
||||
if (first) {
|
||||
System.out.println(" " + ucd.getCodeAndName(key));
|
||||
first = false;
|
||||
}
|
||||
System.out.println(" => " + ucd.getCodeAndName(s));
|
||||
System.out.println(" old: " + collator.toString(nonDecompSortKey));
|
||||
System.out.println(" new: " + collator.toString(sortKey));
|
||||
canCount++;
|
||||
additionalSet.add(s);
|
||||
ordered.put(sortKey + '\u0000' + s, s);
|
||||
}
|
||||
}
|
||||
System.out.println("Done Adding canonical Equivalents -- added " + canCount);
|
||||
/*
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
|
||||
* $Date: 2002/06/04 01:59:02 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,6 +14,8 @@
|
|||
package com.ibm.text.UCD;
|
||||
|
||||
import java.io.IOException;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
//import com.ibm.text.unicode.UInfo;
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
@ -35,6 +37,7 @@ public class BuildNames implements UCD_Types {
|
|||
static Map words = new TreeMap(new LengthFirstComparator());
|
||||
static Map doubleWords = new TreeMap(new LengthFirstComparator());
|
||||
static Map tripleWords = new TreeMap(new LengthFirstComparator());
|
||||
static Map quadWords = new TreeMap(new LengthFirstComparator());
|
||||
static Set lines = new TreeSet(new LengthFirstComparator());
|
||||
static int[] letters = new int[128];
|
||||
|
||||
|
@ -44,6 +47,8 @@ public class BuildNames implements UCD_Types {
|
|||
}
|
||||
|
||||
static String lastWord = "";
|
||||
static String preLastWord = "";
|
||||
static String prePreLastWord = "";
|
||||
|
||||
static void addWord(String word, Map words) {
|
||||
Count count = (Count) words.get(word);
|
||||
|
@ -59,15 +64,21 @@ public class BuildNames implements UCD_Types {
|
|||
|
||||
// doubles
|
||||
|
||||
if (position != 0) {
|
||||
if (position > 0) {
|
||||
addWord(lastWord + "/" + word, doubleWords);
|
||||
}
|
||||
lastWord = word;
|
||||
|
||||
if (position > 1) {
|
||||
addWord(lastWord + "/" + word, doubleWords);
|
||||
addWord(preLastWord + "/" + lastWord + "/" + word, tripleWords);
|
||||
}
|
||||
lastLastWord = word;
|
||||
|
||||
if (position > 2) {
|
||||
addWord(prePreLastWord + "/" + preLastWord + "/" + lastWord + "/" + word, quadWords);
|
||||
}
|
||||
|
||||
prePreLastWord = preLastWord;
|
||||
preLastWord = lastWord;
|
||||
lastWord = word;
|
||||
|
||||
for (int i = 0; i < word.length(); ++i) {
|
||||
letters[word.charAt(i)]++;
|
||||
|
@ -129,35 +140,76 @@ public class BuildNames implements UCD_Types {
|
|||
|
||||
static void collectWords() throws IOException {
|
||||
|
||||
String fname = "ShortNames.txt";
|
||||
System.out.println("Writing " + fname);
|
||||
PrintWriter log = Utility.openPrintWriter(fname, false, true);
|
||||
|
||||
System.out.println("Gathering data");
|
||||
//Counter counter = new Counter();
|
||||
String[] parts = new String[100];
|
||||
//int total = 0;
|
||||
int used = 0;
|
||||
int sum = 0;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (Default.ucd.hasComputableName(i)) continue;
|
||||
String name = Default.ucd.getName(i);
|
||||
if (name == null) continue;
|
||||
name = transform(name);
|
||||
|
||||
sum += name.length();
|
||||
used++;
|
||||
|
||||
// replace numbers & letters
|
||||
|
||||
int len = Utility.split(name, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
stash(parts[j], j);
|
||||
int longSum = 0;
|
||||
|
||||
for (int cp = 0; cp < 0x10FFFF; ++cp) {
|
||||
if (!Default.ucd.isAllocated(cp)) continue;
|
||||
if (Default.ucd.hasComputableName(cp)) continue;
|
||||
Utility.dot(cp);
|
||||
String name;
|
||||
|
||||
if (Default.ucd.isRepresented(cp)) {
|
||||
name = Default.ucd.getName(cp, SHORT);
|
||||
log.println(Utility.hex(cp) + " " + name);
|
||||
String backName = Utility.replace(name, UCD_Names.NAME_ABBREVIATIONS, false);
|
||||
if (!name.equals(backName)) {
|
||||
System.out.println("Failed to recreate: " + name + ", " + backName);
|
||||
}
|
||||
}
|
||||
|
||||
// check the string, and its decomposition. This is just to get a good count.
|
||||
|
||||
String str = UTF16.valueOf(cp);
|
||||
if (false && !Default.nfkd.isNormalized(cp)) {
|
||||
str += Default.nfkd.normalize(cp);
|
||||
}
|
||||
|
||||
int cp2;
|
||||
for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp2)) {
|
||||
cp2 = UTF16.charAt(str, i);
|
||||
name = Default.ucd.getName(cp2, SHORT);
|
||||
if (name == null) continue;
|
||||
//name = transform(name);
|
||||
|
||||
lines.add(name);
|
||||
sum += name.length();
|
||||
longSum += Default.ucd.getName(cp2).length();
|
||||
used++;
|
||||
|
||||
// replace numbers & letters
|
||||
|
||||
int len = Utility.split(name, ' ', parts);
|
||||
for (int j = 0; j < len; ++j) {
|
||||
stash(parts[j], j);
|
||||
}
|
||||
|
||||
lines.add(name);
|
||||
}
|
||||
}
|
||||
System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
|
||||
System.out.println("Strings: " + sum + ", " + (lastLink*4));
|
||||
log.close();
|
||||
Utility.fixDot();
|
||||
//System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
|
||||
//System.out.println("Strings: " + sum + ", " + (lastLink*4));
|
||||
System.out.println("Short Names sum: " + sum + ", average: " + (sum + 0.0)/used);
|
||||
System.out.println("Long Names sum: " + longSum + ", average: " + (longSum + 0.0)/used);
|
||||
System.out.println("Savings: " + (1 - (sum+0.0)/longSum));
|
||||
|
||||
|
||||
printWords(words);
|
||||
printWords(doubleWords);
|
||||
printWords(tripleWords);
|
||||
printWords(quadWords);
|
||||
|
||||
if (true) return;
|
||||
|
||||
System.out.println();
|
||||
System.out.println("Compacting Words");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/ConvertUCD.java,v $
|
||||
* $Date: 2002/04/24 02:38:53 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -331,7 +331,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
|
||||
static void readBlocks() throws Exception {
|
||||
System.out.println("Reading 'Blocks'");
|
||||
BufferedReader input = Utility.openUnicodeFile(blocksname, version, true);
|
||||
BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, false);
|
||||
String line = "";
|
||||
try {
|
||||
String[] parts = new String[20];
|
||||
|
@ -376,7 +376,7 @@ public final class ConvertUCD implements UCD_Types {
|
|||
}
|
||||
String tempVersion = version;
|
||||
if (version.equals(UCD.latestVersion)) tempVersion = "";
|
||||
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true);
|
||||
BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, false);
|
||||
if (input == null) {
|
||||
System.out.println("COULDN'T OPEN: " + labels[0]);
|
||||
return;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,57 +14,416 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public final class GenerateHanTransliterator {
|
||||
public final class GenerateHanTransliterator implements UCD_Types {
|
||||
|
||||
static final boolean TESTING = false;
|
||||
static int type;
|
||||
|
||||
static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
|
||||
|
||||
public static void main(int typeIn) {
|
||||
type = typeIn;
|
||||
Default.setUCD();
|
||||
try {
|
||||
System.out.println("Starting");
|
||||
generate();
|
||||
log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
|
||||
err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
|
||||
log.print('\uFEFF');
|
||||
|
||||
String key; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
||||
String filter; // "kJis0";
|
||||
String filename;
|
||||
|
||||
switch (type) {
|
||||
case DEFINITION:
|
||||
key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
||||
filter = null; // "kJis0";
|
||||
filename = "Transliterator_Han_Latin_Definition.txt";
|
||||
break;
|
||||
case JAPANESE:
|
||||
key = "kJapaneseOn";
|
||||
filter = null; // "kJis0";
|
||||
filename = "Transliterator_ja_Latin.txt";
|
||||
break;
|
||||
case CHINESE:
|
||||
key = "kMandarin";
|
||||
filename = "Transliterator_Han_Latin.txt";
|
||||
filter = null;
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
|
||||
}
|
||||
|
||||
readUnihanData(key, filter);
|
||||
|
||||
if (false) {
|
||||
readCDICT();
|
||||
compareUnihanWithCEDICT();
|
||||
}
|
||||
|
||||
readFrequencyData();
|
||||
|
||||
out = Utility.openPrintWriter(filename, false, false);
|
||||
out.println("# Convert CJK characters");
|
||||
out.println("# Note: adds space between them and letters.");
|
||||
out.println("{ ([:Han:]) } [:L:] > | $1 ' ';");
|
||||
out.println("[\\.\\,\\?\\!\uFF0E\uFF0C\uFF1F\uFF01\u3001\u3002[:Pe:][:Pf:]] { } [:L:] > ' ';");
|
||||
out.println("[:L:] { } [[:Han:][:Ps:][:Pi:]]> ' ';");
|
||||
|
||||
if (type == JAPANESE) {
|
||||
out.println("$kata = [[\uFF9E\uFF9F\uFF70\u30FC][:katakana:]];");
|
||||
out.println("$kata { } [[:L:]-$kata]> ' ';");
|
||||
out.println("[[:L:]-$kata] { } $kata > ' ';");
|
||||
out.println("[:hiragana:] { } [[:L:]-[:hiragana:]] > ' ';");
|
||||
out.println("[[:L:]-[:hiragana:]] { } [:hiragana:]> ' ';");
|
||||
}
|
||||
|
||||
Set gotAlready = new HashSet();
|
||||
Iterator it = rankList.iterator();
|
||||
Set lenSet = new TreeSet();
|
||||
int rank = 0;
|
||||
while (it.hasNext()) {
|
||||
Comparable keyChar = (Comparable) it.next();
|
||||
Comparable def = (Comparable) unihanMap.get(keyChar);
|
||||
if (def == null) continue; // skipping
|
||||
// sort longer definitions first!
|
||||
lenSet.add(new Pair(
|
||||
new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
|
||||
new Pair(keyChar, def)));
|
||||
gotAlready.add(keyChar);
|
||||
}
|
||||
|
||||
// add the ones that are not ranked!
|
||||
it = unihanMap.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Comparable keyChar = (Comparable) it.next();
|
||||
Comparable def = (Comparable) unihanMap.get(keyChar);
|
||||
if (!gotAlready.contains(keyChar)) {
|
||||
lenSet.add(new Pair(
|
||||
new Pair(new Integer(-def.toString().length()), new Integer(rank++)),
|
||||
new Pair(keyChar, def)));
|
||||
}
|
||||
}
|
||||
|
||||
Set gotIt = new HashSet();
|
||||
it = lenSet.iterator();
|
||||
while (it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
p = (Pair) p.second;
|
||||
|
||||
Comparable keyChar = p.first;
|
||||
Comparable def = p.second;
|
||||
String rel = gotIt.contains(def) ? " > " : " <> ";
|
||||
out.println(keyChar + rel + def + ";");
|
||||
//if (TESTING) System.out.println("# " + code + " > " + definition);
|
||||
gotIt.add(def);
|
||||
}
|
||||
|
||||
out.println("\u3002 <> '.';");
|
||||
if (type == JAPANESE) {
|
||||
out.println(":: katakana-latin;");
|
||||
out.println(":: hiragana-latin;");
|
||||
}
|
||||
out.println(":: fullwidth-halfwidth;");
|
||||
|
||||
|
||||
|
||||
System.out.println("Total: " + totalCount);
|
||||
System.out.println("Defined Count: " + count);
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception: " + e);
|
||||
} finally {
|
||||
if (log != null) log.close();
|
||||
if (out != null) out.close();
|
||||
if (err != null) err.close();
|
||||
}
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
static PrintWriter out;
|
||||
static PrintWriter err;
|
||||
|
||||
static int count;
|
||||
static int totalCount;
|
||||
static int oldLine;
|
||||
|
||||
static void generate() throws java.io.IOException {
|
||||
String name = "$Han$English";
|
||||
String key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn
|
||||
String filter = "kJis0";
|
||||
String filename = "Han_English";
|
||||
switch (type) {
|
||||
default: break;
|
||||
case 1: name = "$Han$OnRomaji";
|
||||
key = "kJapaneseOn";
|
||||
filter = "kJis0";
|
||||
filename = "Han_Romaji";
|
||||
break;
|
||||
case 2: name = "$Han$Pinyin";
|
||||
key = "kMandarin";
|
||||
filename = "Han_Pinyin";
|
||||
filter = null;
|
||||
break;
|
||||
|
||||
static void readFrequencyData() throws java.io.IOException {
|
||||
String line = "";
|
||||
try {
|
||||
|
||||
// chinese_frequency.txt
|
||||
// 1 çš„ 1588561 1588561 3.5008%
|
||||
// japanese_frequency.txt
|
||||
// 1 ? 17176
|
||||
|
||||
Set combinedRank = new TreeSet();
|
||||
|
||||
System.out.println("Reading chinese_frequency.txt");
|
||||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\chinese_frequency.txt", true);
|
||||
int counter = 0;
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter++);
|
||||
int tabPos = line.indexOf('\t');
|
||||
int rank = Integer.parseInt(line.substring(0,tabPos));
|
||||
int cp = line.charAt(tabPos+1);
|
||||
//if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
|
||||
combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp)));
|
||||
}
|
||||
br.close();
|
||||
|
||||
System.out.println("Reading japanese_frequency.txt");
|
||||
|
||||
br = Utility.openReadFile( BASE_DIR + "dict\\japanese_frequency.txt", true);
|
||||
Map japaneseMap = new HashMap();
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter++);
|
||||
int tabPos = line.indexOf(' ');
|
||||
|
||||
int tabPos2 = line.indexOf(' ', tabPos+1);
|
||||
int freq = Integer.parseInt(line.substring(tabPos2+1));
|
||||
|
||||
for (int i = tabPos+1; i < tabPos2; ++i) {
|
||||
int cp = line.charAt(i);
|
||||
int script = Default.ucd.getScript(cp);
|
||||
if (script != HAN_SCRIPT) {
|
||||
if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT) {
|
||||
System.out.println("Huh: " + Default.ucd.getCodeAndName(cp));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp));
|
||||
Utility.addCount(japaneseMap, UTF16.valueOf(cp), -freq);
|
||||
}
|
||||
}
|
||||
br.close();
|
||||
|
||||
// get rank order japanese
|
||||
Iterator it = japaneseMap.keySet().iterator();
|
||||
int countJapanese = 0;
|
||||
while (it.hasNext()) {
|
||||
Comparable key = (Comparable) it.next();
|
||||
Comparable val = (Comparable) japaneseMap.get(key);
|
||||
combinedRank.add(new Pair(new Integer(++countJapanese), key));
|
||||
}
|
||||
|
||||
|
||||
int overallRank = 0;
|
||||
it = combinedRank.iterator();
|
||||
|
||||
while(it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
log.println(p.first + ", " + p.second);
|
||||
Object rank = rankMap.get(p.second);
|
||||
if (rank == null) {
|
||||
rankMap.put(p.second, new Integer(++overallRank));
|
||||
rankList.add(p.second);
|
||||
}
|
||||
}
|
||||
|
||||
log.println("@character to rank");
|
||||
|
||||
// get full order
|
||||
it = rankList.iterator();
|
||||
while (it.hasNext()) {
|
||||
Comparable key = (Comparable) it.next();
|
||||
Comparable val = (Comparable) rankMap.get(key);
|
||||
log.println(key + ", " + val);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Line \"{0}\"", new String[] {line}, e);
|
||||
}
|
||||
}
|
||||
|
||||
static void compareUnihanWithCEDICT() {
|
||||
System.out.println("@Comparing CEDICT to Unihan");
|
||||
log.println("@Comparing CEDICT to Unihan");
|
||||
Iterator it = unihanMap.keySet().iterator();
|
||||
List inCEDICT = new ArrayList();
|
||||
List inUnihan = new ArrayList();
|
||||
List inBoth = new ArrayList();
|
||||
UnicodeSet noPinyin = new UnicodeSet();
|
||||
UnicodeSet kPinyin = new UnicodeSet();
|
||||
UnicodeSet tPinyin = new UnicodeSet();
|
||||
UnicodeSet sPinyin = new UnicodeSet();
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!Default.ucd.isAllocated(i)) continue;
|
||||
if (Default.ucd.getScript(i) != HAN_SCRIPT) continue;
|
||||
Utility.dot(i);
|
||||
|
||||
String ch = UTF16.valueOf(i);
|
||||
|
||||
String pinyin = (String) unihanMap.get(ch);
|
||||
if (pinyin == null) {
|
||||
String ch2 = Default.nfkd.normalize(ch);
|
||||
pinyin = (String) unihanMap.get(ch2);
|
||||
if (pinyin != null) {
|
||||
unihanMap.put(ch, pinyin);
|
||||
kPinyin.add(i);
|
||||
} else {
|
||||
String trial = (String) simplifiedToTraditional.get(ch2);
|
||||
if (trial != null) {
|
||||
pinyin = (String) unihanMap.get(trial);
|
||||
if (pinyin != null) {
|
||||
unihanMap.put(ch, pinyin);
|
||||
tPinyin.add(i);
|
||||
} else {
|
||||
trial = (String) traditionalToSimplified.get(ch2);
|
||||
if (trial != null) {
|
||||
pinyin = (String) unihanMap.get(trial);
|
||||
if (pinyin != null) {
|
||||
unihanMap.put(ch, pinyin);
|
||||
sPinyin.add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Map pinyinSet = (Map) cdict.get(ch);
|
||||
if (pinyin == null) {
|
||||
if (pinyinSet != null) inCEDICT.add(ch + " => " + pinyinSet);
|
||||
noPinyin.add(i);
|
||||
} else if (pinyinSet == null) {
|
||||
inUnihan.add(ch + " => " + pinyin);
|
||||
} else {
|
||||
Object temp = pinyinSet.get(pinyin);
|
||||
if (temp == null) {
|
||||
inBoth.add(ch + " => " + pinyin + "; " + pinyinSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out = Utility.openPrintWriter("Transliterate_" + filename + ".txt", false, false);
|
||||
err = Utility.openPrintWriter("Transliterate_" + filename + "_log.txt", false, false);
|
||||
log.println("@In CEDICT but not Unihan: ");
|
||||
printCollection(log, inCEDICT);
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true);
|
||||
log.println("@In Unihan but not CEDICT: ");
|
||||
printCollection(log, inUnihan);
|
||||
|
||||
log.println("@In Unihan and CEDICT, but different: ");
|
||||
printCollection(log, inBoth);
|
||||
|
||||
log.println("@Missing from Unihan: ");
|
||||
log.println(noPinyin.toPattern(true));
|
||||
|
||||
log.println("@Has mapping if we NFKD it: ");
|
||||
log.println(kPinyin.toPattern(true));
|
||||
|
||||
log.println("@Has mapping if we NFKC & simp-trad it: ");
|
||||
log.println(tPinyin.toPattern(true));
|
||||
|
||||
log.println("@Has mapping if we NFKC & trad-simp it: ");
|
||||
log.println(sPinyin.toPattern(true));
|
||||
|
||||
log.println("@Done comparison");
|
||||
}
|
||||
|
||||
static void printCollection(PrintWriter p, Collection c) {
|
||||
Iterator it = c.iterator();
|
||||
int count = 0;
|
||||
while (it.hasNext()) {
|
||||
p.println((++count) + "\t" + it.next());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static Map rankMap = new TreeMap(); // maps from single char strings to overall rank
|
||||
static List rankList = new ArrayList(10000);
|
||||
|
||||
static void readCDICT() throws IOException {
|
||||
System.out.println("Reading cdict.txt");
|
||||
BufferedReader br = Utility.openReadFile(BASE_DIR + "dict\\cdict.txt", true);
|
||||
int counter = 0;
|
||||
String[] pieces = new String[50];
|
||||
String line = "";
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
line = Utility.readDataLine(br);
|
||||
if (line == null) break;
|
||||
if (line.length() == 0) continue;
|
||||
Utility.dot(counter++);
|
||||
int tabPos = line.indexOf('[');
|
||||
String word = line.substring(0,tabPos).trim();
|
||||
word = Utility.replace(word, "\uFE4D", "");
|
||||
word = Utility.replace(word, ".", "");
|
||||
word = Utility.replace(word, "/", "");
|
||||
word = Utility.replace(word, "(", "");
|
||||
word = Utility.replace(word, ")", "");
|
||||
|
||||
|
||||
int tab2Pos = line.indexOf(']', tabPos+1);
|
||||
String pinyins = line.substring(tabPos+1, tab2Pos);
|
||||
int len = Utility.split(pinyins, ' ', pieces);
|
||||
if (word.length() != len) {
|
||||
log.println("Len mismatch: " + line);
|
||||
continue;
|
||||
}
|
||||
for (int i = 0; i < len; ++i) {
|
||||
String chr = word.substring(i, i+1);
|
||||
String piece = convertPinyin.transliterate(pieces[i]);
|
||||
Map oldMap = (Map) cdict.get(chr);
|
||||
if (oldMap == null) {
|
||||
oldMap = new TreeMap();
|
||||
cdict.put(chr, oldMap);
|
||||
}
|
||||
/*&& !oldMap.equals(piece)) {
|
||||
log.println("Variant for '" + chr + "', new: '" + piece + "', old: '" + oldMap + "'");
|
||||
}
|
||||
*/
|
||||
Utility.addCount(oldMap, piece, 1);
|
||||
}
|
||||
}
|
||||
br.close();
|
||||
|
||||
Iterator it = cdict.keySet().iterator();
|
||||
Set tempSet = new TreeSet();
|
||||
while (it.hasNext()) {
|
||||
Object key = it.next();
|
||||
Map val = (Map) cdict.get(key);
|
||||
log.print(key + ": ");
|
||||
Iterator it2 = val.keySet().iterator();
|
||||
tempSet.clear();
|
||||
while (it2.hasNext()) {
|
||||
Comparable key2 = (Comparable) it2.next();
|
||||
Comparable count = (Comparable) val.get(key2);
|
||||
Pair p = new Pair(count, key2);
|
||||
tempSet.add(p); // reverse the order
|
||||
}
|
||||
it2 = tempSet.iterator();
|
||||
int counter2 = 0;
|
||||
while (it2.hasNext()) {
|
||||
if (counter2++ != 0) log.print("/");
|
||||
log.print(it2.next());
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e);
|
||||
}
|
||||
}
|
||||
|
||||
static Map cdict = new TreeMap();
|
||||
static Map simplifiedToTraditional = new HashMap();
|
||||
static Map traditionalToSimplified = new HashMap();
|
||||
|
||||
static void readUnihanData(String key, String filter) throws java.io.IOException {
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true);
|
||||
|
||||
int totalCount = 0;
|
||||
int count = 0;
|
||||
String oldCode = "";
|
||||
String oldLine = "";
|
||||
|
@ -81,9 +440,34 @@ public final class GenerateHanTransliterator {
|
|||
if (line == null) break;
|
||||
if (line.length() < 6) continue;
|
||||
if (line.charAt(0) == '#') continue;
|
||||
int tabPos = line.indexOf(' ');
|
||||
line = line.trim();
|
||||
|
||||
int tabPos = line.indexOf('\t');
|
||||
String code = line.substring(2, tabPos);
|
||||
|
||||
// gather traditional mapping
|
||||
if (line.indexOf("kTraditionalVariant") >= 0) {
|
||||
int tabPos2 = line.indexOf('\t', tabPos+1);
|
||||
int tabPos3 = line.indexOf(' ', tabPos2+1);
|
||||
if (tabPos3 < 0) tabPos3 = line.length();
|
||||
|
||||
String code2 = line.substring(tabPos2+3, tabPos3);
|
||||
simplifiedToTraditional.put(UTF16.valueOf(Integer.parseInt(code, 16)),
|
||||
UTF16.valueOf(Integer.parseInt(code2, 16)));
|
||||
}
|
||||
|
||||
if (line.indexOf("kSimplifiedVariant") >= 0) {
|
||||
int tabPos2 = line.indexOf('\t', tabPos+1);
|
||||
int tabPos3 = line.indexOf(' ', tabPos2+1);
|
||||
if (tabPos3 < 0) tabPos3 = line.length();
|
||||
|
||||
String code2 = line.substring(tabPos2+3, tabPos3);
|
||||
traditionalToSimplified.put(UTF16.valueOf(Integer.parseInt(code, 16)),
|
||||
UTF16.valueOf(Integer.parseInt(code2, 16)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* if (code.compareTo("9FA0") >= 0) {
|
||||
System.out.println("? " + line);
|
||||
}*/
|
||||
|
@ -93,12 +477,15 @@ public final class GenerateHanTransliterator {
|
|||
if (foundKey && foundFilter) {
|
||||
count++;
|
||||
/*if (true) { //*/
|
||||
if (count == 1 || (count % 100) == 0) {
|
||||
if (TESTING && (count == 1 || (count % 100) == 0)) {
|
||||
System.out.println(count + ": " + oldLine);
|
||||
}
|
||||
printDef(out, oldCode, oldLine, oldStart);
|
||||
storeDef(out, oldCode, oldLine, oldStart);
|
||||
}
|
||||
if (TESTING) if (count > 1000) {
|
||||
System.out.println("ABORTING at 1000 for testing");
|
||||
break;
|
||||
}
|
||||
if (TESTING) if (count > 1000) break;
|
||||
oldCode = code;
|
||||
foundKey = false;
|
||||
foundFilter = (filter == null);
|
||||
|
@ -113,16 +500,12 @@ public final class GenerateHanTransliterator {
|
|||
oldStart += key.length();
|
||||
}
|
||||
}
|
||||
if (foundKey && foundFilter) printDef(out, oldCode, oldLine, oldStart);
|
||||
if (foundKey && foundFilter) storeDef(out, oldCode, oldLine, oldStart);
|
||||
|
||||
System.out.println("Total: " + totalCount);
|
||||
System.out.println("Defined Count: " + count);
|
||||
in.close();
|
||||
out.close();
|
||||
err.close();
|
||||
}
|
||||
|
||||
static void printDef(PrintWriter out, String code, String line, int start) {
|
||||
static void storeDef(PrintWriter out, String code, String line, int start) {
|
||||
if (code.length() == 0) return;
|
||||
|
||||
// skip spaces & numbers at start
|
||||
|
@ -139,39 +522,179 @@ public final class GenerateHanTransliterator {
|
|||
if (end2 < 0) end2 = line.length();
|
||||
if (end > end2) end = end2;
|
||||
|
||||
if (type != 0) {
|
||||
if (type != DEFINITION) {
|
||||
end2 = line.indexOf(" ", start);
|
||||
if (end2 < 0) end2 = line.length();
|
||||
if (end > end2) end = end2;
|
||||
}
|
||||
|
||||
String definition = line.substring(start,end);
|
||||
if (type == 2) definition = handlePinyin(definition, line);
|
||||
definition.trim();
|
||||
definition = definition.toLowerCase();
|
||||
String cp = UTF16.valueOf(Integer.parseInt(code, 16));
|
||||
String key = (String) definitionMap.get(definition);
|
||||
if (key == null) {
|
||||
definitionMap.put(definition, cp);
|
||||
if (type == CHINESE) {
|
||||
// since data are messed up, terminate after first digit
|
||||
int end3 = findInString(definition, "12345")+1;
|
||||
if (end3 == 0) {
|
||||
log.println("Bad pinyin data: " + line);
|
||||
end3 = definition.length();
|
||||
}
|
||||
definition = definition.substring(0, end3);
|
||||
|
||||
definition = convertPinyin.transliterate(definition);
|
||||
}
|
||||
out.println(cp + (key == null ? " <> " : " > ") + "'[" + definition + "]';");
|
||||
if (type == DEFINITION) {
|
||||
definition = removeMatched(definition,'(', ')', line);
|
||||
definition = removeMatched(definition,'[', ']', line);
|
||||
definition = definition.trim();
|
||||
definition = Utility.replace(definition, " ", " ");
|
||||
definition = "'[" + quoteNonLetters.transliterate(definition) + "]'";
|
||||
}
|
||||
definition.trim();
|
||||
definition = Default.ucd.getCase(definition, FULL, LOWER);
|
||||
String cp = UTF16.valueOf(Integer.parseInt(code, 16));
|
||||
unihanMap.put(cp, definition);
|
||||
/*
|
||||
String key = (String) unihanMap.get(definition);
|
||||
if (key == null) {
|
||||
unihanMap.put(definition, cp);
|
||||
}
|
||||
out.println(cp + (key == null ? " <> " : " > ") + Default.ucd.getCase(definition, FULL, TITLE) + ";");
|
||||
if (TESTING) System.out.println("# " + code + " > " + definition);
|
||||
*/
|
||||
}
|
||||
|
||||
static Map definitionMap = new HashMap();
|
||||
// WARNING not supplemenatary-safe!
|
||||
|
||||
static int findInString(String source, String chars) {
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
if (chars.indexOf(source.charAt(i)) >= 0) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// WARNING not supplemenatary-safe!
|
||||
|
||||
static String removeMatched(String source, char start, char end, String originalLine) {
|
||||
while (true) {
|
||||
int pos = source.indexOf(start);
|
||||
if (pos < 0) break;
|
||||
int epos = source.indexOf(end, pos+1);
|
||||
if (epos < 0) {
|
||||
epos = source.length()-1;
|
||||
log.println("Mismatches with " + start + ", " + end + ": " + originalLine);
|
||||
}
|
||||
source = source.substring(0,pos) + source.substring(epos+1);
|
||||
}
|
||||
return source;
|
||||
}
|
||||
|
||||
static Map unihanMap = new HashMap();
|
||||
|
||||
static StringBuffer handlePinyinTemp = new StringBuffer();
|
||||
|
||||
static String handlePinyin(String source, String debugLine) {
|
||||
static Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex");
|
||||
static Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters",
|
||||
"([[\\u0021-\\u007E]-[:L:]-[\\']]) > \\u005C $1; \\' > \\'\\';", Transliterator.FORWARD);
|
||||
|
||||
|
||||
|
||||
// ADD Factory since otherwise getInverse blows out
|
||||
static class DummyFactory implements Transliterator.Factory {
|
||||
static DummyFactory singleton = new DummyFactory();
|
||||
static HashMap m = new HashMap();
|
||||
|
||||
// Since Transliterators are immutable, we don't have to clone on set & get
|
||||
static void add(String ID, Transliterator t) {
|
||||
m.put(ID, t);
|
||||
System.out.println("Registering: " + ID + ", " + t.toRules(true));
|
||||
Transliterator.registerFactory(ID, singleton);
|
||||
}
|
||||
public Transliterator getInstance(String ID) {
|
||||
return (Transliterator) m.get(ID);
|
||||
}
|
||||
}
|
||||
|
||||
static Transliterator convertPinyin;
|
||||
|
||||
static {
|
||||
String dt = "1 > ;\n"
|
||||
+ "2 <> \u0301;\n"
|
||||
+ "3 <> \u0306;\n"
|
||||
+ "4 <> \u0300;\n"
|
||||
+ "5 <> \u0304;";
|
||||
|
||||
String dp = "# syllable is ...vowel+ consonant* number\n"
|
||||
+ "# 'a', 'e' are the preferred bases\n"
|
||||
+ "# otherwise 'o'\n"
|
||||
+ "# otherwise last vowel\n"
|
||||
+ "::NFC;\n"
|
||||
+ "$vowel = [aAeEiIoOuUüÜ];\n"
|
||||
+ "$consonant = [[a-z A-Z] - [$vowel]];\n"
|
||||
+ "$digit = [1-5];\n"
|
||||
+ "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||||
+ "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||||
+ "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n"
|
||||
+ "::NFC;\n";
|
||||
|
||||
Transliterator at = Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD);
|
||||
System.out.println(at.transliterate("a1a2a3a4a5"));
|
||||
DummyFactory.add(at.getID(), at);
|
||||
|
||||
convertPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD);
|
||||
System.out.println(convertPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2"));
|
||||
|
||||
}
|
||||
/*
|
||||
|
||||
static String convertTones(String source, String debugLine) {
|
||||
try {
|
||||
result = new StringBuffer();
|
||||
main:
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
ch = source.charAt(i);
|
||||
switch (ch) {
|
||||
case ':':
|
||||
if (i > 0) {
|
||||
char last = result.charAt(result.length()-1);
|
||||
if (last == 'u') {
|
||||
result.setCharAt(result.length()-1, 'ü');
|
||||
continue main;
|
||||
} else if (last == 'U') {
|
||||
result.setCharAt(result.length()-1, 'Ü');
|
||||
continue main;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case '1': break; // skip character
|
||||
case '2': case '3': case '4': case '5':
|
||||
applyToPrecedingBase(result, ch-'0');
|
||||
break;
|
||||
default:
|
||||
result.append(ch);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
source = source.trim();
|
||||
char ch = source.charAt(source.length()-1);
|
||||
int num = (int)(ch-'1');
|
||||
if (num < 0 || num > 5) throw new Exception("none");
|
||||
handlePinyinTemp.setLength(0);
|
||||
boolean gotIt = false;
|
||||
boolean messageIfNoGotIt = true;
|
||||
|
||||
for (int i = source.length()-2; i >= 0; --i) {
|
||||
ch = source.charAt(i);
|
||||
if (ch == ':') {
|
||||
ch = 'Ü';
|
||||
--i;
|
||||
}
|
||||
if ('0' <= ch && ch <= '9') break;
|
||||
if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")");
|
||||
break;
|
||||
}
|
||||
if (!gotIt) switch (ch) {
|
||||
case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break;
|
||||
case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break;
|
||||
|
@ -191,8 +714,31 @@ public final class GenerateHanTransliterator {
|
|||
}
|
||||
source = handlePinyinTemp.toString().toLowerCase();
|
||||
} catch (Exception e) {
|
||||
err.println("Bad line: " + debugLine);
|
||||
log.println("Bad line: " + debugLine);
|
||||
}
|
||||
return source;
|
||||
}
|
||||
|
||||
/*
|
||||
A and e trump all other vowels and always take the tone mark.
|
||||
There are no Mandarin syllables that contain both a and e.
|
||||
In the combination ou, o takes the mark.
|
||||
In all other cases, the final vowel takes the mark.
|
||||
*/
|
||||
/*
|
||||
static String applyToPrecedingBase(StringBuffer result, int tone) {
|
||||
for (int i = result.length()-1; i >= 0; --i) {
|
||||
char ch = result.charAt(i);
|
||||
switch (ch) {
|
||||
case 'a': case 'e': case 'A': case 'E':
|
||||
result.setCharAt(i, mapTone(ch, tone));
|
||||
return;
|
||||
case 'o': case 'O': bestSoFar = i; break;
|
||||
case 'i': case 'I': case 'u': case 'U': case '
|
||||
if (tone == 1) return String.valueOf(ch);
|
||||
return Default.nfc.normalize(ch + mapTone[tone]);
|
||||
}
|
||||
|
||||
static final char[] MAP_TONE = {"\u0301", "\u0306", "\u0300", "\u0304"};
|
||||
*/
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/06/04 01:59:02 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -65,13 +65,14 @@ public final class Main implements UCD_Types {
|
|||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("verifyNormalizationStability")) VerifyUCD.verifyNormalizationStability();
|
||||
|
||||
else if (arg.equalsIgnoreCase("hanTransliterator")) GenerateHanTransliterator.main(0);
|
||||
else if (arg.equalsIgnoreCase("definitionTransliterator")) GenerateHanTransliterator.main(0);
|
||||
else if (arg.equalsIgnoreCase("romajiTransliterator")) GenerateHanTransliterator.main(1);
|
||||
else if (arg.equalsIgnoreCase("pinYinTransliterator")) GenerateHanTransliterator.main(2);
|
||||
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
|
||||
|
||||
else if (arg.equalsIgnoreCase("checkBIDI")) VerifyUCD.checkBIDI();
|
||||
else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -100,7 +100,7 @@ public final class Normalizer implements UCD_Types {
|
|||
// then compose if the form requires.
|
||||
|
||||
if (source.length() != 0) {
|
||||
internalDecompose(source, target);
|
||||
internalDecompose(source, target, true, compatibility);
|
||||
if (composition) {
|
||||
internalCompose(target);
|
||||
}
|
||||
|
@ -108,6 +108,23 @@ public final class Normalizer implements UCD_Types {
|
|||
return target;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
* replacing contents of the target buffer.
|
||||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
public boolean isFCD(String source) {
|
||||
if (source.length() == 0) return true;
|
||||
StringBuffer noReorder = new StringBuffer();
|
||||
StringBuffer reorder = new StringBuffer();
|
||||
|
||||
internalDecompose(source, noReorder, false, false);
|
||||
internalDecompose(source, reorder, true, false);
|
||||
|
||||
return reorder.toString().equals(noReorder.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form
|
||||
* @param source the original text, unnormalized
|
||||
|
@ -280,13 +297,13 @@ public final class Normalizer implements UCD_Types {
|
|||
* @param source the original text, unnormalized
|
||||
* @param target the resulting normalized text
|
||||
*/
|
||||
private void internalDecompose(String source, StringBuffer target) {
|
||||
private void internalDecompose(String source, StringBuffer target, boolean reorder, boolean compat) {
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
int ch32;
|
||||
for (int i = 0; i < source.length(); i += UTF16.getCharCount(ch32)) {
|
||||
buffer.setLength(0);
|
||||
ch32 = UTF16.charAt(source, i);
|
||||
data.getRecursiveDecomposition(ch32, buffer, compatibility);
|
||||
data.getRecursiveDecomposition(ch32, buffer, compat);
|
||||
|
||||
// add all of the characters in the decomposition.
|
||||
// (may be just the original character, if there was
|
||||
|
@ -297,7 +314,7 @@ public final class Normalizer implements UCD_Types {
|
|||
ch = UTF16.charAt(buffer, j);
|
||||
int chClass = data.getCanonicalClass(ch);
|
||||
int k = target.length(); // insertion point
|
||||
if (chClass != 0) {
|
||||
if (chClass != 0 && reorder) {
|
||||
|
||||
// bubble-sort combining marks as necessary
|
||||
|
||||
|
@ -466,27 +483,27 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
return isFirst.get(cp);
|
||||
}
|
||||
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compat) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
if (!composition) {
|
||||
if (compatibility) return dt >= CANONICAL;
|
||||
if (compat) return dt >= CANONICAL;
|
||||
else return dt == CANONICAL;
|
||||
} else {
|
||||
// almost the same, except that we add back in the characters
|
||||
// that RECOMPOSE
|
||||
if (compatibility) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
|
||||
if (compat) return dt >= CANONICAL && !compatibilityRecompose.get(cp);
|
||||
else return dt == CANONICAL && !canonicalRecompose.get(cp);
|
||||
}
|
||||
}
|
||||
|
||||
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compatibility) {
|
||||
public void getRecursiveDecomposition(int cp, StringBuffer buffer, boolean compat) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
// we know we decompose all CANONICAL, plus > CANONICAL if compatibility is TRUE.
|
||||
if (dt == CANONICAL || dt > CANONICAL && compatibility) {
|
||||
// we know we decompose all CANONICAL, plus > CANONICAL if compat is TRUE.
|
||||
if (dt == CANONICAL || dt > CANONICAL && compat) {
|
||||
String s = ucd.getDecompositionMapping(cp);
|
||||
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
getRecursiveDecomposition(cp, buffer, compatibility);
|
||||
getRecursiveDecomposition(cp, buffer, compat);
|
||||
}
|
||||
} else {
|
||||
UTF16.append(buffer, cp);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
|
||||
* $Date: 2002/04/23 01:59:14 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -34,6 +34,13 @@ public final class TestNormalization {
|
|||
public static void main(String[] args) throws java.io.IOException {
|
||||
System.out.println("Creating Normalizers");
|
||||
Default.setUCD();
|
||||
|
||||
String[] testSet = {"a\u0304\u0328", "a\u0328\u0304"};
|
||||
for (int i = 0; i < testSet.length; ++i) {
|
||||
String s = testSet[i];
|
||||
boolean test = Default.nfc.isFCD(s);
|
||||
System.out.println(test + ": " + Default.ucd.getCodeAndName(s));
|
||||
}
|
||||
|
||||
|
||||
String x = UTF32.valueOf32(0x10000);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2002/05/29 02:01:00 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -124,20 +124,35 @@ public final class UCD implements UCD_Types {
|
|||
* Get the character name.
|
||||
*/
|
||||
public String getName(int codePoint) {
|
||||
return getName(codePoint, NORMAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character name.
|
||||
*/
|
||||
public String getName(String s) {
|
||||
return getName(s, NORMAL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character name.
|
||||
*/
|
||||
public String getName(int codePoint, byte style) {
|
||||
if (style == SHORT) return get(codePoint, true).shortName;
|
||||
return get(codePoint, true).name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the character names for the code points in a string, separated by ", "
|
||||
*/
|
||||
public String getName(String s) {
|
||||
public String getName(String s, byte style) {
|
||||
if (s.length() == 1) return get(s.charAt(0), true).name;
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(", ");
|
||||
result.append(getName(cp));
|
||||
result.append(getName(cp, style));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
@ -977,6 +992,9 @@ to guarantee identifier closure.
|
|||
result = UData.UNASSIGNED;
|
||||
if (fixStrings) result.name = "<unassigned-" + Utility.hex(codePoint, 4) + ">";
|
||||
}
|
||||
if (result.shortName != null && result.shortName.length() == 0) {
|
||||
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
|
||||
}
|
||||
return result;
|
||||
case 0x3400: // CJK Ideograph Extension A
|
||||
case 0x4E00: // CJK Ideograph
|
||||
|
@ -1006,13 +1024,17 @@ to guarantee identifier closure.
|
|||
result = getRaw(rangeStart);
|
||||
if (result == null) {
|
||||
result = UData.UNASSIGNED;
|
||||
if (fixStrings) result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
|
||||
if (fixStrings) {
|
||||
result.name = "<reserved-" + Utility.hex(codePoint, 4) + ">";
|
||||
result.shortName = Utility.replace(result.name, UCD_Names.NAME_ABBREVIATIONS);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
result.codePoint = codePoint;
|
||||
if (fixStrings) {
|
||||
result.name = constructedName;
|
||||
result.shortName = Utility.replace(constructedName, UCD_Names.NAME_ABBREVIATIONS);
|
||||
result.decompositionMapping = result.bidiMirror
|
||||
= result.simpleLowercase = result.simpleUppercase = result.simpleTitlecase = result.simpleCaseFolding
|
||||
= result.fullLowercase = result.fullUppercase = result.fullTitlecase = result.fullCaseFolding
|
||||
|
@ -1024,7 +1046,7 @@ to guarantee identifier closure.
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// Hangul constants
|
||||
|
||||
public static final int
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2002/03/20 00:21:42 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -782,6 +782,139 @@ final class UCD_Names implements UCD_Types {
|
|||
};
|
||||
|
||||
static final String[] NF_NAME = {"NFD", "NFC", "NFKD", "NFKC"};
|
||||
|
||||
static final String[][] NAME_ABBREVIATIONS = {
|
||||
{"CJK UNIFIED IDEOGRAPH-", "CJK-"},
|
||||
{"CJK COMPATIBILITY IDEOGRAPH-", "CJKC-"},
|
||||
{"IDEOGRAPHIC TELEGRAPH SYMBOL FOR", "ITSF."},
|
||||
|
||||
{"BRAILLE PATTERN DOTS-", "BPD-"},
|
||||
{"CANADIAN SYLLABICS WEST-", "CSW."},
|
||||
/*{"LATIN SMALL LETTER", "LSL."},
|
||||
{"LATIN CAPITAL LETTER", "LCL."},
|
||||
{"GREEK SMALL LETTER", "GSL."},
|
||||
{"GREEK CAPITAL LETTER", "GCL."},
|
||||
{"CYRILLIC SMALL LETTER", "GSL."},
|
||||
{"CYRILLIC CAPITAL LETTER", "GCL."},
|
||||
{"BYZANTINE MUSICAL SYMBOL", "BMS."},
|
||||
{"YI SYLLABLE", "YS."},
|
||||
{"ETHIOPIC SYLLABLE", "ES."},
|
||||
{"HANGUL SYLLABLE", "HS."},
|
||||
{"CANADIAN SYLLABICS", "CS."},
|
||||
{"ARABIC LETTER", "ALt."},
|
||||
{"ARABIC LIGATURE", "AL."},
|
||||
*/
|
||||
|
||||
{"MATHEMATICAL SANS-SERIF", "MSS."},
|
||||
{"MATHEMATICAL SERIF", "MS."},
|
||||
{"BOLD ITALIC", "BI."},
|
||||
{"ISOLATED FORM", "IF."},
|
||||
{"FINAL FORM", "FF."},
|
||||
{"INITIAL FORM", "IF."},
|
||||
{"VOWEL SIGN", "VS."},
|
||||
{"KANGXI RADICAL", "KR."},
|
||||
{"MUSICAL SYMBOL", "MS."},
|
||||
{"SMALL LETTER", "SL."},
|
||||
{"CAPITAL LETTER", "CL."},
|
||||
|
||||
{"LIGATURE", "Lg."},
|
||||
{"SYLLABICS", "Ss."},
|
||||
{"MATHEMATICAL", "M."},
|
||||
{"LETTER", "L."},
|
||||
{"SYLLABLE", "S."},
|
||||
{"SYMBOL", "Sy."},
|
||||
{"WITH", "W."},
|
||||
{"CAPITAL", "C."},
|
||||
{"SMALL", "C."},
|
||||
{"COMBINING", "Cm."},
|
||||
{"HANGUL", "H."},
|
||||
};
|
||||
|
||||
/*
|
||||
LETTER: 23598
|
||||
MATHEMATICAL: 11976
|
||||
SYLLABLE: 11872
|
||||
CAPITAL: 8918
|
||||
WITH: 8008
|
||||
COMPATIBILITY: 7800
|
||||
SMALL: 7740
|
||||
IDEOGRAPH: 6165
|
||||
SYLLABICS: 5670
|
||||
ARABIC: 5646
|
||||
CANADIAN: 5040
|
||||
LATIN: 4840
|
||||
SYMBOL: 4626
|
||||
LIGATURE: 4048
|
||||
MUSICAL: 3255
|
||||
FORM: 3044
|
||||
ETHIOPIC: 2760
|
||||
RADICAL: 2695
|
||||
HANGUL: 2670
|
||||
ITALIC: 2526
|
||||
YI: 2468
|
||||
BOLD: 2256
|
||||
BYZANTINE: 2214
|
||||
|
||||
COMPATIBILITY/IDEOGRAPH: 13800
|
||||
YI/SYLLABLE: 12815
|
||||
CANADIAN/SYLLABICS: 11340
|
||||
CAPITAL/LETTER: 10948
|
||||
SMALL/LETTER: 10692
|
||||
CJK/COMPATIBILITY: 10200
|
||||
ARABIC/LIGATURE: 7110
|
||||
IDEOGRAPH/-: 6600
|
||||
MUSICAL/SYMBOL: 6510
|
||||
MATHEMATICAL/SANS: 5848
|
||||
LATIN/SMALL: 5786
|
||||
MATHEMATICAL/BOLD: 5678
|
||||
ETHIOPIC/SYLLABLE: 5389
|
||||
LATIN/CAPITAL: 5330
|
||||
ARABIC/LETTER: 4992
|
||||
BYZANTINE/MUSICAL: 4182
|
||||
BRAILLE/PATTERN: 3825
|
||||
ISOLATED/FORM: 3068
|
||||
PATTERN/DOTS: 3060
|
||||
KANGXI/RADICAL: 2996
|
||||
SYLLABICS/CARRIER: 2975
|
||||
-/SERIF: 2576
|
||||
ITALIC/CAPITAL: 2520
|
||||
BOLD/ITALIC: 2420
|
||||
KATAKANA/LETTER: 2415
|
||||
FINAL/FORM: 2400
|
||||
SERIF/BOLD: 2300
|
||||
SANS/-: 2208
|
||||
ITALIC/SMALL: 2184
|
||||
MONGOLIAN/LETTER: 2080
|
||||
MATHEMATICAL/ITALIC: 2071
|
||||
INITIAL/FORM: 2064
|
||||
CYRILLIC/CAPITAL: 2032
|
||||
|
||||
CJK/COMPATIBILITY/IDEOGRAPH: 16200
|
||||
COMPATIBILITY/IDEOGRAPH/-: 15000
|
||||
LATIN/SMALL/LETTER: 9306
|
||||
LATIN/CAPITAL/LETTER: 8160
|
||||
MATHEMATICAL/SANS/-: 6536
|
||||
BYZANTINE/MUSICAL/SYMBOL: 5904
|
||||
BRAILLE/PATTERN/DOTS: 5100
|
||||
CANADIAN/SYLLABICS/CARRIER: 4550
|
||||
SANS/-/SERIF: 4416
|
||||
PATTERN/DOTS/-: 3570
|
||||
GREEK/SMALL/LETTER: 2934
|
||||
CYRILLIC/CAPITAL/LETTER: 2852
|
||||
-/SERIF/BOLD: 2760
|
||||
MATHEMATICAL/BOLD/ITALIC: 2640
|
||||
CYRILLIC/SMALL/LETTER: 2604
|
||||
GREEK/CAPITAL/LETTER: 2580
|
||||
|
||||
CJK/COMPATIBILITY/IDEOGRAPH/-: 17400
|
||||
MATHEMATICAL/SANS/-/SERIF: 8600
|
||||
BRAILLE/PATTERN/DOTS/-: 5610
|
||||
SANS/-/SERIF/BOLD: 3910
|
||||
CANADIAN/SYLLABICS/WEST/-: 2200
|
||||
IDEOGRAPHIC/TELEGRAPH/SYMBOL/FOR: 2176
|
||||
-/SERIF/BOLD/ITALIC: 2090
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
static {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -18,6 +18,7 @@ import com.ibm.text.utility.*;
|
|||
|
||||
class UData implements UCD_Types {
|
||||
String name;
|
||||
String shortName = ""; // cache
|
||||
String decompositionMapping;
|
||||
String simpleUppercase;
|
||||
String simpleLowercase;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/05/31 01:41:03 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -843,6 +843,7 @@ can help you narrow these down.
|
|||
}
|
||||
|
||||
static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
|
||||
static final String names2[] = {"LOWER", "TITLE", "UPPER", "FOLD"};
|
||||
static final String lowerNames[] = {"", "Other_Lower"};
|
||||
static final String upperNames[] = {"", "Other_Upper"};
|
||||
|
||||
|
@ -852,13 +853,50 @@ can help you narrow these down.
|
|||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!Default.ucd.isAssigned(cp) || Default.ucd.isPUA(cp)) continue;
|
||||
|
||||
boolean failed = false;
|
||||
String fullTest = Default.ucd.getCase(Default.ucd.getCase(cp, FULL, UPPER), FULL, LOWER);
|
||||
String simpleTest = Default.ucd.getCase(Default.ucd.getCase(cp, SIMPLE, UPPER), SIMPLE, LOWER);
|
||||
|
||||
String full = Default.ucd.getCase(cp, FULL, FOLD);
|
||||
String simple = Default.ucd.getCase(cp, SIMPLE, FOLD);
|
||||
|
||||
boolean failed = false;
|
||||
|
||||
String realTest = "\u0360" + UTF16.valueOf(cp) + "\u0334";
|
||||
|
||||
int ccc = Default.ucd.getCombiningClass(cp);
|
||||
|
||||
for (byte style = FOLD; style < CASE_LIMIT; ++style) {
|
||||
|
||||
String fold_NFD = Default.nfd.normalize(Default.ucd.getCase(realTest, FULL, style));
|
||||
String NFD_fold = Default.ucd.getCase(Default.nfd.normalize(realTest), FULL, style);
|
||||
if (!fold_NFD.equals(NFD_fold)) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Case check fails at " + Default.ucd.getCodeAndName(cp));
|
||||
System.out.println("\t" + names2[style] + ", then NFD: " + Default.ucd.getCodeAndName(fold_NFD));
|
||||
System.out.println("\tNFD, then " + names2[style] + ": " + Default.ucd.getCodeAndName(NFD_fold));
|
||||
failed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
int ccc = Default.ucd.getCombiningClass(cp);
|
||||
|
||||
int cp2;
|
||||
for (int i = 0; i < full.length(); i += UTF16.getCharCount(cp2)) {
|
||||
cp2 = UTF16.charAt(full, i);
|
||||
int ccc2 = Default.ucd.getCombiningClass(cp2);
|
||||
if (ccc2 != ccc) {
|
||||
System.out.println("Case fold CCC fails at " + Default.ucd.getCodeAndName(cp));
|
||||
System.out.println("\tFull case folding:" + ccc2 + ", " + Default.ucd.getCodeAndName(full));
|
||||
System.out.println("\tccc:" + ccc);
|
||||
System.out.println("\tccc:" + ccc2 + ", " + Default.ucd.getCodeAndName(cp2));
|
||||
failed = true;
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
if (!full.equals(fullTest)) {
|
||||
Utility.fixDot();
|
||||
System.out.println("Case fold fails at " + Default.ucd.getCodeAndName(cp));
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Pair.java,v $
|
||||
* $Date: 2001/09/19 23:33:52 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -49,4 +49,9 @@ public final class Pair implements java.lang.Comparable, Cloneable {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return '(' + (first == null ? "null" : first.toString())
|
||||
+ ',' + (second == null ? "null" : second.toString()) + ')';
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/06/02 05:07:08 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -588,6 +588,39 @@ public final class Utility { // COMMON UTILITIES
|
|||
public static void appendFile(String filename, boolean utf8, PrintWriter output) throws IOException {
|
||||
appendFile(filename, utf8, output, null);
|
||||
}
|
||||
|
||||
public static BufferedReader openReadFile(String filename, boolean UTF8) throws FileNotFoundException, UnsupportedEncodingException {
|
||||
FileInputStream fis = new FileInputStream(filename);
|
||||
InputStreamReader isr = UTF8 ? new InputStreamReader(fis, "UTF8") : new InputStreamReader(fis);
|
||||
BufferedReader br = new BufferedReader(isr, 32*1024);
|
||||
return br;
|
||||
}
|
||||
|
||||
public static void addCount(Map m, Object key, int count) {
|
||||
Integer oldCount = (Integer) m.get(key);
|
||||
if (oldCount == null) {
|
||||
m.put(key, new Integer(count));
|
||||
return;
|
||||
}
|
||||
m.put(key, new Integer(oldCount.intValue() + count));
|
||||
}
|
||||
|
||||
public static String readDataLine(BufferedReader br) throws IOException {
|
||||
String originalLine = "";
|
||||
String line = "";
|
||||
|
||||
try {
|
||||
line = originalLine = br.readLine();
|
||||
if (line == null) return null;
|
||||
if (line.length() > 0 && line.charAt(0) == 0xFEFF) line = line.substring(1);
|
||||
int commentPos = line.indexOf('#');
|
||||
if (commentPos >= 0) line = line.substring(0, commentPos);
|
||||
line = line.trim();
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("Line \"{0}\", \"{1}\"", new String[] {originalLine, line}, e);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
public static void appendFile(String filename, boolean utf8, PrintWriter output, String[] replacementList) throws IOException {
|
||||
FileInputStream fis = new FileInputStream(filename);
|
||||
|
@ -691,10 +724,10 @@ public final class Utility { // COMMON UTILITIES
|
|||
copyTextFile(filename, utf8, newName, null);
|
||||
}
|
||||
|
||||
public static BufferedReader openUnicodeFile(String filename, String version, boolean show) throws IOException {
|
||||
public static BufferedReader openUnicodeFile(String filename, String version, boolean show, boolean UTF8) throws IOException {
|
||||
String name = getMostRecentUnicodeDataFile(filename, version, true, show);
|
||||
if (name == null) return null;
|
||||
return new BufferedReader(new FileReader(name),32*1024);
|
||||
return openReadFile(name, UTF8); // new BufferedReader(new FileReader(name),32*1024);
|
||||
}
|
||||
|
||||
public static String getMostRecentUnicodeDataFile(String filename, String version,
|
||||
|
@ -758,6 +791,7 @@ public final class Utility { // COMMON UTILITIES
|
|||
* Replaces all occurances of piece with replacement, and returns new String
|
||||
*/
|
||||
public static String replace(String source, String piece, String replacement) {
|
||||
if (source == null || source.length() < piece.length()) return source;
|
||||
int pos = 0;
|
||||
while (true) {
|
||||
pos = source.indexOf(piece, pos);
|
||||
|
@ -767,6 +801,21 @@ public final class Utility { // COMMON UTILITIES
|
|||
}
|
||||
}
|
||||
|
||||
public static String replace(String source, String[][] replacements) {
|
||||
for (int i = 0; i < replacements.length; ++i) {
|
||||
source = replace(source, replacements[i][0], replacements[i][1]);
|
||||
}
|
||||
return source;
|
||||
}
|
||||
|
||||
public static String replace(String source, String[][] replacements, boolean reverse) {
|
||||
if (!reverse) return replace(source, replacements);
|
||||
for (int i = 0; i < replacements.length; ++i) {
|
||||
source = replace(source, replacements[i][1], replacements[i][0]);
|
||||
}
|
||||
return source;
|
||||
}
|
||||
|
||||
public static String getStack() {
|
||||
Exception e = new Exception();
|
||||
StringWriter sw = new StringWriter();
|
||||
|
|
Loading…
Add table
Reference in a new issue