mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
Changes for generating linebreak test
X-SVN-Rev: 9433
This commit is contained in:
parent
a5e7872567
commit
73cd203e91
18 changed files with 750 additions and 136 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
|
||||
* $Date: 2002/06/13 21:14:05 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -142,7 +142,7 @@ public class BuildNames implements UCD_Types {
|
|||
|
||||
String fname = "ShortNames.txt";
|
||||
System.out.println("Writing " + fname);
|
||||
PrintWriter log = Utility.openPrintWriter(fname, false, true);
|
||||
PrintWriter log = Utility.openPrintWriter(fname, Utility.LATIN1_WINDOWS);
|
||||
|
||||
System.out.println("Gathering data");
|
||||
//Counter counter = new Counter();
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.util.Date;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.TimeZone;
|
||||
|
||||
|
||||
public final class Default implements UCD_Types {
|
||||
|
||||
|
@ -25,5 +29,14 @@ public final class Default implements UCD_Types {
|
|||
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion);
|
||||
System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(ucd.getDate())));
|
||||
}
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
public static String getDate() {
|
||||
return myDateFormat.format(new Date());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2002/05/31 01:41:04 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -41,7 +41,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
PICK_SHORT = NF_CLOSURE = normalized;
|
||||
|
||||
Default.setUCD();
|
||||
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true));
|
||||
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
|
||||
|
||||
System.out.println("Making Full Data");
|
||||
|
@ -57,7 +57,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
if (normalized) filename += "-Normalized";
|
||||
String directory = "DerivedData/";
|
||||
String newFile = directory + filename + GenerateData.getFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(newFile);
|
||||
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
|
||||
|
||||
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
|
||||
|
@ -444,7 +444,8 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
String suffix2 = "";
|
||||
if (normalize) suffix2 = "-Normalized";
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions" + suffix2 + GenerateData.getFileSuffix(true));
|
||||
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
|
||||
+ suffix2 + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
|
@ -555,7 +556,7 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
|
||||
System.out.println("Writing");
|
||||
String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
|
||||
PrintWriter out = Utility.openPrintWriter(newFile);
|
||||
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true));
|
||||
out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
|
||||
out.println(GenerateData.generateDateLine());
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2002/07/14 22:04:49 $
|
||||
* $Revision: 1.21 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.22 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -15,8 +15,6 @@ package com.ibm.text.UCD;
|
|||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
@ -28,6 +26,49 @@ public class GenerateData implements UCD_Types {
|
|||
static final boolean DEBUG = false;
|
||||
|
||||
static final String HORIZONTAL_LINE = "# ================================================";
|
||||
|
||||
static final void genSplit () {
|
||||
Default.setUCD();
|
||||
UnicodeSet split = new UnicodeSet();
|
||||
UnicodeSet reordrant = new UnicodeSet(
|
||||
"[\u093F\u09BF\u09c7\u09c8\u0abf\u0abf\u0b47\u0bc6\u0bc7\u0bc8"
|
||||
+ "\u0d46\u0d47\u0d48\u0dd9\u0dda\u0ddb\u1031\u17be\u17c1\u17c2\u17c3]");
|
||||
UnicodeSet subjoined = new UnicodeSet();
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
if (!Default.ucd.isAssigned(i)) continue;
|
||||
Utility.dot(i);
|
||||
int cat = Default.ucd.getCategory(i);
|
||||
if (cat != Mc && cat != Mn && cat != Me) continue;
|
||||
if (Default.ucd.getName(i).indexOf("SUBJOINED") >= 0) {
|
||||
System.out.print('*');
|
||||
subjoined.add(i);
|
||||
continue;
|
||||
}
|
||||
String decomp = Default.nfd.normalize(i);
|
||||
//int count = countTypes(decomp, Mc);
|
||||
if (UTF16.countCodePoint(decomp) > 1) split.add(i);
|
||||
}
|
||||
Utility.fixDot();
|
||||
System.out.println("Split: " + split.size());
|
||||
Utility.showSetNames("", split, false, Default.ucd);
|
||||
|
||||
System.out.println("Reordrant: " + reordrant.size());
|
||||
Utility.showSetNames("", reordrant, false, Default.ucd);
|
||||
|
||||
System.out.println("Subjoined: " + subjoined.size());
|
||||
Utility.showSetNames("", subjoined, false, Default.ucd);
|
||||
}
|
||||
|
||||
static int countTypes(String s, int filter) {
|
||||
int count = 0;
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i+= UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(s, i);
|
||||
int cat = Default.ucd.getCategory(i);
|
||||
if (cat == filter) count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
//static UnifiedBinaryProperty ubp
|
||||
|
||||
|
@ -55,12 +96,6 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
|
||||
public static String fixFile(String s) {
|
||||
|
@ -108,7 +143,7 @@ public class GenerateData implements UCD_Types {
|
|||
Default.setUCD();
|
||||
String newFile = directory + fileName + getFileSuffix(true);
|
||||
System.out.println("New File: " + newFile);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
|
||||
System.out.println("Most recent: " + mostRecent);
|
||||
|
||||
|
@ -156,7 +191,7 @@ public class GenerateData implements UCD_Types {
|
|||
public static void generateCompExclusions() throws IOException {
|
||||
Default.setUCD();
|
||||
String newFile = "DerivedData/CompositionExclusions" + getFileSuffix(true);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true));
|
||||
|
||||
output.println("# CompositionExclusions" + getFileSuffix(false));
|
||||
|
@ -217,7 +252,7 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
|
||||
static String generateDateLine() {
|
||||
return "# Date: " + myDateFormat.format(new Date()) + " [MD]";
|
||||
return "# Date: " + Default.getDate() + " [MD]";
|
||||
}
|
||||
|
||||
static class CompLister extends PropertyLister {
|
||||
|
@ -332,7 +367,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
Utility.fixDot();
|
||||
System.out.println("Set Size: " + map.size());
|
||||
PrintWriter output = Utility.openPrintWriter("Partition" + getFileSuffix(true));
|
||||
PrintWriter output = Utility.openPrintWriter("Partition" + getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
|
||||
Iterator it = map.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
|
@ -351,7 +386,7 @@ public class GenerateData implements UCD_Types {
|
|||
public static void listDifferences() throws IOException {
|
||||
|
||||
Default.setUCD();
|
||||
PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + getFileSuffix(true));
|
||||
PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
output.println("# Listing of relationships among properties, suitable for analysis by spreadsheet");
|
||||
output.println("# Generated for " + Default.ucd.getVersion());
|
||||
output.println(generateDateLine());
|
||||
|
@ -610,7 +645,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
String filename = "PropertyAliases";
|
||||
String newFile = "DerivedData/" + filename + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
|
||||
|
||||
log.println("# " + filename + getFileSuffix(false));
|
||||
|
@ -626,7 +661,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
filename = "PropertyValueAliases";
|
||||
newFile = "DerivedData/" + filename + getFileSuffix(true);
|
||||
log = Utility.openPrintWriter(newFile);
|
||||
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
|
||||
|
||||
log.println("# " + filename + getFileSuffix(false));
|
||||
|
@ -642,7 +677,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
filename = "PropertyAliasSummary";
|
||||
newFile = "OtherData/" + filename + getFileSuffix(true);
|
||||
log = Utility.openPrintWriter(newFile);
|
||||
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
mostRecent = generateBat("OtherData/", filename, getFileSuffix(true));
|
||||
log.println();
|
||||
log.println(HORIZONTAL_LINE);
|
||||
|
@ -793,7 +828,7 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
|
||||
public static void generateBatAux(String batName, String oldName, String newName) throws IOException {
|
||||
PrintWriter output = Utility.openPrintWriter(batName + ".bat");
|
||||
PrintWriter output = Utility.openPrintWriter(batName + ".bat", Utility.LATIN1_UNIX);
|
||||
newName = Utility.getOutputName(newName);
|
||||
System.out.println("Writing BAT to compare " + oldName + " and " + newName);
|
||||
|
||||
|
@ -812,7 +847,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
Default.setUCD();
|
||||
String newFile = directory + file + getFileSuffix(true);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile);
|
||||
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, file, getFileSuffix(true));
|
||||
|
||||
doHeader(file + getFileSuffix(false), output, headerChoice);
|
||||
|
@ -881,7 +916,7 @@ public class GenerateData implements UCD_Types {
|
|||
static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException {
|
||||
Default.setUCD();
|
||||
String newFile = directory + fileName + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, true, false);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
|
||||
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
|
||||
|
||||
String[] example = new String[256];
|
||||
|
@ -1082,7 +1117,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
Default.setUCD();
|
||||
String newFile = directory + filename + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
|
||||
DiffPropertyLister dpl;
|
||||
UnicodeSet cummulative = new UnicodeSet();
|
||||
|
@ -1164,7 +1199,7 @@ public class GenerateData implements UCD_Types {
|
|||
static final void generateAge(String directory, String filename) throws IOException {
|
||||
Default.setUCD();
|
||||
String newFile = directory + filename + getFileSuffix(true);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile);
|
||||
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
|
||||
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
|
||||
try {
|
||||
log.println("# " + filename + getFileSuffix(false));
|
||||
|
@ -1259,7 +1294,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
public static void listCombiningAccents() throws IOException {
|
||||
Default.setUCD();
|
||||
PrintWriter log = Utility.openPrintWriter("ListAccents" + getFileSuffix(true));
|
||||
PrintWriter log = Utility.openPrintWriter("ListAccents" + getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
Set set = new TreeSet();
|
||||
Set set2 = new TreeSet();
|
||||
|
||||
|
@ -1296,7 +1331,7 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
public static void listGreekVowels() throws IOException {
|
||||
Default.setUCD();
|
||||
PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + getFileSuffix(true));
|
||||
PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + getFileSuffix(true), Utility.LATIN1_UNIX);
|
||||
Set set = new TreeSet();
|
||||
Set set2 = new TreeSet();
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
|
||||
* $Date: 2002/07/21 08:43:39 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -42,7 +42,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
|
||||
public static void readUnihan() throws java.io.IOException {
|
||||
|
||||
log = Utility.openPrintWriter("Unihan_log.html", false, false);
|
||||
log = Utility.openPrintWriter("Unihan_log.html", Utility.UTF8_WINDOWS);
|
||||
log.println("<body>");
|
||||
|
||||
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true);
|
||||
|
@ -241,6 +241,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
|
||||
static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
|
||||
|
||||
static final boolean DO_SIMPLE = true;
|
||||
|
||||
public static void main(int typeIn) {
|
||||
type = typeIn;
|
||||
Default.setUCD();
|
||||
|
@ -269,13 +271,20 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
|
||||
}
|
||||
|
||||
log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
|
||||
err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
|
||||
err = Utility.openPrintWriter("Transliterate_err.txt", Utility.UTF8_WINDOWS);
|
||||
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
|
||||
log.print('\uFEFF');
|
||||
|
||||
readUnihanData(key);
|
||||
log.println();
|
||||
log.println("@*DICT Data");
|
||||
log.println();
|
||||
readCDICTDefinitions(type);
|
||||
|
||||
log.println();
|
||||
log.println("@Unihan Data");
|
||||
log.println();
|
||||
readUnihanData(key);
|
||||
|
||||
if (false) {
|
||||
readCDICT();
|
||||
compareUnihanWithCEDICT();
|
||||
|
@ -283,7 +292,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
|
||||
readFrequencyData(type);
|
||||
|
||||
out = Utility.openPrintWriter(filename, false, false);
|
||||
out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
|
||||
out.println("# Start RAW data for converting CJK characters");
|
||||
/*
|
||||
out.println("# Note: adds space between them and letters.");
|
||||
|
@ -366,21 +375,24 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
Set doReverse = new HashSet();
|
||||
Set gotIt = new HashSet();
|
||||
|
||||
it = backSet.iterator();
|
||||
while (it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
p = (Pair) p.second;
|
||||
|
||||
String keyChar = (String) p.first;
|
||||
String def = (String) p.second;
|
||||
if (!gotIt.contains(def)) {
|
||||
if (unihanNonSingular) {
|
||||
out.println(quoteNonLetters.transliterate(keyChar) + " < " + quoteNonLetters.transliterate(def) + ";");
|
||||
} else {
|
||||
doReverse.add(keyChar);
|
||||
if (!DO_SIMPLE) {
|
||||
it = backSet.iterator();
|
||||
while (it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
p = (Pair) p.second;
|
||||
|
||||
String keyChar = (String) p.first;
|
||||
String def = (String) p.second;
|
||||
if (!gotIt.contains(def)) {
|
||||
if (unihanNonSingular) {
|
||||
out.println(quoteNonLetters.transliterate(keyChar)
|
||||
+ " < " + quoteNonLetters.transliterate(def) + ";");
|
||||
} else {
|
||||
doReverse.add(keyChar);
|
||||
}
|
||||
}
|
||||
gotIt.add(def);
|
||||
}
|
||||
gotIt.add(def);
|
||||
}
|
||||
|
||||
|
||||
|
@ -391,10 +403,10 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
|
||||
String keyChar = (String) p.first;
|
||||
String def = (String) p.second;
|
||||
String rel = doReverse.contains(keyChar) ? " <> " : " > ";
|
||||
String rel = !DO_SIMPLE && doReverse.contains(keyChar) ? "<>" : ">";
|
||||
|
||||
out.println(quoteNonLetters.transliterate(keyChar) + rel
|
||||
+ quoteNonLetters.transliterate(def) + ";");
|
||||
+ quoteNonLetters.transliterate(def) + "|\\ ;");
|
||||
//if (TESTING) System.out.println("# " + code + " > " + definition);
|
||||
}
|
||||
|
||||
|
@ -413,6 +425,24 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
System.out.println("Total: " + totalCount);
|
||||
System.out.println("Defined Count: " + count);
|
||||
|
||||
log.println();
|
||||
log.println("@Duplicates");
|
||||
log.println();
|
||||
it = duplicates.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String word = (String) it.next();
|
||||
log.print(hex.transliterate(word) + "\t" + word + "\t");
|
||||
Collection dups = (Collection) duplicates.get(word);
|
||||
Iterator it2 = dups.iterator();
|
||||
boolean gotFirst = false;
|
||||
while (it2.hasNext()) {
|
||||
if (!gotFirst) gotFirst = true;
|
||||
else log.print(", ");
|
||||
log.print(it2.next());
|
||||
}
|
||||
log.println();
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println("Exception: " + e);
|
||||
} finally {
|
||||
|
@ -506,6 +536,10 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
int overallRank = 0;
|
||||
it = combinedRank.iterator();
|
||||
|
||||
log.println();
|
||||
log.println("@Frequency data: Rank of Character");
|
||||
log.println();
|
||||
|
||||
while(it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
log.println(p.first + ", " + p.second);
|
||||
|
@ -516,7 +550,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
}
|
||||
}
|
||||
|
||||
log.println("@character to rank");
|
||||
log.println();
|
||||
log.println("@Frequency data: Character to Rank");
|
||||
log.println();
|
||||
|
||||
// get full order
|
||||
it = rankList.iterator();
|
||||
|
@ -871,8 +907,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
}
|
||||
|
||||
static void addCheck2(String word, String definition, String line) {
|
||||
definition = Default.nfc.normalize(definition) + " ";
|
||||
definition = Default.nfc.normalize(definition);
|
||||
word = Default.nfc.normalize(word);
|
||||
if (DO_SIMPLE && UTF16.countCodePoint(word) > 1) return;
|
||||
|
||||
if (pua.containsSome(word) ) {
|
||||
Utility.fixDot();
|
||||
|
@ -881,7 +918,13 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
Utility.fixDot();
|
||||
System.out.println("Only numbers on: " + line);
|
||||
} else {
|
||||
unihanMap.put(word, definition);
|
||||
Object alreadyThere = unihanMap.get(word);
|
||||
if (alreadyThere == null) {
|
||||
unihanMap.put(word, definition);
|
||||
} else if (!definition.equals(alreadyThere)) {
|
||||
Utility.addToList(duplicates, word, alreadyThere, true);
|
||||
Utility.addToList(duplicates, word, definition, true);
|
||||
}
|
||||
}
|
||||
if (UTF16.countCodePoint(word) > 1) unihanNonSingular = true;
|
||||
}
|
||||
|
@ -1025,19 +1068,28 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
if (end > end2) end = end2;
|
||||
|
||||
// IF CHINESE or JAPANESE, stop at first space!!!
|
||||
rawDefinition = rawDefinition.substring(start,end);
|
||||
|
||||
if (type != DEFINITION) {
|
||||
end2 = rawDefinition.indexOf(" ", start);
|
||||
if (end2 < 0) end2 = rawDefinition.length();
|
||||
if (end > end2) end = end2;
|
||||
if (type == DEFINITION) {
|
||||
storeDef2(out, cp, rawDefinition, line);
|
||||
} else {
|
||||
if (rawDefinition.indexOf(' ') < 0) storeDef2(out, cp, rawDefinition, line);
|
||||
else {
|
||||
String [] pieces = Utility.split(rawDefinition, ' ');
|
||||
for (int i = 0; i < pieces.length; ++i) {
|
||||
storeDef2(out, cp, pieces[i], line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String definition = rawDefinition.substring(start,end);
|
||||
}
|
||||
|
||||
static void storeDef2(PrintWriter out, int cp, String definition, String line) {
|
||||
if (type == CHINESE) {
|
||||
// since data are messed up, terminate after first digit
|
||||
int end3 = findInString(definition, "12345")+1;
|
||||
if (end3 == 0) {
|
||||
log.println("Bad pinyin data: " + rawDefinition);
|
||||
log.println("Bad pinyin data: " + hex.transliterate(UTF16.valueOf(cp))
|
||||
+ "\t" + UTF16.valueOf(cp) + "\t" + definition);
|
||||
end3 = definition.length();
|
||||
}
|
||||
definition = definition.substring(0, end3);
|
||||
|
@ -1045,9 +1097,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
definition = convertPinyin.transliterate(definition);
|
||||
}
|
||||
if (type == DEFINITION) {
|
||||
definition = removeMatched(definition,'(', ')', rawDefinition);
|
||||
definition = removeMatched(definition,'[', ']', rawDefinition);
|
||||
definition = fixDefinition(definition, rawDefinition);
|
||||
definition = removeMatched(definition,'(', ')', line);
|
||||
definition = removeMatched(definition,'[', ']', line);
|
||||
definition = fixDefinition(definition, line);
|
||||
}
|
||||
definition = definition.trim();
|
||||
definition = Default.ucd.getCase(definition, FULL, LOWER);
|
||||
|
@ -1056,7 +1108,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
Utility.fixDot();
|
||||
System.out.println("Zero value for " + Default.ucd.getCode(cp) + " on: " + hex.transliterate(line));
|
||||
} else {
|
||||
addCheck(UTF16.valueOf(cp), definition, rawDefinition);
|
||||
addCheck(UTF16.valueOf(cp), definition, line);
|
||||
}
|
||||
/*
|
||||
String key = (String) unihanMap.get(definition);
|
||||
|
@ -1103,6 +1155,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
|
|||
}
|
||||
|
||||
static Map unihanMap = new HashMap();
|
||||
static Map duplicates = new TreeMap();
|
||||
|
||||
static boolean unihanNonSingular = false;
|
||||
|
||||
static StringBuffer handlePinyinTemp = new StringBuffer();
|
||||
|
|
479
tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
Normal file
479
tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
Normal file
|
@ -0,0 +1,479 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
|
||||
* $Date: 2002/07/30 09:57:18 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
public class GenerateLineBreakTest implements UCD_Types {
|
||||
|
||||
static String[] samples = new String[LB_LIMIT + 3];
|
||||
|
||||
static byte[] TROrder = {
|
||||
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
|
||||
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
|
||||
// missing from Pair Table
|
||||
LB_SP, LB_BK, LB_CR, LB_LF,
|
||||
// resolved types below
|
||||
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
|
||||
// 3 JAMO CLASSES
|
||||
29, 30, 31
|
||||
};
|
||||
static final int TABLE_LIMIT = 25;
|
||||
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Default.setUCD();
|
||||
|
||||
findSamples();
|
||||
|
||||
// test individual cases
|
||||
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
|
||||
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
|
||||
out.println("<html><body><h1>Current (fixed only for consistency):</h1>");
|
||||
generateTable(out, false);
|
||||
out.println("<h1>Recommended:</h1>");
|
||||
generateTable(out, true);
|
||||
out.println("</body></html>");
|
||||
out.close();
|
||||
|
||||
// do main test
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
|
||||
int counter = 0;
|
||||
|
||||
out.println("# Default Linebreak conformance test");
|
||||
out.println("# " + Default.getDate() + ", MED");
|
||||
out.println("#");
|
||||
|
||||
for (int ii = 0; ii < samples.length; ++ii) {
|
||||
int i = TROrder[ii];
|
||||
String before = samples[i];
|
||||
|
||||
for (int jj = 0; jj < samples.length; ++jj) {
|
||||
Utility.dot(counter++);
|
||||
int j = TROrder[jj];
|
||||
String after = samples[j];
|
||||
// do line straight
|
||||
printLine(out, before, "", after, k != 0);
|
||||
printLine(out, before, " ", after, k != 0);
|
||||
printLine(out, before, "\u0301\u0308", after, k != 0);
|
||||
}
|
||||
}
|
||||
out.println("# Lines: " + counter);
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
public static void generateTable(PrintWriter out, boolean recommended) {
|
||||
out.print("<table border='1' cellspacing='0'><tr><th></th>");
|
||||
for (int i = 0; i < TABLE_LIMIT; ++i) {
|
||||
String h = getLBID(samples[TROrder[i]]);
|
||||
out.print("<th>" + h + "</th>");
|
||||
}
|
||||
out.print("</tr>");
|
||||
String[] rule = new String[1];
|
||||
String[] rule2 = new String[1];
|
||||
for (int i = 0; i < TABLE_LIMIT; ++i) {
|
||||
String before = samples[TROrder[i]];
|
||||
String line = "<tr><th>" + getLBID(before) + "</th>";
|
||||
for (int j = 0; j < TABLE_LIMIT; ++j) {
|
||||
String after = samples[TROrder[j]];
|
||||
String t = getTableEntry(before, after, recommended, rule);
|
||||
String background = "";
|
||||
if (recommended) {
|
||||
String t2 = getTableEntry(before, after, false, rule2);
|
||||
if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
|
||||
}
|
||||
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
|
||||
}
|
||||
out.println(line + "</tr>");
|
||||
}
|
||||
out.println("</table>");
|
||||
}
|
||||
|
||||
public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
|
||||
String t = "_";
|
||||
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
|
||||
String spaceRule = rule;
|
||||
|
||||
boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended);
|
||||
String spaceRule2 = rule;
|
||||
|
||||
boolean normalBreak = isBreak(before + after, before.length(), recommended);
|
||||
String normalRule = rule;
|
||||
|
||||
if (!normalBreak) {
|
||||
if (!spaceBreak && !spaceBreak2) {
|
||||
t = "^";
|
||||
rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule;
|
||||
if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) {
|
||||
rule += "/" + spaceRule2;
|
||||
}
|
||||
} else {
|
||||
t = "%";
|
||||
rule = normalRule;
|
||||
}
|
||||
}
|
||||
ruleOut[0] = rule;
|
||||
return t;
|
||||
}
|
||||
|
||||
|
||||
public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
|
||||
String s = before + filler + after;
|
||||
int offset = before.length() + filler.length();
|
||||
|
||||
boolean lb = isBreak(s, offset, false);
|
||||
|
||||
String tlb = (lb ? "b" : "n");
|
||||
String comment = "";
|
||||
if (comments) comment =
|
||||
" # " + getLBID(before + filler)
|
||||
+ " " + tlb
|
||||
+ " " + getLBID(after)
|
||||
+ " # " + Default.ucd.getName(before + filler)
|
||||
+ " " + tlb
|
||||
+ " " + Default.ucd.getName(after);
|
||||
|
||||
out.println(Utility.hex(before + filler)
|
||||
+ "; " + tlb
|
||||
+ "; " + Utility.hex(after)
|
||||
+ comment);
|
||||
}
|
||||
|
||||
public static void findSamples() {
|
||||
for (int i = 1; i <= 0x10FFFF; ++i) {
|
||||
if (!Default.ucd.isAllocated(i)) continue;
|
||||
if (Default.ucd.isLeadingJamo(i)
|
||||
|| Default.ucd.isVowelJamo(i)
|
||||
|| Default.ucd.isTrailingJamo(i)) continue;
|
||||
byte lb = Default.ucd.getLineBreak(i);
|
||||
if (samples[lb] == null) {
|
||||
samples[lb] = UTF16.valueOf(i);
|
||||
}
|
||||
}
|
||||
// fill the last with special cases
|
||||
samples[LB_LIMIT] = "\u1100";
|
||||
samples[LB_LIMIT+1] = "\u1162";
|
||||
samples[LB_LIMIT+2] = "\u11A8";
|
||||
}
|
||||
|
||||
|
||||
public static String getLBID(String s) {
|
||||
if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
if (i > 0) result.append(" ");
|
||||
result.append(Default.ucd.getLineBreakID(cp));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static String rule;
|
||||
|
||||
public static int findLastNon(String source, int offset, byte notLBType) {
|
||||
int cp;
|
||||
for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
byte f = getResolvedLB(cp);
|
||||
if (f != notLBType) return cp;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static byte getResolvedLB (int cp) {
|
||||
// LB 1 Assign a line break category to each character of the input.
|
||||
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
|
||||
byte result = Default.ucd.getLineBreak(cp);
|
||||
switch (result) {
|
||||
case LB_AI: result = LB_AI; break;
|
||||
// case LB_CB: result = LB_ID; break;
|
||||
case LB_SA: result = LB_AL; break;
|
||||
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
|
||||
case LB_XX: result = LB_AL; break;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// find out whether there is a break at offset
|
||||
// WARNING: as a side effect, sets "rule"
|
||||
|
||||
public static boolean isBreak(String source, int offset, boolean recommended) {
|
||||
|
||||
// LB 1 Assign a line break category to each character of the input.
|
||||
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
|
||||
// this is taken care of in the getResolvedLB function
|
||||
|
||||
// LB 2a Never break at the start of text
|
||||
|
||||
rule="2a";
|
||||
if (offset <= 0) return false;
|
||||
|
||||
// LB 2b Always break at the end of text
|
||||
|
||||
rule="2b";
|
||||
if (offset >= source.length()) return true;
|
||||
|
||||
|
||||
// UTF-16: never break in the middle of a code point
|
||||
if (UTF16.isLeadSurrogate(source.charAt(offset-1))
|
||||
&& UTF16.isTrailSurrogate(source.charAt(offset))) return false;
|
||||
|
||||
|
||||
// now get the character before and after, and their types
|
||||
|
||||
|
||||
int cpBefore = UTF16.charAt(source, offset-1);
|
||||
int cpAfter = UTF16.charAt(source, offset);
|
||||
|
||||
byte before = getResolvedLB(cpBefore);
|
||||
byte after = getResolvedLB(cpAfter);
|
||||
|
||||
|
||||
rule="3a";
|
||||
// Always break after hard line breaks (but never between CR and LF).
|
||||
// CR ^ LF
|
||||
if (before == LB_CR && after == LB_LF) return false;
|
||||
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
|
||||
|
||||
//LB 3b Don’t break before hard line breaks.
|
||||
rule="3b";
|
||||
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
|
||||
|
||||
// LB 4 Don’t break before spaces or zero-width space.
|
||||
// × SP
|
||||
// × ZW
|
||||
|
||||
rule="4";
|
||||
if (after == LB_SP || after == LB_ZW) return false;
|
||||
|
||||
// LB 5 Break after zero-width space.
|
||||
// ZW ÷
|
||||
rule="5";
|
||||
if (before == LB_ZW) return true;
|
||||
|
||||
// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
|
||||
rule="6";
|
||||
if (after == LB_CM) return false;
|
||||
if (Default.ucd.isLeadingJamo(cpBefore)) {
|
||||
if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
|
||||
} else if (Default.ucd.isVowelJamo(cpBefore)) {
|
||||
if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
|
||||
} else if (Default.ucd.isTrailingJamo(cpBefore)) {
|
||||
if (Default.ucd.isTrailingJamo(cpAfter)) return false;
|
||||
}
|
||||
|
||||
boolean setBase = false;
|
||||
if (before == LB_CM) {
|
||||
setBase = true;
|
||||
int cp = findLastNon(source, offset, LB_CM);
|
||||
if (cp == 0) {
|
||||
before = LB_ID;
|
||||
} else {
|
||||
before = getResolvedLB(cp);
|
||||
}
|
||||
}
|
||||
|
||||
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
|
||||
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
|
||||
// one would break before an ID.
|
||||
rule="7";
|
||||
if (setBase && before == LB_SP) before = LB_ID;
|
||||
|
||||
// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces.
|
||||
// × CL, × EX, × IS, × SY
|
||||
rule="8";
|
||||
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
|
||||
|
||||
|
||||
// find the last non-space character; we will need it
|
||||
byte lastNonSpace = before;
|
||||
if (lastNonSpace == LB_SP) {
|
||||
int cp = findLastNon(source, offset, LB_CM);
|
||||
if (cp != 0) {
|
||||
lastNonSpace = getResolvedLB(cp);
|
||||
}
|
||||
}
|
||||
|
||||
// LB 9 Don’t break after ‘[’, even after spaces.
|
||||
// OP SP* ×
|
||||
rule="9";
|
||||
if (lastNonSpace == LB_OP) return false;
|
||||
|
||||
// LB 10 Don’t break within ‘”[’, , even with intervening spaces.
|
||||
// QU SP* × OP
|
||||
rule="10";
|
||||
if (lastNonSpace == LB_QU && after == LB_OP) return false;
|
||||
|
||||
// LB 11 Don’t break within ‘]h’, even with intervening spaces.
|
||||
// CL SP* × NS
|
||||
rule="11";
|
||||
if (lastNonSpace == LB_CL && after == LB_NS) return false;
|
||||
|
||||
// LB 11a Don’t break within ‘——’, even with intervening spaces.
|
||||
// B2 × B2
|
||||
rule="11a";
|
||||
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
|
||||
|
||||
|
||||
if (recommended) {
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
rule="11b";
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
}
|
||||
|
||||
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
|
||||
|
||||
rule="12";
|
||||
// LB 12 Break after spaces
|
||||
// SP ÷
|
||||
|
||||
if (before == LB_SP) return true;
|
||||
|
||||
if (!recommended) {
|
||||
// LB 13 Don’t break before or after NBSP or WORD JOINER
|
||||
// × GL
|
||||
// GL ×
|
||||
|
||||
rule="13";
|
||||
if (after == LB_GL || before == LB_GL) return false;
|
||||
}
|
||||
|
||||
rule="14";
|
||||
// LB 14 Don’t break before or after ‘”’
|
||||
// × QU
|
||||
// QU ×
|
||||
if (before == LB_QU || after == LB_QU) return false;
|
||||
|
||||
// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces,
|
||||
// small kana and other non- starters, or after acute accents:
|
||||
// × BA
|
||||
// × HY
|
||||
// × NS
|
||||
// BB ×
|
||||
|
||||
if (recommended) {
|
||||
// LB 14a Break before and after CB
|
||||
// CB ÷
|
||||
// ÷ CB
|
||||
if (before == LB_CB || after == LB_CB) return true;
|
||||
|
||||
}
|
||||
|
||||
rule="15";
|
||||
if (after == LB_NS) return false;
|
||||
if (after == LB_HY) return false;
|
||||
if (after == LB_BA) return false;
|
||||
if (before == LB_BB) return false;
|
||||
|
||||
if (!recommended) {
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
|
||||
rule="15b";
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
}
|
||||
|
||||
// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis:
|
||||
// AL × IN
|
||||
// ID × IN
|
||||
// IN × IN
|
||||
// NU × IN
|
||||
// Examples: ’9...’, ‘a...’, ‘H...’
|
||||
rule="16";
|
||||
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
|
||||
if (before == LB_IN && after == LB_IN) return false;
|
||||
|
||||
// Don't break alphanumerics.
|
||||
// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’
|
||||
// ID × PO
|
||||
// AL × NU
|
||||
// NU × AL
|
||||
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
|
||||
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
|
||||
// This is approximated with the following rules. (Some cases already handled above,
|
||||
// like ‘9,’, ‘[9’.)
|
||||
rule="17";
|
||||
if (before == LB_ID && after == LB_PO) return false;
|
||||
if (before == LB_AL && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_AL) return false;
|
||||
|
||||
// LB 18 Don’t break between the following pairs of classes.
|
||||
// CL × PO
|
||||
// HY × NU
|
||||
// IS × NU
|
||||
// NU × NU
|
||||
// NU × PO
|
||||
// PR × AL
|
||||
// PR × HY
|
||||
// PR × ID
|
||||
// PR × NU
|
||||
// PR × OP
|
||||
// SY × NU
|
||||
// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’
|
||||
|
||||
rule="18";
|
||||
if (before == LB_CL && after == LB_PO) return false;
|
||||
if (before == LB_HY && after == LB_NU) return false;
|
||||
if (before == LB_IS && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_NU) return false;
|
||||
if (before == LB_NU && after == LB_PO) return false;
|
||||
|
||||
if (before == LB_PR && after == LB_AL) return false;
|
||||
if (before == LB_PR && after == LB_HY) return false;
|
||||
if (before == LB_PR && after == LB_ID) return false;
|
||||
if (before == LB_PR && after == LB_NU) return false;
|
||||
if (before == LB_PR && after == LB_OP) return false;
|
||||
|
||||
if (before == LB_SY && after == LB_NU) return false;
|
||||
|
||||
if (recommended) {
|
||||
// LB 15b Break after hyphen-minus, and before acute accents:
|
||||
// HY ÷
|
||||
// ÷ BB
|
||||
|
||||
rule="18b";
|
||||
if (before == LB_HY) return true;
|
||||
if (after == LB_BB) return true;
|
||||
}
|
||||
|
||||
// LB 19 Don’t break between alphabetics (“at”)
|
||||
// AL × AL
|
||||
|
||||
rule="19";
|
||||
if (before == LB_AL && after == LB_AL) return false;
|
||||
|
||||
// LB 20 Break everywhere else
|
||||
// ALL ÷
|
||||
// ÷ ALL
|
||||
|
||||
rule="20";
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
|
||||
* $Date: 2002/04/23 22:50:15 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -81,7 +81,7 @@ public class GenerateThaiBreaks {
|
|||
System.out.println("initials size: " + initials.size());
|
||||
System.out.println("finals size: " + finals.size());
|
||||
|
||||
out = Utility.openPrintWriter("ThaiData.txt", false, false);
|
||||
out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
|
||||
out.write('\uFEFF');
|
||||
out.println("Only Initials");
|
||||
Utility.print(out, initials, ", ", new MyBreaker());
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/07/14 22:04:49 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -59,7 +59,7 @@ public final class Main implements UCD_Types {
|
|||
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion});
|
||||
else if (arg.equalsIgnoreCase("version")) Default.setUCD(args[++i]);
|
||||
else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics();
|
||||
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
|
||||
else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null);
|
||||
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
|
@ -77,7 +77,12 @@ public final class Main implements UCD_Types {
|
|||
else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);
|
||||
|
||||
else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
|
||||
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
|
||||
|
||||
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
|
||||
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
|
||||
else if (arg.equalsIgnoreCase("checkCase3")) VerifyUCD.checkCase3();
|
||||
|
|
|
@ -12,14 +12,15 @@ public final class NFSkippable extends UnicodeProperty {
|
|||
|
||||
private Normalizer nf;
|
||||
private Normalizer nfd;
|
||||
private UCD ucd;
|
||||
private boolean composes;
|
||||
private int[] realTrailers = new int[100];
|
||||
private int realTrailerCount = 0;
|
||||
|
||||
public NFSkippable(byte normalizerMode, String unicodeVersion) {
|
||||
public NFSkippable(byte normalizerMode, UCD inputUCD) {
|
||||
isStandard = false;
|
||||
ucd = UCD.make(unicodeVersion);
|
||||
nf = new Normalizer(normalizerMode, unicodeVersion);
|
||||
this.ucd = inputUCD;
|
||||
nf = new Normalizer(normalizerMode, ucd.getVersion());
|
||||
name = nf.getName() + "_Skippable";
|
||||
shortName = nf.getName() + "_Skip";
|
||||
header = "# Derived Property: " + name
|
||||
|
@ -28,7 +29,7 @@ public final class NFSkippable extends UnicodeProperty {
|
|||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
|
||||
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
|
||||
composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
|
||||
|
||||
// preprocess to find possible trailers
|
||||
|
@ -36,7 +37,7 @@ public final class NFSkippable extends UnicodeProperty {
|
|||
if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
|
||||
if (nf.isTrailing(cp2)) {
|
||||
//System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
|
||||
if (ucd.isTrailingJamo(cp2)) {
|
||||
if (ucd.isNonLeadJamo(cp2)) {
|
||||
//System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
|
||||
continue;
|
||||
}
|
||||
|
@ -190,18 +191,21 @@ public final class NFSkippable extends UnicodeProperty {
|
|||
static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
|
||||
|
||||
public static void main (String[] args) throws java.io.IOException {
|
||||
Default.setUCD();
|
||||
|
||||
String version = ""; // Unicode version, "" = latest released
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt");
|
||||
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
|
||||
out.println("NFSafeSets");
|
||||
out.println("Version: " + Default.ucd.getVersion());
|
||||
out.println("Date: " + Default.getDate());
|
||||
out.println();
|
||||
|
||||
for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
|
||||
UnicodeProperty up = DerivedProperty.make(mode, UCD.make(version));
|
||||
UnicodeProperty up = DerivedProperty.make(mode, Default.ucd);
|
||||
generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
|
||||
}
|
||||
|
||||
for (byte mode = NFD; mode <= NFKC; ++mode) {
|
||||
NFSkippable skipper = new NFSkippable(mode,version);
|
||||
NFSkippable skipper = new NFSkippable(mode, Default.ucd);
|
||||
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
|
||||
}
|
||||
|
||||
|
@ -219,9 +223,9 @@ public final class NFSkippable extends UnicodeProperty {
|
|||
|
||||
String rSet = result.toPattern(true);
|
||||
rSet = replace(rSet, "\\U", "\\\\U");
|
||||
rSet = replace(rSet, "\\u", "\\\\u");
|
||||
out.println(label + " = new UnicodeSet(");
|
||||
writeStringInPieces(out, rSet, ", false);");
|
||||
out.println();
|
||||
|
||||
rSet = result.toPattern(false);
|
||||
out.println("/*Unicode: ");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2002/06/24 15:25:10 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -271,7 +271,7 @@ public final class Normalizer implements UCD_Types {
|
|||
}
|
||||
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
|
||||
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
|
||||
if (trailing != null && UCD.isTrailingJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
|
||||
if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
|
||||
}
|
||||
if (leading != null) {
|
||||
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
|
||||
|
@ -407,7 +407,7 @@ public final class Normalizer implements UCD_Types {
|
|||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAssigned(i)) continue;
|
||||
if (ucd.isPUA(i)) continue;
|
||||
if (ucd.isTrailingJamo(i)) isSecond.set(i);
|
||||
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
|
||||
if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
|
||||
byte dt = ucd.getDecompositionType(i);
|
||||
if (dt != CANONICAL) continue;
|
||||
|
|
|
@ -246,7 +246,7 @@ public class NormalizerSample implements UCD_Types {
|
|||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (!ucd.isAssigned(i)) continue;
|
||||
if (ucd.isPUA(i)) continue;
|
||||
if (ucd.isTrailingJamo(i)) isSecond.set(i);
|
||||
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
|
||||
byte dt = ucd.getDecompositionType(i);
|
||||
if (dt != CANONICAL) continue;
|
||||
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -126,12 +126,6 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
}
|
||||
|
||||
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
|
||||
|
||||
public static String fixFile(String s) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2002/06/22 01:21:09 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -1170,14 +1170,22 @@ to guarantee identifier closure.
|
|||
&& ((char1 - SBase) % TCount) == 0);
|
||||
}
|
||||
|
||||
static boolean isVowelJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit);
|
||||
}
|
||||
|
||||
static boolean isTrailingJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
return (TBase <= cp && cp < TLimit);
|
||||
}
|
||||
|
||||
static boolean isLeadingJamo(int cp) {
|
||||
return (LBase <= cp && cp < LLimit);
|
||||
}
|
||||
|
||||
static boolean isNonLeadJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
}
|
||||
|
||||
private void fillFromFile(String version) {
|
||||
try {
|
||||
fillFromFile2(version);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2002/06/15 02:47:13 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -229,10 +229,12 @@ public interface UCD_Types {
|
|||
|
||||
// line break
|
||||
public static final byte
|
||||
LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
|
||||
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
|
||||
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
|
||||
LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
|
||||
LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7,
|
||||
LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15,
|
||||
LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23,
|
||||
LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
|
||||
LIMIT_LINE_BREAK = 29,
|
||||
LB_LIMIT = LIMIT_LINE_BREAK;
|
||||
|
||||
// east asian width
|
||||
public static final byte
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
|
||||
* $Date: 2002/06/15 02:47:12 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -42,7 +42,7 @@ class UData implements UCD_Types {
|
|||
byte numericType = NUMERIC_NONE;
|
||||
|
||||
byte eastAsianWidth = EAN;
|
||||
byte lineBreak = LBXX;
|
||||
byte lineBreak = LB_XX;
|
||||
byte joiningType = JT_U;
|
||||
byte joiningGroup = NO_SHAPING;
|
||||
byte script = COMMON_SCRIPT;
|
||||
|
@ -196,7 +196,7 @@ class UData implements UCD_Types {
|
|||
if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');
|
||||
|
||||
if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
|
||||
if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
|
||||
if (full || lineBreak != LB_AL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
|
||||
if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
|
||||
if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
|
||||
if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/06/22 01:21:09 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -551,7 +551,7 @@ can help you narrow these down.
|
|||
static void generateXML() throws IOException {
|
||||
Default.setUCD();
|
||||
String filename = "UCD.xml";
|
||||
PrintWriter log = Utility.openPrintWriter(filename);
|
||||
PrintWriter log = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
|
||||
|
||||
//log.println('\uFEFF');
|
||||
log.println("<ucd>");
|
||||
|
@ -580,14 +580,14 @@ can help you narrow these down.
|
|||
|
||||
String ttest = Default.ucd.getCase(test, FULL, TITLE);
|
||||
|
||||
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt");
|
||||
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt", Utility.LATIN1_UNIX);
|
||||
titleTest.println(test);
|
||||
titleTest.println(ttest);
|
||||
titleTest.close();
|
||||
|
||||
System.out.println(Default.ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
|
||||
String fileName = "CaseDifferences.txt";
|
||||
PrintWriter log = Utility.openPrintWriter(fileName);
|
||||
PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);
|
||||
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
|
@ -648,7 +648,7 @@ can help you narrow these down.
|
|||
|
||||
|
||||
String fileName = "CaseNormalizationDifferences.txt";
|
||||
PrintWriter log = Utility.openPrintWriter(fileName);
|
||||
PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);
|
||||
|
||||
log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
|
||||
log.println("u, l, t - upper, lower, title");
|
||||
|
@ -1069,7 +1069,7 @@ can help you narrow these down.
|
|||
System.out.println("Writing IDNCheck.txt");
|
||||
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("IDNCheck.txt");
|
||||
PrintWriter log = Utility.openPrintWriter("IDNCheck.txt", Utility.LATIN1_UNIX);
|
||||
log.println("IDN Check");
|
||||
log.println("Total Errors: " + errorCount);
|
||||
|
||||
|
@ -1124,7 +1124,7 @@ can help you narrow these down.
|
|||
public static void genIDN() throws IOException {
|
||||
PrintWriter out = new PrintWriter(System.out);
|
||||
Default.setUCD();
|
||||
PrintWriter log = Utility.openPrintWriter("IDN-tables.txt");
|
||||
PrintWriter log = Utility.openPrintWriter("IDN-tables.txt", Utility.LATIN1_UNIX);
|
||||
|
||||
/*UnicodeSet y = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
|
||||
UnicodeSet x = new UnicodeSet(0xE0001,0xE007F).retainAll(y);
|
||||
|
@ -1906,7 +1906,7 @@ E0020-E007F; [TAGGING CHARACTERS]
|
|||
}
|
||||
}
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt");
|
||||
PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt", Utility.LATIN1_UNIX);
|
||||
|
||||
Iterator it = m.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,7 +21,7 @@ import com.ibm.text.utility.*;
|
|||
public class WriteJavaScriptInfo implements UCD_Types {
|
||||
|
||||
static public void assigned() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter("assigned.js");
|
||||
PrintWriter log = Utility.openPrintWriter("assigned.js", Utility.LATIN1_UNIX);
|
||||
UCD ucd = UCD.make();
|
||||
boolean wasIn = false;
|
||||
int lastWritten = -100;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2002/07/21 08:43:39 $
|
||||
* $Revision: 1.22 $
|
||||
* $Date: 2002/07/30 09:56:41 $
|
||||
* $Revision: 1.23 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -531,14 +531,22 @@ public final class Utility { // COMMON UTILITIES
|
|||
"1.1.0",
|
||||
};
|
||||
|
||||
public static PrintWriter openPrintWriter(String filename) throws IOException {
|
||||
return openPrintWriter(filename, true, true);
|
||||
/*public static PrintWriter openPrintWriter(String filename) throws IOException {
|
||||
return openPrintWriter(filename, LATIN1_UNIX);
|
||||
}
|
||||
*/
|
||||
|
||||
static final byte WINDOWS_MASK = 1, UTF8_MASK = 2;
|
||||
public static final byte
|
||||
LATIN1_UNIX = 0,
|
||||
LATIN1_WINDOWS = WINDOWS_MASK,
|
||||
UTF8_UNIX = UTF8_MASK,
|
||||
UTF8_WINDOWS = UTF8_MASK | WINDOWS_MASK;
|
||||
|
||||
// Normally use false, false.
|
||||
// But for UCD files use true, true
|
||||
// Or if they are UTF8, use true, false
|
||||
public static PrintWriter openPrintWriter(String filename, boolean removeCR, boolean latin1) throws IOException {
|
||||
public static PrintWriter openPrintWriter(String filename, byte options) throws IOException {
|
||||
File file = new File(getOutputName(filename));
|
||||
System.out.println("Creating File: " + file);
|
||||
File parent = new File(file.getParent());
|
||||
|
@ -548,7 +556,7 @@ public final class Utility { // COMMON UTILITIES
|
|||
new UTF8StreamWriter(
|
||||
new FileOutputStream(file),
|
||||
32*1024,
|
||||
removeCR, latin1));
|
||||
(options & WINDOWS_MASK) == 0, (options & UTF8_MASK) == 0));
|
||||
}
|
||||
|
||||
public static String getOutputName(String filename) {
|
||||
|
@ -606,7 +614,7 @@ public final class Utility { // COMMON UTILITIES
|
|||
}
|
||||
|
||||
public static void addToSet(Map m, Object key, Object value) {
|
||||
Set set = (Set) m.get(key);
|
||||
Collection set = (Collection) m.get(key);
|
||||
if (set == null) {
|
||||
set = new TreeSet();
|
||||
m.put(key, set);
|
||||
|
@ -614,6 +622,15 @@ public final class Utility { // COMMON UTILITIES
|
|||
set.add(value);
|
||||
}
|
||||
|
||||
public static void addToList(Map m, Object key, Object value, boolean unique) {
|
||||
Collection set = (Collection) m.get(key);
|
||||
if (set == null) {
|
||||
set = new ArrayList();
|
||||
m.put(key, set);
|
||||
}
|
||||
if (!unique || !set.contains(value)) set.add(value);
|
||||
}
|
||||
|
||||
public static String readDataLine(BufferedReader br) throws IOException {
|
||||
String originalLine = "";
|
||||
String line = "";
|
||||
|
@ -724,7 +741,7 @@ public final class Utility { // COMMON UTILITIES
|
|||
}
|
||||
|
||||
public static void copyTextFile(String filename, boolean utf8, String newName, String[] replacementList) throws IOException {
|
||||
PrintWriter out = Utility.openPrintWriter(newName, false, false);
|
||||
PrintWriter out = Utility.openPrintWriter(newName, UTF8_WINDOWS);
|
||||
appendFile(filename, utf8, out, replacementList);
|
||||
out.close();
|
||||
}
|
||||
|
@ -834,10 +851,12 @@ public final class Utility { // COMMON UTILITIES
|
|||
return "Showing Stack with fake " + sw.getBuffer().toString();
|
||||
}
|
||||
|
||||
static PrintWriter showSetNamesPw;
|
||||
|
||||
public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) {
|
||||
PrintWriter temp = new PrintWriter(System.out);
|
||||
showSetNames(temp, prefix, set, separateLines, false, ucd);
|
||||
temp.close();
|
||||
if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out);
|
||||
showSetNames(showSetNamesPw, prefix, set, separateLines, false, ucd);
|
||||
showSetNamesPw.flush();
|
||||
}
|
||||
|
||||
public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) {
|
||||
|
|
Loading…
Add table
Reference in a new issue