Changes for generating linebreak test

X-SVN-Rev: 9433
This commit is contained in:
Mark Davis 2002-07-30 09:57:18 +00:00
parent a5e7872567
commit 73cd203e91
18 changed files with 750 additions and 136 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
* $Date: 2002/06/13 21:14:05 $
* $Revision: 1.6 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -142,7 +142,7 @@ public class BuildNames implements UCD_Types {
String fname = "ShortNames.txt";
System.out.println("Writing " + fname);
PrintWriter log = Utility.openPrintWriter(fname, false, true);
PrintWriter log = Utility.openPrintWriter(fname, Utility.LATIN1_WINDOWS);
System.out.println("Gathering data");
//Counter counter = new Counter();

View file

@ -1,6 +1,10 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.TimeZone;
public final class Default implements UCD_Types {
@ -25,5 +29,14 @@ public final class Default implements UCD_Types {
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, ucdVersion);
System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(ucd.getDate())));
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
public static String getDate() {
return myDateFormat.format(new Date());
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2002/05/31 01:41:04 $
* $Revision: 1.10 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
@ -41,7 +41,7 @@ public class GenerateCaseFolding implements UCD_Types {
PICK_SHORT = NF_CLOSURE = normalized;
Default.setUCD();
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true));
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
System.out.println("Making Full Data");
@ -57,7 +57,7 @@ public class GenerateCaseFolding implements UCD_Types {
if (normalized) filename += "-Normalized";
String directory = "DerivedData/";
String newFile = directory + filename + GenerateData.getFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(newFile);
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
@ -444,7 +444,8 @@ public class GenerateCaseFolding implements UCD_Types {
String suffix2 = "";
if (normalize) suffix2 = "-Normalized";
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions" + suffix2 + GenerateData.getFileSuffix(true));
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions"
+ suffix2 + GenerateData.getFileSuffix(true), Utility.LATIN1_UNIX);
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
@ -555,7 +556,7 @@ public class GenerateCaseFolding implements UCD_Types {
System.out.println("Writing");
String newFile = "DerivedData/SpecialCasing" + suffix2 + GenerateData.getFileSuffix(true);
PrintWriter out = Utility.openPrintWriter(newFile);
PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = GenerateData.generateBat("DerivedData/", "SpecialCasing", suffix2 + GenerateData.getFileSuffix(true));
out.println("# SpecialCasing" + GenerateData.getFileSuffix(false));
out.println(GenerateData.generateDateLine());

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2002/07/14 22:04:49 $
* $Revision: 1.21 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -15,8 +15,6 @@ package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
@ -28,6 +26,49 @@ public class GenerateData implements UCD_Types {
static final boolean DEBUG = false;
static final String HORIZONTAL_LINE = "# ================================================";
static final void genSplit () {
Default.setUCD();
UnicodeSet split = new UnicodeSet();
UnicodeSet reordrant = new UnicodeSet(
"[\u093F\u09BF\u09c7\u09c8\u0abf\u0abf\u0b47\u0bc6\u0bc7\u0bc8"
+ "\u0d46\u0d47\u0d48\u0dd9\u0dda\u0ddb\u1031\u17be\u17c1\u17c2\u17c3]");
UnicodeSet subjoined = new UnicodeSet();
for (int i = 0; i <= 0x10FFFF; ++i) {
if (!Default.ucd.isAssigned(i)) continue;
Utility.dot(i);
int cat = Default.ucd.getCategory(i);
if (cat != Mc && cat != Mn && cat != Me) continue;
if (Default.ucd.getName(i).indexOf("SUBJOINED") >= 0) {
System.out.print('*');
subjoined.add(i);
continue;
}
String decomp = Default.nfd.normalize(i);
//int count = countTypes(decomp, Mc);
if (UTF16.countCodePoint(decomp) > 1) split.add(i);
}
Utility.fixDot();
System.out.println("Split: " + split.size());
Utility.showSetNames("", split, false, Default.ucd);
System.out.println("Reordrant: " + reordrant.size());
Utility.showSetNames("", reordrant, false, Default.ucd);
System.out.println("Subjoined: " + subjoined.size());
Utility.showSetNames("", subjoined, false, Default.ucd);
}
static int countTypes(String s, int filter) {
int count = 0;
int cp;
for (int i = 0; i < s.length(); i+= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
int cat = Default.ucd.getCategory(i);
if (cat == filter) count++;
}
return count;
}
//static UnifiedBinaryProperty ubp
@ -55,12 +96,6 @@ public class GenerateData implements UCD_Types {
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd','HH:mm:ss' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
public static String fixFile(String s) {
@ -108,7 +143,7 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + fileName + getFileSuffix(true);
System.out.println("New File: " + newFile);
PrintWriter output = Utility.openPrintWriter(newFile);
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
System.out.println("Most recent: " + mostRecent);
@ -156,7 +191,7 @@ public class GenerateData implements UCD_Types {
public static void generateCompExclusions() throws IOException {
Default.setUCD();
String newFile = "DerivedData/CompositionExclusions" + getFileSuffix(true);
PrintWriter output = Utility.openPrintWriter(newFile);
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat("DerivedData/", "CompositionExclusions", getFileSuffix(true));
output.println("# CompositionExclusions" + getFileSuffix(false));
@ -217,7 +252,7 @@ public class GenerateData implements UCD_Types {
}
static String generateDateLine() {
return "# Date: " + myDateFormat.format(new Date()) + " [MD]";
return "# Date: " + Default.getDate() + " [MD]";
}
static class CompLister extends PropertyLister {
@ -332,7 +367,7 @@ public class GenerateData implements UCD_Types {
Utility.fixDot();
System.out.println("Set Size: " + map.size());
PrintWriter output = Utility.openPrintWriter("Partition" + getFileSuffix(true));
PrintWriter output = Utility.openPrintWriter("Partition" + getFileSuffix(true), Utility.LATIN1_UNIX);
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
@ -351,7 +386,7 @@ public class GenerateData implements UCD_Types {
public static void listDifferences() throws IOException {
Default.setUCD();
PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + getFileSuffix(true));
PrintWriter output = Utility.openPrintWriter("PropertyDifferences" + getFileSuffix(true), Utility.LATIN1_UNIX);
output.println("# Listing of relationships among properties, suitable for analysis by spreadsheet");
output.println("# Generated for " + Default.ucd.getVersion());
output.println(generateDateLine());
@ -610,7 +645,7 @@ public class GenerateData implements UCD_Types {
String filename = "PropertyAliases";
String newFile = "DerivedData/" + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
log.println("# " + filename + getFileSuffix(false));
@ -626,7 +661,7 @@ public class GenerateData implements UCD_Types {
filename = "PropertyValueAliases";
newFile = "DerivedData/" + filename + getFileSuffix(true);
log = Utility.openPrintWriter(newFile);
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
mostRecent = generateBat("DerivedData/", filename, getFileSuffix(true));
log.println("# " + filename + getFileSuffix(false));
@ -642,7 +677,7 @@ public class GenerateData implements UCD_Types {
filename = "PropertyAliasSummary";
newFile = "OtherData/" + filename + getFileSuffix(true);
log = Utility.openPrintWriter(newFile);
log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
mostRecent = generateBat("OtherData/", filename, getFileSuffix(true));
log.println();
log.println(HORIZONTAL_LINE);
@ -793,7 +828,7 @@ public class GenerateData implements UCD_Types {
}
public static void generateBatAux(String batName, String oldName, String newName) throws IOException {
PrintWriter output = Utility.openPrintWriter(batName + ".bat");
PrintWriter output = Utility.openPrintWriter(batName + ".bat", Utility.LATIN1_UNIX);
newName = Utility.getOutputName(newName);
System.out.println("Writing BAT to compare " + oldName + " and " + newName);
@ -812,7 +847,7 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + file + getFileSuffix(true);
PrintWriter output = Utility.openPrintWriter(newFile);
PrintWriter output = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, file, getFileSuffix(true));
doHeader(file + getFileSuffix(false), output, headerChoice);
@ -881,7 +916,7 @@ public class GenerateData implements UCD_Types {
static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException {
Default.setUCD();
String newFile = directory + fileName + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile, true, false);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX);
String mostRecent = generateBat(directory, fileName, getFileSuffix(true));
String[] example = new String[256];
@ -1082,7 +1117,7 @@ public class GenerateData implements UCD_Types {
Default.setUCD();
String newFile = directory + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
DiffPropertyLister dpl;
UnicodeSet cummulative = new UnicodeSet();
@ -1164,7 +1199,7 @@ public class GenerateData implements UCD_Types {
static final void generateAge(String directory, String filename) throws IOException {
Default.setUCD();
String newFile = directory + filename + getFileSuffix(true);
PrintWriter log = Utility.openPrintWriter(newFile);
PrintWriter log = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX);
String mostRecent = generateBat(directory, filename, getFileSuffix(true));
try {
log.println("# " + filename + getFileSuffix(false));
@ -1259,7 +1294,7 @@ public class GenerateData implements UCD_Types {
public static void listCombiningAccents() throws IOException {
Default.setUCD();
PrintWriter log = Utility.openPrintWriter("ListAccents" + getFileSuffix(true));
PrintWriter log = Utility.openPrintWriter("ListAccents" + getFileSuffix(true), Utility.LATIN1_UNIX);
Set set = new TreeSet();
Set set2 = new TreeSet();
@ -1296,7 +1331,7 @@ public class GenerateData implements UCD_Types {
public static void listGreekVowels() throws IOException {
Default.setUCD();
PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + getFileSuffix(true));
PrintWriter log = Utility.openPrintWriter("ListGreekVowels" + getFileSuffix(true), Utility.LATIN1_UNIX);
Set set = new TreeSet();
Set set2 = new TreeSet();

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateHanTransliterator.java,v $
* $Date: 2002/07/21 08:43:39 $
* $Revision: 1.7 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -42,7 +42,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
public static void readUnihan() throws java.io.IOException {
log = Utility.openPrintWriter("Unihan_log.html", false, false);
log = Utility.openPrintWriter("Unihan_log.html", Utility.UTF8_WINDOWS);
log.println("<body>");
BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion, true, true);
@ -241,6 +241,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
static final int CHINESE = 2, JAPANESE = 1, DEFINITION = 0;
static final boolean DO_SIMPLE = true;
public static void main(int typeIn) {
type = typeIn;
Default.setUCD();
@ -269,13 +271,20 @@ public final class GenerateHanTransliterator implements UCD_Types {
default: throw new IllegalArgumentException("Unexpected option: must be 0..2");
}
log = Utility.openPrintWriter("Transliterate_log.txt", false, false);
err = Utility.openPrintWriter("Transliterate_err.txt", false, false);
err = Utility.openPrintWriter("Transliterate_err.txt", Utility.UTF8_WINDOWS);
log = Utility.openPrintWriter("Transliterate_log.txt", Utility.UTF8_WINDOWS);
log.print('\uFEFF');
readUnihanData(key);
log.println();
log.println("@*DICT Data");
log.println();
readCDICTDefinitions(type);
log.println();
log.println("@Unihan Data");
log.println();
readUnihanData(key);
if (false) {
readCDICT();
compareUnihanWithCEDICT();
@ -283,7 +292,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
readFrequencyData(type);
out = Utility.openPrintWriter(filename, false, false);
out = Utility.openPrintWriter(filename, Utility.UTF8_WINDOWS);
out.println("# Start RAW data for converting CJK characters");
/*
out.println("# Note: adds space between them and letters.");
@ -366,21 +375,24 @@ public final class GenerateHanTransliterator implements UCD_Types {
Set doReverse = new HashSet();
Set gotIt = new HashSet();
it = backSet.iterator();
while (it.hasNext()) {
Pair p = (Pair) it.next();
p = (Pair) p.second;
String keyChar = (String) p.first;
String def = (String) p.second;
if (!gotIt.contains(def)) {
if (unihanNonSingular) {
out.println(quoteNonLetters.transliterate(keyChar) + " < " + quoteNonLetters.transliterate(def) + ";");
} else {
doReverse.add(keyChar);
if (!DO_SIMPLE) {
it = backSet.iterator();
while (it.hasNext()) {
Pair p = (Pair) it.next();
p = (Pair) p.second;
String keyChar = (String) p.first;
String def = (String) p.second;
if (!gotIt.contains(def)) {
if (unihanNonSingular) {
out.println(quoteNonLetters.transliterate(keyChar)
+ " < " + quoteNonLetters.transliterate(def) + ";");
} else {
doReverse.add(keyChar);
}
}
gotIt.add(def);
}
gotIt.add(def);
}
@ -391,10 +403,10 @@ public final class GenerateHanTransliterator implements UCD_Types {
String keyChar = (String) p.first;
String def = (String) p.second;
String rel = doReverse.contains(keyChar) ? " <> " : " > ";
String rel = !DO_SIMPLE && doReverse.contains(keyChar) ? "<>" : ">";
out.println(quoteNonLetters.transliterate(keyChar) + rel
+ quoteNonLetters.transliterate(def) + ";");
+ quoteNonLetters.transliterate(def) + "|\\ ;");
//if (TESTING) System.out.println("# " + code + " > " + definition);
}
@ -413,6 +425,24 @@ public final class GenerateHanTransliterator implements UCD_Types {
System.out.println("Total: " + totalCount);
System.out.println("Defined Count: " + count);
log.println();
log.println("@Duplicates");
log.println();
it = duplicates.keySet().iterator();
while (it.hasNext()) {
String word = (String) it.next();
log.print(hex.transliterate(word) + "\t" + word + "\t");
Collection dups = (Collection) duplicates.get(word);
Iterator it2 = dups.iterator();
boolean gotFirst = false;
while (it2.hasNext()) {
if (!gotFirst) gotFirst = true;
else log.print(", ");
log.print(it2.next());
}
log.println();
}
} catch (Exception e) {
System.out.println("Exception: " + e);
} finally {
@ -506,6 +536,10 @@ public final class GenerateHanTransliterator implements UCD_Types {
int overallRank = 0;
it = combinedRank.iterator();
log.println();
log.println("@Frequency data: Rank of Character");
log.println();
while(it.hasNext()) {
Pair p = (Pair) it.next();
log.println(p.first + ", " + p.second);
@ -516,7 +550,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
}
log.println("@character to rank");
log.println();
log.println("@Frequency data: Character to Rank");
log.println();
// get full order
it = rankList.iterator();
@ -871,8 +907,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
static void addCheck2(String word, String definition, String line) {
definition = Default.nfc.normalize(definition) + " ";
definition = Default.nfc.normalize(definition);
word = Default.nfc.normalize(word);
if (DO_SIMPLE && UTF16.countCodePoint(word) > 1) return;
if (pua.containsSome(word) ) {
Utility.fixDot();
@ -881,7 +918,13 @@ public final class GenerateHanTransliterator implements UCD_Types {
Utility.fixDot();
System.out.println("Only numbers on: " + line);
} else {
unihanMap.put(word, definition);
Object alreadyThere = unihanMap.get(word);
if (alreadyThere == null) {
unihanMap.put(word, definition);
} else if (!definition.equals(alreadyThere)) {
Utility.addToList(duplicates, word, alreadyThere, true);
Utility.addToList(duplicates, word, definition, true);
}
}
if (UTF16.countCodePoint(word) > 1) unihanNonSingular = true;
}
@ -1025,19 +1068,28 @@ public final class GenerateHanTransliterator implements UCD_Types {
if (end > end2) end = end2;
// IF CHINESE or JAPANESE, stop at first space!!!
rawDefinition = rawDefinition.substring(start,end);
if (type != DEFINITION) {
end2 = rawDefinition.indexOf(" ", start);
if (end2 < 0) end2 = rawDefinition.length();
if (end > end2) end = end2;
if (type == DEFINITION) {
storeDef2(out, cp, rawDefinition, line);
} else {
if (rawDefinition.indexOf(' ') < 0) storeDef2(out, cp, rawDefinition, line);
else {
String [] pieces = Utility.split(rawDefinition, ' ');
for (int i = 0; i < pieces.length; ++i) {
storeDef2(out, cp, pieces[i], line);
}
}
}
String definition = rawDefinition.substring(start,end);
}
static void storeDef2(PrintWriter out, int cp, String definition, String line) {
if (type == CHINESE) {
// since data are messed up, terminate after first digit
int end3 = findInString(definition, "12345")+1;
if (end3 == 0) {
log.println("Bad pinyin data: " + rawDefinition);
log.println("Bad pinyin data: " + hex.transliterate(UTF16.valueOf(cp))
+ "\t" + UTF16.valueOf(cp) + "\t" + definition);
end3 = definition.length();
}
definition = definition.substring(0, end3);
@ -1045,9 +1097,9 @@ public final class GenerateHanTransliterator implements UCD_Types {
definition = convertPinyin.transliterate(definition);
}
if (type == DEFINITION) {
definition = removeMatched(definition,'(', ')', rawDefinition);
definition = removeMatched(definition,'[', ']', rawDefinition);
definition = fixDefinition(definition, rawDefinition);
definition = removeMatched(definition,'(', ')', line);
definition = removeMatched(definition,'[', ']', line);
definition = fixDefinition(definition, line);
}
definition = definition.trim();
definition = Default.ucd.getCase(definition, FULL, LOWER);
@ -1056,7 +1108,7 @@ public final class GenerateHanTransliterator implements UCD_Types {
Utility.fixDot();
System.out.println("Zero value for " + Default.ucd.getCode(cp) + " on: " + hex.transliterate(line));
} else {
addCheck(UTF16.valueOf(cp), definition, rawDefinition);
addCheck(UTF16.valueOf(cp), definition, line);
}
/*
String key = (String) unihanMap.get(definition);
@ -1103,6 +1155,8 @@ public final class GenerateHanTransliterator implements UCD_Types {
}
static Map unihanMap = new HashMap();
static Map duplicates = new TreeMap();
static boolean unihanNonSingular = false;
static StringBuffer handlePinyinTemp = new StringBuffer();

View file

@ -0,0 +1,479 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
* $Date: 2002/07/30 09:57:18 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class GenerateLineBreakTest implements UCD_Types {
static String[] samples = new String[LB_LIMIT + 3];
static byte[] TROrder = {
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
// missing from Pair Table
LB_SP, LB_BK, LB_CR, LB_LF,
// resolved types below
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
// 3 JAMO CLASSES
29, 30, 31
};
static final int TABLE_LIMIT = 25;
public static void main(String[] args) throws IOException {
Default.setUCD();
findSamples();
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility.openPrintWriter("LineBreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><body><h1>Current (fixed only for consistency):</h1>");
generateTable(out, false);
out.println("<h1>Recommended:</h1>");
generateTable(out, true);
out.println("</body></html>");
out.close();
// do main test
for (int k = 0; k < 2; ++k) {
out = Utility.openPrintWriter(k == 0 ? "LineBreakTest_SHORT.txt" : "LineBreakTest.txt", Utility.UTF8_WINDOWS);
int counter = 0;
out.println("# Default Linebreak conformance test");
out.println("# " + Default.getDate() + ", MED");
out.println("#");
for (int ii = 0; ii < samples.length; ++ii) {
int i = TROrder[ii];
String before = samples[i];
for (int jj = 0; jj < samples.length; ++jj) {
Utility.dot(counter++);
int j = TROrder[jj];
String after = samples[j];
// do line straight
printLine(out, before, "", after, k != 0);
printLine(out, before, " ", after, k != 0);
printLine(out, before, "\u0301\u0308", after, k != 0);
}
}
out.println("# Lines: " + counter);
out.close();
}
}
public static void generateTable(PrintWriter out, boolean recommended) {
out.print("<table border='1' cellspacing='0'><tr><th></th>");
for (int i = 0; i < TABLE_LIMIT; ++i) {
String h = getLBID(samples[TROrder[i]]);
out.print("<th>" + h + "</th>");
}
out.print("</tr>");
String[] rule = new String[1];
String[] rule2 = new String[1];
for (int i = 0; i < TABLE_LIMIT; ++i) {
String before = samples[TROrder[i]];
String line = "<tr><th>" + getLBID(before) + "</th>";
for (int j = 0; j < TABLE_LIMIT; ++j) {
String after = samples[TROrder[j]];
String t = getTableEntry(before, after, recommended, rule);
String background = "";
if (recommended) {
String t2 = getTableEntry(before, after, false, rule2);
if (!t.equals(t2)) background = " bgcolor='#FFFF00'";
}
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
}
out.println(line + "</tr>");
}
out.println("</table>");
}
public static String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
String t = "_";
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
String spaceRule = rule;
boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended);
String spaceRule2 = rule;
boolean normalBreak = isBreak(before + after, before.length(), recommended);
String normalRule = rule;
if (!normalBreak) {
if (!spaceBreak && !spaceBreak2) {
t = "^";
rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule;
if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) {
rule += "/" + spaceRule2;
}
} else {
t = "%";
rule = normalRule;
}
}
ruleOut[0] = rule;
return t;
}
public static void printLine(PrintWriter out, String before, String filler, String after, boolean comments) {
String s = before + filler + after;
int offset = before.length() + filler.length();
boolean lb = isBreak(s, offset, false);
String tlb = (lb ? "b" : "n");
String comment = "";
if (comments) comment =
" # " + getLBID(before + filler)
+ " " + tlb
+ " " + getLBID(after)
+ " # " + Default.ucd.getName(before + filler)
+ " " + tlb
+ " " + Default.ucd.getName(after);
out.println(Utility.hex(before + filler)
+ "; " + tlb
+ "; " + Utility.hex(after)
+ comment);
}
public static void findSamples() {
for (int i = 1; i <= 0x10FFFF; ++i) {
if (!Default.ucd.isAllocated(i)) continue;
if (Default.ucd.isLeadingJamo(i)
|| Default.ucd.isVowelJamo(i)
|| Default.ucd.isTrailingJamo(i)) continue;
byte lb = Default.ucd.getLineBreak(i);
if (samples[lb] == null) {
samples[lb] = UTF16.valueOf(i);
}
}
// fill the last with special cases
samples[LB_LIMIT] = "\u1100";
samples[LB_LIMIT+1] = "\u1162";
samples[LB_LIMIT+2] = "\u11A8";
}
public static String getLBID(String s) {
if (s.length() == 1) return Default.ucd.getLineBreakID(s.charAt(0));
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(" ");
result.append(Default.ucd.getLineBreakID(cp));
}
return result.toString();
}
static String rule;
public static int findLastNon(String source, int offset, byte notLBType) {
int cp;
for (int i = offset-2; i >= 0; i -= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
byte f = getResolvedLB(cp);
if (f != notLBType) return cp;
}
return 0;
}
public static byte getResolvedLB (int cp) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
byte result = Default.ucd.getLineBreak(cp);
switch (result) {
case LB_AI: result = LB_AI; break;
// case LB_CB: result = LB_ID; break;
case LB_SA: result = LB_AL; break;
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX: result = LB_AL; break;
}
return result;
}
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
public static boolean isBreak(String source, int offset, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
// this is taken care of in the getResolvedLB function
// LB 2a Never break at the start of text
rule="2a";
if (offset <= 0) return false;
// LB 2b Always break at the end of text
rule="2b";
if (offset >= source.length()) return true;
// UTF-16: never break in the middle of a code point
if (UTF16.isLeadSurrogate(source.charAt(offset-1))
&& UTF16.isTrailSurrogate(source.charAt(offset))) return false;
// now get the character before and after, and their types
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedLB(cpBefore);
byte after = getResolvedLB(cpAfter);
rule="3a";
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if (before == LB_CR && after == LB_LF) return false;
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
//LB 3b Dont break before hard line breaks.
rule="3b";
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
// LB 4 Dont break before spaces or zero-width space.
// × SP
// × ZW
rule="4";
if (after == LB_SP || after == LB_ZW) return false;
// LB 5 Break after zero-width space.
// ZW ÷
rule="5";
if (before == LB_ZW) return true;
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule="6";
if (after == LB_CM) return false;
if (Default.ucd.isLeadingJamo(cpBefore)) {
if (Default.ucd.isLeadingJamo(cpAfter) || Default.ucd.isVowelJamo(cpAfter)) return false;
} else if (Default.ucd.isVowelJamo(cpBefore)) {
if (Default.ucd.isVowelJamo(cpAfter) || Default.ucd.isTrailingJamo(cpAfter)) return false;
} else if (Default.ucd.isTrailingJamo(cpBefore)) {
if (Default.ucd.isTrailingJamo(cpAfter)) return false;
}
boolean setBase = false;
if (before == LB_CM) {
setBase = true;
int cp = findLastNon(source, offset, LB_CM);
if (cp == 0) {
before = LB_ID;
} else {
before = getResolvedLB(cp);
}
}
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID.
rule="7";
if (setBase && before == LB_SP) before = LB_ID;
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
rule="8";
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
// find the last non-space character; we will need it
byte lastNonSpace = before;
if (lastNonSpace == LB_SP) {
int cp = findLastNon(source, offset, LB_CM);
if (cp != 0) {
lastNonSpace = getResolvedLB(cp);
}
}
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
rule="9";
if (lastNonSpace == LB_OP) return false;
// LB 10 Dont break within [, , even with intervening spaces.
// QU SP* × OP
rule="10";
if (lastNonSpace == LB_QU && after == LB_OP) return false;
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
rule="11";
if (lastNonSpace == LB_CL && after == LB_NS) return false;
// LB 11a Dont break within , even with intervening spaces.
// B2 × B2
rule="11a";
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
if (recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="11b";
if (after == LB_GL || before == LB_GL) return false;
}
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
rule="12";
// LB 12 Break after spaces
// SP ÷
if (before == LB_SP) return true;
if (!recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="13";
if (after == LB_GL || before == LB_GL) return false;
}
rule="14";
// LB 14 Dont break before or after
// × QU
// QU ×
if (before == LB_QU || after == LB_QU) return false;
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// small kana and other non- starters, or after acute accents:
// × BA
// × HY
// × NS
// BB ×
if (recommended) {
// LB 14a Break before and after CB
// CB ÷
// ÷ CB
if (before == LB_CB || after == LB_CB) return true;
}
rule="15";
if (after == LB_NS) return false;
if (after == LB_HY) return false;
if (after == LB_BA) return false;
if (before == LB_BB) return false;
if (!recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
rule="15b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// AL × IN
// ID × IN
// IN × IN
// NU × IN
// Examples: 9..., a..., H...
rule="16";
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
if (before == LB_IN && after == LB_IN) return false;
// Don't break alphanumerics.
// LB 17 Dont break within a9, 3a, or H%
// ID × PO
// AL × NU
// NU × AL
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// This is approximated with the following rules. (Some cases already handled above,
// like 9,, [9.)
rule="17";
if (before == LB_ID && after == LB_PO) return false;
if (before == LB_AL && after == LB_NU) return false;
if (before == LB_NU && after == LB_AL) return false;
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
rule="18";
if (before == LB_CL && after == LB_PO) return false;
if (before == LB_HY && after == LB_NU) return false;
if (before == LB_IS && after == LB_NU) return false;
if (before == LB_NU && after == LB_NU) return false;
if (before == LB_NU && after == LB_PO) return false;
if (before == LB_PR && after == LB_AL) return false;
if (before == LB_PR && after == LB_HY) return false;
if (before == LB_PR && after == LB_ID) return false;
if (before == LB_PR && after == LB_NU) return false;
if (before == LB_PR && after == LB_OP) return false;
if (before == LB_SY && after == LB_NU) return false;
if (recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
rule="18b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 19 Dont break between alphabetics (at)
// AL × AL
rule="19";
if (before == LB_AL && after == LB_AL) return false;
// LB 20 Break everywhere else
// ALL ÷
// ÷ ALL
rule="20";
return true;
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
* $Date: 2002/04/23 22:50:15 $
* $Revision: 1.1 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -81,7 +81,7 @@ public class GenerateThaiBreaks {
System.out.println("initials size: " + initials.size());
System.out.println("finals size: " + finals.size());
out = Utility.openPrintWriter("ThaiData.txt", false, false);
out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
out.write('\uFEFF');
out.println("Only Initials");
Utility.print(out, initials, ", ", new MyBreaker());

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/07/14 22:04:49 $
* $Revision: 1.18 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -59,7 +59,7 @@ public final class Main implements UCD_Types {
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{Default.ucdVersion});
else if (arg.equalsIgnoreCase("version")) Default.setUCD(args[++i]);
else if (arg.equalsIgnoreCase("statistics")) VerifyUCD.statistics();
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
else if (arg.equalsIgnoreCase("NFSkippable")) NFSkippable.main(null);
else if (arg.equalsIgnoreCase("diffIgnorable")) VerifyUCD.diffIgnorable();
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
@ -77,7 +77,12 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("Buildnames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
else if (arg.equalsIgnoreCase("linebreaktest")) GenerateLineBreakTest.main(null);
else if (arg.equalsIgnoreCase("genSplit")) GenerateData.genSplit();
else if (arg.equalsIgnoreCase("iana")) IANANames.testSensitivity();
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
else if (arg.equalsIgnoreCase("checkCase3")) VerifyUCD.checkCase3();

View file

@ -12,14 +12,15 @@ public final class NFSkippable extends UnicodeProperty {
private Normalizer nf;
private Normalizer nfd;
private UCD ucd;
private boolean composes;
private int[] realTrailers = new int[100];
private int realTrailerCount = 0;
public NFSkippable(byte normalizerMode, String unicodeVersion) {
public NFSkippable(byte normalizerMode, UCD inputUCD) {
isStandard = false;
ucd = UCD.make(unicodeVersion);
nf = new Normalizer(normalizerMode, unicodeVersion);
this.ucd = inputUCD;
nf = new Normalizer(normalizerMode, ucd.getVersion());
name = nf.getName() + "_Skippable";
shortName = nf.getName() + "_Skip";
header = "# Derived Property: " + name
@ -28,7 +29,7 @@ public final class NFSkippable extends UnicodeProperty {
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
nfd = new Normalizer(Normalizer.NFD, ucd.getVersion());
composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
// preprocess to find possible trailers
@ -36,7 +37,7 @@ public final class NFSkippable extends UnicodeProperty {
if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
if (nf.isTrailing(cp2)) {
//System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
if (ucd.isTrailingJamo(cp2)) {
if (ucd.isNonLeadJamo(cp2)) {
//System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
continue;
}
@ -190,18 +191,21 @@ public final class NFSkippable extends UnicodeProperty {
static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
public static void main (String[] args) throws java.io.IOException {
Default.setUCD();
String version = ""; // Unicode version, "" = latest released
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt");
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt", Utility.UTF8_WINDOWS);
out.println("NFSafeSets");
out.println("Version: " + Default.ucd.getVersion());
out.println("Date: " + Default.getDate());
out.println();
for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
UnicodeProperty up = DerivedProperty.make(mode, UCD.make(version));
UnicodeProperty up = DerivedProperty.make(mode, Default.ucd);
generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
}
for (byte mode = NFD; mode <= NFKC; ++mode) {
NFSkippable skipper = new NFSkippable(mode,version);
NFSkippable skipper = new NFSkippable(mode, Default.ucd);
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
}
@ -219,9 +223,9 @@ public final class NFSkippable extends UnicodeProperty {
String rSet = result.toPattern(true);
rSet = replace(rSet, "\\U", "\\\\U");
rSet = replace(rSet, "\\u", "\\\\u");
out.println(label + " = new UnicodeSet(");
writeStringInPieces(out, rSet, ", false);");
out.println();
rSet = result.toPattern(false);
out.println("/*Unicode: ");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2002/06/24 15:25:10 $
* $Revision: 1.12 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.13 $
*
*******************************************************************************
*/
@ -271,7 +271,7 @@ public final class Normalizer implements UCD_Types {
}
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
if (trailing != null && UCD.isTrailingJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
if (trailing != null && UCD.isNonLeadJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
}
if (leading != null) {
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
@ -407,7 +407,7 @@ public final class Normalizer implements UCD_Types {
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isTrailingJamo(i)) isSecond.set(i);
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;

View file

@ -246,7 +246,7 @@ public class NormalizerSample implements UCD_Types {
for (int i = 0; i < 0x10FFFF; ++i) {
if (!ucd.isAssigned(i)) continue;
if (ucd.isPUA(i)) continue;
if (ucd.isTrailingJamo(i)) isSecond.set(i);
if (ucd.isNonLeadJamo(i)) isSecond.set(i);
byte dt = ucd.getDecompositionType(i);
if (dt != CANONICAL) continue;
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.8 $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -126,12 +126,6 @@ public class TestData implements UCD_Types {
}
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd' 'HH:mm:ss.S' GMT'");
static {
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
//Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names
public static String fixFile(String s) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.15 $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.16 $
*
*******************************************************************************
*/
@ -1170,14 +1170,22 @@ to guarantee identifier closure.
&& ((char1 - SBase) % TCount) == 0);
}
static boolean isVowelJamo(int cp) {
return (VBase <= cp && cp < VLimit);
}
static boolean isTrailingJamo(int cp) {
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
return (TBase <= cp && cp < TLimit);
}
static boolean isLeadingJamo(int cp) {
return (LBase <= cp && cp < LLimit);
}
static boolean isNonLeadJamo(int cp) {
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
}
private void fillFromFile(String version) {
try {
fillFromFile2(version);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2002/06/15 02:47:13 $
* $Revision: 1.13 $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -229,10 +229,12 @@ public interface UCD_Types {
// line break
public static final byte
LBXX = 0, LBOP = 1, LBCL = 2, LBQU = 3, LBGL = 4, LBNS = 5, LBEX = 6, LBSY = 7,
LBIS = 8, LBPR = 9, LBPO = 10, LBNU = 11, LBAL = 12, LBID = 13, LBIN = 14, LBHY = 15,
LBCM = 16, LBBB = 17, LBBA = 18, LBSP = 19, LBBK = 20, LBCR = 21, LBLF = 22, LBCB = 23,
LBSA = 24, LBAI = 25, LBB2 = 26, LBSG = 27, LBZW = 28, LIMIT_LINE_BREAK = 29;
LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7,
LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15,
LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23,
LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28,
LIMIT_LINE_BREAK = 29,
LB_LIMIT = LIMIT_LINE_BREAK;
// east asian width
public static final byte

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
* $Date: 2002/06/15 02:47:12 $
* $Revision: 1.5 $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -42,7 +42,7 @@ class UData implements UCD_Types {
byte numericType = NUMERIC_NONE;
byte eastAsianWidth = EAN;
byte lineBreak = LBXX;
byte lineBreak = LB_XX;
byte joiningType = JT_U;
byte joiningGroup = NO_SHAPING;
byte script = COMMON_SCRIPT;
@ -196,7 +196,7 @@ class UData implements UCD_Types {
if (full || !Double.isNaN(numericValue)) result.append(" nv='").append(numericValue).append('\'');
if (full || eastAsianWidth != EAN) result.append(" ea='").append(UCD_Names.EA[eastAsianWidth]).append('\'');
if (full || lineBreak != LBAL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
if (full || lineBreak != LB_AL) result.append(" lb='").append(UCD_Names.LB[lineBreak]).append('\'');
if (full || joiningType != JT_U) result.append(" jt='").append(UCD_Names.JOINING_TYPE[joiningType]).append('\'');
if (full || joiningGroup != NO_SHAPING) result.append(" jg='").append(UCD_Names.JOINING_GROUP[joiningGroup]).append('\'');
if (full || age != 0) result.append(" ag='").append(UCD_Names.AGE[age]).append('\'');

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/06/22 01:21:09 $
* $Revision: 1.17 $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.18 $
*
*******************************************************************************
*/
@ -551,7 +551,7 @@ can help you narrow these down.
static void generateXML() throws IOException {
Default.setUCD();
String filename = "UCD.xml";
PrintWriter log = Utility.openPrintWriter(filename);
PrintWriter log = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
//log.println('\uFEFF');
log.println("<ucd>");
@ -580,14 +580,14 @@ can help you narrow these down.
String ttest = Default.ucd.getCase(test, FULL, TITLE);
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt");
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt", Utility.LATIN1_UNIX);
titleTest.println(test);
titleTest.println(ttest);
titleTest.close();
System.out.println(Default.ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
String fileName = "CaseDifferences.txt";
PrintWriter log = Utility.openPrintWriter(fileName);
PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
@ -648,7 +648,7 @@ can help you narrow these down.
String fileName = "CaseNormalizationDifferences.txt";
PrintWriter log = Utility.openPrintWriter(fileName);
PrintWriter log = Utility.openPrintWriter(fileName, Utility.LATIN1_UNIX);
log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
log.println("u, l, t - upper, lower, title");
@ -1069,7 +1069,7 @@ can help you narrow these down.
System.out.println("Writing IDNCheck.txt");
PrintWriter log = Utility.openPrintWriter("IDNCheck.txt");
PrintWriter log = Utility.openPrintWriter("IDNCheck.txt", Utility.LATIN1_UNIX);
log.println("IDN Check");
log.println("Total Errors: " + errorCount);
@ -1124,7 +1124,7 @@ can help you narrow these down.
public static void genIDN() throws IOException {
PrintWriter out = new PrintWriter(System.out);
Default.setUCD();
PrintWriter log = Utility.openPrintWriter("IDN-tables.txt");
PrintWriter log = Utility.openPrintWriter("IDN-tables.txt", Utility.LATIN1_UNIX);
/*UnicodeSet y = UnifiedBinaryProperty.make(CATEGORY + FORMAT).getSet();
UnicodeSet x = new UnicodeSet(0xE0001,0xE007F).retainAll(y);
@ -1906,7 +1906,7 @@ E0020-E007F; [TAGGING CHARACTERS]
}
}
PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt");
PrintWriter log = Utility.openPrintWriter("CheckScriptsLog.txt", Utility.LATIN1_UNIX);
Iterator it = m.keySet().iterator();
while (it.hasNext()) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.3 $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -21,7 +21,7 @@ import com.ibm.text.utility.*;
public class WriteJavaScriptInfo implements UCD_Types {
static public void assigned() throws IOException {
PrintWriter log = Utility.openPrintWriter("assigned.js");
PrintWriter log = Utility.openPrintWriter("assigned.js", Utility.LATIN1_UNIX);
UCD ucd = UCD.make();
boolean wasIn = false;
int lastWritten = -100;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2002/07/21 08:43:39 $
* $Revision: 1.22 $
* $Date: 2002/07/30 09:56:41 $
* $Revision: 1.23 $
*
*******************************************************************************
*/
@ -531,14 +531,22 @@ public final class Utility { // COMMON UTILITIES
"1.1.0",
};
public static PrintWriter openPrintWriter(String filename) throws IOException {
return openPrintWriter(filename, true, true);
/*public static PrintWriter openPrintWriter(String filename) throws IOException {
return openPrintWriter(filename, LATIN1_UNIX);
}
*/
static final byte WINDOWS_MASK = 1, UTF8_MASK = 2;
public static final byte
LATIN1_UNIX = 0,
LATIN1_WINDOWS = WINDOWS_MASK,
UTF8_UNIX = UTF8_MASK,
UTF8_WINDOWS = UTF8_MASK | WINDOWS_MASK;
// Normally use false, false.
// But for UCD files use true, true
// Or if they are UTF8, use true, false
public static PrintWriter openPrintWriter(String filename, boolean removeCR, boolean latin1) throws IOException {
public static PrintWriter openPrintWriter(String filename, byte options) throws IOException {
File file = new File(getOutputName(filename));
System.out.println("Creating File: " + file);
File parent = new File(file.getParent());
@ -548,7 +556,7 @@ public final class Utility { // COMMON UTILITIES
new UTF8StreamWriter(
new FileOutputStream(file),
32*1024,
removeCR, latin1));
(options & WINDOWS_MASK) == 0, (options & UTF8_MASK) == 0));
}
public static String getOutputName(String filename) {
@ -606,7 +614,7 @@ public final class Utility { // COMMON UTILITIES
}
public static void addToSet(Map m, Object key, Object value) {
Set set = (Set) m.get(key);
Collection set = (Collection) m.get(key);
if (set == null) {
set = new TreeSet();
m.put(key, set);
@ -614,6 +622,15 @@ public final class Utility { // COMMON UTILITIES
set.add(value);
}
public static void addToList(Map m, Object key, Object value, boolean unique) {
Collection set = (Collection) m.get(key);
if (set == null) {
set = new ArrayList();
m.put(key, set);
}
if (!unique || !set.contains(value)) set.add(value);
}
public static String readDataLine(BufferedReader br) throws IOException {
String originalLine = "";
String line = "";
@ -724,7 +741,7 @@ public final class Utility { // COMMON UTILITIES
}
public static void copyTextFile(String filename, boolean utf8, String newName, String[] replacementList) throws IOException {
PrintWriter out = Utility.openPrintWriter(newName, false, false);
PrintWriter out = Utility.openPrintWriter(newName, UTF8_WINDOWS);
appendFile(filename, utf8, out, replacementList);
out.close();
}
@ -834,10 +851,12 @@ public final class Utility { // COMMON UTILITIES
return "Showing Stack with fake " + sw.getBuffer().toString();
}
static PrintWriter showSetNamesPw;
public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) {
PrintWriter temp = new PrintWriter(System.out);
showSetNames(temp, prefix, set, separateLines, false, ucd);
temp.close();
if (showSetNamesPw == null) showSetNamesPw = new PrintWriter(System.out);
showSetNames(showSetNamesPw, prefix, set, separateLines, false, ucd);
showSetNamesPw.flush();
}
public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) {