mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
cleaned up break tests, added sentence test
X-SVN-Rev: 9663
This commit is contained in:
parent
5159355344
commit
04a7e86e8f
4 changed files with 535 additions and 239 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $
|
||||
* $Date: 2002/08/08 15:35:01 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -41,7 +41,7 @@ abstract public class CheckCollator {
|
|||
// later, drive off of args
|
||||
|
||||
// choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai
|
||||
test(Locale.KOREAN, "Korean");
|
||||
//test(Locale.KOREAN, "Korean");
|
||||
test(Locale.ENGLISH, "Latin");
|
||||
test(Locale.FRENCH, "Latin");
|
||||
test(Locale.JAPANESE, "Japanese");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
|
||||
* $Date: 2002/08/08 15:38:15 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -59,58 +59,53 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
return UTF16.charAt(source, start);
|
||||
}
|
||||
|
||||
// quick & dirty routine
|
||||
String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
|
||||
String result = insertion;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
result += source.charAt(i);
|
||||
if (breaker.isBreak(source, i, true)) {
|
||||
result += insertion;
|
||||
}
|
||||
}
|
||||
return result + insertion;
|
||||
}
|
||||
|
||||
|
||||
static UnicodeSet midLetterSet = new UnicodeSet("[\u0027\u002E\u003A\u00AD\u05F3\u05F4\u2019\uFE52\uFE55\uFF07\uFF0E\uFF1A]");
|
||||
/*
|
||||
U+0027 APOSTROPHE
|
||||
U+002E FULL STOP
|
||||
U+003A COLON # used in Swedish
|
||||
U+00AD SOFT HYPHEN
|
||||
U+05F3 HEBREW PUNCTUATION GERESH
|
||||
U+05F4 HEBREW PUNCTUATION GERSHAYIM
|
||||
U+2019 RIGHT SINGLE QUOTATION MARK
|
||||
U+FE52 SMALL FULL STOP
|
||||
U+FE55 SMALL COLON
|
||||
U+FF07 FULLWIDTH APOSTROPHE
|
||||
U+FF0E FULLWIDTH FULL STOP
|
||||
U+FF1A FULLWIDTH COLON
|
||||
*/
|
||||
|
||||
static UnicodeSet ambigSentPunct = new UnicodeSet("[\u002E\u0589\u06D4]");
|
||||
/*
|
||||
U+002E FULL STOP
|
||||
U+0589 ARMENIAN FULL STOP
|
||||
U+06D4 ARABIC FULL STOP
|
||||
*/
|
||||
|
||||
static UnicodeSet sentPunct = new UnicodeSet("[\u0021\u003F\u0387\u061F\u0964\u203C\u203D\u2048\u2049"
|
||||
+ "\u3002\ufe52\ufe57\uff01\uff0e\uff1f\uff61]");
|
||||
/*
|
||||
U+0021 EXCLAMATION MARK
|
||||
U+003F QUESTION MARK
|
||||
U+0387 GREEK ANO TELEIA
|
||||
U+061F ARABIC QUESTION MARK
|
||||
U+0964 DEVANAGARI DANDA
|
||||
U+203C DOUBLE EXCLAMATION MARK
|
||||
U+203D INTERROBANG
|
||||
U+2048 QUESTION EXCLAMATION MARK
|
||||
U+2049 EXCLAMATION QUESTION MARK
|
||||
U+3002 IDEOGRAPHIC FULL STOP
|
||||
U+FE52 SMALL FULL STOP
|
||||
U+FE57 SMALL EXCLAMATION MARK
|
||||
U+FF01 FULLWIDTH EXCLAMATION MARK
|
||||
U+FF0E FULLWIDTH FULL STOP
|
||||
U+FF1F FULLWIDTH QUESTION MARK
|
||||
U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
*/
|
||||
|
||||
static {
|
||||
Default.setUCD();
|
||||
}
|
||||
|
||||
static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5\\u02ED\\u05F3]");
|
||||
static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet()
|
||||
.addAll(extraAlpha);
|
||||
|
||||
static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet();
|
||||
|
||||
static {
|
||||
System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
|
||||
}
|
||||
|
||||
|
||||
// ====================== Main ===========================
|
||||
|
||||
static final boolean SHOW_TYPE = false;
|
||||
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61");
|
||||
Default.setUCD();
|
||||
|
||||
checkDecomps();
|
||||
|
||||
|
||||
if (DEBUG) {
|
||||
checkDecomps();
|
||||
|
||||
Utility.showSetNames("", new UnicodeSet("[\u034F\u00AD\u1806[:DI:]-[:Cs:]-[:Cn:]]"), true, Default.ucd);
|
||||
|
||||
System.out.println("*** Extend - Cf");
|
||||
|
@ -122,7 +117,20 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false);
|
||||
systemPrintWriter.flush();
|
||||
}
|
||||
|
||||
if (false) {
|
||||
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
|
||||
foo.isBreak("(\"Go.\") (He did)", 5, true);
|
||||
|
||||
showSet("sepSet", GenerateSentenceBreakTest.sepSet);
|
||||
showSet("atermSet", GenerateSentenceBreakTest.atermSet);
|
||||
showSet("termSet", GenerateSentenceBreakTest.termSet);
|
||||
}
|
||||
|
||||
new GenerateSentenceBreakTest().run();
|
||||
|
||||
//if (true) return; // cut short for now
|
||||
|
||||
new GenerateLineBreakTest().run();
|
||||
new GenerateGraphemeBreakTest().run();
|
||||
new GenerateWordBreakTest().run();
|
||||
|
@ -178,6 +186,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
return result.toString();
|
||||
}
|
||||
|
||||
static void showSet(String title, UnicodeSet set) {
|
||||
System.out.println(title + ": " + set.toPattern(true));
|
||||
Utility.showSetNames("", set, false, Default.ucd);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// determines if string is of form Base NSM*
|
||||
static boolean isBaseNSMStar(String source) {
|
||||
int cp;
|
||||
|
@ -262,62 +277,74 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
|
||||
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
|
||||
+ fileName + "</title></head>");
|
||||
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
|
||||
out.println("<body bgcolor='#FFFFFF'><h3>Current:</h3>");
|
||||
|
||||
|
||||
|
||||
generateTable(out, false);
|
||||
out.println("<h3>Recommended:</h3>");
|
||||
generateTable(out, true);
|
||||
out.println("</body></html>");
|
||||
if (recommendedDiffers()) {
|
||||
generateTable(out, false);
|
||||
out.println("<h3>Recommended:</h3>");
|
||||
generateTable(out, true);
|
||||
out.println("</body></html>");
|
||||
} else {
|
||||
generateTable(out, true);
|
||||
}
|
||||
out.close();
|
||||
|
||||
if (recommendedDiffers()) {
|
||||
generateTest(false, false);
|
||||
}
|
||||
generateTest(false, true);
|
||||
|
||||
}
|
||||
|
||||
public void generateTest(boolean shortVersion, boolean recommended) throws IOException {
|
||||
String[] testCase = new String[50];
|
||||
// do main test
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
|
||||
int counter = 0;
|
||||
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest"
|
||||
+ (recommended & recommendedDiffers() ? "_NEW" : "")
|
||||
+ (shortVersion ? "_SHORT" : "")
|
||||
+ ".txt", Utility.LATIN1_WINDOWS);
|
||||
int counter = 0;
|
||||
|
||||
out.println("# Default " + fileName + " Break Test");
|
||||
out.println("# Generated: " + Default.getDate() + ", MED");
|
||||
out.println("#");
|
||||
out.println("# Format:");
|
||||
out.println("# <string> (# <comment>)? ");
|
||||
out.println("# <string> contains hex Unicode code points, with ");
|
||||
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
|
||||
out.println("#\t" + NOBREAK + " wherever there is not.");
|
||||
out.println("# <comment> the format can change, but currently it shows:");
|
||||
out.println("#\t- the sample character name");
|
||||
out.println("#\t- (x) the line_break property* for the sample character");
|
||||
out.println("#\t- [x] the rule that determines whether there is a break or not");
|
||||
out.println("#");
|
||||
sampleDescription(out);
|
||||
out.println("# These samples may be extended or changed in the future.");
|
||||
out.println("#");
|
||||
out.println("# Default " + fileName + " Break Test");
|
||||
out.println("# Generated: " + Default.getDate() + ", MED");
|
||||
out.println("#");
|
||||
out.println("# Format:");
|
||||
out.println("# <string> (# <comment>)? ");
|
||||
out.println("# <string> contains hex Unicode code points, with ");
|
||||
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
|
||||
out.println("#\t" + NOBREAK + " wherever there is not.");
|
||||
out.println("# <comment> the format can change, but currently it shows:");
|
||||
out.println("#\t- the sample character name");
|
||||
out.println("#\t- (x) the line_break property* for the sample character");
|
||||
out.println("#\t- [x] the rule that determines whether there is a break or not");
|
||||
out.println("#");
|
||||
sampleDescription(out);
|
||||
out.println("# These samples may be extended or changed in the future.");
|
||||
out.println("#");
|
||||
|
||||
for (int ii = 0; ii < sampleLimit; ++ii) {
|
||||
String before = samples[ii];
|
||||
for (int ii = 0; ii < sampleLimit; ++ii) {
|
||||
String before = samples[ii];
|
||||
|
||||
for (int jj = 0; jj < sampleLimit; ++jj) {
|
||||
Utility.dot(counter);
|
||||
String after = samples[jj];
|
||||
for (int jj = 0; jj < sampleLimit; ++jj) {
|
||||
Utility.dot(counter);
|
||||
String after = samples[jj];
|
||||
|
||||
// do line straight
|
||||
int len = genTestItems(before, after, testCase);
|
||||
for (int q = 0; q < len; ++q) {
|
||||
printLine(out, testCase[q], k != 0 && q == 0, false, false);
|
||||
++counter;
|
||||
}
|
||||
// do line straight
|
||||
int len = genTestItems(before, after, testCase);
|
||||
for (int q = 0; q < len; ++q) {
|
||||
printLine(out, testCase[q], !shortVersion && q == 0, recommended, false);
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
|
||||
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
|
||||
printLine(out, extraSingleSamples[ii], true, false, false);
|
||||
}
|
||||
out.println("# Lines: " + counter);
|
||||
out.close();
|
||||
}
|
||||
|
||||
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
|
||||
printLine(out, extraSingleSamples[ii], true, recommended, false);
|
||||
}
|
||||
out.println("# Lines: " + counter);
|
||||
out.close();
|
||||
}
|
||||
|
||||
public void sampleDescription(PrintWriter out) {}
|
||||
|
@ -328,6 +355,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
abstract public String getTypeID(int s, boolean recommended);
|
||||
|
||||
public boolean recommendedDiffers() {
|
||||
return false;
|
||||
}
|
||||
|
||||
final public byte getType (int cp) {
|
||||
return getType(cp, false);
|
||||
|
@ -406,7 +436,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
}
|
||||
|
||||
public void generateTable(PrintWriter out, boolean recommended) {
|
||||
String width = "width='" + (100 / (tableLimit + 1)) + "%'";
|
||||
String width = "width='" + (100 / (tableLimit + 2)) + "%'";
|
||||
out.print("<table border='1' cellspacing='0' width='100%'>");
|
||||
String types = "";
|
||||
String codes = "";
|
||||
|
@ -424,7 +454,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
String[] rule = new String[1];
|
||||
String[] rule2 = new String[1];
|
||||
for (int type = 0; type < tableLimit; ++type) {
|
||||
for (int type = 0; type < sampleLimit; ++type) {
|
||||
String before = samples[type];
|
||||
if (before == null) continue;
|
||||
|
||||
|
@ -455,7 +485,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
out.println("<ol>");
|
||||
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
|
||||
out.println("<li><font size='5'>");
|
||||
printLine(out, extraSingleSamples[ii], true, false, true);
|
||||
printLine(out, extraSingleSamples[ii], true, recommended, true);
|
||||
out.println("</font></li>");
|
||||
}
|
||||
out.println("</ol>");
|
||||
|
@ -576,8 +606,10 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
}
|
||||
*/
|
||||
|
||||
System.arraycopy(extraSamples, 0, samples, sampleLimit, extraSamples.length);
|
||||
sampleLimit += extraSamples.length;
|
||||
if (extraSamples.length > 0) {
|
||||
System.arraycopy(extraSamples, 0, samples, sampleLimit, extraSamples.length);
|
||||
sampleLimit += extraSamples.length;
|
||||
}
|
||||
}
|
||||
|
||||
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
|
||||
|
@ -619,10 +651,14 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
*/
|
||||
{
|
||||
fileName = "Line";
|
||||
extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby" };
|
||||
extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", "-3" };
|
||||
}
|
||||
|
||||
|
||||
public boolean recommendedDiffers() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void sampleDescription(PrintWriter out) {
|
||||
out.println("# Samples:");
|
||||
out.println("# The test currently takes all pairs of linebreak types*,");
|
||||
|
@ -1009,6 +1045,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp, boolean recommended) {
|
||||
recommended = true; // don't care about old stuff
|
||||
// single characters
|
||||
if (cp == 0xA) return LF;
|
||||
if (cp == 0xD) return CR;
|
||||
|
@ -1028,7 +1065,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
if (recommended) {
|
||||
if (cat == Cf) return Control;
|
||||
if (cat == Me || cat == Mn) return Extend;
|
||||
if (otherExtendSet.contains(cp)) return Extend;
|
||||
// FOR FUTURE! if (otherExtendSet.contains(cp)) return Extend;
|
||||
return Base;
|
||||
}
|
||||
if (cat == Cf) return Extend;
|
||||
|
@ -1044,40 +1081,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
return Other;
|
||||
}
|
||||
|
||||
static public class Context {
|
||||
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
|
||||
}
|
||||
|
||||
public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
|
||||
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
|
||||
if (false) {
|
||||
context.cpBefore = UTF16.charAt(source, offset-1);
|
||||
context.cpAfter = UTF16.charAt(source, offset);
|
||||
|
||||
int b2Offset = offset - UTF16.getCharCount(context.cpBefore) - 1;
|
||||
context.cpBefore2 = b2Offset < 0 ? -1 : UTF16.charAt(source, b2Offset);
|
||||
|
||||
int a2Offset = offset + UTF16.getCharCount(context.cpAfter);
|
||||
context.cpAfter2 = a2Offset >= source.length() ? -1 : UTF16.charAt(source, a2Offset);
|
||||
} else {
|
||||
if (DEBUG) {
|
||||
System.out.println("stop here");
|
||||
}
|
||||
int a1 = next(source, offset, recommended);
|
||||
context.cpAfter = findFirstBase(source, offset, a1);
|
||||
|
||||
int b1 = previous(source, offset, recommended);
|
||||
context.cpBefore = findFirstBase(source, b1, offset);
|
||||
|
||||
int a2 = next(source, a1, recommended);
|
||||
if (a2 != DONE) context.cpAfter2 = findFirstBase(source, a1, a2);
|
||||
|
||||
int b2 = previous(source, b1, recommended);
|
||||
if (b2 != DONE) context.cpBefore2 = findFirstBase(source, b2, b1);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
recommended = true; // don't care about old stuff
|
||||
rule="1";
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
if (offset == 0) return true;
|
||||
|
@ -1155,16 +1160,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
static class GenerateWordBreakTest extends GenerateBreakTest {
|
||||
|
||||
static final byte Hiragana = 0, Katakana = 1, Letter = 2, MidLetter = 3, Hyphen = 4,
|
||||
static final byte Format = 0, Katakana = 1, ALetter = 2, MidLetter = 3, Hyphen = 4,
|
||||
Numeric = 5, Infix_Numeric = 6, Prefix_Numeric = 7, Postfix_Numeric = 8,
|
||||
Prefix = 9, Postfix = 10, Other = 11,
|
||||
Prefix = 9, Postfix = 10, MidNumLet = 11, Hiragana = 12, Other = 13,
|
||||
LIMIT = Other + 1;
|
||||
|
||||
static final String[] Names = {"Hiragana", "Katakana", "Letter", "MidLetter", "Hyphen",
|
||||
"Numeric", "INum", "PrNum", "PoNum", "PreLet", "PostLet", "Other" };
|
||||
static final String[] Names = {"Format", "Katakana", "ALetter", "MidLetter", "Hyphen",
|
||||
"Numeric", "INum", "PrNum", "PoNum", "PreLet", "PostLet", "MidNumLet", "Hiragana", "Other" };
|
||||
|
||||
GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
|
||||
GenerateGraphemeBreakTest.Context context = new GenerateGraphemeBreakTest.Context();
|
||||
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
|
||||
static Context context = new Context();
|
||||
|
||||
static String LENGTH = "[\u30FC\uFF70]";
|
||||
static String HALFWIDTH_KATAKANA = "[\uFF65-\uFF9F]";
|
||||
|
@ -1173,7 +1178,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
|
|||
|
||||
static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]");
|
||||
|
||||
static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
|
||||
//static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
|
||||
static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
|
||||
static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
|
||||
|
||||
|
@ -1216,25 +1221,15 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
|||
thaiLaoNormal = new UnicodeSet("[[:thai:][:lao:]]").removeAll(prefixSet).removeAll(postfixSet);
|
||||
|
||||
// we want ideographics, hiragana, thai (except prefix/suffix)
|
||||
UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]");
|
||||
//UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]");
|
||||
|
||||
UnicodeSet hiragana = UnifiedBinaryProperty.make(SCRIPT | HIRAGANA_SCRIPT).getSet();
|
||||
UnicodeSet smallHiragana = new UnicodeSet(hiragana).retainAll(linebreakNS);
|
||||
|
||||
exceptionLetters = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet()
|
||||
.addAll(new UnicodeSet("[[:thai:][:lao:]]"))
|
||||
.addAll(compatIdeographics)
|
||||
.addAll(hiragana)
|
||||
.addAll(thaiLaoNormal);
|
||||
|
||||
normalLetters = new UnicodeSet(letterSet).removeAll(exceptionLetters);
|
||||
|
||||
UnicodeSet missingKatakana = new UnicodeSet(extraKatakana).removeAll(new UnicodeSet("[:katakana:]"));
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("compatIdeographics: " + compatIdeographics.toPattern(true));
|
||||
Utility.showSetNames("", compatIdeographics, false, Default.ucd);
|
||||
|
||||
System.out.println("missingKatakana: " + missingKatakana.toPattern(true));
|
||||
Utility.showSetNames("", missingKatakana, false, Default.ucd);
|
||||
|
||||
|
@ -1263,7 +1258,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
|||
|
||||
fileName = "Word";
|
||||
extraSamples = new String[] {
|
||||
"\uFF70", "\uFF65", "\u30FD"
|
||||
"\uFF70", "\uFF65", "\u30FD", "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060"
|
||||
};
|
||||
if (DEBUG) {
|
||||
System.out.println("length not covered: "
|
||||
|
@ -1278,7 +1273,12 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
|||
System.out.println("L2: " + getTypeID('\uFF70'));
|
||||
}
|
||||
|
||||
extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", "a$-3.14%b", "3a" };
|
||||
String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" };
|
||||
extraSingleSamples = new String [temp.length * 2];
|
||||
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
|
||||
for (int i = 0; i < temp.length; ++i) {
|
||||
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1291,39 +1291,29 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
|||
// stuff that subclasses need to override
|
||||
public byte getType(int cp, boolean recommended) {
|
||||
byte cat = Default.ucd.getCategory(cp);
|
||||
|
||||
if (cat == Cf) return Format;
|
||||
|
||||
byte script = Default.ucd.getScript(cp);
|
||||
|
||||
if (recommended) {
|
||||
//if (prefixSet.contains(cp)) return Prefix;
|
||||
//if (postfixSet.contains(cp)) return Postfix;
|
||||
//if (exceptionLetters.contains(cp)) return XLetter;
|
||||
}
|
||||
|
||||
boolean isCatLetter = ((1<<cat) & LETTER_MASK) != 0;
|
||||
if (!recommended) {
|
||||
if (script == HIRAGANA_SCRIPT) return Hiragana;
|
||||
} else {
|
||||
if (script == HIRAGANA_SCRIPT) return Other;
|
||||
}
|
||||
if (extraKatakana.contains(cp)) return Katakana;
|
||||
|
||||
if (script == KATAKANA_SCRIPT) return Katakana;
|
||||
if (extraKatakana.contains(cp)) return Katakana;
|
||||
|
||||
if (script == HIRAGANA_SCRIPT || script == THAI_SCRIPT || script == LAO_SCRIPT) return Other;
|
||||
if (ideographicSet.contains(cp)) return Other;
|
||||
|
||||
if (alphabeticSet.contains(cp)) return ALetter;
|
||||
|
||||
byte lb = Default.ucd.getLineBreak(cp);
|
||||
if (!recommended) {
|
||||
if ((isCatLetter || cat == Sk) && lb != LB_ID) return Letter;
|
||||
} else {
|
||||
if (normalLetters.contains(cp)) return Letter;
|
||||
}
|
||||
|
||||
if (lb == LB_HY) return Hyphen;
|
||||
if (lb == LB_NU) return Numeric;
|
||||
|
||||
if (midLetterSet.contains(cp)) {
|
||||
if (lb == LB_IS) return MidNumLet;
|
||||
return MidLetter;
|
||||
}
|
||||
if (lb == LB_IS) return Infix_Numeric;
|
||||
if (lb == LB_PR) return Prefix_Numeric;
|
||||
if (lb == LB_PO) return Postfix_Numeric;
|
||||
|
||||
if (midLetterSet.contains(cp)) return MidLetter;
|
||||
|
||||
|
||||
return Other;
|
||||
}
|
||||
|
||||
|
@ -1335,112 +1325,385 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
|
|||
return 3;
|
||||
}
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
static public class Context {
|
||||
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
|
||||
public byte tBefore2, tBefore, tAfter, tAfter2;
|
||||
}
|
||||
|
||||
public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
|
||||
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
|
||||
context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
|
||||
|
||||
MyBreakIterator graphemeIterator = new MyBreakIterator();
|
||||
|
||||
graphemeIterator.set(source, offset);
|
||||
while (true) {
|
||||
int cp = graphemeIterator.previousBase();
|
||||
if (cp == -1) break;
|
||||
byte t = getResolvedType(cp, recommended);
|
||||
if (t == Format) continue;
|
||||
|
||||
if (context.cpBefore == -1) {
|
||||
context.cpBefore = cp;
|
||||
context.tBefore = t;
|
||||
} else {
|
||||
context.cpBefore2 = cp;
|
||||
context.tBefore2 = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
graphemeIterator.set(source, offset);
|
||||
while (true) {
|
||||
int cp = graphemeIterator.nextBase();
|
||||
if (cp == -1) break;
|
||||
byte t = getResolvedType(cp, recommended);
|
||||
if (t == Format) continue;
|
||||
|
||||
if (context.cpAfter == -1) {
|
||||
context.cpAfter = cp;
|
||||
context.tAfter = t;
|
||||
} else {
|
||||
context.cpAfter2 = cp;
|
||||
context.tAfter2 = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
recommended = true; // don't care about old stuff
|
||||
|
||||
rule = "1";
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
rule="16";
|
||||
|
||||
if (offset == 0) return true;
|
||||
|
||||
rule="15";
|
||||
rule = "2";
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
// Treat a grapheme cluster as if it were a single character:
|
||||
// the first base character, if there is one; otherwise the first character.
|
||||
// GC => FB
|
||||
|
||||
rule="1";
|
||||
rule="3";
|
||||
if (!grapheme.isBreak( source, offset, recommended)) return false;
|
||||
|
||||
// now get the base character before and after, and their types
|
||||
|
||||
grapheme.getGraphemeBases(source, offset, recommended, context);
|
||||
getGraphemeBases(source, offset, recommended, context);
|
||||
|
||||
byte before = getResolvedType(context.cpBefore, recommended);
|
||||
byte after = getResolvedType(context.cpAfter, recommended);
|
||||
byte before2 = context.cpBefore2 < 0 ? (byte)-1 : getResolvedType(context.cpBefore2, recommended);
|
||||
byte after2 = context.cpAfter2 < 0 ? (byte)-1 : getResolvedType(context.cpAfter2, recommended);
|
||||
byte before = context.tBefore;
|
||||
byte after = context.tAfter;
|
||||
byte before2 = context.tBefore2;
|
||||
byte after2 = context.tAfter2;
|
||||
|
||||
//Don't break between most letters
|
||||
// Letter × Letter
|
||||
// ALetter × ALetter
|
||||
|
||||
rule = "2";
|
||||
if (before == Letter && after == Letter) return false;
|
||||
rule = "5";
|
||||
if (before == ALetter && after == ALetter) return false;
|
||||
|
||||
// Don’t break letters across certain punctuation
|
||||
// Letter × MidLetter Letter (3)
|
||||
// Letter MidLetter × Letter (4)
|
||||
// ALetter×(MidLetter | MidNumLet) ALetter(6)
|
||||
// ALetter (MidLetter | MidNumLet)×ALetter(7)
|
||||
|
||||
/*if (recommended) {
|
||||
rule = "2a";
|
||||
if (before == Prefix && after == Letter) return false;
|
||||
rule = "6";
|
||||
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
|
||||
|
||||
rule = "2b";
|
||||
if (before == Letter && after == Postfix) return false;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
rule = "3";
|
||||
if (before == Letter && after == MidLetter && after2 == Letter) return false;
|
||||
|
||||
rule = "4";
|
||||
if (before2 == Letter && before == MidLetter && after == Letter) return false;
|
||||
rule = "7";
|
||||
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
|
||||
|
||||
// Don’t break within sequences of digits, or digits adjacent to letters.
|
||||
|
||||
// Numeric × Numeric (5)
|
||||
rule = "5";
|
||||
rule = "8";
|
||||
if (before == Numeric && after == Numeric) return false;
|
||||
|
||||
// Letter × Numeric (6)
|
||||
rule = "6";
|
||||
if (before == Letter && after == Numeric) return false;
|
||||
// ALetter × Numeric (6)
|
||||
rule = "9";
|
||||
if (before == ALetter && after == Numeric) return false;
|
||||
|
||||
// Numeric × Letter (7)
|
||||
rule = "7";
|
||||
if (before == Numeric && after == Letter) return false;
|
||||
// Numeric × ALetter (7)
|
||||
rule = "10";
|
||||
if (before == Numeric && after == ALetter) return false;
|
||||
|
||||
|
||||
// Don’t break within sequences like: '-3.2'
|
||||
|
||||
// Hyphen × Numeric (8)
|
||||
rule = "8";
|
||||
if (before == Hyphen && after == Numeric) return false;
|
||||
|
||||
// Numeric Infix_Numeric × Numeric (9)
|
||||
rule = "9";
|
||||
if (before2 == Numeric && before == Infix_Numeric && after == Numeric) return false;
|
||||
|
||||
// Numeric × Infix_Numeric Numeric (10)
|
||||
rule = "10";
|
||||
if (before == Numeric && after == Infix_Numeric && after2 == Numeric) return false;
|
||||
|
||||
// Prefix_Numeric × Numeric (11)
|
||||
// Numeric (MidNum | MidNumLet)×Numeric(11)
|
||||
rule = "11";
|
||||
if (before == Prefix_Numeric && after == Numeric) return false;
|
||||
if (before2 == Numeric && (before == Infix_Numeric || before == MidNumLet) && after == Numeric) return false;
|
||||
|
||||
// Numeric × Postfix_Numeric (12)
|
||||
// Numeric×(MidNum | MidNumLet) Numeric(12)
|
||||
rule = "12";
|
||||
if (before == Numeric && after == Postfix_Numeric) return false;
|
||||
if (before == Numeric && (after == Infix_Numeric || after == MidNumLet) && after2 == Numeric) return false;
|
||||
|
||||
// Don't break between Hiragana or Katakana
|
||||
// Don't break between Hiragana
|
||||
|
||||
if (!recommended) {
|
||||
// Hiragana × Hiragana (13)
|
||||
rule = "13";
|
||||
if (before == Hiragana && after == Hiragana) return false;
|
||||
}
|
||||
|
||||
// Katakana × Katakana (14)
|
||||
rule = "14";
|
||||
if (before == Katakana && after == Katakana) return false;
|
||||
// Hiragana × Hiragana (13)
|
||||
rule = "13";
|
||||
if (before == Hiragana && after == Hiragana) return false;
|
||||
|
||||
// Otherwise break always.
|
||||
rule = "15";
|
||||
rule = "14";
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//==============================================
|
||||
|
||||
static class GenerateSentenceBreakTest extends GenerateBreakTest {
|
||||
|
||||
static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5,
|
||||
Close = 6, ATerm = 7, Term = 8, Other = 9,
|
||||
LIMIT = Other + 1;
|
||||
|
||||
static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper",
|
||||
"Close", "ATerm", "Term", "Other" };
|
||||
|
||||
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
|
||||
|
||||
static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]");
|
||||
static UnicodeSet atermSet = new UnicodeSet("[\\u002E]");
|
||||
static UnicodeSet termSet = new UnicodeSet("[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
|
||||
+ "\\u1362\\u1367\\u1368\\u1803\\u1809\\u203c\\u203d\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
|
||||
|
||||
static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase);
|
||||
static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase);
|
||||
|
||||
{
|
||||
|
||||
fileName = "Sentence";
|
||||
extraSamples = new String[] {
|
||||
|
||||
};
|
||||
String[] temp = new String[] {
|
||||
"(\"Go.\") (He did.)",
|
||||
"(\"Go?\") (He did.)",
|
||||
"U.S.A\u0300. is",
|
||||
"U.S.A\u0300? He",
|
||||
"U.S.A\u0300.",
|
||||
"\u4e00.\u4300",
|
||||
"\u4e00?\u4300",
|
||||
};
|
||||
extraSingleSamples = new String [temp.length * 2];
|
||||
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
|
||||
for (int i = 0; i < temp.length; ++i) {
|
||||
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public String getTypeID(int cp, boolean recommended) {
|
||||
byte type = getType(cp, recommended);
|
||||
return Names[type];
|
||||
}
|
||||
|
||||
// stuff that subclasses need to override
|
||||
public byte getType(int cp, boolean recommended) {
|
||||
byte cat = Default.ucd.getCategory(cp);
|
||||
|
||||
if (cat == Cf) return Format;
|
||||
if (sepSet.contains(cp)) return Sep;
|
||||
if (Default.ucd.getBinaryProperty(cp, White_space)) return Sp;
|
||||
if (alphabeticSet.contains(cp)) return OLetter;
|
||||
if (lowercaseProp.hasValue(cp)) return Lower;
|
||||
if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper;
|
||||
if (atermSet.contains(cp)) return ATerm;
|
||||
if (termSet.contains(cp)) return Term;
|
||||
if (cat == Po || cat == Pe
|
||||
|| Default.ucd.getLineBreak(cp) == LB_QU) return Close;
|
||||
return Other;
|
||||
}
|
||||
|
||||
public int genTestItems(String before, String after, String[] results) {
|
||||
results[0] = before + after;
|
||||
/*
|
||||
results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a';
|
||||
results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a';
|
||||
results[3] = 'a' + before + "\u0301\u0308" + samples[Infix_Numeric] + after + "\u0301\u0308" + 'a';
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
public boolean isBreak(String source, int offset, boolean recommended) {
|
||||
|
||||
rule = "1";
|
||||
if (offset < 0 || offset > source.length()) return false;
|
||||
|
||||
if (offset == 0) return true;
|
||||
|
||||
rule = "2";
|
||||
if (offset == source.length()) return true;
|
||||
|
||||
// Sep ÷ (3)
|
||||
rule = "3";
|
||||
byte before = getResolvedType(source.charAt(offset-1), recommended);
|
||||
if (before == Sep) return true;
|
||||
|
||||
// Treat a grapheme cluster as if it were a single character:
|
||||
// the first base character, if there is one; otherwise the first character.
|
||||
// GC => FB
|
||||
// Ignore interior Format characters. That is, ignore Format characters in all subsequent rules.
|
||||
// X Format*
|
||||
// ?
|
||||
// X
|
||||
// (5)
|
||||
|
||||
rule="3";
|
||||
if (!grapheme.isBreak( source, offset, recommended)) return false;
|
||||
|
||||
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
|
||||
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
|
||||
// ATerm ×Upper (7)
|
||||
|
||||
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp*÷(10)
|
||||
|
||||
|
||||
// These cases are all handled together.
|
||||
// First we loop backwards, checking for the different types.
|
||||
|
||||
MyBreakIterator graphemeIterator = new MyBreakIterator();
|
||||
graphemeIterator.set(source, offset);
|
||||
|
||||
int state = 0;
|
||||
byte lookAfter = -1;
|
||||
int cp;
|
||||
byte t;
|
||||
boolean gotSpace = false;
|
||||
boolean gotClose = false;
|
||||
|
||||
behindLoop:
|
||||
while (true) {
|
||||
cp = graphemeIterator.previousBase();
|
||||
if (cp == -1) break;
|
||||
t = getResolvedType(cp, recommended);
|
||||
if (SHOW_TYPE) System.out.println(Default.ucd.getCodeAndName(cp) + ", " + getTypeID(cp, recommended));
|
||||
|
||||
if (t == Format) continue; // ignore all formats!
|
||||
|
||||
switch (state) {
|
||||
case 0:
|
||||
if (t == Sp) {
|
||||
// loop as long as we have Space
|
||||
gotSpace = true;
|
||||
continue behindLoop;
|
||||
} else if (t == Close) {
|
||||
gotClose = true;
|
||||
state = 1; // go to close loop
|
||||
continue behindLoop;
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
if (t == Close) {
|
||||
// loop as long as we have Close
|
||||
continue behindLoop;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (t == ATerm) {
|
||||
lookAfter = ATerm;
|
||||
} else if (t == Term) {
|
||||
lookAfter = Term;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// if we didn't find ATerm or Term, bail
|
||||
|
||||
if (lookAfter == -1) {
|
||||
// Otherwise, do not break
|
||||
// Any × Any (11)
|
||||
rule = "11";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
|
||||
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
|
||||
// ATerm ×Upper (7)
|
||||
|
||||
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
|
||||
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
|
||||
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
|
||||
// ( Term | ATerm ) Close* Sp*÷(10)
|
||||
|
||||
// We DID find one. Loop to see if the right side is ok.
|
||||
|
||||
graphemeIterator.set(source, offset);
|
||||
boolean isFirst = true;
|
||||
while (true) {
|
||||
cp = graphemeIterator.nextBase();
|
||||
if (cp == -1) break;
|
||||
t = getResolvedType(cp, recommended);
|
||||
if (SHOW_TYPE) System.out.println(Default.ucd.getCodeAndName(cp) + ", " + getTypeID(cp, recommended));
|
||||
|
||||
if (t == Format) continue; // skip format characters!
|
||||
|
||||
if (isFirst) {
|
||||
isFirst = false;
|
||||
if (lookAfter == ATerm && t == Upper) {
|
||||
rule = "7";
|
||||
return false;
|
||||
}
|
||||
if (gotSpace) {
|
||||
if (t == Sp || t == Sep) {
|
||||
rule = "9";
|
||||
return false;
|
||||
}
|
||||
} else if (t == Close || t == Sp || t == Sep) {
|
||||
rule = "8";
|
||||
return false;
|
||||
}
|
||||
if (lookAfter == Term) break;
|
||||
}
|
||||
|
||||
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
|
||||
if (t != OLetter && t != Upper && t != Lower) continue;
|
||||
if (t == Lower) {
|
||||
rule = "6";
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
rule = "10";
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static class MyBreakIterator {
|
||||
int offset = 0;
|
||||
String string = "";
|
||||
GenerateBreakTest breaker = new GenerateGraphemeBreakTest();
|
||||
boolean recommended = true;
|
||||
|
||||
public MyBreakIterator set(String source, int offset) {
|
||||
string = source;
|
||||
this.offset = offset;
|
||||
return this;
|
||||
}
|
||||
|
||||
public int nextBase() {
|
||||
if (offset >= string.length()) return -1;
|
||||
int result = UTF16.charAt(string, offset);
|
||||
for (++offset; offset < string.length(); ++offset) {
|
||||
if (breaker.isBreak(string, offset, recommended)) break;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public int previousBase() {
|
||||
if (offset <= 0) return -1;
|
||||
for (--offset; offset >= 0; --offset) {
|
||||
if (breaker.isBreak(string, offset, recommended)) break;
|
||||
}
|
||||
return UTF16.charAt(string, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2002/08/08 15:35:01 $
|
||||
* $Revision: 1.21 $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.22 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -78,6 +78,8 @@ public final class Main implements UCD_Types {
|
|||
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("checkDecompFolding")) VerifyUCD.checkDecompFolding();
|
||||
|
||||
else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
|
||||
else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null);
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2002/07/30 09:56:40 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2002/08/09 23:56:24 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -31,6 +31,37 @@ import java.text.NumberFormat;
|
|||
public class VerifyUCD implements UCD_Types {
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
static void checkDecompFolding() {
|
||||
Default.setUCD();
|
||||
UnicodeSet sum = new UnicodeSet();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (!Default.ucd.isAllocated(cp)) continue;
|
||||
byte cat = Default.ucd.getCategory(cp);
|
||||
if (cat == UNASSIGNED || cat == PRIVATE_USE) continue;
|
||||
String decomp = Default.nfd.normalize(cp);
|
||||
String foldDecomp = Default.ucd.getCase(decomp, FULL, FOLD);
|
||||
int d0 = Default.ucd.getCombiningClass(decomp.charAt(0));
|
||||
int dL = Default.ucd.getCombiningClass(decomp.charAt(decomp.length()-1));
|
||||
int f0 = Default.ucd.getCombiningClass(foldDecomp.charAt(0));
|
||||
int fL = Default.ucd.getCombiningClass(foldDecomp.charAt(decomp.length()-1));
|
||||
if (d0 != f0 || dL != fL) {
|
||||
Utility.fixDot();
|
||||
System.out.println();
|
||||
System.out.println("Exception: " + Default.ucd.getCodeAndName(cp));
|
||||
System.out.println("Decomp: " + Default.ucd.getCodeAndName(decomp));
|
||||
System.out.println("FoldedDecomp: " + Default.ucd.getCodeAndName(foldDecomp));
|
||||
System.out.println("d0: " + d0 + ", "
|
||||
+ "dL: " + dL + ", "
|
||||
+ "f0: " + f0 + ", "
|
||||
+ "fL: " + fL
|
||||
);
|
||||
sum.add(cp);
|
||||
}
|
||||
}
|
||||
System.out.println("Set: " + sum.toPattern(true));
|
||||
}
|
||||
|
||||
static void oneTime() {
|
||||
Default.setUCD();
|
||||
int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000
|
||||
|
|
Loading…
Add table
Reference in a new issue