cleaned up break tests, added sentence test

X-SVN-Rev: 9663
This commit is contained in:
Mark Davis 2002-08-09 23:56:24 +00:00
parent 5159355344
commit 04a7e86e8f
4 changed files with 535 additions and 239 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/CheckCollator.java,v $
* $Date: 2002/08/08 15:35:01 $
* $Revision: 1.1 $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -41,7 +41,7 @@ abstract public class CheckCollator {
// later, drive off of args
// choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai
test(Locale.KOREAN, "Korean");
//test(Locale.KOREAN, "Korean");
test(Locale.ENGLISH, "Latin");
test(Locale.FRENCH, "Latin");
test(Locale.JAPANESE, "Japanese");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateBreakTest.java,v $
* $Date: 2002/08/08 15:38:15 $
* $Revision: 1.1 $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -59,58 +59,53 @@ abstract public class GenerateBreakTest implements UCD_Types {
return UTF16.charAt(source, start);
}
// quick & dirty routine
String insertEverywhere(String source, String insertion, GenerateBreakTest breaker) {
String result = insertion;
for (int i = 0; i < source.length(); ++i) {
result += source.charAt(i);
if (breaker.isBreak(source, i, true)) {
result += insertion;
}
}
return result + insertion;
}
static UnicodeSet midLetterSet = new UnicodeSet("[\u0027\u002E\u003A\u00AD\u05F3\u05F4\u2019\uFE52\uFE55\uFF07\uFF0E\uFF1A]");
/*
U+0027 APOSTROPHE
U+002E FULL STOP
U+003A COLON # used in Swedish
U+00AD SOFT HYPHEN
U+05F3 HEBREW PUNCTUATION GERESH
U+05F4 HEBREW PUNCTUATION GERSHAYIM
U+2019 RIGHT SINGLE QUOTATION MARK
U+FE52 SMALL FULL STOP
U+FE55 SMALL COLON
U+FF07 FULLWIDTH APOSTROPHE
U+FF0E FULLWIDTH FULL STOP
U+FF1A FULLWIDTH COLON
*/
static UnicodeSet ambigSentPunct = new UnicodeSet("[\u002E\u0589\u06D4]");
/*
U+002E FULL STOP
U+0589 ARMENIAN FULL STOP
U+06D4 ARABIC FULL STOP
*/
static UnicodeSet sentPunct = new UnicodeSet("[\u0021\u003F\u0387\u061F\u0964\u203C\u203D\u2048\u2049"
+ "\u3002\ufe52\ufe57\uff01\uff0e\uff1f\uff61]");
/*
U+0021 EXCLAMATION MARK
U+003F QUESTION MARK
U+0387 GREEK ANO TELEIA
U+061F ARABIC QUESTION MARK
U+0964 DEVANAGARI DANDA
U+203C DOUBLE EXCLAMATION MARK
U+203D INTERROBANG
U+2048 QUESTION EXCLAMATION MARK
U+2049 EXCLAMATION QUESTION MARK
U+3002 IDEOGRAPHIC FULL STOP
U+FE52 SMALL FULL STOP
U+FE57 SMALL EXCLAMATION MARK
U+FF01 FULLWIDTH EXCLAMATION MARK
U+FF0E FULLWIDTH FULL STOP
U+FF1F FULLWIDTH QUESTION MARK
U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP
*/
static {
Default.setUCD();
}
static UnicodeSet extraAlpha = new UnicodeSet("[\\u02B9-\\u02BA\\u02C2-\\u02CF\\u02D2-\\u02DF\\u02E5\\u02ED\\u05F3]");
static UnicodeSet alphabeticSet = UnifiedBinaryProperty.make(DERIVED | PropAlphabetic).getSet()
.addAll(extraAlpha);
static UnicodeSet ideographicSet = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet();
static {
System.out.println("alphabetic: " + alphabeticSet.toPattern(true));
}
// ====================== Main ===========================
static final boolean SHOW_TYPE = false;
public static void main(String[] args) throws IOException {
System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61");
Default.setUCD();
checkDecomps();
if (DEBUG) {
checkDecomps();
Utility.showSetNames("", new UnicodeSet("[\u034F\u00AD\u1806[:DI:]-[:Cs:]-[:Cn:]]"), true, Default.ucd);
System.out.println("*** Extend - Cf");
@ -122,7 +117,20 @@ abstract public class GenerateBreakTest implements UCD_Types {
gwb.printLine(systemPrintWriter, "n\u0308't", true, true, false);
systemPrintWriter.flush();
}
if (false) {
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest();
foo.isBreak("(\"Go.\") (He did)", 5, true);
showSet("sepSet", GenerateSentenceBreakTest.sepSet);
showSet("atermSet", GenerateSentenceBreakTest.atermSet);
showSet("termSet", GenerateSentenceBreakTest.termSet);
}
new GenerateSentenceBreakTest().run();
//if (true) return; // cut short for now
new GenerateLineBreakTest().run();
new GenerateGraphemeBreakTest().run();
new GenerateWordBreakTest().run();
@ -178,6 +186,13 @@ abstract public class GenerateBreakTest implements UCD_Types {
return result.toString();
}
static void showSet(String title, UnicodeSet set) {
System.out.println(title + ": " + set.toPattern(true));
Utility.showSetNames("", set, false, Default.ucd);
}
// determines if string is of form Base NSM*
static boolean isBaseNSMStar(String source) {
int cp;
@ -262,62 +277,74 @@ abstract public class GenerateBreakTest implements UCD_Types {
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
+ fileName + "</title></head>");
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
out.println("<body bgcolor='#FFFFFF'><h3>Current:</h3>");
generateTable(out, false);
out.println("<h3>Recommended:</h3>");
generateTable(out, true);
out.println("</body></html>");
if (recommendedDiffers()) {
generateTable(out, false);
out.println("<h3>Recommended:</h3>");
generateTable(out, true);
out.println("</body></html>");
} else {
generateTable(out, true);
}
out.close();
if (recommendedDiffers()) {
generateTest(false, false);
}
generateTest(false, true);
}
public void generateTest(boolean shortVersion, boolean recommended) throws IOException {
String[] testCase = new String[50];
// do main test
for (int k = 0; k < 2; ++k) {
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
int counter = 0;
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest"
+ (recommended & recommendedDiffers() ? "_NEW" : "")
+ (shortVersion ? "_SHORT" : "")
+ ".txt", Utility.LATIN1_WINDOWS);
int counter = 0;
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + Default.getDate() + ", MED");
out.println("#");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
out.println("# <string> contains hex Unicode code points, with ");
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
out.println("#\t" + NOBREAK + " wherever there is not.");
out.println("# <comment> the format can change, but currently it shows:");
out.println("#\t- the sample character name");
out.println("#\t- (x) the line_break property* for the sample character");
out.println("#\t- [x] the rule that determines whether there is a break or not");
out.println("#");
sampleDescription(out);
out.println("# These samples may be extended or changed in the future.");
out.println("#");
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + Default.getDate() + ", MED");
out.println("#");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
out.println("# <string> contains hex Unicode code points, with ");
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
out.println("#\t" + NOBREAK + " wherever there is not.");
out.println("# <comment> the format can change, but currently it shows:");
out.println("#\t- the sample character name");
out.println("#\t- (x) the line_break property* for the sample character");
out.println("#\t- [x] the rule that determines whether there is a break or not");
out.println("#");
sampleDescription(out);
out.println("# These samples may be extended or changed in the future.");
out.println("#");
for (int ii = 0; ii < sampleLimit; ++ii) {
String before = samples[ii];
for (int ii = 0; ii < sampleLimit; ++ii) {
String before = samples[ii];
for (int jj = 0; jj < sampleLimit; ++jj) {
Utility.dot(counter);
String after = samples[jj];
for (int jj = 0; jj < sampleLimit; ++jj) {
Utility.dot(counter);
String after = samples[jj];
// do line straight
int len = genTestItems(before, after, testCase);
for (int q = 0; q < len; ++q) {
printLine(out, testCase[q], k != 0 && q == 0, false, false);
++counter;
}
// do line straight
int len = genTestItems(before, after, testCase);
for (int q = 0; q < len; ++q) {
printLine(out, testCase[q], !shortVersion && q == 0, recommended, false);
++counter;
}
}
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
printLine(out, extraSingleSamples[ii], true, false, false);
}
out.println("# Lines: " + counter);
out.close();
}
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
printLine(out, extraSingleSamples[ii], true, recommended, false);
}
out.println("# Lines: " + counter);
out.close();
}
public void sampleDescription(PrintWriter out) {}
@ -328,6 +355,9 @@ abstract public class GenerateBreakTest implements UCD_Types {
abstract public String getTypeID(int s, boolean recommended);
public boolean recommendedDiffers() {
return false;
}
final public byte getType (int cp) {
return getType(cp, false);
@ -406,7 +436,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
public void generateTable(PrintWriter out, boolean recommended) {
String width = "width='" + (100 / (tableLimit + 1)) + "%'";
String width = "width='" + (100 / (tableLimit + 2)) + "%'";
out.print("<table border='1' cellspacing='0' width='100%'>");
String types = "";
String codes = "";
@ -424,7 +454,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
String[] rule = new String[1];
String[] rule2 = new String[1];
for (int type = 0; type < tableLimit; ++type) {
for (int type = 0; type < sampleLimit; ++type) {
String before = samples[type];
if (before == null) continue;
@ -455,7 +485,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
out.println("<ol>");
for (int ii = 0; ii < extraSingleSamples.length; ++ii) {
out.println("<li><font size='5'>");
printLine(out, extraSingleSamples[ii], true, false, true);
printLine(out, extraSingleSamples[ii], true, recommended, true);
out.println("</font></li>");
}
out.println("</ol>");
@ -576,8 +606,10 @@ abstract public class GenerateBreakTest implements UCD_Types {
}
*/
System.arraycopy(extraSamples, 0, samples, sampleLimit, extraSamples.length);
sampleLimit += extraSamples.length;
if (extraSamples.length > 0) {
System.arraycopy(extraSamples, 0, samples, sampleLimit, extraSamples.length);
sampleLimit += extraSamples.length;
}
}
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
@ -619,10 +651,14 @@ abstract public class GenerateBreakTest implements UCD_Types {
*/
{
fileName = "Line";
extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby" };
extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", "-3" };
}
public boolean recommendedDiffers() {
return true;
}
public void sampleDescription(PrintWriter out) {
out.println("# Samples:");
out.println("# The test currently takes all pairs of linebreak types*,");
@ -1009,6 +1045,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
// stuff that subclasses need to override
public byte getType(int cp, boolean recommended) {
recommended = true; // don't care about old stuff
// single characters
if (cp == 0xA) return LF;
if (cp == 0xD) return CR;
@ -1028,7 +1065,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
if (recommended) {
if (cat == Cf) return Control;
if (cat == Me || cat == Mn) return Extend;
if (otherExtendSet.contains(cp)) return Extend;
// FOR FUTURE! if (otherExtendSet.contains(cp)) return Extend;
return Base;
}
if (cat == Cf) return Extend;
@ -1044,40 +1081,8 @@ abstract public class GenerateBreakTest implements UCD_Types {
return Other;
}
static public class Context {
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
}
public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
if (false) {
context.cpBefore = UTF16.charAt(source, offset-1);
context.cpAfter = UTF16.charAt(source, offset);
int b2Offset = offset - UTF16.getCharCount(context.cpBefore) - 1;
context.cpBefore2 = b2Offset < 0 ? -1 : UTF16.charAt(source, b2Offset);
int a2Offset = offset + UTF16.getCharCount(context.cpAfter);
context.cpAfter2 = a2Offset >= source.length() ? -1 : UTF16.charAt(source, a2Offset);
} else {
if (DEBUG) {
System.out.println("stop here");
}
int a1 = next(source, offset, recommended);
context.cpAfter = findFirstBase(source, offset, a1);
int b1 = previous(source, offset, recommended);
context.cpBefore = findFirstBase(source, b1, offset);
int a2 = next(source, a1, recommended);
if (a2 != DONE) context.cpAfter2 = findFirstBase(source, a1, a2);
int b2 = previous(source, b1, recommended);
if (b2 != DONE) context.cpBefore2 = findFirstBase(source, b2, b1);
}
}
public boolean isBreak(String source, int offset, boolean recommended) {
recommended = true; // don't care about old stuff
rule="1";
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
@ -1155,16 +1160,16 @@ abstract public class GenerateBreakTest implements UCD_Types {
static class GenerateWordBreakTest extends GenerateBreakTest {
static final byte Hiragana = 0, Katakana = 1, Letter = 2, MidLetter = 3, Hyphen = 4,
static final byte Format = 0, Katakana = 1, ALetter = 2, MidLetter = 3, Hyphen = 4,
Numeric = 5, Infix_Numeric = 6, Prefix_Numeric = 7, Postfix_Numeric = 8,
Prefix = 9, Postfix = 10, Other = 11,
Prefix = 9, Postfix = 10, MidNumLet = 11, Hiragana = 12, Other = 13,
LIMIT = Other + 1;
static final String[] Names = {"Hiragana", "Katakana", "Letter", "MidLetter", "Hyphen",
"Numeric", "INum", "PrNum", "PoNum", "PreLet", "PostLet", "Other" };
static final String[] Names = {"Format", "Katakana", "ALetter", "MidLetter", "Hyphen",
"Numeric", "INum", "PrNum", "PoNum", "PreLet", "PostLet", "MidNumLet", "Hiragana", "Other" };
GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
GenerateGraphemeBreakTest.Context context = new GenerateGraphemeBreakTest.Context();
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
static Context context = new Context();
static String LENGTH = "[\u30FC\uFF70]";
static String HALFWIDTH_KATAKANA = "[\uFF65-\uFF9F]";
@ -1173,7 +1178,7 @@ abstract public class GenerateBreakTest implements UCD_Types {
static UnicodeSet extraKatakana = new UnicodeSet("[" + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + "]");
static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
//static UnicodeProperty LineBreakIdeographic = UnifiedBinaryProperty.make(LINE_BREAK | LB_ID);
static UnicodeProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
static UnicodeProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
@ -1216,25 +1221,15 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
thaiLaoNormal = new UnicodeSet("[[:thai:][:lao:]]").removeAll(prefixSet).removeAll(postfixSet);
// we want ideographics, hiragana, thai (except prefix/suffix)
UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]");
//UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]");
UnicodeSet hiragana = UnifiedBinaryProperty.make(SCRIPT | HIRAGANA_SCRIPT).getSet();
UnicodeSet smallHiragana = new UnicodeSet(hiragana).retainAll(linebreakNS);
exceptionLetters = UnifiedBinaryProperty.make(BINARY_PROPERTIES | Ideographic).getSet()
.addAll(new UnicodeSet("[[:thai:][:lao:]]"))
.addAll(compatIdeographics)
.addAll(hiragana)
.addAll(thaiLaoNormal);
normalLetters = new UnicodeSet(letterSet).removeAll(exceptionLetters);
UnicodeSet missingKatakana = new UnicodeSet(extraKatakana).removeAll(new UnicodeSet("[:katakana:]"));
if (DEBUG) {
System.out.println("compatIdeographics: " + compatIdeographics.toPattern(true));
Utility.showSetNames("", compatIdeographics, false, Default.ucd);
System.out.println("missingKatakana: " + missingKatakana.toPattern(true));
Utility.showSetNames("", missingKatakana, false, Default.ucd);
@ -1263,7 +1258,7 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
fileName = "Word";
extraSamples = new String[] {
"\uFF70", "\uFF65", "\u30FD"
"\uFF70", "\uFF65", "\u30FD", "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060"
};
if (DEBUG) {
System.out.println("length not covered: "
@ -1278,7 +1273,12 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
System.out.println("L2: " + getTypeID('\uFF70'));
}
extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", "a$-3.14%b", "3a" };
String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" };
extraSingleSamples = new String [temp.length * 2];
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
for (int i = 0; i < temp.length; ++i) {
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
}
}
@ -1291,39 +1291,29 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
// stuff that subclasses need to override
public byte getType(int cp, boolean recommended) {
byte cat = Default.ucd.getCategory(cp);
if (cat == Cf) return Format;
byte script = Default.ucd.getScript(cp);
if (recommended) {
//if (prefixSet.contains(cp)) return Prefix;
//if (postfixSet.contains(cp)) return Postfix;
//if (exceptionLetters.contains(cp)) return XLetter;
}
boolean isCatLetter = ((1<<cat) & LETTER_MASK) != 0;
if (!recommended) {
if (script == HIRAGANA_SCRIPT) return Hiragana;
} else {
if (script == HIRAGANA_SCRIPT) return Other;
}
if (extraKatakana.contains(cp)) return Katakana;
if (script == KATAKANA_SCRIPT) return Katakana;
if (extraKatakana.contains(cp)) return Katakana;
if (script == HIRAGANA_SCRIPT || script == THAI_SCRIPT || script == LAO_SCRIPT) return Other;
if (ideographicSet.contains(cp)) return Other;
if (alphabeticSet.contains(cp)) return ALetter;
byte lb = Default.ucd.getLineBreak(cp);
if (!recommended) {
if ((isCatLetter || cat == Sk) && lb != LB_ID) return Letter;
} else {
if (normalLetters.contains(cp)) return Letter;
}
if (lb == LB_HY) return Hyphen;
if (lb == LB_NU) return Numeric;
if (midLetterSet.contains(cp)) {
if (lb == LB_IS) return MidNumLet;
return MidLetter;
}
if (lb == LB_IS) return Infix_Numeric;
if (lb == LB_PR) return Prefix_Numeric;
if (lb == LB_PO) return Postfix_Numeric;
if (midLetterSet.contains(cp)) return MidLetter;
return Other;
}
@ -1335,112 +1325,385 @@ U+02E5..U+02ED # MODIFIER LETTER EXTRA-HIGH TONE BAR..MODIFIER LETTER UNASPIRAT
return 3;
}
public boolean isBreak(String source, int offset, boolean recommended) {
static public class Context {
public int cpBefore2, cpBefore, cpAfter, cpAfter2;
public byte tBefore2, tBefore, tAfter, tAfter2;
}
public void getGraphemeBases(String source, int offset, boolean recommended, Context context) {
context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1;
context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1;
MyBreakIterator graphemeIterator = new MyBreakIterator();
graphemeIterator.set(source, offset);
while (true) {
int cp = graphemeIterator.previousBase();
if (cp == -1) break;
byte t = getResolvedType(cp, recommended);
if (t == Format) continue;
if (context.cpBefore == -1) {
context.cpBefore = cp;
context.tBefore = t;
} else {
context.cpBefore2 = cp;
context.tBefore2 = t;
break;
}
}
graphemeIterator.set(source, offset);
while (true) {
int cp = graphemeIterator.nextBase();
if (cp == -1) break;
byte t = getResolvedType(cp, recommended);
if (t == Format) continue;
if (context.cpAfter == -1) {
context.cpAfter = cp;
context.tAfter = t;
} else {
context.cpAfter2 = cp;
context.tAfter2 = t;
break;
}
}
}
public boolean isBreak(String source, int offset, boolean recommended) {
recommended = true; // don't care about old stuff
rule = "1";
if (offset < 0 || offset > source.length()) return false;
rule="16";
if (offset == 0) return true;
rule="15";
rule = "2";
if (offset == source.length()) return true;
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
// GC => FB
rule="1";
rule="3";
if (!grapheme.isBreak( source, offset, recommended)) return false;
// now get the base character before and after, and their types
grapheme.getGraphemeBases(source, offset, recommended, context);
getGraphemeBases(source, offset, recommended, context);
byte before = getResolvedType(context.cpBefore, recommended);
byte after = getResolvedType(context.cpAfter, recommended);
byte before2 = context.cpBefore2 < 0 ? (byte)-1 : getResolvedType(context.cpBefore2, recommended);
byte after2 = context.cpAfter2 < 0 ? (byte)-1 : getResolvedType(context.cpAfter2, recommended);
byte before = context.tBefore;
byte after = context.tAfter;
byte before2 = context.tBefore2;
byte after2 = context.tAfter2;
//Don't break between most letters
// Letter × Letter
// ALetter × ALetter
rule = "2";
if (before == Letter && after == Letter) return false;
rule = "5";
if (before == ALetter && after == ALetter) return false;
// Dont break letters across certain punctuation
// Letter × MidLetter Letter (3)
// Letter MidLetter × Letter (4)
// ALetter×(MidLetter | MidNumLet) ALetter(6)
// ALetter (MidLetter | MidNumLet)×ALetter(7)
/*if (recommended) {
rule = "2a";
if (before == Prefix && after == Letter) return false;
rule = "6";
if (before == ALetter && (after == MidLetter || after == MidNumLet) && after2 == ALetter) return false;
rule = "2b";
if (before == Letter && after == Postfix) return false;
}
*/
rule = "3";
if (before == Letter && after == MidLetter && after2 == Letter) return false;
rule = "4";
if (before2 == Letter && before == MidLetter && after == Letter) return false;
rule = "7";
if (before2 == ALetter && (before == MidLetter || before == MidNumLet) && after == ALetter) return false;
// Dont break within sequences of digits, or digits adjacent to letters.
// Numeric × Numeric (5)
rule = "5";
rule = "8";
if (before == Numeric && after == Numeric) return false;
// Letter × Numeric (6)
rule = "6";
if (before == Letter && after == Numeric) return false;
// ALetter × Numeric (6)
rule = "9";
if (before == ALetter && after == Numeric) return false;
// Numeric × Letter (7)
rule = "7";
if (before == Numeric && after == Letter) return false;
// Numeric × ALetter (7)
rule = "10";
if (before == Numeric && after == ALetter) return false;
// Dont break within sequences like: '-3.2'
// Hyphen × Numeric (8)
rule = "8";
if (before == Hyphen && after == Numeric) return false;
// Numeric Infix_Numeric × Numeric (9)
rule = "9";
if (before2 == Numeric && before == Infix_Numeric && after == Numeric) return false;
// Numeric × Infix_Numeric Numeric (10)
rule = "10";
if (before == Numeric && after == Infix_Numeric && after2 == Numeric) return false;
// Prefix_Numeric × Numeric (11)
// Numeric (MidNum | MidNumLet)×Numeric(11)
rule = "11";
if (before == Prefix_Numeric && after == Numeric) return false;
if (before2 == Numeric && (before == Infix_Numeric || before == MidNumLet) && after == Numeric) return false;
// Numeric × Postfix_Numeric (12)
// Numeric×(MidNum | MidNumLet) Numeric(12)
rule = "12";
if (before == Numeric && after == Postfix_Numeric) return false;
if (before == Numeric && (after == Infix_Numeric || after == MidNumLet) && after2 == Numeric) return false;
// Don't break between Hiragana or Katakana
// Don't break between Hiragana
if (!recommended) {
// Hiragana × Hiragana (13)
rule = "13";
if (before == Hiragana && after == Hiragana) return false;
}
// Katakana × Katakana (14)
rule = "14";
if (before == Katakana && after == Katakana) return false;
// Hiragana × Hiragana (13)
rule = "13";
if (before == Hiragana && after == Hiragana) return false;
// Otherwise break always.
rule = "15";
rule = "14";
return true;
}
}
//==============================================
static class GenerateSentenceBreakTest extends GenerateBreakTest {
static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5,
Close = 6, ATerm = 7, Term = 8, Other = 9,
LIMIT = Other + 1;
static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper",
"Close", "ATerm", "Term", "Other" };
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest();
static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]");
static UnicodeSet atermSet = new UnicodeSet("[\\u002E]");
static UnicodeSet termSet = new UnicodeSet("[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934"
+ "\\u1362\\u1367\\u1368\\u1803\\u1809\\u203c\\u203d\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]");
static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase);
static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase);
{
fileName = "Sentence";
extraSamples = new String[] {
};
String[] temp = new String[] {
"(\"Go.\") (He did.)",
"(\"Go?\") (He did.)",
"U.S.A\u0300. is",
"U.S.A\u0300? He",
"U.S.A\u0300.",
"\u4e00.\u4300",
"\u4e00?\u4300",
};
extraSingleSamples = new String [temp.length * 2];
System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length);
for (int i = 0; i < temp.length; ++i) {
extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme);
}
}
// stuff that subclasses need to override
public String getTypeID(int cp, boolean recommended) {
byte type = getType(cp, recommended);
return Names[type];
}
// stuff that subclasses need to override
public byte getType(int cp, boolean recommended) {
byte cat = Default.ucd.getCategory(cp);
if (cat == Cf) return Format;
if (sepSet.contains(cp)) return Sep;
if (Default.ucd.getBinaryProperty(cp, White_space)) return Sp;
if (alphabeticSet.contains(cp)) return OLetter;
if (lowercaseProp.hasValue(cp)) return Lower;
if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper;
if (atermSet.contains(cp)) return ATerm;
if (termSet.contains(cp)) return Term;
if (cat == Po || cat == Pe
|| Default.ucd.getLineBreak(cp) == LB_QU) return Close;
return Other;
}
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
/*
results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a';
results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a';
results[3] = 'a' + before + "\u0301\u0308" + samples[Infix_Numeric] + after + "\u0301\u0308" + 'a';
*/
return 1;
}
public boolean isBreak(String source, int offset, boolean recommended) {
rule = "1";
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
rule = "2";
if (offset == source.length()) return true;
// Sep ÷ (3)
rule = "3";
byte before = getResolvedType(source.charAt(offset-1), recommended);
if (before == Sep) return true;
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
// GC => FB
// Ignore interior Format characters. That is, ignore Format characters in all subsequent rules.
// X Format*
// ?
// X
// (5)
rule="3";
if (!grapheme.isBreak( source, offset, recommended)) return false;
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
// ATerm ×Upper (7)
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp*÷(10)
// These cases are all handled together.
// First we loop backwards, checking for the different types.
MyBreakIterator graphemeIterator = new MyBreakIterator();
graphemeIterator.set(source, offset);
int state = 0;
byte lookAfter = -1;
int cp;
byte t;
boolean gotSpace = false;
boolean gotClose = false;
behindLoop:
while (true) {
cp = graphemeIterator.previousBase();
if (cp == -1) break;
t = getResolvedType(cp, recommended);
if (SHOW_TYPE) System.out.println(Default.ucd.getCodeAndName(cp) + ", " + getTypeID(cp, recommended));
if (t == Format) continue; // ignore all formats!
switch (state) {
case 0:
if (t == Sp) {
// loop as long as we have Space
gotSpace = true;
continue behindLoop;
} else if (t == Close) {
gotClose = true;
state = 1; // go to close loop
continue behindLoop;
}
break;
case 1:
if (t == Close) {
// loop as long as we have Close
continue behindLoop;
}
break;
}
if (t == ATerm) {
lookAfter = ATerm;
} else if (t == Term) {
lookAfter = Term;
}
break;
}
// if we didn't find ATerm or Term, bail
if (lookAfter == -1) {
// Otherwise, do not break
// Any × Any (11)
rule = "11";
return false;
}
// Do not break after ambiguous terminators like period, if the first following letter is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
// ATerm Close* Sp*×(¬( OLetter | Upper ))* Lower(6)
// ATerm ×Upper (7)
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
// ( Term | ATerm ) Close*×( Close | Sp | Sep )(8)
// ( Term | ATerm ) Close* Sp×( Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp*÷(10)
// We DID find one. Loop to see if the right side is ok.
graphemeIterator.set(source, offset);
boolean isFirst = true;
while (true) {
cp = graphemeIterator.nextBase();
if (cp == -1) break;
t = getResolvedType(cp, recommended);
if (SHOW_TYPE) System.out.println(Default.ucd.getCodeAndName(cp) + ", " + getTypeID(cp, recommended));
if (t == Format) continue; // skip format characters!
if (isFirst) {
isFirst = false;
if (lookAfter == ATerm && t == Upper) {
rule = "7";
return false;
}
if (gotSpace) {
if (t == Sp || t == Sep) {
rule = "9";
return false;
}
} else if (t == Close || t == Sp || t == Sep) {
rule = "8";
return false;
}
if (lookAfter == Term) break;
}
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
if (t != OLetter && t != Upper && t != Lower) continue;
if (t == Lower) {
rule = "6";
return false;
}
break;
}
rule = "10";
return true;
}
}
static class MyBreakIterator {
int offset = 0;
String string = "";
GenerateBreakTest breaker = new GenerateGraphemeBreakTest();
boolean recommended = true;
public MyBreakIterator set(String source, int offset) {
string = source;
this.offset = offset;
return this;
}
public int nextBase() {
if (offset >= string.length()) return -1;
int result = UTF16.charAt(string, offset);
for (++offset; offset < string.length(); ++offset) {
if (breaker.isBreak(string, offset, recommended)) break;
}
return result;
}
public int previousBase() {
if (offset <= 0) return -1;
for (--offset; offset >= 0; --offset) {
if (breaker.isBreak(string, offset, recommended)) break;
}
return UTF16.charAt(string, offset);
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2002/08/08 15:35:01 $
* $Revision: 1.21 $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -78,6 +78,8 @@ public final class Main implements UCD_Types {
else if (arg.equalsIgnoreCase("TestNormalization")) TestNormalization.main(null);
else if (arg.equalsIgnoreCase("checkDecompFolding")) VerifyUCD.checkDecompFolding();
else if (arg.equalsIgnoreCase("breaktest")) GenerateBreakTest.main(null);
else if (arg.equalsIgnoreCase("checkcollator")) CheckCollator.main(null);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2002/07/30 09:56:40 $
* $Revision: 1.18 $
* $Date: 2002/08/09 23:56:24 $
* $Revision: 1.19 $
*
*******************************************************************************
*/
@ -31,6 +31,37 @@ import java.text.NumberFormat;
public class VerifyUCD implements UCD_Types {
static final boolean DEBUG = false;
static void checkDecompFolding() {
Default.setUCD();
UnicodeSet sum = new UnicodeSet();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!Default.ucd.isAllocated(cp)) continue;
byte cat = Default.ucd.getCategory(cp);
if (cat == UNASSIGNED || cat == PRIVATE_USE) continue;
String decomp = Default.nfd.normalize(cp);
String foldDecomp = Default.ucd.getCase(decomp, FULL, FOLD);
int d0 = Default.ucd.getCombiningClass(decomp.charAt(0));
int dL = Default.ucd.getCombiningClass(decomp.charAt(decomp.length()-1));
int f0 = Default.ucd.getCombiningClass(foldDecomp.charAt(0));
int fL = Default.ucd.getCombiningClass(foldDecomp.charAt(decomp.length()-1));
if (d0 != f0 || dL != fL) {
Utility.fixDot();
System.out.println();
System.out.println("Exception: " + Default.ucd.getCodeAndName(cp));
System.out.println("Decomp: " + Default.ucd.getCodeAndName(decomp));
System.out.println("FoldedDecomp: " + Default.ucd.getCodeAndName(foldDecomp));
System.out.println("d0: " + d0 + ", "
+ "dL: " + dL + ", "
+ "f0: " + f0 + ", "
+ "fL: " + fL
);
sum.add(cp);
}
}
System.out.println("Set: " + sum.toPattern(true));
}
static void oneTime() {
Default.setUCD();
int[] testSet = {0x10000, 'a', 0xE0000, '\u0221'}; // 10000