diff --git a/icu4c/source/test/intltest/intltest.dsp b/icu4c/source/test/intltest/intltest.dsp index 43e0e687718..83c4bc2695d 100644 --- a/icu4c/source/test/intltest/intltest.dsp +++ b/icu4c/source/test/intltest/intltest.dsp @@ -306,10 +306,6 @@ SOURCE=.\ittrans.cpp # End Source File # Begin Source File -SOURCE=.\ittxtbd.cpp -# End Source File -# Begin Source File - SOURCE=.\itutil.cpp # End Source File # Begin Source File @@ -683,10 +679,6 @@ SOURCE=.\ittrans.h # End Source File # Begin Source File -SOURCE=.\ittxtbd.h -# End Source File -# Begin Source File - SOURCE=.\itutil.h # End Source File # Begin Source File diff --git a/icu4c/source/test/intltest/itmajor.cpp b/icu4c/source/test/intltest/itmajor.cpp index c53c86cfe94..f2ef7fe7fa4 100644 --- a/icu4c/source/test/intltest/itmajor.cpp +++ b/icu4c/source/test/intltest/itmajor.cpp @@ -1,5 +1,5 @@ /******************************************************************** - * COPYRIGHT: + * COPYRIGHT: * Copyright (c) 1998-2001, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ @@ -19,7 +19,6 @@ #include "itutil.h" #include "tscoll.h" -#include "ittxtbd.h" #include "itformat.h" #include "itconv.h" #include "ittrans.h" @@ -42,15 +41,15 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &name, char* par ) { switch (index) { - case 0: name = "utility"; - if (exec) { + case 0: name = "utility"; + if (exec) { logln("TestSuite Utilities---"); logln(); IntlTestUtilities test; callTest( test, par ); } break; - case 1: name = "normalize"; + case 1: name = "normalize"; if (exec) { logln("TestSuite Normalize---"); logln(); IntlTestNormalize test; @@ -58,7 +57,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam } break; - case 2: name = "collate"; + case 2: name = "collate"; if (exec) { logln("TestSuite Collator---"); logln(); IntlTestCollator test; @@ -66,15 +65,11 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam } break; - case 3: name = "textbounds"; - if (exec) { - logln("TestSuite TextBoundary---"); logln(); - IntlTestTextBoundary test; - callTest( test, par ); - } + case 3: name = "unused"; + // Used to be text bounds. break; - case 4: name = "format"; + case 4: name = "format"; if (exec) { logln("TestSuite Format---"); logln(); IntlTestFormat test; @@ -82,7 +77,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam } break; - case 5: name = "translit"; + case 5: name = "translit"; if (exec) { logln("TestSuite Transliterator---"); logln(); IntlTestTransliterator test; @@ -90,7 +85,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam } break; - case 6: name = "rbbi"; + case 6: name = "rbbi"; if (exec) { logln("TestSuite RuleBasedBreakIterator---"); logln(); IntlTestRBBI test; @@ -114,7 +109,7 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam /* Only the C API is exists */ #ifdef ICU_UNICODECONVERTER_USE_DEPRECATES - case 9: name = "convert"; + case 9: name = "convert"; if (exec) { logln("TestSuite Convert---"); logln(); IntlTestConvert test; diff --git a/icu4c/source/test/intltest/ittxtbd.cpp b/icu4c/source/test/intltest/ittxtbd.cpp deleted file mode 100644 index 2df7cf56982..00000000000 --- a/icu4c/source/test/intltest/ittxtbd.cpp +++ /dev/null @@ -1,1667 +0,0 @@ -/******************************************************************** - * COPYRIGHT: - * Copyright (c) 1997-2001, International Business Machines Corporation and - * others. All Rights Reserved. - ********************************************************************/ - -#include "intltest.h" -#include "unicode/brkiter.h" -#include "unicode/uchar.h" -#include -//#include "txbdapi.h" // BreakIteratorAPIC - -//-------------------------------------------------------------------------------------- -/** - * "Vector" class for holding test tables - * (this class is actually a linked list, but we use the name and API of the - * java.util.Vector class to keep as much of our test code as possible the same.) - */ -class Enumeration { // text enumeration -public: - virtual UBool hasMoreElements() = 0; - virtual UnicodeString nextElement() = 0; -}; - -class Vector { // text vector -public: - - class TextLink { - public: - TextLink() : fLink(0), fText() {} - TextLink(TextLink* link, UnicodeString text) : fLink(link), fText(text) {} - - TextLink* fLink; - UnicodeString fText; - }; - -public: - TextLink fBase; - TextLink* fEnd; - int32_t fSize; - -public: - class VectorEnumeration : public Enumeration { - public: - VectorEnumeration(Vector* vector) : fVector(vector), fPos(&vector->fBase) {} - - UBool hasMoreElements() { return fPos->fLink != &fVector->fBase; } - UnicodeString nextElement() { fPos = fPos->fLink; return fPos->fText; } - - Vector* fVector; - TextLink* fPos; - }; - - Vector() : fBase(), fEnd(&fBase), fSize(0) { fBase.fLink = &fBase; } - - ~Vector() { - while (fBase.fLink != &fBase) { - TextLink* link = fBase.fLink; - fBase.fLink = link->fLink; - delete link; - } - } - - void addElement(UnicodeString text) { fEnd->fLink = new TextLink(&fBase, text); fEnd = fEnd->fLink; ++fSize; } - void insertElementAt(UnicodeString text, int pos) { - if(pos >= fSize || pos < 0) - ; - else if(pos == 0){ - TextLink* insert = new TextLink(&fBase, text); - insert->fLink=fBase.fLink; - ++fSize; - fBase.fLink=insert; - } - else{ - TextLink* link = fBase.fLink; - while(--pos > 0) - link=link->fLink; - TextLink* insert = new TextLink(&fBase, text); - insert->fLink =link->fLink; - link->fLink=insert; - ++fSize; - - } - - } - UnicodeString elementAt(int32_t pos) { - if (pos >= fSize) - return UnicodeString(); - - TextLink* link = fBase.fLink; - while (pos-- > 0) link = link->fLink; - return link->fText; - } - UnicodeString lastElement() { return fEnd == &fBase ? UnicodeString() : fEnd->fText; } - int32_t size() { return fSize; } - - Enumeration* elements() { return new VectorEnumeration(this); } - -}; - -//-------------------------------------------------------------------------------------- -/** - * IntlTestTextBoundary is medium top level test class for everything in the directory "findword". - */ - -#include "unicode/utypes.h" -#include "ittxtbd.h" - -#include -#include "unicode/schriter.h" - -const UChar IntlTestTextBoundary::cannedTestArray[] = { - 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, - 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, - 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, - 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, - 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, - 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, - 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, - 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 -}; - -UnicodeString* IntlTestTextBoundary::cannedTestChars = 0; - -//--------------------------------------------- -// setup methods -//--------------------------------------------- - -IntlTestTextBoundary::IntlTestTextBoundary() -{ - UnicodeString temp(cannedTestArray); - cannedTestChars = new UnicodeString(); - *cannedTestChars += (UChar)0x0000; - *cannedTestChars += temp; - addTestWordData(); - addTestSentenceData(); - addTestLineData(); - addTestCharacterData(); -} - -IntlTestTextBoundary::~IntlTestTextBoundary() -{ - delete wordSelectionData; - delete sentenceSelectionData; - delete lineSelectionData; - delete characterSelectionData; - delete cannedTestChars; -} - -/** - * @bug 4097779 4098467 4117554 - */ -void IntlTestTextBoundary::addTestWordData() -{ - wordSelectionData = new Vector(); - - wordSelectionData->addElement("12,34"); - - wordSelectionData->addElement(" "); - wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A2))); //cent sign - wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A3))); //pound sign - wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A4))); //currency sign - wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A5))); //yen sign - wordSelectionData->addElement(CharsToUnicodeString("alpha\\u00adbeta\\u00adgamma")); - wordSelectionData->addElement("."); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("Badges"); - wordSelectionData->addElement("?"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("BADGES"); - wordSelectionData->addElement("!"); - wordSelectionData->addElement("?"); - wordSelectionData->addElement("!"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("We"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("don't"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("need"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("no"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("STINKING"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("BADGES"); - wordSelectionData->addElement("!"); - wordSelectionData->addElement("!"); - wordSelectionData->addElement("!"); - - wordSelectionData->addElement("012.566,5"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("123.3434,900"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("1000,233,456.000"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("1,23.322"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("123.1222"); - - wordSelectionData->addElement(" "); - wordSelectionData->addElement("$"); - wordSelectionData->addElement("123,000.20"); - - wordSelectionData->addElement(" "); - wordSelectionData->addElement("179.01"); - wordSelectionData->addElement("%"); - - wordSelectionData->addElement("Hello"); - wordSelectionData->addElement(","); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("how"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("are"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("you"); - wordSelectionData->addElement(" "); - wordSelectionData->addElement("X"); - wordSelectionData->addElement(" "); - - wordSelectionData->addElement("Now"); - wordSelectionData->addElement("\r"); - wordSelectionData->addElement("is"); - wordSelectionData->addElement("\n"); - wordSelectionData->addElement("the"); - wordSelectionData->addElement("\r\n"); - wordSelectionData->addElement("time"); - wordSelectionData->addElement("\n"); - wordSelectionData->addElement("\r"); - wordSelectionData->addElement("for"); - wordSelectionData->addElement("\r"); - wordSelectionData->addElement("\r"); - wordSelectionData->addElement("all"); - wordSelectionData->addElement(" "); - - // to test for bug #4097779 - wordSelectionData->addElement(CharsToUnicodeString("aa\\u0300a")); - wordSelectionData->addElement(" "); - - // to test for bug #4098467 - // What follows is a string of Korean characters (I found it in the Yellow Pages - // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed - // it correctly), first as precomposed syllables, and then as conjoining jamo. - // Both sequences should be semantically identical and break the same way. - // precomposed syllables... - wordSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d")); - wordSelectionData->addElement(" "); - wordSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778")); - wordSelectionData->addElement(" "); - wordSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569")); - wordSelectionData->addElement(" "); - wordSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c")); - wordSelectionData->addElement(" "); - // conjoining jamo... - wordSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc")); - wordSelectionData->addElement(" "); - wordSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab")); - wordSelectionData->addElement(" "); - wordSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8")); - wordSelectionData->addElement(" "); - wordSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c")); - wordSelectionData->addElement(" "); - - // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should - // count as a Kanji character for the purposes of word breaking - wordSelectionData->addElement("abc"); - // Unicode TR29: Ideographs do NOT group together into words. - //wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03")); - wordSelectionData->addElement(CharsToUnicodeString("\\u4e01")); - wordSelectionData->addElement(CharsToUnicodeString("\\u4e02")); - wordSelectionData->addElement(CharsToUnicodeString("\\u3005")); - wordSelectionData->addElement(CharsToUnicodeString("\\u4e03")); - wordSelectionData->addElement(CharsToUnicodeString("\\u4e03")); - wordSelectionData->addElement("abc"); - - - -} - -const UChar kParagraphSeparator = 0x2029; -const UChar kLineSeparator = 0x2028; - -/** - * @bug 4111338 4117554 4113835 - */ -void IntlTestTextBoundary::addTestSentenceData() -{ - sentenceSelectionData = new Vector(); - sentenceSelectionData->addElement("This is a simple sample sentence. "); - sentenceSelectionData->addElement("(This is it.) "); - sentenceSelectionData->addElement("This is a simple sample sentence. "); - sentenceSelectionData->addElement("\"This isn\'t it.\" "); - sentenceSelectionData->addElement("Hi! "); - sentenceSelectionData->addElement("This is a simple sample sentence. "); - sentenceSelectionData->addElement("It does not have to make any sense as you can see. "); - sentenceSelectionData->addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. "); - sentenceSelectionData->addElement("Che la dritta via aveo smarrita. "); - sentenceSelectionData->addElement("He said, that I said, that you said!! "); - - sentenceSelectionData->addElement("Don't rock the boat." + UCharToUnicodeString(kParagraphSeparator)); - - sentenceSelectionData->addElement("Because I am the daddy, that is why. "); - sentenceSelectionData->addElement("Not on my time (el timo.)! "); - - sentenceSelectionData->addElement("So what!!" + UCharToUnicodeString(kParagraphSeparator)); - - sentenceSelectionData->addElement("\"But now,\" he said, \"I know!\" "); - sentenceSelectionData->addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). "); - sentenceSelectionData->addElement("One species, B. anthracis, is highly virulent.\n"); - sentenceSelectionData->addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" "); - sentenceSelectionData->addElement("Have you ever said, \"This is where\tI shall live\"? "); - sentenceSelectionData->addElement("He answered, \"You may not!\" "); - sentenceSelectionData->addElement("Another popular saying is: \"How do you do?\". "); - sentenceSelectionData->addElement("Yet another popular saying is: \'I\'m fine thanks.\' "); - sentenceSelectionData->addElement("What is the proper use of the abbreviation pp.? "); - sentenceSelectionData->addElement("Yes, I am definatelly 12\" tall!!"); - - // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks - sentenceSelectionData->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e")); - - // test for bug #4111338: Don't break sentences at the boundary between CJK - // and other letters - sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165:\"JAVA\\u821c") - + CharsToUnicodeString("\\u8165\\u7fc8\\u51ce\\u306d,\\u2494\\u56d8\\u4ec0\\u60b1\\u8560\\u51ba") - + CharsToUnicodeString("\\u611d\\u57b6\\u2510\\u5d46\".\\u2029")); - sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8") - + CharsToUnicodeString("\\u97e4JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0") - + CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002")); - sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4") - + CharsToUnicodeString("\\u6470\\u8790JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8") - + CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2048")); - sentenceSelectionData->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029")); - - // test for bug #4117554: Treat fullwidth variants of .!? the same as their - // normal counterparts -#if 0 // Not according to TR29. TODO: what is the right thing for these chars? - sentenceSelectionData->addElement(CharsToUnicodeString("I know I'm right\\uff0e ")); - sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff1f ")); - sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff01 ")); -#endif - - // test for bug #4117554: Don't break sentences at boundary between CJK and digits - sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8") - + CharsToUnicodeString("\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0") - + CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751.\\u2029")); - - // test for bug #4117554: Break sentence between a sentence terminator and - // opening punctuation - sentenceSelectionData->addElement("Say no?"); - sentenceSelectionData->addElement("(yes)." + CharsToUnicodeString("\\u2029")); - - // test for bug #4158381: Don't break sentence after period if it isn't - // followed by a space - sentenceSelectionData->addElement("Test Flags.Flag class. "); - sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029")); - - // test for bug #4158381: No breaks when there are no terminators around - sentenceSelectionData->addElement("

Provides a set of "lightweight" (all-javaTM language) components that, to the maximum degree possible, work the same on all platforms. "); - sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029")); - - // test for bug #4143071: Make sure sentences that end with digits - // work right - sentenceSelectionData->addElement("Today is the 27th of May, 1998. "); - sentenceSelectionData->addElement("Tomorrow with be 28 May 1998. "); - sentenceSelectionData->addElement("The day after will be the 30th." - + CharsToUnicodeString("\\u2029")); - - // test for bug #4152416: Make sure sentences ending with a capital - // letter are treated correctly - // Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter. - sentenceSelectionData->addElement("The type of all primitive boolean values accessed in the target VM. " - "Calls to xxx will return an implementor of this interface. " + CharsToUnicodeString("\\u2029")); - - // test for bug #4152117: Make sure sentence breaking is handling - // punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS - // HERE TO MAKE SURE IT DOESN'T CROP UP] - sentenceSelectionData->addElement("Constructs a randomly generated BigInteger, uniformly distributed over the range 0 to (2numBits - 1), inclusive. "); - sentenceSelectionData->addElement("The uniformity of the distribution assumes that a fair source of random bits is provided in rnd. "); - sentenceSelectionData->addElement("Note that this constructor always constructs a non-negative BigInteger." + CharsToUnicodeString("\\u2029")); - -} - -/** - * @bug 4068133 4086052 4035266 4097920 4098467 4117554 - */ -void IntlTestTextBoundary::addTestLineData() -{ - lineSelectionData = new Vector(); - lineSelectionData->addElement("Multi-"); - lineSelectionData->addElement("Level "); - lineSelectionData->addElement("example "); - lineSelectionData->addElement("of "); - lineSelectionData->addElement("a "); - lineSelectionData->addElement("semi-"); - lineSelectionData->addElement("idiotic "); - lineSelectionData->addElement("non-"); - lineSelectionData->addElement("sensical "); - lineSelectionData->addElement("(non-"); - lineSelectionData->addElement("important) "); - lineSelectionData->addElement("sentence. "); - - lineSelectionData->addElement("Hi "); - lineSelectionData->addElement("Hello "); - lineSelectionData->addElement("How\n"); - lineSelectionData->addElement("are\r"); - lineSelectionData->addElement("you" + UCharToUnicodeString(kLineSeparator)); - lineSelectionData->addElement("fine.\t"); - lineSelectionData->addElement("good. "); - - lineSelectionData->addElement("Now\r"); - lineSelectionData->addElement("is\n"); - lineSelectionData->addElement("the\r\n"); - lineSelectionData->addElement("time\n"); - lineSelectionData->addElement("\r"); - lineSelectionData->addElement("for\r"); - lineSelectionData->addElement("\r"); - lineSelectionData->addElement("all"); - - // to test for bug #4068133 - lineSelectionData->addElement(CharsToUnicodeString("\\u96f6")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e00\\u3002")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e8c\\u3001")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e09\\u3002\\u3001")); - lineSelectionData->addElement(CharsToUnicodeString("\\u56db\\u3001\\u3002\\u3001")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e94,")); - lineSelectionData->addElement(CharsToUnicodeString("\\u516d.")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e03.\\u3001,\\u3002")); - lineSelectionData->addElement(CharsToUnicodeString("\\u516b")); - - // to test for bug #4086052 - lineSelectionData->addElement(CharsToUnicodeString("foo\\u00a0bar ")); -// lineSelectionData->addElement("foo\\ufeffbar"); - - // to test for bug #4097920 - lineSelectionData->addElement("dog,"); - lineSelectionData->addElement("cat,"); - lineSelectionData->addElement("mouse "); - lineSelectionData->addElement("(one)"); - lineSelectionData->addElement("(two)\n"); - - // to test for bug #4035266 - lineSelectionData->addElement("The "); - lineSelectionData->addElement("balance "); - lineSelectionData->addElement("is "); - lineSelectionData->addElement("$-23,456.78, "); - lineSelectionData->addElement("not "); - // lineSelectionData->addElement("-$32,456.78!\n"); // Doesn't break this way according to TR29 - lineSelectionData->addElement("-"); - lineSelectionData->addElement("$32,456.78!\n"); - - // to test for bug #4098467 - // What follows is a string of Korean characters (I found it in the Yellow Pages - // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed - // it correctly), first as precomposed syllables, and then as conjoining jamo. - // Both sequences should be semantically identical and break the same way. - // precomposed syllables... - - // By TR14, precomposed Hangul syllables should not be grouped together. - // Also, identical test is in rbbitst.cpp. -#if 0 - lineSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d ")); - lineSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778 ")); - lineSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569 ")); - lineSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c ")); - - // conjoining jamo... - lineSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc ")); - lineSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab ")); - lineSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 ")); - lineSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c")); -#endif - - // to test for bug #4117554: Fullwidth .!? should be treated as postJwrd - lineSelectionData->addElement(CharsToUnicodeString("\\u4e01\\uff0e")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e02\\uff01")); - lineSelectionData->addElement(CharsToUnicodeString("\\u4e03\\uff1f")); - -} - -/* -const UnicodeString graveS = "S" + (UChar)0x0300; -const UnicodeString acuteBelowI = "i" + UCharToUnicodeString(0x0317); -const UnicodeString acuteE = "e" + UCharToUnicodeString(0x0301); -const UnicodeString circumflexA = "a" + UCharToUnicodeString(0x0302); -const UnicodeString tildeE = "e" + UCharToUnicodeString(0x0303); -*/ - -/** - * @bug 4098467 - */ -void IntlTestTextBoundary::addTestCharacterData() -{ - characterSelectionData = new Vector(); - characterSelectionData->addElement("S" + UCharToUnicodeString(0x0300)); //graveS - characterSelectionData->addElement("i" + UCharToUnicodeString(0x0301)); // acuteBelowI - characterSelectionData->addElement("m"); - characterSelectionData->addElement("p"); - characterSelectionData->addElement("l"); - characterSelectionData->addElement("e" + UCharToUnicodeString(0x0301)); // acuteE - characterSelectionData->addElement(" "); - characterSelectionData->addElement("s"); - characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA - characterSelectionData->addElement("m"); - characterSelectionData->addElement("p"); - characterSelectionData->addElement("l"); - characterSelectionData->addElement("e" + UCharToUnicodeString(0x0303)); // tildeE - characterSelectionData->addElement("."); - characterSelectionData->addElement("w"); - characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA - characterSelectionData->addElement("w"); - characterSelectionData->addElement("a"); - characterSelectionData->addElement("f"); - characterSelectionData->addElement("q"); - characterSelectionData->addElement("\n"); - characterSelectionData->addElement("\r"); - characterSelectionData->addElement("\r\n"); - characterSelectionData->addElement("\n"); - - // to test for bug #4098467 - // What follows is a string of Korean characters (I found it in the Yellow Pages - // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed - // it correctly), first as precomposed syllables, and then as conjoining jamo. - // Both sequences should be semantically identical and break the same way. - // precomposed syllables... - characterSelectionData->addElement(CharsToUnicodeString("\\uc0c1")); - characterSelectionData->addElement(CharsToUnicodeString("\\ud56d")); - characterSelectionData->addElement(" "); - characterSelectionData->addElement(CharsToUnicodeString("\\ud55c")); - characterSelectionData->addElement(CharsToUnicodeString("\\uc778")); - characterSelectionData->addElement(" "); - characterSelectionData->addElement(CharsToUnicodeString("\\uc5f0")); - characterSelectionData->addElement(CharsToUnicodeString("\\ud569")); - characterSelectionData->addElement(" "); - characterSelectionData->addElement(CharsToUnicodeString("\\uc7a5")); - characterSelectionData->addElement(CharsToUnicodeString("\\ub85c")); - characterSelectionData->addElement(CharsToUnicodeString("\\uad50")); - characterSelectionData->addElement(CharsToUnicodeString("\\ud68c")); - characterSelectionData->addElement(" "); - // conjoining jamo... - characterSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc")); - characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11bc")); - characterSelectionData->addElement(" "); - characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab")); - characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1175\\u11ab")); - characterSelectionData->addElement(" "); - characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab")); - characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11b8")); - characterSelectionData->addElement(" "); - characterSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc")); - characterSelectionData->addElement(CharsToUnicodeString("\\u1105\\u1169")); - characterSelectionData->addElement(CharsToUnicodeString("\\u1100\\u116d")); - characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u116c")); - -} - -UnicodeString IntlTestTextBoundary::createTestData(Enumeration* e) -{ - UnicodeString result = ""; - - while (e->hasMoreElements()) { - result += e->nextElement(); - } - return result; -} - -//--------------------------------------------- -// SentenceBreak tests -//--------------------------------------------- - -void IntlTestTextBoundary::TestSentenceIteration() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator* e = BreakIterator::createSentenceInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestSentenceIteration.\n"); - return; - } - generalIteratorTest(*e, sentenceSelectionData); - delete e; -} - -void IntlTestTextBoundary::TestSentenceInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n"); - return; - } - UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff"); - doOtherInvariantTest(*e, s); - delete e; -} -//--------------------------------------------- -// WordBreak tests -//--------------------------------------------- -void IntlTestTextBoundary::TestWordIteration() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestWordIteration.\n"); - return; - } - generalIteratorTest(*e, wordSelectionData); - delete e; -} -void IntlTestTextBoundary::TestWordInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n"); - return; - } - UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02"); - doBreakInvariantTest(*e, s); - s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02"); - doOtherInvariantTest(*e, s); - delete e; -} -//--------------------------------------------- -// CharacterBreak tests -//--------------------------------------------- -void IntlTestTextBoundary::TestCharacterIteration() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator* e = BreakIterator::createCharacterInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestCharacterIteration.\n"); - return; - } - // generalIteratorTest(*e, testCharacterText, characterSelectionData); - generalIteratorTest(*e, characterSelectionData); - delete e; -} -void IntlTestTextBoundary::TestCharacterInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n"); - return; - } - UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa"); - doBreakInvariantTest(*e, s); - s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa"); - doOtherInvariantTest(*e, s); - delete e; -} -//--------------------------------------------- -// LineBreak tests -//--------------------------------------------- -void IntlTestTextBoundary::TestLineIteration() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator* e = BreakIterator::createLineInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestLineIteration.\n"); - return; - } - generalIteratorTest(*e, lineSelectionData); - delete e; -} -void IntlTestTextBoundary::TestLineInvariants() -{ - UErrorCode status = U_ZERO_ERROR; - BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n"); - return; - } - UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02"); - UnicodeString testChars = *cannedTestChars + s; - doBreakInvariantTest(*e, testChars); - doOtherInvariantTest(*e, testChars); - - int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen; - int32_t i, j, k; - - // in addition to the other invariants, a line-break iterator should make sure that: - // it doesn't break around the non-breaking characters, - // EXCEPT breaking after a space takes precedence over not breaking before - // an non-breaking char. So says TR 14. - UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff"); - UnicodeString work("aaa"); - testCharsLen = testChars.length(); - noBreakLen = noBreak.length(); - for (i = 0; i < testCharsLen; i++) { - UChar c = testChars[i]; - if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 || - u_charType(c) == U_CONTROL_CHAR) { - continue; - } - work[0] = c; - for (j = 0; j < noBreakLen; j++) { - work[1] = noBreak[j]; - for (k = 0; k < testCharsLen; k++) { - work[2] = testChars[k]; - e->setText(work); - for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) { - UChar c1 = work[l - 1]; - UChar c2 = work[l]; - if (c1 == 0x20 && l == 1) { - continue; - } - if (l == 1 || l == 2) { - errln("Got break between U+" + UCharToUnicodeString(c1) + - " and U+" + UCharToUnicodeString(c2)); - errCount++; - if (errCount >= 75) - return; - } - } - } - } - } - - // it does break after hyphens (Rule 15B from TR 14 - // (unless they're followed by a digit, a non-spacing mark, - // a currency symbol, a non-breaking space, or a line or paragraph separator - // or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d - - // This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH - // - UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014"); - dashesLen = dashes.length(); - for (i = 0; i < testCharsLen; i++) { - work[0] = testChars[i]; - for (j = 0; j < dashesLen; j++) { - UChar c1 = work[1] = dashes[j]; - for (k = 0; k < testCharsLen; k++) { - UChar c2 = work[2] = testChars[k]; - int8_t type = u_charType(c2); - if (type == U_DECIMAL_DIGIT_NUMBER || - type == U_OTHER_NUMBER || - type == U_NON_SPACING_MARK || - type == U_ENCLOSING_MARK || - type == U_CURRENCY_SYMBOL || - type == U_SPACE_SEPARATOR || - type == U_DASH_PUNCTUATION || - type == U_CONTROL_CHAR || - type == U_FORMAT_CHAR || - c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 || - c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 || - c2 == 0xfeff) - { - continue; - } - // If c1 == hyphen-minus, and ... - if (c1 == 0x002d && ( - c2 == 0x0021 || // ! - c2 == 0x002c || // , - c2 == 0x002d || // - - c2 == 0x002e || // . (TR 14 class IS) - c2 == 0x0029 || // ) - c2 == 0x003a || // : - c2 == 0x003b || // ; (TR 14 class IS) - c2 == 0x005d || // ] - c2 == 0x007c || // | (TR 14 class BA, rule 15) - c2 == 0x007d || // } - c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test? - c2 == 0x093E || // Devanagari , combining, what's it doing in this test? - c2 == 0x093F || // Devanagari , combining, what's it doing in this test? - c2 == 0x0940 || // Devanagari , combining, what's it doing in this test? - c2 == 0x0949 || // Devanagari , combining, what's it doing in this test? - c2 == 0x0f3b || // Tibetan closing bracket - c2 == 0x3001 || // CJK closing bracket - c2 == 0x3002 // CJK closing bracket - )) { - continue; - } - - e->setText(work); - UBool saw2 = FALSE; - for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) { - if (l == 2) { - saw2 = TRUE; - break; - } - } - if (!saw2) { - // TODO: This test is completely out of sync with the spec. Fix it. - // errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) + - // " and U+" + UCharToUnicodeString(work[2])); - // errCount++; - // if (errCount >= 75) - // return; - } - } - } - } - delete e; -} - -void IntlTestTextBoundary::TestThaiLineBreak() { - Vector* thaiLineSelection = new Vector(); - UErrorCode status = U_ZERO_ERROR; - - // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that - // represents elided letters at the end of a long word. It should be bound to - // the end of the word and not treated as an independent punctuation mark. - - - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e30")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e21")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e08\\u0e49\\u0e32")); -// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32")); -// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48")); - // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e2d\\u0e01")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e23\\u0e48\\u0e07")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e15\\u0e47\\u0e21")); - - // the one time where the paiyannoi occurs somewhere other than at the end - // of a word is in the Thai abbrevation for "etc.", which both begins and - // ends with a paiyannoi - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2f\\u0e25\\u0e2f")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19")); - - BreakIterator* e = BreakIterator::createLineInstance( - Locale("th"), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n"); - return; - } - - generalIteratorTest(*e, thaiLineSelection); - delete e; - delete thaiLineSelection; -} - -void IntlTestTextBoundary::TestMixedThaiLineBreak() -{ - UErrorCode status = U_ZERO_ERROR; - Vector* thaiLineSelection= new Vector(); - - // Arabic numerals should always be separated from surrounding Thai text -/* - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a")); - thaiLineSelection->addElement("39"); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 ")); - - // words in non-Thai scripts should always be separated from surrounding Thai text - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e14")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e2d\\u0e1a")); - thaiLineSelection->addElement("Java"); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e19")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ")); - - // Thai numerals should always be separated from the text surrounding them - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e53\\u0e59")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 ")); - - // Thai text should interact correctly with punctuation and symbols - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21")); -// thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28")); -// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e17\\u0e22)")); -thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)")); -// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e1b\\u0e34\\u0e14")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e15\\u0e31\\u0e27\"")); -*/ - - // The Unicode Linebreak TR says do not break before or after quotes. - // So this test is changed ot not break around the quote. - // TODO: should Thai break around the around the quotes, like the original behavior here? -// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"")); -// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"" - "\\u0e23\\u0e38\\u0e48\\u0e19")); - - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e22.")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e35\\u0e49")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e32\\u0e04\\u0e32")); - thaiLineSelection->addElement("$200"); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e17\\u0e48\\u0e32")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19 ")); - thaiLineSelection->addElement(CharsToUnicodeString("(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").")); - - BreakIterator* e = BreakIterator::createLineInstance(Locale("th"), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n"); - return; - } - - - generalIteratorTest(*e, thaiLineSelection); - delete e; - delete thaiLineSelection; -} - - -void IntlTestTextBoundary::TestMaiyamok() -{ - Vector* thaiLineSelection= new Vector(); - UErrorCode status = U_ZERO_ERROR; - // the Thai maiyamok character is a shorthand symbol that means "repeat the previous - // word". Instead of appearing as a word unto itself, however, it's kept together - // with the word before it - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e1b\\u0e46")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32\\u0e46")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e25\\u0e30")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07")); - thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48")); - - BreakIterator* e = BreakIterator::createLineInstance( - Locale("th"), status); - - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n"); - return; - } - generalIteratorTest(*e, thaiLineSelection); - delete e; - delete thaiLineSelection; -} - -void IntlTestTextBoundary::TestThaiWordBreak() { - Vector* thaiWordSelection = new Vector(); - UErrorCode status = U_ZERO_ERROR; - - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1A\\u0E17")); //2 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E35\\u0E48")); //5 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E51")); //6 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1E\\u0E32\\u0E22\\u0E38")); //10 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19")); //16 - thaiWordSelection->addElement(CharsToUnicodeString("\\u000D\\u000A")); //18 - - // This is the correct result - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35")); //24 - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29 - - // and this is what the dictionary does... - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14")); // 20 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29 - - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E22\\u0E39\\u0E48")); //33 - - // This is the correct result - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21")); //37 - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E25\\u0E32\\u0E07")); //41 - - // and this is what the dictionary does - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07")); //41 - - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E38\\u0E48\\u0E07")); //45 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E2B\\u0E0D\\u0E48")); //49 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E19")); //51 - - // This is the correct result - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A")); //57 - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E31\\u0E1A")); //60 - - // and this is what the dictionary does - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19")); // 54 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A")); //60 - - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E25\\u0E38\\u0E07")); //63 - - // This is the correct result - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35")); //68 - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0A\\u0E32\\u0E27")); //71 - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E23\\u0E48")); //74 - //thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E25\\u0E30")); //77 - - // and this is what the dictionary does - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E")); // 65 - thaiWordSelection->addElement(CharsToUnicodeString("\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30")); //77 - - BreakIterator* e = BreakIterator::createWordInstance( - Locale("th"), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n"); - return; - } - - generalIteratorTest(*e, thaiWordSelection); - delete e; - delete thaiWordSelection; -} - -/** - * Test Japanese Line Break - * @bug 4095322 - */ -void IntlTestTextBoundary::TestJapaneseLineBreak() -{ - // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count - // as opening and closing punctuation for line breaking. - // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars - // from these tests. 6-13-2002 - // - UErrorCode status = U_ZERO_ERROR; - UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); - UnicodeString precedingChars = CharsToUnicodeString( - //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); - "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); - UnicodeString followingChars = CharsToUnicodeString( - // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" - ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" - // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" - ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" - "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); - BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); - - int32_t i; - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); - return; - } - - for (i = 0; i < precedingChars.length(); i++) { - testString[1] = precedingChars[i]; - iter->setText(testString); - int32_t j = iter->first(); - if (j != 0) - errln("ja line break failure: failed to start at 0"); - j = iter->next(); - if (j != 1) - errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) - + "' (" + ((int)(precedingChars[i])) + ")"); - j = iter->next(); - if (j != 3) - errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) - + "' (" + ((int)(precedingChars[i])) + ")"); - } - - for (i = 0; i < followingChars.length(); i++) { - testString[1] = followingChars[i]; - iter->setText(testString); - int j = iter->first(); - if (j != 0) - errln("ja line break failure: failed to start at 0"); - j = iter->next(); - if (j != 2) - errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) - + "' (" + ((int)(followingChars[i])) + ")"); - j = iter->next(); - if (j != 3) - errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) - + "' (" + ((int)(followingChars[i])) + ")"); - } - delete iter; -} - -//--------------------------------------------- -// other tests -//---------------------------------------------/ - -void IntlTestTextBoundary::TestEmptyString() -{ - UnicodeString text = ""; - Vector x; - UErrorCode status = U_ZERO_ERROR; - x.addElement(text); - BreakIterator* bi = BreakIterator::createLineInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n"); - return; - } - generalIteratorTest(*bi, &x); - - delete bi; -} - -void IntlTestTextBoundary::TestGetAvailableLocales() -{ - int32_t locCount = 0; - const Locale* locList = BreakIterator::getAvailableLocales(locCount); - - if (locCount == 0) - errln("getAvailableLocales() returned an empty list!"); - // Just make sure that it's returning good memory. - for (int32_t i = 0; i < locCount; ++i) { - logln(locList[i].getName()); - } -} - -//Testing the BreakIterator::getDisplayName() function -void IntlTestTextBoundary::TestGetDisplayName() -{ - UnicodeString result; - - BreakIterator::getDisplayName(Locale::getUS(), result); - if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") - errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" - + result); - - BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); - if (result != "French (France)") - errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" - + result); -} -/** - * Test End Behaviour - * @bug 4068137 - */ -void IntlTestTextBoundary::TestEndBehaviour() -{ - UErrorCode status = U_ZERO_ERROR; - UnicodeString testString("boo."); - BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n"); - return; - } - wb->setText(testString); - - if (wb->first() != 0) - errln("Didn't get break at beginning of string."); - if (wb->next() != 3) - errln("Didn't get break before period in \"boo.\""); - if (wb->current() != 4 && wb->next() != 4) - errln("Didn't get break at end of string."); - delete wb; -} -/* - * @bug 4153072 - */ -void IntlTestTextBoundary::TestBug4153072() { - UErrorCode status = U_ZERO_ERROR; - BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestBug4153072\n"); - return; - } - UnicodeString str("...Hello, World!..."); - int32_t begin = 3; - int32_t end = str.length() - 3; - UBool dummy; - - StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); - iter->adoptText(textIterator); - for (int index = -1; index < begin + 1; ++index) { - dummy = iter->isBoundary(index); - if (index < begin && dummy == TRUE) { - errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index + - " and begin index = " + begin); - } - } - delete iter; -} - -/* - * Test Preceding() - */ -void IntlTestTextBoundary::TestPreceding() -{ - UErrorCode status = U_ZERO_ERROR; - UnicodeString words3("aaa bbb ccc"); - BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status); - if (U_FAILURE(status)) - { - errln("Failed to create the BreakIterator for default locale in TestPreceeding.\n"); - return; - } - - e->setText( words3 ); - e->first(); - int32_t p1 = e->next(); - int32_t p2 = e->next(); - int32_t p3 = e->next(); - int32_t p4 = e->next(); - - int32_t f = e->following(p2+1); - int32_t p = e->preceding(p2+1); - if (f!=p3) - errln("IntlTestTextBoundary::TestPreceding: f!=p3"); - if (p!=p2) - errln("IntlTestTextBoundary::TestPreceding: p!=p2"); - - if (p1+1!=p2) - errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2"); - - if (p3+1!=p4) - errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4"); - - if (!e->isBoundary(p2) || e->isBoundary(p2+1) || !e->isBoundary(p3)) - { - errln("IntlTestTextBoundary::TestPreceding: isBoundary err"); - } - delete e; -} -//--------------------------------------------- -// runIndexedTest -//--------------------------------------------- - -void IntlTestTextBoundary::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) -{ - if (exec) logln("TestSuite TextBoundary: "); - switch (index) { - case 0: name = "TestSentenceIteration"; if(exec) TestSentenceIteration(); break; - case 1: name = "TestWordIteration"; if(exec) TestWordIteration(); break; - case 2: name = "TestLineIteration"; if(exec) TestLineIteration(); break; - case 3: name = "TestCharacterIteration"; if(exec) TestCharacterIteration(); break; - case 4: name = "TestSentenceInvariants"; if(exec) TestSentenceInvariants();break; - case 5: name = "TestWordInvariants"; if(exec) TestWordInvariants();break; - case 6: name = "TestLineInvariants"; if(exec) TestLineInvariants();break; - case 7: name = "TestCharacterInvariants"; if(exec) TestCharacterInvariants();break; - - case 8: name = "TestEmptyString"; if (exec) TestEmptyString(); break; - case 9: name = "TestGetAvailableLocales"; if (exec) TestGetAvailableLocales(); break; - case 10: name = "TestGetDisplayName"; if (exec) TestGetDisplayName(); break; - case 11: name = "TestPreceding"; if (exec) TestPreceding(); break; - case 12: name = "TestBug4153072"; if (exec) TestBug4153072(); break; - case 13: name = "TestEndBehaviour"; if (exec) TestEndBehaviour(); break; - - case 14: name = "TestJapaneseLineBreak"; if (exec) TestJapaneseLineBreak(); break; - case 15: name = "TestThaiLineBreak"; if(exec) TestThaiLineBreak(); break; - case 16: name = "TestMixedThaiLineBreak"; if(exec) TestMixedThaiLineBreak(); break; - case 17: name = "TestMaiyamok"; if(exec) TestMaiyamok(); break; - case 18: name = "TestThaiWordBreak"; if(exec) TestThaiWordBreak(); break; - - - default: name = ""; break; //needed to end loop - } -} - -//--------------------------------------------- -// Test implementation routines -//--------------------------------------------- - -// general test Implementation subroutines -void IntlTestTextBoundary::generalIteratorTest(BreakIterator& bi, Vector* expectedResult) -{ - Enumeration *elems = expectedResult->elements(); - UnicodeString text = createTestData(elems); - delete elems; - - logln("comparing forward and backward..."); - bi.setText(text); - - Vector *nextResults = testFirstAndNext(bi, text); - if (nextResults == NULL) { - errln("Couldn't get nextResults!"); - return; - } - - Vector *previousResults = testLastAndPrevious(bi, text); - - if (previousResults == NULL) { - errln("Couldn't get previousResults!"); - return; - } - - int errs = getErrors(); - UnicodeString str1="forward iteration"; - UnicodeString str2="backward iteration"; - compareFragmentLists(str1, str2, nextResults, - previousResults); - if (getErrors() == errs) { - logln("comparing expected and actual..."); - str1="expected result"; - str2="actual result"; - compareFragmentLists(str1, str2, expectedResult, - nextResults); - } - - int32_t *boundaries = new int32_t[expectedResult->size() + 3]; - boundaries[0] = BreakIterator::DONE; - boundaries[1] = 0; - for (int i = 0; i < expectedResult->size(); i++) - boundaries[i + 2] = boundaries[i + 1] + ((UnicodeString)expectedResult->elementAt(i)). - length(); - - int len = expectedResult->size() + 3 -1; - boundaries[len] = BreakIterator::DONE; - - testFollowing(bi, text, boundaries); - testPreceding(bi, text, boundaries); - testIsBoundary(bi, text, boundaries); - - doMultipleSelectionTest(bi, text); - - delete nextResults; - delete previousResults; - delete []boundaries; -} - -Vector* IntlTestTextBoundary::testFirstAndNext(BreakIterator& bi, UnicodeString& text) -{ - int32_t p = bi.first(); - int32_t lastP = p; - Vector *result = new Vector(); - UnicodeString selection; - - if (p != 0) - errln((UnicodeString)"first() returned " + p + (UnicodeString)" instead of 0"); - while (p != BreakIterator::DONE) { - p = bi.next(); - if (p != BreakIterator::DONE) { - if (p <= lastP) { - errln((UnicodeString)"next() failed to move forward: next() on position " - + lastP + (UnicodeString)" yielded " + p); - errln("Are the *.brk files corrupt?"); - return NULL; - } - - text.extractBetween(lastP, p, selection); - result->addElement(selection); - } - else { - if (lastP != text.length()) - errln((UnicodeString)"next() returned DONE prematurely: offset was " - + lastP + (UnicodeString)" instead of " + text.length()); - } - lastP = p; - } - return result; -} - -Vector* IntlTestTextBoundary::testLastAndPrevious(BreakIterator& bi, UnicodeString& text) -{ - int32_t p = bi.last(); - int32_t lastP = p; - Vector *result = new Vector(); - UnicodeString selection; - - if (p != text.length()) - errln((UnicodeString)"last() returned " + p + (UnicodeString)" instead of " + text.length()); - while (p != BreakIterator::DONE) { - p = bi.previous(); - if (p != BreakIterator::DONE) { - if (p >= lastP) - errln((UnicodeString)"previous() failed to move backward: previous() on position " - + lastP + (UnicodeString)" yielded " + p); - text.extractBetween(p, lastP, selection); - result->insertElementAt(selection, 0); - } - else { - if (lastP != 0) - errln((UnicodeString)"previous() returned DONE prematurely: offset was " - + lastP + (UnicodeString)" instead of 0"); - } - lastP = p; - } - return result; -} - -void IntlTestTextBoundary::compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2) -{ - int32_t p1 = 0; - int32_t p2 = 0; - UnicodeString s1; - UnicodeString s2; - int32_t t1 = 0; - int32_t t2 = 0; - UnicodeString target; - - while (p1 < f1->size() && p2 < f2->size()) { - s1 = (UnicodeString)f1->elementAt(p1); - s2 = (UnicodeString)f2->elementAt(p2); - t1 += s1.length(); - t2 += s2.length(); - - if (s1.compare(s2) == 0) { - logln(prettify((UnicodeString)" >" + s1 + (UnicodeString)"<", target)); - ++p1; - ++p2; - } - else { - int32_t tempT1 = t1; - int32_t tempT2 = t2; - int32_t tempP1 = p1; - int32_t tempP2 = p2; - - while (tempT1 != tempT2 && tempP1 < f1->size() && tempP2 < f2->size()) { - while (tempT1 < tempT2 && tempP1 < f1->size()) { - tempT1 += ((UnicodeString)f1->elementAt(tempP1)).length(); - ++tempP1; - } - while (tempT2 < tempT1 && tempP2 < f2->size()) { - tempT2 += ((UnicodeString)f2->elementAt(tempP2)).length(); - ++tempP2; - } - } - logln((UnicodeString)"*** " + f1Name + (UnicodeString)" has:"); - while (p1 <= tempP1 && p1 < f1->size()) { - s1 = (UnicodeString)f1->elementAt(p1); - t1 += s1.length(); - logln(prettify((UnicodeString)" *** >" + s1 + (UnicodeString)"<", target)); - ++p1; - } - logln("***** " + f2Name + " has:"); - while (p2 <= tempP2 && p2 < f2->size()) { - s2 = (UnicodeString)f2->elementAt(p2); - t2 += s2.length(); - logln(prettify(" ***** >" + s2 + "<", target)); - ++p2; - } - errln((UnicodeString)"Discrepancy between " + f1Name + (UnicodeString)" and " + f2Name); - } - } -} - -void IntlTestTextBoundary::testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) -{ - logln("testFollowing():"); - int p = 2; - int32_t textLen = text.length(); - for (int i = 0; i <= textLen; i++) { - if (i == boundaries[p]) - ++p; - - int32_t b = bi.following(i); - logln((UnicodeString)"bi.following(" + i + ") -> " + b); - if (b != boundaries[p]) - errln((UnicodeString)"Wrong result from following() for " + i + (UnicodeString)": expected " + boundaries[p] - + (UnicodeString)", got " + b); - } -} - -void IntlTestTextBoundary::testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) { - logln("testPreceding():"); - int p = 0; - int32_t textLen = text.length(); - for (int i = 0; i <= textLen; i++) { - int32_t b = bi.preceding(i); - logln((UnicodeString)"bi.preceding(" + i + ") -> " + b); - if (b != boundaries[p]) - errln((UnicodeString)"Wrong result from preceding() for " + i + (UnicodeString)": expected " + boundaries[p] - + (UnicodeString)", got " + b); - - if (i == boundaries[p + 1]) - ++p; - } -} - -void IntlTestTextBoundary::testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) { - logln("testIsBoundary():"); - int p = 1; - UBool isB; - int32_t textLen = text.length(); - for (int i = 0; i < textLen; i++) { - isB = bi.isBoundary(i); - logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB); - - if (i == boundaries[p]) { - if (!isB) - errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false"); - p++; - } - else { - if (isB) - errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true"); - } - } -} - -void IntlTestTextBoundary::doMultipleSelectionTest(BreakIterator& iterator, - UnicodeString& testText) -{ - iterator.setText(testText); - - BreakIterator* testIterator = iterator.clone(); - int32_t offset = iterator.first(); - int32_t testOffset; - int32_t count = 0; - - logln("doMultipleSelectionTest text of length: %d", testText.length()); - - if (*testIterator != iterator) - errln("clone() or operator!= failed: two clones compared unequal"); - - do { - testOffset = testIterator->first(); - testOffset = testIterator->next(count); - if (offset != testOffset) - errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); - - if (offset != BreakIterator::DONE) { - count++; - offset = iterator.next(); - - if (offset != BreakIterator::DONE && *testIterator == iterator) - errln("operator== failed: Two unequal iterators compared equal."); - } - } while (offset != BreakIterator::DONE); - - // now do it backwards... - offset = iterator.last(); - count = 0; - - do { - testOffset = testIterator->last(); - testOffset = testIterator->next(count); - if (offset != testOffset) - errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); - - if (offset != BreakIterator::DONE) { - count--; - offset = iterator.previous(); - } - } while (offset != BreakIterator::DONE); - delete testIterator; -} - -void IntlTestTextBoundary::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars) -{ - UnicodeString work("aaa"); - int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen; - - // a break should always occur after CR (unless followed by LF), LF, PS, and LS - UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028"); - int32_t i, j; - - breaksLen = breaks.length(); - for (i = 0; i < breaksLen; i++) { - UChar c1 = work[1] = breaks[i]; - for (j = 0; j < testCharsLen; j++) { - UChar c0 = work[0] = testChars[j]; - for (int k = 0; k < testCharsLen; k++) { - UChar c2 = work[2] = testChars[k]; - - // if a cr is followed by lf, ps, ls or etx, don't do the check (that's - // not supposed to work) - if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029 - || c2 == 0x2028 || c2 == 0x0003)) - continue; - - if (u_charType(c1) == U_CONTROL_CHAR && - (u_charType(c2) == U_NON_SPACING_MARK || - u_charType(c2) == U_ENCLOSING_MARK || - u_charType(c2) == U_COMBINING_SPACING_MARK) - ) { - // Combining marks don't combine with controls. - // TODO: enhance test to verify that the break actually occurs, - // not just ignore the case. - continue; - } - - - tb.setText(work); - UBool seen2 = FALSE; - for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) { - if (l == 2) { - seen2 = TRUE; - break; - } - } - if (!seen2) { - errln("No break between U+" + UCharToUnicodeString(c1) - + " and U+" + UCharToUnicodeString(c2)); - errCount++; - if (errCount >= 75) - return; - } - } - } - } -} - -void IntlTestTextBoundary::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars) -{ - UnicodeString work("a\r\na"); - int32_t errCount = 0, testCharsLen = testChars.length(); - int32_t i, j; - int8_t type; - - // a break should never occur between CR and LF - for (i = 0; i < testCharsLen; i++) { - work[0] = testChars[i]; - for (j = 0; j < testCharsLen; j++) { - work[3] = testChars[j]; - tb.setText(work); - for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next()) - if (k == 2) { - errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) + - ", U+d U+a U+" + UCharToUnicodeString(work[3])); - errCount++; - if (errCount >= 75) - return; - } - } - } - - // a break should never occur before a non-spacing mark, unless the preceding - // character is CR, LF, PS, or LS - // Or the general category == Control. - work.remove(); - work += "aaaa"; - for (i = 0; i < testCharsLen; i++) { - UChar c1 = testChars[i]; - if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 || - u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) { - continue; - } - work[1] = c1; - for (j = 0; j < testCharsLen; j++) { - UChar c2 = testChars[j]; - type = u_charType(c2); - if ((type != U_NON_SPACING_MARK) && - (type != U_ENCLOSING_MARK)) { - continue; - } - work[2] = c2; - tb.setText(work); - for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next()) - if (k == 2) { - errln("Break between U+" + UCharToUnicodeString(work[1]) - + " and U+" + UCharToUnicodeString(work[2])); - errCount++; - if (errCount >= 75) - return; - } - } - } -} - -void IntlTestTextBoundary::sample(BreakIterator& tb, - UnicodeString& text, - UnicodeString& title) -{ - UnicodeString substring; - UBool verboseWas = verbose; - verbose = TRUE; - logln("-------------------------"+title+" length = "+text.length()); - tb.setText(text); - int32_t start = tb.first(); - int32_t end; - for (end = tb.next(); end != BreakIterator::DONE; end = tb.next()) { - text.extractBetween(start, end, substring); - logln(UnicodeString("[")+start+","+end+"] \""+substring+"\""); - start = end; - } - verbose = verboseWas; -} - - - diff --git a/icu4c/source/test/intltest/ittxtbd.h b/icu4c/source/test/intltest/ittxtbd.h deleted file mode 100644 index 3e2691f9193..00000000000 --- a/icu4c/source/test/intltest/ittxtbd.h +++ /dev/null @@ -1,182 +0,0 @@ -/******************************************************************** - * COPYRIGHT: - * Copyright (c) 1997-2001, International Business Machines Corporation and - * others. All Rights Reserved. - ********************************************************************/ - - -#ifndef _INTLTESTTEXTBOUNDARY -#define _INTLTESTTEXTBOUNDARY - - -#include "intltest.h" -#include "unicode/brkiter.h" - -class Vector; -class Enumeration; - -/** - * Test the BreakIterator class and indirectly all related classes - */ -class IntlTestTextBoundary: public IntlTest { -public: - IntlTestTextBoundary(); - virtual ~IntlTestTextBoundary(); - - void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); - /** - * Test sentence break using generalIteratorTest() - **/ - void TestSentenceIteration(void); - /** - * Test word break using generalIteratorTest() - **/ - void TestWordIteration(void); - /** - * Test line break using generalIteratorTest() - **/ - void TestLineIteration(void); - /** - * Test character break using generalIteratorTest() - **/ - void TestCharacterIteration(void); - /** - * Test sentence break using () - **/ - void TestSentenceInvariants(void); - /** - * Test sentence break Invariants using generalIteratorTest() - **/ - void TestWordInvariants(void); - /** - * Test sentence break Invariants using generalIteratorTest() - **/ - void TestLineInvariants(void); - /** - * Test sentence break Invariants using generalIteratorTest() - **/ - void TestCharacterInvariants(void); - /** - * Test Japanese line break Invariants using generalIteratorTest() - **/ - void TestJapaneseLineBreak(void); - /** - * Test Thai line break using generalIteratorTest() - **/ - void TestThaiLineBreak(void); - /** - * Test Mixed Thai (thai with other languages like english)line break using generalIteratorTest() - **/ - void TestMixedThaiLineBreak(void); - /** - * Test Thai Line break with Maiyamok using generalIteratorTest() - * The Thai maiyamok character is a shorthand symbol that means "repeat the previous - * word". Instead of appearing as a word unto itself, however, it's kept together - * with the word before it - **/ - void TestMaiyamok(void); - /** - * Test Thai word break using generalIteratorTest() - **/ - void TestThaiWordBreak(void); - /** - * test behaviour of BreakIterator on an empty string - **/ - void TestEmptyString(void); - /** - * Test BreakIterator::getAvailableLocales - **/ - void TestGetAvailableLocales(void); - /** - * Test BreakIterator::getDisplayName - **/ - void TestGetDisplayName(void); - /** - * test methods preceding, following and isBoundary - **/ - void TestPreceding(void); - - void TestBug4153072(void); - /** - * Test End Behaviour - * @bug 4068137 - **/ - void TestEndBehaviour(void); - -/***********************/ -private: - /** - * internal methods to prepare test data - **/ - void addTestWordData(void); - void addTestSentenceData(void); - void addTestLineData(void); - void addTestCharacterData(void); - UnicodeString createTestData(Enumeration* e); - - /** - * Perform tests of BreakIterator forward and backward functionality - * on different kinds of iterators (word, sentence, line and character). - * It tests the methods first(), next(), current(), preceding(), following() - * previous() and isBoundary(). - * It makes use of internal functions to achieve this. - **/ - void generalIteratorTest(BreakIterator& bi, Vector* expectedResult); - /** - * Internal method to perform iteration and test the first() and next() functions - **/ - Vector* testFirstAndNext(BreakIterator& bi, UnicodeString& text); - /** - * Internal method to perform iteration and test the last() and previous() functions - **/ - Vector* testLastAndPrevious(BreakIterator& bi, UnicodeString& text); - /** - * Internal method to perform iteration and test the following() function - **/ - void testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries); - /** - * Internal method to perform iteration and test the preceding() function - **/ - void testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries); - /** - * Internal method to perform iteration and test the isBoundary() function - **/ - void testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries); - /** - * Internal method which does the comparision of expected and got results. - **/ - void compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2); - /** - * Internal method to perform tests of BreakIterator multiple selection functionality - * on different kinds of iterators (word, sentence, line and character) - **/ - void doMultipleSelectionTest(BreakIterator& iterator, UnicodeString& testText); - /** - * Internal method to perform tests of BreakIterator break Invariants - * on different kinds of iterators (word, sentence, line and character) - **/ - void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars); - /** - * Internal method to perform tests of BreakIterator other invariants - * on different kinds of iterators (word, sentence, line and character) - **/ - void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars); - /** - * Perform tests with short sample code - **/ - void sample(BreakIterator& tb, UnicodeString& text, UnicodeString& title); - /** - * The vectors holding test data for testing - * different kinds of iterators( word, sentence, line and character) - **/ - Vector* lineSelectionData; - Vector* sentenceSelectionData; - Vector* wordSelectionData; - Vector* characterSelectionData; - - static const UChar cannedTestArray[]; - static UnicodeString *cannedTestChars; -}; - - -#endif diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index f8d1059e150..c2d020ce2ee 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -166,11 +166,24 @@ void BITestData::clearResults() { } -//-------------------------------------------------------------------------------------- +//----------------------------------------------------------------------------------- // -// RBBITest +// Cannned Test Characters // -//-------------------------------------------------------------------------------------- +//----------------------------------------------------------------------------------- + +static const UChar cannedTestArray[] = { + 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, + 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, + 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, + 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, + 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, + 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, + 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, + 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 +}; + +static UnicodeString* cannedTestChars = 0; #define halfNA "\\u0928\\u094d\\u200d" #define halfSA "\\u0938\\u094d\\u200d" @@ -178,7 +191,23 @@ void BITestData::clearResults() { #define halfKA "\\u0915\\u094d\\u200d" #define deadTA "\\u0924\\u094d" +//-------------------------------------------------------------------------------------- +// +// RBBITest constructor and destructor +// +//-------------------------------------------------------------------------------------- +RBBITest::RBBITest() { + UnicodeString temp(cannedTestArray); + cannedTestChars = new UnicodeString(); + *cannedTestChars += (UChar)0x0000; + *cannedTestChars += temp; +} + + +RBBITest::~RBBITest() { + delete cannedTestChars; +} //-------------------------------------------------------------------- //tests default rules based character iteration @@ -209,6 +238,32 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration() ADD_DATACHUNK(chardata, "e\\u0301", 0, status); //acuteE ADD_DATACHUNK(chardata, "&", 0, status); ADD_DATACHUNK(chardata, "e\\u0303", 0, status); //tildaE + + ADD_DATACHUNK(chardata, "S\\u0300", 0, status); //graveS + ADD_DATACHUNK(chardata, "i\\u0301", 0, status); // acuteBelowI + ADD_DATACHUNK(chardata, "m", 0, status); + ADD_DATACHUNK(chardata, "p", 0, status); + ADD_DATACHUNK(chardata, "l", 0, status); + ADD_DATACHUNK(chardata, "e\\u0301", 0, status); // acuteE + ADD_DATACHUNK(chardata, " ", 0, status); + ADD_DATACHUNK(chardata, "s", 0, status); + ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA + ADD_DATACHUNK(chardata, "m", 0, status); + ADD_DATACHUNK(chardata, "p", 0, status); + ADD_DATACHUNK(chardata, "l", 0, status); + ADD_DATACHUNK(chardata, "e\\u0303", 0, status); // tildeE + ADD_DATACHUNK(chardata, ".", 0, status); + ADD_DATACHUNK(chardata, "w", 0, status); + ADD_DATACHUNK(chardata, "a\\u0302", 0, status); // circumflexA + ADD_DATACHUNK(chardata, "w", 0, status); + ADD_DATACHUNK(chardata, "a", 0, status); + ADD_DATACHUNK(chardata, "f", 0, status); + ADD_DATACHUNK(chardata, "q", 0, status); + ADD_DATACHUNK(chardata, "\n", 0, status); + ADD_DATACHUNK(chardata, "\r", 0, status); + ADD_DATACHUNK(chardata, "\r\n", 0, status); + ADD_DATACHUNK(chardata, "\n", 0, status); + //devanagiri characters for Hindi support ADD_DATACHUNK(chardata, "\\u0906", 0, status); //devanagiri AA //ADD_DATACHUNK(chardata, "\\u093e\\u0901", 0); //devanagiri vowelsign AA+ chandrabindhu @@ -233,6 +288,10 @@ void RBBITest::TestDefaultRuleBasedCharacterIteration() ADD_DATACHUNK(chardata, "i\\u0301", 0, status); //acuteBelowI ADD_DATACHUNK(chardata, "!", 0, status); + + + + // What follows is a string of Korean characters (I found it in the Yellow Pages // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed // it correctly), first as precomposed syllables, and then as conjoining jamo. @@ -354,6 +413,8 @@ void RBBITest::TestDefaultRuleBasedWordIteration() ADD_DATACHUNK(worddata, "$", 0, status); ADD_DATACHUNK(worddata, "30.10", T_NUMBER, status); ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "12,34", T_NUMBER, status); + ADD_DATACHUNK(worddata, " ", 0, status); ADD_DATACHUNK(worddata, "\\u00A2", 0, status); //cent sign ADD_DATACHUNK(worddata, "\\u00A3", 0, status); //pound sign ADD_DATACHUNK(worddata, "\\u00A4", 0, status); //currency sign @@ -365,14 +426,33 @@ void RBBITest::TestDefaultRuleBasedWordIteration() ADD_DATACHUNK(worddata, " ", 0, status); ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status); ADD_DATACHUNK(worddata, "!", 0, status); + ADD_DATACHUNK(worddata, "?", 0, status); + ADD_DATACHUNK(worddata, "!", 0, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "We", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "don't", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "need", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "no", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "STINKING", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "BADGES", T_LETTER, status); + ADD_DATACHUNK(worddata, "!", 0, status); + ADD_DATACHUNK(worddata, "!", 0, status); + ADD_DATACHUNK(worddata, "1000,233,456.000", T_NUMBER, status); ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "1,23.322", T_NUMBER, status); ADD_DATACHUNK(worddata, "%", 0, status); ADD_DATACHUNK(worddata, "123.1222", T_NUMBER, status); ADD_DATACHUNK(worddata, "$", 0, status); ADD_DATACHUNK(worddata, "123,000.20", T_NUMBER, status); ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "179.01", T_NUMBER, status); ADD_DATACHUNK(worddata, "%", 0, status); ADD_DATACHUNK(worddata, "X", T_LETTER, status); @@ -428,12 +508,54 @@ void RBBITest::TestDefaultRuleBasedWordIteration() ADD_DATACHUNK(worddata, "\\u3094\\u0301", T_H_OR_K, status); // Hiragana ADD_DATACHUNK(worddata, "\\u309d", T_H_OR_K, status); // Hiragana ADD_DATACHUNK(worddata, "\\u30a1\\u30fd\\uff66\\uff9d", T_H_OR_K, status); // Katakana - // ADD_DATACHUNK(worddata, "def", T_LETTER, status); // TODO why does this fail??? - ADD_DATACHUNK(worddata, ".", 0, status); + ADD_DATACHUNK(worddata, "def", T_LETTER, status); + ADD_DATACHUNK(worddata, "#", 0, status); // Words with interior formatting characters ADD_DATACHUNK(worddata, "def\\u0301\\u070Fabc", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + // to test for bug #4097779 + ADD_DATACHUNK(worddata, "aa\\u0300a", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + + // to test for bug #4098467 + // What follows is a string of Korean characters (I found it in the Yellow Pages + // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed + // it correctly), first as precomposed syllables, and then as conjoining jamo. + // Both sequences should be semantically identical and break the same way. + // precomposed syllables... + ADD_DATACHUNK(worddata, "\\uc0c1\\ud56d", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "\\ud55c\\uc778", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "\\uc5f0\\ud569", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "\\uc7a5\\ub85c\\uad50\\ud68c", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + // conjoining jamo... + ADD_DATACHUNK(worddata, "\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + ADD_DATACHUNK(worddata, "\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c", T_LETTER, status); + ADD_DATACHUNK(worddata, " ", 0, status); + + // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should + // count as a Kanji character for the purposes of word breaking + ADD_DATACHUNK(worddata, "abc", T_LETTER, status); + // Unicode TR29: Ideographs do NOT group together into words. + //wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03")); + ADD_DATACHUNK(worddata, "\\u4e01", T_IDEO, status); + ADD_DATACHUNK(worddata, "\\u4e02", T_IDEO, status); + ADD_DATACHUNK(worddata, "\\u3005", T_LETTER, status); // TODO: 3005 is ideographic iteration mark + // Treating as letter is according to TR. + // Check whether this is really intended. + ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status); + ADD_DATACHUNK(worddata, "\\u4e03", T_IDEO, status); + ADD_DATACHUNK(worddata, "abc", T_LETTER, status); if (U_FAILURE(status)){ errln("FAIL : in BITestData construction"); @@ -531,6 +653,40 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration() // opening punctuation ADD_DATACHUNK(sentdata, "How do you do?", 0, status); ADD_DATACHUNK(sentdata, "(fine).", 0, status); + + // test for bug #4158381: Don't break sentence after period if it isn't + // followed by a space + ADD_DATACHUNK(sentdata, "Test Flags.Flag class. ", 0, status); + ADD_DATACHUNK(sentdata, "Another test.\\u2029", 0, status); + + // test for bug #4158381: No breaks when there are no terminators around + ADD_DATACHUNK(sentdata, "

Provides a set of "lightweight" (all-java" + "TM language) components that, to the maximum degree possible," + "work the same on all platforms. ", 0, status); + ADD_DATACHUNK(sentdata, "Another test.\\u2029", 0, status); + + // test for bug #4143071: Make sure sentences that end with digits + // work right + ADD_DATACHUNK(sentdata, "Today is the 27th of May, 1998. ", 0, status); + ADD_DATACHUNK(sentdata, "Tomorrow with be 28 May 1998. ", 0, status); + ADD_DATACHUNK(sentdata, "The day after will be the 30th.\\u2029", 0, status); + + // test for bug #4152416: Make sure sentences ending with a capital + // letter are treated correctly + // Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter. + ADD_DATACHUNK(sentdata, "The type of all primitive boolean values accessed in the " + "target VM. Calls to xxx will return an implementor of this interface. \\u2029", 0, status); + + // test for bug #4152117: Make sure sentence breaking is handling + // punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS + // HERE TO MAKE SURE IT DOESN'T CROP UP] + ADD_DATACHUNK(sentdata, "Constructs a randomly generated BigInteger, uniformly distributed " + "over the range 0 to (2numBits - 1), inclusive. ", 0, status); + ADD_DATACHUNK(sentdata, "The uniformity of the distribution assumes that a fair source of random bits " + "is provided in rnd. ", 0, status); + ADD_DATACHUNK(sentdata, "Note that this constructor always constructs a non-negative biginteger. \n", 0, status); + ADD_DATACHUNK(sentdata, "Ahh abc. \n", 0, status); + //sentence breaks for hindi which used Devanagari script //make sure there is sentence break after ?,danda(hindi phrase separator),fullstop followed by space and no break after \n \r ADD_DATACHUNK(sentdata, "\\u0928\\u092e" halfSA @@ -1198,6 +1354,222 @@ void RBBITest::TestAbbrRuleBasedWordIteration() delete rb; } */ + + +void RBBITest::TestThaiLineBreak() { + UErrorCode status = U_ZERO_ERROR; + BITestData thaiLineSelection(status); + + // \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that + // represents elided letters at the end of a long word. It should be bound to + // the end of the word and not treated as an independent punctuation mark. + + + ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data + ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status); +// ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status); +// ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status); + // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us + ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status); + + // the one time where the paiyannoi occurs somewhere other than at the end + // of a word is in the Thai abbrevation for "etc.", which both begins and + // ends with a paiyannoi + ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status); + + RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( + Locale("th"), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n"); + return; + } + + generalIteratorTest(*e, thaiLineSelection); +} + + + +void RBBITest::TestMixedThaiLineBreak() +{ + UErrorCode status = U_ZERO_ERROR; + BITestData thaiLineSelection(status); + + ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data + + // Arabic numerals should always be separated from surrounding Thai text +/* + ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status); + thaiLineSelection->addElement("39"); + ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status); + + // words in non-Thai scripts should always be separated from surrounding Thai text + ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e14", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e2d\\u0e1a", 0, status); + thaiLineSelection->addElement("Java"); + ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e19", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 ", 0, status); + + // Thai numerals should always be separated from the text surrounding them + ADD_DATACHUNK(thaiLineSelection, "\\u0e04\\u0e48\\u0e32", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e07\\u0e34\\u0e19", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e15\\u0e30", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e53\\u0e59", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e1a\\u0e32\\u0e17 ", 0, status); + + // Thai text should interact correctly with punctuation and symbols + ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21", 0, status); +// ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28", 0, status); +// ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e17\\u0e22)", 0, status); +ADD_DATACHUNK(thaiLineSelection, "(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)", 0, status); +// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary + ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e1b\\u0e34\\u0e14", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e15\\u0e31\\u0e27\"", 0, status); +*/ + + // The Unicode Linebreak TR says do not break before or after quotes. + // So this test is changed ot not break around the quote. + // TODO: should Thai break around the around the quotes, like the original behavior here? +// ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"", 0, status); +// ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\"" + "\\u0e23\\u0e38\\u0e48\\u0e19", 0, status); + + ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status); + ADD_DATACHUNK(thaiLineSelection, "$200", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status); + ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status); + + RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n"); + return; + } + + + generalIteratorTest(*e, thaiLineSelection); +} + + +void RBBITest::TestMaiyamok() +{ + UErrorCode status = U_ZERO_ERROR; + BITestData thaiLineSelection(status); + ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); // Break at start of data + // the Thai maiyamok character is a shorthand symbol that means "repeat the previous + // word". Instead of appearing as a word unto itself, however, it's kept together + // with the word before it + ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status); + ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status); + + RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance( + Locale("th"), status); + + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n"); + return; + } + generalIteratorTest(*e, thaiLineSelection); + delete e; +} + +void RBBITest::TestThaiWordBreak() { + UErrorCode status = U_ZERO_ERROR; + BITestData thaiWordSelection(status); + + ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); // Break at start of data + ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); //2 + ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); //5 + ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); //6 + ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); //10 + ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); //16 + ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status); //18 + + // This is the correct result + //ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35", 0, status); //24 + //ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29 + + // and this is what the dictionary does... + ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); // 20 + ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status); //29 + + ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status); //33 + + // This is the correct result + //ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21", 0, status); //37 + //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41 + + // and this is what the dictionary does + ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status); //41 + + ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); //45 + ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); //49 + ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status); //51 + + // This is the correct result + //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A", 0, status); //57 + //ADD_DATACHUNK(thaiWordSelection, "\\u0E01\\u0E31\\u0E1A", 0, status); //60 + + // and this is what the dictionary does + ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); // 54 + ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status); //60 + + ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status); //63 + + // This is the correct result + //ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35", 0, status); //68 + //ADD_DATACHUNK(thaiWordSelection, "\\u0E0A\\u0E32\\u0E27", 0, status); //71 + //ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E23\\u0E48", 0, status); //74 + //ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E25\\u0E30", 0, status); //77 + + // and this is what the dictionary does + ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); // 65 + ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status); //77 + + RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance( + Locale("th"), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n"); + return; + } + + generalIteratorTest(*e, thaiWordSelection); + delete e; +} + + //--------------------------------------------- // runIndexedTest //--------------------------------------------- @@ -1223,11 +1595,42 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha if(exec) TestTitleBreak(); break; case 7: name = "TestStatusReturn"; if(exec) TestStatusReturn(); break; - case 8: name = "TestLineBreakData"; - if(exec) TestLineBreakData(); break; -// case 6: name = "TestDanda()"; -// if(exec) TestDanda(); break; + case 8: name = "TestLineBreakData"; + if(exec) TestLineBreakData(); break; + case 9: name = "TestSentenceInvariants"; + if(exec) TestSentenceInvariants(); break; + case 10: name = "TestCharacterInvariants"; + if(exec) TestCharacterInvariants(); break; + case 11: name = "TestWordInvariants"; + if(exec) TestWordInvariants(); break; + + case 12: name = "TestEmptyString"; + if(exec) TestEmptyString(); break; + + case 13: name = "TestGetAvailableLocales"; + if(exec) TestGetAvailableLocales(); break; + + case 14: name = "TestGetDisplayName"; + if(exec) TestGetDisplayName(); break; + + case 15: name = "TestEndBehaviour"; + if(exec) TestEndBehaviour(); break; + case 16: name = "TestBug4153072"; + if(exec) TestBug4153072(); break; + case 17: name = "TestJapaneseLineBreak()"; + if(exec) TestJapaneseLineBreak(); break; + + + case 18: name = "TestThaiLineBreak()"; + if(exec) TestThaiLineBreak(); break; + case 19: name = "TestMixedThaiLineBreak()"; + if(exec) TestMixedThaiLineBreak(); break; + case 20: name = "TestMaiyamok()"; + if(exec) TestMaiyamok(); break; + case 21: name = "TestThaiWordBreak()"; + if(exec) TestThaiWordBreak(); break; + // case 7: name = "TestHindiCharacterWrapping()"; // if(exec) TestHindiCharacterWrapping(); break; // case 8: name = "TestCustomRuleBasedWordIteration"; @@ -1486,6 +1889,488 @@ void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestD } + +//-------------------------------------------------------------------------------------------- +// +// Break Iterator Invariants Tests +// +//-------------------------------------------------------------------------------------------- + +void RBBITest::TestCharacterInvariants() +{ + UErrorCode status = U_ZERO_ERROR; + BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n"); + return; + } + UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa"); + doBreakInvariantTest(*e, s); + s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa"); + doOtherInvariantTest(*e, s); + delete e; +} + + +void RBBITest::TestWordInvariants() +{ + UErrorCode status = U_ZERO_ERROR; + BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n"); + return; + } + UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02"); + doBreakInvariantTest(*e, s); + s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02"); + doOtherInvariantTest(*e, s); + delete e; +} + + +void RBBITest::TestSentenceInvariants() +{ + UErrorCode status = U_ZERO_ERROR; + BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n"); + return; + } + UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff"); + doOtherInvariantTest(*e, s); + delete e; +} + + +void RBBITest::TestLineInvariants() +{ + UErrorCode status = U_ZERO_ERROR; + BreakIterator *e = BreakIterator::createLineInstance(Locale::getUS(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n"); + return; + } + UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02"); + UnicodeString testChars = *cannedTestChars + s; + doBreakInvariantTest(*e, testChars); + doOtherInvariantTest(*e, testChars); + + int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen; + int32_t i, j, k; + + // in addition to the other invariants, a line-break iterator should make sure that: + // it doesn't break around the non-breaking characters, + // EXCEPT breaking after a space takes precedence over not breaking before + // an non-breaking char. So says TR 14. + UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff"); + UnicodeString work("aaa"); + testCharsLen = testChars.length(); + noBreakLen = noBreak.length(); + for (i = 0; i < testCharsLen; i++) { + UChar c = testChars[i]; + if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 || + u_charType(c) == U_CONTROL_CHAR) { + continue; + } + work[0] = c; + for (j = 0; j < noBreakLen; j++) { + work[1] = noBreak[j]; + for (k = 0; k < testCharsLen; k++) { + work[2] = testChars[k]; + e->setText(work); + for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) { + UChar c1 = work[l - 1]; + UChar c2 = work[l]; + if (c1 == 0x20 && l == 1) { + continue; + } + if (l == 1 || l == 2) { + errln("Got break between U+" + UCharToUnicodeString(c1) + + " and U+" + UCharToUnicodeString(c2)); + errCount++; + if (errCount >= 75) + return; + } + } + } + } + } + + // it does break after hyphens (Rule 15B from TR 14 + // (unless they're followed by a digit, a non-spacing mark, + // a currency symbol, a non-breaking space, or a line or paragraph separator + // or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d + + // This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH + // + UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014"); + dashesLen = dashes.length(); + for (i = 0; i < testCharsLen; i++) { + work[0] = testChars[i]; + for (j = 0; j < dashesLen; j++) { + UChar c1 = work[1] = dashes[j]; + for (k = 0; k < testCharsLen; k++) { + UChar c2 = work[2] = testChars[k]; + int8_t type = u_charType(c2); + if (type == U_DECIMAL_DIGIT_NUMBER || + type == U_OTHER_NUMBER || + type == U_NON_SPACING_MARK || + type == U_ENCLOSING_MARK || + type == U_CURRENCY_SYMBOL || + type == U_SPACE_SEPARATOR || + type == U_DASH_PUNCTUATION || + type == U_CONTROL_CHAR || + type == U_FORMAT_CHAR || + c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 || + c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 || + c2 == 0xfeff) + { + continue; + } + // If c1 == hyphen-minus, and ... + if (c1 == 0x002d && ( + c2 == 0x0021 || // ! + c2 == 0x002c || // , + c2 == 0x002d || // - + c2 == 0x002e || // . (TR 14 class IS) + c2 == 0x0029 || // ) + c2 == 0x003a || // : + c2 == 0x003b || // ; (TR 14 class IS) + c2 == 0x005d || // ] + c2 == 0x007c || // | (TR 14 class BA, rule 15) + c2 == 0x007d || // } + c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test? + c2 == 0x093E || // Devanagari , combining, what's it doing in this test? + c2 == 0x093F || // Devanagari , combining, what's it doing in this test? + c2 == 0x0940 || // Devanagari , combining, what's it doing in this test? + c2 == 0x0949 || // Devanagari , combining, what's it doing in this test? + c2 == 0x0f3b || // Tibetan closing bracket + c2 == 0x3001 || // CJK closing bracket + c2 == 0x3002 // CJK closing bracket + )) { + continue; + } + + e->setText(work); + UBool saw2 = FALSE; + for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) { + if (l == 2) { + saw2 = TRUE; + break; + } + } + if (!saw2) { + // TODO: This test is completely out of sync with the spec. Fix it. + // errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) + + // " and U+" + UCharToUnicodeString(work[2])); + // errCount++; + // if (errCount >= 75) + // return; + } + } + } + } + delete e; +} + + + +void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars) +{ + UnicodeString work("aaa"); + int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen; + + // a break should always occur after CR (unless followed by LF), LF, PS, and LS + UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028"); + int32_t i, j; + + breaksLen = breaks.length(); + for (i = 0; i < breaksLen; i++) { + UChar c1 = work[1] = breaks[i]; + for (j = 0; j < testCharsLen; j++) { + UChar c0 = work[0] = testChars[j]; + for (int k = 0; k < testCharsLen; k++) { + UChar c2 = work[2] = testChars[k]; + + // if a cr is followed by lf, ps, ls or etx, don't do the check (that's + // not supposed to work) + if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029 + || c2 == 0x2028 || c2 == 0x0003)) + continue; + + if (u_charType(c1) == U_CONTROL_CHAR && + (u_charType(c2) == U_NON_SPACING_MARK || + u_charType(c2) == U_ENCLOSING_MARK || + u_charType(c2) == U_COMBINING_SPACING_MARK) + ) { + // Combining marks don't combine with controls. + // TODO: enhance test to verify that the break actually occurs, + // not just ignore the case. + continue; + } + + + tb.setText(work); + UBool seen2 = FALSE; + for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) { + if (l == 2) { + seen2 = TRUE; + break; + } + } + if (!seen2) { + errln("No break between U+" + UCharToUnicodeString(c1) + + " and U+" + UCharToUnicodeString(c2)); + errCount++; + if (errCount >= 75) + return; + } + } + } + } +} + + + +void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars) +{ + UnicodeString work("a\r\na"); + int32_t errCount = 0, testCharsLen = testChars.length(); + int32_t i, j; + int8_t type; + + // a break should never occur between CR and LF + for (i = 0; i < testCharsLen; i++) { + work[0] = testChars[i]; + for (j = 0; j < testCharsLen; j++) { + work[3] = testChars[j]; + tb.setText(work); + for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next()) + if (k == 2) { + errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) + + ", U+d U+a U+" + UCharToUnicodeString(work[3])); + errCount++; + if (errCount >= 75) + return; + } + } + } + + // a break should never occur before a non-spacing mark, unless the preceding + // character is CR, LF, PS, or LS + // Or the general category == Control. + work.remove(); + work += "aaaa"; + for (i = 0; i < testCharsLen; i++) { + UChar c1 = testChars[i]; + if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 || + u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) { + continue; + } + work[1] = c1; + for (j = 0; j < testCharsLen; j++) { + UChar c2 = testChars[j]; + type = u_charType(c2); + if ((type != U_NON_SPACING_MARK) && + (type != U_ENCLOSING_MARK)) { + continue; + } + work[2] = c2; + tb.setText(work); + for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next()) + if (k == 2) { + errln("Break between U+" + UCharToUnicodeString(work[1]) + + " and U+" + UCharToUnicodeString(work[2])); + errCount++; + if (errCount >= 75) + return; + } + } + } +} + + + + +//--------------------------------------------- +// +// other tests +// +//--------------------------------------------- +void RBBITest::TestEmptyString() +{ + UnicodeString text = ""; + UErrorCode status = U_ZERO_ERROR; + + BITestData x(status); + ADD_DATACHUNK(x, "", 0, status); // Break at start of data + RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n"); + return; + } + generalIteratorTest(*bi, x); + delete bi; +} + +void RBBITest::TestGetAvailableLocales() +{ + int32_t locCount = 0; + const Locale* locList = BreakIterator::getAvailableLocales(locCount); + + if (locCount == 0) + errln("getAvailableLocales() returned an empty list!"); + // Just make sure that it's returning good memory. + for (int32_t i = 0; i < locCount; ++i) { + logln(locList[i].getName()); + } +} + +//Testing the BreakIterator::getDisplayName() function +void RBBITest::TestGetDisplayName() +{ + UnicodeString result; + + BreakIterator::getDisplayName(Locale::getUS(), result); + if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") + errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" + + result); + + BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); + if (result != "French (France)") + errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" + + result); +} +/** + * Test End Behaviour + * @bug 4068137 + */ +void RBBITest::TestEndBehaviour() +{ + UErrorCode status = U_ZERO_ERROR; + UnicodeString testString("boo."); + BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n"); + return; + } + wb->setText(testString); + + if (wb->first() != 0) + errln("Didn't get break at beginning of string."); + if (wb->next() != 3) + errln("Didn't get break before period in \"boo.\""); + if (wb->current() != 4 && wb->next() != 4) + errln("Didn't get break at end of string."); + delete wb; +} +/* + * @bug 4153072 + */ +void RBBITest::TestBug4153072() { + UErrorCode status = U_ZERO_ERROR; + BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for default locale in TestBug4153072\n"); + return; + } + UnicodeString str("...Hello, World!..."); + int32_t begin = 3; + int32_t end = str.length() - 3; + UBool dummy; + + StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); + iter->adoptText(textIterator); + for (int index = -1; index < begin + 1; ++index) { + dummy = iter->isBoundary(index); + if (index < begin && dummy == TRUE) { + errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index + + " and begin index = " + begin); + } + } + delete iter; +} + + +/** + * Test Japanese Line Break + * @bug 4095322 + */ +void RBBITest::TestJapaneseLineBreak() +{ + // Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count + // as opening and closing punctuation for line breaking. + // Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars + // from these tests. 6-13-2002 + // + UErrorCode status = U_ZERO_ERROR; + UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c"); + UnicodeString precedingChars = CharsToUnicodeString( + //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f"); + "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e"); + UnicodeString followingChars = CharsToUnicodeString( + // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc" + ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7" + // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034" + ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034" + "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302"); + BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status); + + int32_t i; + if (U_FAILURE(status)) + { + errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n"); + return; + } + + for (i = 0; i < precedingChars.length(); i++) { + testString[1] = precedingChars[i]; + iter->setText(testString); + int32_t j = iter->first(); + if (j != 0) + errln("ja line break failure: failed to start at 0"); + j = iter->next(); + if (j != 1) + errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i]) + + "' (" + ((int)(precedingChars[i])) + ")"); + j = iter->next(); + if (j != 3) + errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i]) + + "' (" + ((int)(precedingChars[i])) + ")"); + } + + for (i = 0; i < followingChars.length(); i++) { + testString[1] = followingChars[i]; + iter->setText(testString); + int j = iter->first(); + if (j != 0) + errln("ja line break failure: failed to start at 0"); + j = iter->next(); + if (j != 2) + errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i]) + + "' (" + ((int)(followingChars[i])) + ")"); + j = iter->next(); + if (j != 3) + errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i]) + + "' (" + ((int)(followingChars[i])) + ")"); + } + delete iter; +} + + +//-------------------------------------------------------------------------------------------- +// +// Exhaustive Tests, using Unicode Data Files. +// +//-------------------------------------------------------------------------------------------- + // // Token level scanner for the Unicode Line Break Test Data file. // Return the next token, as follows: diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index df39dcf7bf0..89230c74e91 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -27,6 +27,9 @@ class BITestData; class RBBITest: public IntlTest { public: + RBBITest(); + ~RBBITest(); + void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); /** * Tests default rules based character iteration @@ -67,6 +70,22 @@ public: **/ void TestLineBreakData(); + void TestSentenceInvariants(); + void TestCharacterInvariants(); + void TestWordInvariants(); + void TestLineInvariants(); + void TestEmptyString(); + void TestGetAvailableLocales(); + void TestGetDisplayName(); + void TestEndBehaviour(); + void TestBug4153072(); + void TestJapaneseLineBreak(); + void TestThaiLineBreak(); + void TestMixedThaiLineBreak(); + void TestMaiyamok(); + void TestThaiWordBreak(); + + /** * Test Hindi Danda i.e make sure we have a break point before and after danda **/ @@ -136,6 +155,9 @@ private: **/ void doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td); + void doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars); + void doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars); + };