ICU-13194 RBBI safe tables, added another test.

X-SVN-Rev: 41157
This commit is contained in:
Andy Heninger 2018-03-27 05:03:10 +00:00
parent b1b0be93ea
commit e5ab76b130
2 changed files with 90 additions and 5 deletions

View file

@ -17,6 +17,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <utility>
#include <vector>
#include "unicode/brkiter.h"
@ -111,6 +112,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestBug12677);
TESTCASE_AUTO(TestTableRedundancies);
TESTCASE_AUTO(TestBug13447);
TESTCASE_AUTO(TestReverse);
TESTCASE_AUTO_END;
}
@ -1817,7 +1819,7 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
// a break if there are three or more contiguous RIs. If there are
// only two, a break following will occur via other rules, and will include
// any trailing extend characters, which is needed behavior.
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
&& fRegionalIndicatorSet->contains(c2)) {
break;
}
@ -3121,11 +3123,11 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
// LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
// PR x (ID | EB | EM)
// (ID | EB | EM) x PO
if (fPR->contains(prevChar) &&
if (fPR->contains(prevChar) &&
(fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
continue;
}
if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
fPO->contains(thisChar)) {
continue;
}
@ -4422,7 +4424,7 @@ void RBBITest::TestBug12519() {
return;
}
assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
@ -4462,7 +4464,7 @@ void RBBITest::TestBug12677() {
void RBBITest::TestTableRedundancies() {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedBreakIterator> bi (
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
assertSuccess(WHERE, status);
@ -4538,6 +4540,85 @@ void RBBITest::TestBug13447() {
assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
}
// TestReverse exercises both the synthesized safe reverse rules and the logic
// for filling the break iterator cache when starting from random positions
// in the text.
//
// It's a monkey test, working on random data, with the expected data obtained
// from forward iteration (no safe rules involved), comparing with results
// when indexing into the interior of the string (safe rules needed).
void RBBITest::TestReverse() {
UErrorCode status = U_ZERO_ERROR;
TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
assertSuccess(WHERE, status);
TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
BreakIterator::createWordInstance(Locale::getEnglish(), status)));
assertSuccess(WHERE, status);
TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
BreakIterator::createLineInstance(Locale::getEnglish(), status)));
assertSuccess(WHERE, status);
TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
assertSuccess(WHERE, status);
}
void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
if (!bi) {
errln(WHERE);
return;
}
// From the mapping trie in the break iterator's internal data, create a
// vector of UnicodeStrings, one for each character category, containing
// all of the code points that map to that category. Unicode planes 0 and 1 only,
// to avoid an execess of unassigned code points.
RBBIDataWrapper *data = bi->fData;
int32_t categoryCount = data->fHeader->fCatCount;
UTrie2 *trie = data->fTrie;
std::vector<UnicodeString> strings(categoryCount, UnicodeString());
for (int cp=0; cp<0x1fff0; ++cp) {
int cat = utrie2_get32(trie, cp);
cat &= ~0x4000; // And off the dictionary bit from the category.
assertTrue(WHERE, cat < categoryCount && cat >= 0);
if (cat < 0 || cat >= categoryCount) return;
strings[cat].append(cp);
}
icu_rand randomGen;
const int testStringLength = 10000;
UnicodeString testString;
for (int i=0; i<testStringLength; ++i) {
int charClass = randomGen() % categoryCount;
if (strings[charClass].length() > 0) {
int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
testString.append(cp);
}
}
typedef std::pair<UBool, int32_t> Result;
std::vector<Result> expectedResults;
bi->setText(testString);
for (int i=0; i<testString.length(); ++i) {
bool isboundary = bi->isBoundary(i);
int ruleStatus = bi->getRuleStatus();
expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
}
for (int i=testString.length()-1; i>=0; --i) {
bi->setText(testString); // clears the internal break cache
Result expected = expectedResults[i];
assertEquals(WHERE, expected.first, bi->isBoundary(i));
assertEquals(WHERE, expected.second, bi->getRuleStatus());
}
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked

View file

@ -17,6 +17,8 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include <memory>
#include "intltest.h"
#include "unicode/brkiter.h"
#include "unicode/rbbi.h"
@ -77,6 +79,8 @@ public:
void TestBug12677();
void TestTableRedundancies();
void TestBug13447();
void TestReverse();
void TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi);
void TestDebug();
void TestProperties();