ICU-8550 Dictionary Break Iterator, fixes to work with UTF-8 text.

X-SVN-Rev: 35724
This commit is contained in:
Andy Heninger 2014-05-17 00:44:39 +00:00
parent 68c893b2f1
commit f71b9053d2
8 changed files with 726 additions and 487 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
/**
*******************************************************************************
* Copyright (C) 2006,2012-2013, International Business Machines Corporation *
* Copyright (C) 2006-2014, International Business Machines Corporation *
* and others. All Rights Reserved. *
*******************************************************************************
*/
@ -17,6 +17,7 @@
U_NAMESPACE_BEGIN
class DictionaryMatcher;
class Normalizer2;
/*******************************************************************
* DictionaryBreakEngine
@ -326,7 +327,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
UnicodeSet fKatakanaWordSet;
UnicodeSet fHiraganaWordSet;
DictionaryMatcher *fDictionary;
DictionaryMatcher *fDictionary;
const Normalizer2 *nfkcNorm2;
public:

View file

@ -40,22 +40,31 @@ int32_t UCharsDictionaryMatcher::getType() const {
return DictionaryData::TRIE_TYPE_UCHARS;
}
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const {
UCharsTrie uct(characters);
UChar32 c = utext_next32(text);
if (c < 0) {
return 0;
}
UStringTrieResult result = uct.first(c);
int32_t numChars = 1;
count = 0;
for (;;) {
int32_t startingTextIndex = utext_getNativeIndex(text);
int32_t wordCount = 0;
int32_t codePointsMatched = 0;
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
codePointsMatched += 1;
if (USTRINGTRIE_HAS_VALUE(result)) {
if (count < limit) {
if (wordCount < limit) {
if (values != NULL) {
values[count] = uct.getValue();
values[wordCount] = uct.getValue();
}
lengths[count++] = numChars;
if (lengths != NULL) {
lengths[wordCount] = lengthMatched;
}
if (cpLengths != NULL) {
cpLengths[wordCount] = codePointsMatched;
}
++wordCount;
}
if (result == USTRINGTRIE_FINAL_VALUE) {
break;
@ -64,20 +73,15 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
else if (result == USTRINGTRIE_NO_MATCH) {
break;
}
// TODO: why do we have a text limit if the UText knows its length?
if (numChars >= maxLength) {
if (lengthMatched >= maxLength) {
break;
}
c = utext_next32(text);
if (c < 0) {
break;
}
++numChars;
result = uct.next(c);
}
return numChars;
if (prefix != NULL) {
*prefix = codePointsMatched;
}
return wordCount;
}
BytesDictionaryMatcher::~BytesDictionaryMatcher() {
@ -104,22 +108,30 @@ int32_t BytesDictionaryMatcher::getType() const {
return DictionaryData::TRIE_TYPE_BYTES;
}
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const {
BytesTrie bt(characters);
UChar32 c = utext_next32(text);
if (c < 0) {
return 0;
}
UStringTrieResult result = bt.first(transform(c));
int32_t numChars = 1;
count = 0;
for (;;) {
int32_t startingTextIndex = utext_getNativeIndex(text);
int32_t wordCount = 0;
int32_t codePointsMatched = 0;
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
codePointsMatched += 1;
if (USTRINGTRIE_HAS_VALUE(result)) {
if (count < limit) {
if (wordCount < limit) {
if (values != NULL) {
values[count] = bt.getValue();
values[wordCount] = bt.getValue();
}
lengths[count++] = numChars;
if (lengths != NULL) {
lengths[wordCount] = lengthMatched;
}
if (cpLengths != NULL) {
cpLengths[wordCount] = codePointsMatched;
}
++wordCount;
}
if (result == USTRINGTRIE_FINAL_VALUE) {
break;
@ -128,20 +140,15 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
else if (result == USTRINGTRIE_NO_MATCH) {
break;
}
// TODO: why do we have a text limit if the UText knows its length?
if (numChars >= maxLength) {
if (lengthMatched >= maxLength) {
break;
}
c = utext_next32(text);
if (c < 0) {
break;
}
++numChars;
result = bt.next(transform(c));
}
return numChars;
if (prefix != NULL) {
*prefix = codePointsMatched;
}
return wordCount;
}

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2013, International Business Machines
* Copyright (C) 2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* dictionarydata.h
@ -66,10 +66,32 @@ public:
*/
class U_COMMON_API DictionaryMatcher : public UMemory {
public:
DictionaryMatcher() {};
virtual ~DictionaryMatcher();
// this should emulate CompactTrieDictionary::matches()
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
int32_t limit, int32_t *values = NULL) const = 0;
/* @param text The text in which to look for matching words. Matching begins
* at the current position of the UText.
* @param maxLength The max length of match to consider. Units are the native indexing
* units of the UText.
* @param limit Capacity of output arrays, which is also the maximum number of
* matching words to be found.
* @param lengths output array, filled with the lengths of the matches, in order,
* from shortest to longest. Lengths are in native indexing units
* of the UText. May be NULL.
* @param cpLengths output array, filled with the lengths of the matches, in order,
* from shortest to longest. Lengths are the number of Unicode code points.
* May be NULL.
* @param values Output array, filled with the values associated with the words found.
* May be NULL.
* @param prefix Output parameter, the code point length of the prefix match, even if that
* prefix didn't lead to a complete word. Will always be >= the cpLength
* of the longest complete word matched. May be NULL.
* @return Number of matching words found.
*/
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const = 0;
/** @return DictionaryData::TRIE_TYPE_XYZ */
virtual int32_t getType() const = 0;
};
@ -81,8 +103,9 @@ public:
// The UDataMemory * will be closed on this object's destruction.
UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }
virtual ~UCharsDictionaryMatcher();
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
int32_t limit, int32_t *values = NULL) const;
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const;
virtual int32_t getType() const;
private:
const UChar *characters;
@ -98,8 +121,9 @@ public:
BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
: characters(c), transformConstant(t), file(f) { }
virtual ~BytesDictionaryMatcher();
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
int32_t limit, int32_t *values = NULL) const;
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
int32_t *lengths, int32_t *cpLengths, int32_t *values,
int32_t *prefix) const;
virtual int32_t getType() const;
private:
UChar32 transform(UChar32 c) const;

View file

@ -701,6 +701,22 @@ int32_t RuleBasedBreakIterator::previous(void) {
* @return The position of the first break after the current position.
*/
int32_t RuleBasedBreakIterator::following(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset >= utext_nativeLength(fText)) {
last();
return next();
}
else if (offset < 0) {
return first();
}
// Move requested offset to a code point start. It might be on a trail surrogate,
// or on a trail byte if the input is UTF-8.
utext_setNativeIndex(fText, offset);
offset = utext_getNativeIndex(fText);
// if we have cached break positions and offset is in the range
// covered by them, use them
// TODO: could use binary search
@ -722,20 +738,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
}
}
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
if (fText == NULL || offset >= utext_nativeLength(fText)) {
last();
return next();
}
else if (offset < 0) {
return first();
}
// otherwise, set our internal iteration position (temporarily)
// Set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
@ -747,6 +750,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// move forward one codepoint to prepare for moving back to a
// safe point.
// this handles offset being between a supplementary character
// TODO: is this still needed, with move to code point boundary handled above?
(void)UTEXT_NEXT32(fText);
// handlePrevious will move most of the time to < 1 boundary away
handlePrevious(fData->fSafeRevTable);
@ -809,6 +813,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
* @return The position of the last boundary before the starting position.
*/
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > utext_nativeLength(fText)) {
return last();
}
else if (offset < 0) {
return first();
}
// Move requested offset to a code point start. It might be on a trail surrogate,
// or on a trail byte if the input is UTF-8.
utext_setNativeIndex(fText, offset);
offset = utext_getNativeIndex(fText);
// if we have cached break positions and offset is in the range
// covered by them, use them
if (fCachedBreakPositions != NULL) {
@ -834,17 +853,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
}
}
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > utext_nativeLength(fText)) {
// return BreakIterator::DONE;
return last();
}
else if (offset < 0) {
return first();
}
// if we start by updating the current iteration position to the
// position specified by the caller, we can just use previous()
// to carry out this operation
@ -1578,30 +1586,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
return (reverse ? startPos : endPos);
}
// Bug 5532. The dictionary code will crash if the input text is UTF-8
// because native indexes are different from UTF-16 indexes.
// Temporary hack: skip dictionary lookup for UTF-8 encoded text.
// It wont give the right breaks, but it's better than a crash.
//
// Check the type of the UText by checking its pFuncs field, which
// is UText's function dispatch table. It will be the same for all
// UTF-8 UTexts and different for any other UText type.
//
// We have no other type of UText available with non-UTF-16 native indexing.
// This whole check will go away once the dictionary code is fixed.
static const void *utext_utf8Funcs;
if (utext_utf8Funcs == NULL) {
// Cache the UTF-8 UText function pointer value.
UErrorCode status = U_ZERO_ERROR;
UText tempUText = UTEXT_INITIALIZER;
utext_openUTF8(&tempUText, NULL, 0, &status);
utext_utf8Funcs = tempUText.pFuncs;
utext_close(&tempUText);
}
if (fText->pFuncs == utext_utf8Funcs) {
return (reverse ? startPos : endPos);
}
// Starting from the starting point, scan towards the proposed result,
// looking for the first dictionary character (which may be the one
// we're on, if we're starting in the middle of a range).

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2013, International Business Machines Corporation and
* Copyright (c) 1999-2014, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -31,9 +31,9 @@
#include "intltest.h"
#include "rbbitst.h"
#include <string.h>
#include "charstr.h"
#include "uvector.h"
#include "uvectr32.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "unicode/numfmt.h"
@ -354,27 +354,19 @@ void RBBITest::TestStatusReturn() {
}
static void printStringBreaks(UnicodeString ustr, int expected[],
int expectedcount)
{
static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
UErrorCode status = U_ZERO_ERROR;
char name[100];
printf("code alpha extend alphanum type word sent line name\n");
int j;
for (j = 0; j < ustr.length(); j ++) {
if (expectedcount > 0) {
int k;
for (k = 0; k < expectedcount; k ++) {
if (j == expected[k]) {
printf("------------------------------------------------ %d\n",
j);
}
}
}
UChar32 c = ustr.char32At(j);
if (c > 0xffff) {
j ++;
int nextExpectedIndex = 0;
utext_setNativeIndex(tstr, 0);
for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
printf("------------------------------------------------ %d\n", j);
++nextExpectedIndex;
}
UChar32 c = utext_next32(tstr);
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
u_isUAlphabetic(c),
@ -400,6 +392,19 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
}
static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
UErrorCode status = U_ZERO_ERROR;
UText *tstr = NULL;
tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
if (U_FAILURE(status)) {
printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
return;
}
printStringBreaks(tstr, expected, expectedCount);
utext_close(tstr);
}
void RBBITest::TestBug3818() {
UErrorCode status = U_ZERO_ERROR;
@ -830,23 +835,173 @@ void RBBITest::TestBug5775() {
//------------------------------------------------------------------------------
struct TestParams {
BreakIterator *bi;
UnicodeString dataToBreak;
UVector32 *expectedBreaks;
UVector32 *srcLine;
BreakIterator *bi; // Break iterator is set while parsing test source.
// Changed out whenever test data changes break type.
UnicodeString dataToBreak; // Data that is built up while parsing the test.
UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
UVector32 *srcCol;
UText *textToBreak; // UText, could be UTF8 or UTF16.
UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
CharString utf8String; // UTF-8 form of text to break.
TestParams(UErrorCode &status) : dataToBreak() {
bi = NULL;
expectedBreaks = new UVector32(status);
srcLine = new UVector32(status);
srcCol = new UVector32(status);
textToBreak = NULL;
textMap = new UVector32(status);
}
~TestParams() {
delete bi;
delete expectedBreaks;
delete srcLine;
delete srcCol;
utext_close(textToBreak);
delete textMap;
}
int32_t getSrcLine(int32_t bp);
int32_t getExpectedBreak(int32_t bp);
int32_t getSrcCol(int32_t bp);
void setUTF16(UErrorCode &status);
void setUTF8(UErrorCode &status);
};
void RBBITest::executeTest(TestParams *t) {
// Append a UnicodeString to a CharString with UTF-8 encoding.
// Substitute any invalid chars.
// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
int32_t utf8Length;
u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
src.getBuffer(), src.length(), // UTF-16 data
0xfffd, NULL, // Substitution char, number of subs.
&status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
return;
}
status = U_ZERO_ERROR;
int32_t capacity;
char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
u_strToUTF8WithSub(buffer, utf8Length, NULL,
src.getBuffer(), src.length(),
0xfffd, NULL, &status);
dest.append(buffer, utf8Length, status);
}
void TestParams::setUTF16(UErrorCode &status) {
textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
textMap->removeAllElements();
for (int32_t i=0; i<dataToBreak.length(); i++) {
if (i == dataToBreak.getChar32Start(i)) {
textMap->addElement(i, status);
} else {
textMap->addElement(-1, status);
}
}
textMap->addElement(dataToBreak.length(), status);
U_ASSERT(dataToBreak.length() + 1 == textMap->size());
}
void TestParams::setUTF8(UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
utf8String.clear();
CharStringAppend(utf8String, dataToBreak, status);
textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
if (U_FAILURE(status)) {
return;
}
textMap->removeAllElements();
int32_t utf16Index = 0;
for (;;) {
textMap->addElement(utf16Index, status);
UChar32 c32 = utext_current32(textToBreak);
if (c32 < 0) {
break;
}
utf16Index += U16_LENGTH(c32);
utext_next32(textToBreak);
while (textMap->size() < utext_getNativeIndex(textToBreak)) {
textMap->addElement(-1, status);
}
}
U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
}
int32_t TestParams::getSrcLine(int bp) {
if (bp >= textMap->size()) {
bp = textMap->size() - 1;
}
int32_t i = 0;
for(; bp >= 0 ; --bp) {
// Move to a character boundary if we are not on one already.
i = textMap->elementAti(bp);
if (i >= 0) {
break;
}
}
return srcLine->elementAti(i);
}
int32_t TestParams::getExpectedBreak(int bp) {
if (bp >= textMap->size()) {
return 0;
}
int32_t i = textMap->elementAti(bp);
int32_t retVal = 0;
if (i >= 0) {
retVal = expectedBreaks->elementAti(i);
}
return retVal;
}
int32_t TestParams::getSrcCol(int bp) {
if (bp >= textMap->size()) {
bp = textMap->size() - 1;
}
int32_t i = 0;
for(; bp >= 0; --bp) {
// Move bp to a character boundary if we are not on one already.
i = textMap->elementAti(bp);
if (i >= 0) {
break;
}
}
return srcCol->elementAti(i);
}
void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
int32_t bp;
int32_t prevBP;
int32_t i;
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
return;
}
if (t->bi == NULL) {
return;
}
t->bi->setText(t->dataToBreak);
t->bi->setText(t->textToBreak, status);
//
// Run the iterator forward
//
@ -855,93 +1010,92 @@ void RBBITest::executeTest(TestParams *t) {
if (prevBP == bp) {
// Fail for lack of forward progress.
errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
bp, t->getSrcLine(bp), t->getSrcCol(bp));
break;
}
// Check that there were we didn't miss an expected break between the last one
// Check that there we didn't miss an expected break between the last one
// and this one.
for (i=prevBP+1; i<bp; i++) {
if (t->expectedBreaks->elementAti(i) != 0) {
if (t->getExpectedBreak(i) != 0) {
int expected[] = {0, i};
printStringBreaks(t->dataToBreak, expected, 2);
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
i, t->getSrcLine(i), t->getSrcCol(i));
}
}
// Check that the break we did find was expected
if (t->expectedBreaks->elementAti(bp) == 0) {
if (t->getExpectedBreak(bp) == 0) {
int expected[] = {0, bp};
printStringBreaks(t->dataToBreak, expected, 2);
printStringBreaks(t->textToBreak, expected, 2);
errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
bp, t->getSrcLine(bp), t->getSrcCol(bp));
} else {
// The break was expected.
// Check that the {nnn} tag value is correct.
int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
int32_t expectedTagVal = t->getExpectedBreak(bp);
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int32_t line = t->srcLine->elementAti(bp);
int32_t line = t->getSrcLine(bp);
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
bp, line, t->getSrcCol(bp), rs, expectedTagVal);
}
}
prevBP = bp;
}
// Verify that there were no missed expected breaks after the last one found
for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
if (t->expectedBreaks->elementAti(i) != 0) {
for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
if (t->getExpectedBreak(i) != 0) {
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
i, t->getSrcLine(i), t->getSrcCol(i));
}
}
//
// Run the iterator backwards, verify that the same breaks are found.
//
prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
if (prevBP == bp) {
// Fail for lack of progress.
errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
bp, t->getSrcLine(bp), t->getSrcCol(bp));
break;
}
// Check that there were we didn't miss an expected break between the last one
// Check that we didn't miss an expected break between the last one
// and this one. (UVector returns zeros for index out of bounds.)
for (i=prevBP-1; i>bp; i--) {
if (t->expectedBreaks->elementAti(i) != 0) {
errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
if (t->getExpectedBreak(i) != 0) {
errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->getSrcLine(i), t->getSrcCol(i));
}
}
// Check that the break we did find was expected
if (t->expectedBreaks->elementAti(bp) == 0) {
if (t->getExpectedBreak(bp) == 0) {
errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
bp, t->getSrcLine(bp), t->getSrcCol(bp));
} else {
// The break was expected.
// Check that the {nnn} tag value is correct.
int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
int32_t expectedTagVal = t->getExpectedBreak(bp);
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int line = t->srcLine->elementAti(bp);
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
int line = t->getSrcLine(bp);
int32_t rs = t->bi->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
bp, line, t->getSrcCol(bp), rs, expectedTagVal);
}
}
@ -950,30 +1104,30 @@ void RBBITest::executeTest(TestParams *t) {
// Verify that there were no missed breaks prior to the last one found
for (i=prevBP-1; i>=0; i--) {
if (t->expectedBreaks->elementAti(i) != 0) {
if (t->getExpectedBreak(i) != 0) {
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
i, t->getSrcLine(i), t->getSrcCol(i));
}
}
// Check isBoundary()
for (i=0; i<t->expectedBreaks->size(); i++) {
UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
UBool boundaryFound = t->bi->isBoundary(i);
if (boundaryExpected != boundaryFound) {
errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
" Expected, Actual= %s, %s",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
i, t->getSrcLine(i), t->getSrcCol(i),
boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
}
}
// Check following()
for (i=0; i<t->expectedBreaks->size(); i++) {
for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
int32_t actualBreak = t->bi->following(i);
int32_t expectedBreak = BreakIterator::DONE;
for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
if (t->expectedBreaks->elementAti(j) != 0) {
for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
break;
}
@ -981,17 +1135,24 @@ void RBBITest::executeTest(TestParams *t) {
if (expectedBreak != actualBreak) {
errln("following(%d) incorrect. File line,col= %4d,%4d\n"
" Expected, Actual= %d, %d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
}
}
// Check preceding()
for (i=t->expectedBreaks->size(); i>=0; i--) {
for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
int32_t actualBreak = t->bi->preceding(i);
int32_t expectedBreak = BreakIterator::DONE;
for (int32_t j=i-1; j >= 0; j--) {
if (t->expectedBreaks->elementAti(j) != 0) {
// For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
// preceding(trailing byte) will return the index of some preceding code point,
// not the lead byte of the current code point, even though that has a smaller index.
// Therefore, start looking at the expected break data not at i-1, but at
// the start of code point index - 1.
utext_setNativeIndex(t->textToBreak, i);
int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
for (; j >= 0; j--) {
if (t->getExpectedBreak(j) != 0) {
expectedBreak = j;
break;
}
@ -999,7 +1160,7 @@ void RBBITest::executeTest(TestParams *t) {
if (expectedBreak != actualBreak) {
errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
" Expected, Actual= %d, %d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
}
}
}
@ -1011,11 +1172,7 @@ void RBBITest::TestExtended() {
Locale locale("");
UnicodeString rules;
TestParams tp;
tp.bi = NULL;
tp.expectedBreaks = new UVector32(status);
tp.srcLine = new UVector32(status);
tp.srcCol = new UVector32(status);
TestParams tp(status);
RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
if (U_FAILURE(status)) {
@ -1190,7 +1347,16 @@ void RBBITest::TestExtended() {
charIdx += 6;
// RUN THE TEST!
executeTest(&tp);
status = U_ZERO_ERROR;
tp.setUTF16(status);
executeTest(&tp, status);
TEST_ASSERT_SUCCESS(status);
// Run again, this time with UTF-8 text wrapped in a UText.
status = U_ZERO_ERROR;
tp.setUTF8(status);
TEST_ASSERT_SUCCESS(status);
executeTest(&tp, status);
break;
}
@ -1356,10 +1522,6 @@ void RBBITest::TestExtended() {
}
end_test:
delete tp.bi;
delete tp.expectedBreaks;
delete tp.srcLine;
delete tp.srcCol;
delete [] testFile;
#endif
}

View file

@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 1999-2013, International Business Machines
* Copyright (c) 1999-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*************************************************************************
* Date Name Description
@ -57,7 +57,7 @@ public:
void TestExtended();
UChar *ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status);
void executeTest(TestParams *);
void executeTest(TestParams *, UErrorCode &status);
void TestWordBreaks();
void TestWordBoundary();

View file

@ -33,11 +33,10 @@
# Temp debugging tests
<word>
<data>•Isn't<200></data>
<char>
<data>•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
<sent>
<data>•\u00c0.•</data>
#<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•</data>
########################################################################################
#
#