mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-8550 Dictionary Break Iterator, fixes to work with UTF-8 text.
X-SVN-Rev: 35724
This commit is contained in:
parent
68c893b2f1
commit
f71b9053d2
8 changed files with 726 additions and 487 deletions
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006,2012-2013, International Business Machines Corporation *
|
||||
* Copyright (C) 2006-2014, International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -17,6 +17,7 @@
|
|||
U_NAMESPACE_BEGIN
|
||||
|
||||
class DictionaryMatcher;
|
||||
class Normalizer2;
|
||||
|
||||
/*******************************************************************
|
||||
* DictionaryBreakEngine
|
||||
|
@ -326,7 +327,8 @@ class CjkBreakEngine : public DictionaryBreakEngine {
|
|||
UnicodeSet fKatakanaWordSet;
|
||||
UnicodeSet fHiraganaWordSet;
|
||||
|
||||
DictionaryMatcher *fDictionary;
|
||||
DictionaryMatcher *fDictionary;
|
||||
const Normalizer2 *nfkcNorm2;
|
||||
|
||||
public:
|
||||
|
||||
|
|
|
@ -40,22 +40,31 @@ int32_t UCharsDictionaryMatcher::getType() const {
|
|||
return DictionaryData::TRIE_TYPE_UCHARS;
|
||||
}
|
||||
|
||||
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
|
||||
int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const {
|
||||
|
||||
UCharsTrie uct(characters);
|
||||
UChar32 c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
return 0;
|
||||
}
|
||||
UStringTrieResult result = uct.first(c);
|
||||
int32_t numChars = 1;
|
||||
count = 0;
|
||||
for (;;) {
|
||||
int32_t startingTextIndex = utext_getNativeIndex(text);
|
||||
int32_t wordCount = 0;
|
||||
int32_t codePointsMatched = 0;
|
||||
|
||||
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
|
||||
UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
|
||||
int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
|
||||
codePointsMatched += 1;
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
if (count < limit) {
|
||||
if (wordCount < limit) {
|
||||
if (values != NULL) {
|
||||
values[count] = uct.getValue();
|
||||
values[wordCount] = uct.getValue();
|
||||
}
|
||||
lengths[count++] = numChars;
|
||||
if (lengths != NULL) {
|
||||
lengths[wordCount] = lengthMatched;
|
||||
}
|
||||
if (cpLengths != NULL) {
|
||||
cpLengths[wordCount] = codePointsMatched;
|
||||
}
|
||||
++wordCount;
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
break;
|
||||
|
@ -64,20 +73,15 @@ int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
|
|||
else if (result == USTRINGTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: why do we have a text limit if the UText knows its length?
|
||||
if (numChars >= maxLength) {
|
||||
if (lengthMatched >= maxLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
++numChars;
|
||||
result = uct.next(c);
|
||||
}
|
||||
return numChars;
|
||||
|
||||
if (prefix != NULL) {
|
||||
*prefix = codePointsMatched;
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
BytesDictionaryMatcher::~BytesDictionaryMatcher() {
|
||||
|
@ -104,22 +108,30 @@ int32_t BytesDictionaryMatcher::getType() const {
|
|||
return DictionaryData::TRIE_TYPE_BYTES;
|
||||
}
|
||||
|
||||
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
|
||||
int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const {
|
||||
BytesTrie bt(characters);
|
||||
UChar32 c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
return 0;
|
||||
}
|
||||
UStringTrieResult result = bt.first(transform(c));
|
||||
int32_t numChars = 1;
|
||||
count = 0;
|
||||
for (;;) {
|
||||
int32_t startingTextIndex = utext_getNativeIndex(text);
|
||||
int32_t wordCount = 0;
|
||||
int32_t codePointsMatched = 0;
|
||||
|
||||
for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
|
||||
UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
|
||||
int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
|
||||
codePointsMatched += 1;
|
||||
if (USTRINGTRIE_HAS_VALUE(result)) {
|
||||
if (count < limit) {
|
||||
if (wordCount < limit) {
|
||||
if (values != NULL) {
|
||||
values[count] = bt.getValue();
|
||||
values[wordCount] = bt.getValue();
|
||||
}
|
||||
lengths[count++] = numChars;
|
||||
if (lengths != NULL) {
|
||||
lengths[wordCount] = lengthMatched;
|
||||
}
|
||||
if (cpLengths != NULL) {
|
||||
cpLengths[wordCount] = codePointsMatched;
|
||||
}
|
||||
++wordCount;
|
||||
}
|
||||
if (result == USTRINGTRIE_FINAL_VALUE) {
|
||||
break;
|
||||
|
@ -128,20 +140,15 @@ int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t
|
|||
else if (result == USTRINGTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: why do we have a text limit if the UText knows its length?
|
||||
if (numChars >= maxLength) {
|
||||
if (lengthMatched >= maxLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
c = utext_next32(text);
|
||||
if (c < 0) {
|
||||
break;
|
||||
}
|
||||
++numChars;
|
||||
result = bt.next(transform(c));
|
||||
}
|
||||
return numChars;
|
||||
|
||||
if (prefix != NULL) {
|
||||
*prefix = codePointsMatched;
|
||||
}
|
||||
return wordCount;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2013, International Business Machines
|
||||
* Copyright (C) 2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* dictionarydata.h
|
||||
|
@ -66,10 +66,32 @@ public:
|
|||
*/
|
||||
class U_COMMON_API DictionaryMatcher : public UMemory {
|
||||
public:
|
||||
DictionaryMatcher() {};
|
||||
virtual ~DictionaryMatcher();
|
||||
// this should emulate CompactTrieDictionary::matches()
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
|
||||
int32_t limit, int32_t *values = NULL) const = 0;
|
||||
/* @param text The text in which to look for matching words. Matching begins
|
||||
* at the current position of the UText.
|
||||
* @param maxLength The max length of match to consider. Units are the native indexing
|
||||
* units of the UText.
|
||||
* @param limit Capacity of output arrays, which is also the maximum number of
|
||||
* matching words to be found.
|
||||
* @param lengths output array, filled with the lengths of the matches, in order,
|
||||
* from shortest to longest. Lengths are in native indexing units
|
||||
* of the UText. May be NULL.
|
||||
* @param cpLengths output array, filled with the lengths of the matches, in order,
|
||||
* from shortest to longest. Lengths are the number of Unicode code points.
|
||||
* May be NULL.
|
||||
* @param values Output array, filled with the values associated with the words found.
|
||||
* May be NULL.
|
||||
* @param prefix Output parameter, the code point length of the prefix match, even if that
|
||||
* prefix didn't lead to a complete word. Will always be >= the cpLength
|
||||
* of the longest complete word matched. May be NULL.
|
||||
* @return Number of matching words found.
|
||||
*/
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const = 0;
|
||||
|
||||
/** @return DictionaryData::TRIE_TYPE_XYZ */
|
||||
virtual int32_t getType() const = 0;
|
||||
};
|
||||
|
@ -81,8 +103,9 @@ public:
|
|||
// The UDataMemory * will be closed on this object's destruction.
|
||||
UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { }
|
||||
virtual ~UCharsDictionaryMatcher();
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
|
||||
int32_t limit, int32_t *values = NULL) const;
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const;
|
||||
virtual int32_t getType() const;
|
||||
private:
|
||||
const UChar *characters;
|
||||
|
@ -98,8 +121,9 @@ public:
|
|||
BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f)
|
||||
: characters(c), transformConstant(t), file(f) { }
|
||||
virtual ~BytesDictionaryMatcher();
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count,
|
||||
int32_t limit, int32_t *values = NULL) const;
|
||||
virtual int32_t matches(UText *text, int32_t maxLength, int32_t limit,
|
||||
int32_t *lengths, int32_t *cpLengths, int32_t *values,
|
||||
int32_t *prefix) const;
|
||||
virtual int32_t getType() const;
|
||||
private:
|
||||
UChar32 transform(UChar32 c) const;
|
||||
|
|
|
@ -701,6 +701,22 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
* @return The position of the first break after the current position.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (fText == NULL || offset >= utext_nativeLength(fText)) {
|
||||
last();
|
||||
return next();
|
||||
}
|
||||
else if (offset < 0) {
|
||||
return first();
|
||||
}
|
||||
|
||||
// Move requested offset to a code point start. It might be on a trail surrogate,
|
||||
// or on a trail byte if the input is UTF-8.
|
||||
utext_setNativeIndex(fText, offset);
|
||||
offset = utext_getNativeIndex(fText);
|
||||
|
||||
// if we have cached break positions and offset is in the range
|
||||
// covered by them, use them
|
||||
// TODO: could use binary search
|
||||
|
@ -722,20 +738,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
}
|
||||
}
|
||||
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
fLastRuleStatusIndex = 0;
|
||||
fLastStatusIndexValid = TRUE;
|
||||
if (fText == NULL || offset >= utext_nativeLength(fText)) {
|
||||
last();
|
||||
return next();
|
||||
}
|
||||
else if (offset < 0) {
|
||||
return first();
|
||||
}
|
||||
|
||||
// otherwise, set our internal iteration position (temporarily)
|
||||
// Set our internal iteration position (temporarily)
|
||||
// to the position passed in. If this is the _beginning_ position,
|
||||
// then we can just use next() to get our return value
|
||||
|
||||
|
@ -747,6 +750,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
// move forward one codepoint to prepare for moving back to a
|
||||
// safe point.
|
||||
// this handles offset being between a supplementary character
|
||||
// TODO: is this still needed, with move to code point boundary handled above?
|
||||
(void)UTEXT_NEXT32(fText);
|
||||
// handlePrevious will move most of the time to < 1 boundary away
|
||||
handlePrevious(fData->fSafeRevTable);
|
||||
|
@ -809,6 +813,21 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
* @return The position of the last boundary before the starting position.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (fText == NULL || offset > utext_nativeLength(fText)) {
|
||||
return last();
|
||||
}
|
||||
else if (offset < 0) {
|
||||
return first();
|
||||
}
|
||||
|
||||
// Move requested offset to a code point start. It might be on a trail surrogate,
|
||||
// or on a trail byte if the input is UTF-8.
|
||||
utext_setNativeIndex(fText, offset);
|
||||
offset = utext_getNativeIndex(fText);
|
||||
|
||||
// if we have cached break positions and offset is in the range
|
||||
// covered by them, use them
|
||||
if (fCachedBreakPositions != NULL) {
|
||||
|
@ -834,17 +853,6 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
|||
}
|
||||
}
|
||||
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (fText == NULL || offset > utext_nativeLength(fText)) {
|
||||
// return BreakIterator::DONE;
|
||||
return last();
|
||||
}
|
||||
else if (offset < 0) {
|
||||
return first();
|
||||
}
|
||||
|
||||
// if we start by updating the current iteration position to the
|
||||
// position specified by the caller, we can just use previous()
|
||||
// to carry out this operation
|
||||
|
@ -1578,30 +1586,6 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
// Bug 5532. The dictionary code will crash if the input text is UTF-8
|
||||
// because native indexes are different from UTF-16 indexes.
|
||||
// Temporary hack: skip dictionary lookup for UTF-8 encoded text.
|
||||
// It wont give the right breaks, but it's better than a crash.
|
||||
//
|
||||
// Check the type of the UText by checking its pFuncs field, which
|
||||
// is UText's function dispatch table. It will be the same for all
|
||||
// UTF-8 UTexts and different for any other UText type.
|
||||
//
|
||||
// We have no other type of UText available with non-UTF-16 native indexing.
|
||||
// This whole check will go away once the dictionary code is fixed.
|
||||
static const void *utext_utf8Funcs;
|
||||
if (utext_utf8Funcs == NULL) {
|
||||
// Cache the UTF-8 UText function pointer value.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText tempUText = UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&tempUText, NULL, 0, &status);
|
||||
utext_utf8Funcs = tempUText.pFuncs;
|
||||
utext_close(&tempUText);
|
||||
}
|
||||
if (fText->pFuncs == utext_utf8Funcs) {
|
||||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
// Starting from the starting point, scan towards the proposed result,
|
||||
// looking for the first dictionary character (which may be the one
|
||||
// we're on, if we're starting in the middle of a range).
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2013, International Business Machines Corporation and
|
||||
* Copyright (c) 1999-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
|
@ -31,9 +31,9 @@
|
|||
#include "intltest.h"
|
||||
#include "rbbitst.h"
|
||||
#include <string.h>
|
||||
#include "charstr.h"
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/numfmt.h"
|
||||
|
@ -354,27 +354,19 @@ void RBBITest::TestStatusReturn() {
|
|||
}
|
||||
|
||||
|
||||
static void printStringBreaks(UnicodeString ustr, int expected[],
|
||||
int expectedcount)
|
||||
{
|
||||
static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
char name[100];
|
||||
printf("code alpha extend alphanum type word sent line name\n");
|
||||
int j;
|
||||
for (j = 0; j < ustr.length(); j ++) {
|
||||
if (expectedcount > 0) {
|
||||
int k;
|
||||
for (k = 0; k < expectedcount; k ++) {
|
||||
if (j == expected[k]) {
|
||||
printf("------------------------------------------------ %d\n",
|
||||
j);
|
||||
}
|
||||
}
|
||||
}
|
||||
UChar32 c = ustr.char32At(j);
|
||||
if (c > 0xffff) {
|
||||
j ++;
|
||||
int nextExpectedIndex = 0;
|
||||
utext_setNativeIndex(tstr, 0);
|
||||
for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
|
||||
if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
|
||||
printf("------------------------------------------------ %d\n", j);
|
||||
++nextExpectedIndex;
|
||||
}
|
||||
|
||||
UChar32 c = utext_next32(tstr);
|
||||
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
|
||||
printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
|
||||
u_isUAlphabetic(c),
|
||||
|
@ -400,6 +392,19 @@ static void printStringBreaks(UnicodeString ustr, int expected[],
|
|||
}
|
||||
|
||||
|
||||
static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UText *tstr = NULL;
|
||||
tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
|
||||
return;
|
||||
}
|
||||
printStringBreaks(tstr, expected, expectedCount);
|
||||
utext_close(tstr);
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestBug3818() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
@ -830,23 +835,173 @@ void RBBITest::TestBug5775() {
|
|||
//------------------------------------------------------------------------------
|
||||
|
||||
struct TestParams {
|
||||
BreakIterator *bi;
|
||||
UnicodeString dataToBreak;
|
||||
UVector32 *expectedBreaks;
|
||||
UVector32 *srcLine;
|
||||
BreakIterator *bi; // Break iterator is set while parsing test source.
|
||||
// Changed out whenever test data changes break type.
|
||||
|
||||
UnicodeString dataToBreak; // Data that is built up while parsing the test.
|
||||
UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
|
||||
UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
|
||||
UVector32 *srcCol;
|
||||
|
||||
UText *textToBreak; // UText, could be UTF8 or UTF16.
|
||||
UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
|
||||
CharString utf8String; // UTF-8 form of text to break.
|
||||
|
||||
TestParams(UErrorCode &status) : dataToBreak() {
|
||||
bi = NULL;
|
||||
expectedBreaks = new UVector32(status);
|
||||
srcLine = new UVector32(status);
|
||||
srcCol = new UVector32(status);
|
||||
textToBreak = NULL;
|
||||
textMap = new UVector32(status);
|
||||
}
|
||||
|
||||
~TestParams() {
|
||||
delete bi;
|
||||
delete expectedBreaks;
|
||||
delete srcLine;
|
||||
delete srcCol;
|
||||
utext_close(textToBreak);
|
||||
delete textMap;
|
||||
}
|
||||
|
||||
int32_t getSrcLine(int32_t bp);
|
||||
int32_t getExpectedBreak(int32_t bp);
|
||||
int32_t getSrcCol(int32_t bp);
|
||||
|
||||
void setUTF16(UErrorCode &status);
|
||||
void setUTF8(UErrorCode &status);
|
||||
};
|
||||
|
||||
void RBBITest::executeTest(TestParams *t) {
|
||||
// Append a UnicodeString to a CharString with UTF-8 encoding.
|
||||
// Substitute any invalid chars.
|
||||
// Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
|
||||
static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t utf8Length;
|
||||
u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
|
||||
src.getBuffer(), src.length(), // UTF-16 data
|
||||
0xfffd, NULL, // Substitution char, number of subs.
|
||||
&status);
|
||||
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
return;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
int32_t capacity;
|
||||
char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
|
||||
u_strToUTF8WithSub(buffer, utf8Length, NULL,
|
||||
src.getBuffer(), src.length(),
|
||||
0xfffd, NULL, &status);
|
||||
dest.append(buffer, utf8Length, status);
|
||||
}
|
||||
|
||||
|
||||
void TestParams::setUTF16(UErrorCode &status) {
|
||||
textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
|
||||
textMap->removeAllElements();
|
||||
for (int32_t i=0; i<dataToBreak.length(); i++) {
|
||||
if (i == dataToBreak.getChar32Start(i)) {
|
||||
textMap->addElement(i, status);
|
||||
} else {
|
||||
textMap->addElement(-1, status);
|
||||
}
|
||||
}
|
||||
textMap->addElement(dataToBreak.length(), status);
|
||||
U_ASSERT(dataToBreak.length() + 1 == textMap->size());
|
||||
}
|
||||
|
||||
|
||||
void TestParams::setUTF8(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
utf8String.clear();
|
||||
CharStringAppend(utf8String, dataToBreak, status);
|
||||
textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
textMap->removeAllElements();
|
||||
int32_t utf16Index = 0;
|
||||
for (;;) {
|
||||
textMap->addElement(utf16Index, status);
|
||||
UChar32 c32 = utext_current32(textToBreak);
|
||||
if (c32 < 0) {
|
||||
break;
|
||||
}
|
||||
utf16Index += U16_LENGTH(c32);
|
||||
utext_next32(textToBreak);
|
||||
while (textMap->size() < utext_getNativeIndex(textToBreak)) {
|
||||
textMap->addElement(-1, status);
|
||||
}
|
||||
}
|
||||
U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
|
||||
}
|
||||
|
||||
|
||||
int32_t TestParams::getSrcLine(int bp) {
|
||||
if (bp >= textMap->size()) {
|
||||
bp = textMap->size() - 1;
|
||||
}
|
||||
int32_t i = 0;
|
||||
for(; bp >= 0 ; --bp) {
|
||||
// Move to a character boundary if we are not on one already.
|
||||
i = textMap->elementAti(bp);
|
||||
if (i >= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return srcLine->elementAti(i);
|
||||
}
|
||||
|
||||
|
||||
int32_t TestParams::getExpectedBreak(int bp) {
|
||||
if (bp >= textMap->size()) {
|
||||
return 0;
|
||||
}
|
||||
int32_t i = textMap->elementAti(bp);
|
||||
int32_t retVal = 0;
|
||||
if (i >= 0) {
|
||||
retVal = expectedBreaks->elementAti(i);
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
|
||||
int32_t TestParams::getSrcCol(int bp) {
|
||||
if (bp >= textMap->size()) {
|
||||
bp = textMap->size() - 1;
|
||||
}
|
||||
int32_t i = 0;
|
||||
for(; bp >= 0; --bp) {
|
||||
// Move bp to a character boundary if we are not on one already.
|
||||
i = textMap->elementAti(bp);
|
||||
if (i >= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return srcCol->elementAti(i);
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
|
||||
int32_t bp;
|
||||
int32_t prevBP;
|
||||
int32_t i;
|
||||
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (t->bi == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
t->bi->setText(t->dataToBreak);
|
||||
t->bi->setText(t->textToBreak, status);
|
||||
//
|
||||
// Run the iterator forward
|
||||
//
|
||||
|
@ -855,93 +1010,92 @@ void RBBITest::executeTest(TestParams *t) {
|
|||
if (prevBP == bp) {
|
||||
// Fail for lack of forward progress.
|
||||
errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
||||
bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
||||
break;
|
||||
}
|
||||
|
||||
// Check that there were we didn't miss an expected break between the last one
|
||||
// Check that there we didn't miss an expected break between the last one
|
||||
// and this one.
|
||||
for (i=prevBP+1; i<bp; i++) {
|
||||
if (t->expectedBreaks->elementAti(i) != 0) {
|
||||
if (t->getExpectedBreak(i) != 0) {
|
||||
int expected[] = {0, i};
|
||||
printStringBreaks(t->dataToBreak, expected, 2);
|
||||
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
||||
i, t->getSrcLine(i), t->getSrcCol(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Check that the break we did find was expected
|
||||
if (t->expectedBreaks->elementAti(bp) == 0) {
|
||||
if (t->getExpectedBreak(bp) == 0) {
|
||||
int expected[] = {0, bp};
|
||||
printStringBreaks(t->dataToBreak, expected, 2);
|
||||
printStringBreaks(t->textToBreak, expected, 2);
|
||||
errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
||||
bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
||||
} else {
|
||||
// The break was expected.
|
||||
// Check that the {nnn} tag value is correct.
|
||||
int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
|
||||
int32_t expectedTagVal = t->getExpectedBreak(bp);
|
||||
if (expectedTagVal == -1) {
|
||||
expectedTagVal = 0;
|
||||
}
|
||||
int32_t line = t->srcLine->elementAti(bp);
|
||||
int32_t line = t->getSrcLine(bp);
|
||||
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
|
||||
if (rs != expectedTagVal) {
|
||||
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
|
||||
" Actual, Expected status = %4d, %4d",
|
||||
bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
|
||||
bp, line, t->getSrcCol(bp), rs, expectedTagVal);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
prevBP = bp;
|
||||
}
|
||||
|
||||
// Verify that there were no missed expected breaks after the last one found
|
||||
for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
|
||||
if (t->expectedBreaks->elementAti(i) != 0) {
|
||||
for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
|
||||
if (t->getExpectedBreak(i) != 0) {
|
||||
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
||||
i, t->getSrcLine(i), t->getSrcCol(i));
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Run the iterator backwards, verify that the same breaks are found.
|
||||
//
|
||||
prevBP = t->dataToBreak.length()+2; // start with a phony value for the last break pos seen.
|
||||
prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
|
||||
for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
|
||||
if (prevBP == bp) {
|
||||
// Fail for lack of progress.
|
||||
errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
||||
bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
||||
break;
|
||||
}
|
||||
|
||||
// Check that there were we didn't miss an expected break between the last one
|
||||
// Check that we didn't miss an expected break between the last one
|
||||
// and this one. (UVector returns zeros for index out of bounds.)
|
||||
for (i=prevBP-1; i>bp; i--) {
|
||||
if (t->expectedBreaks->elementAti(i) != 0) {
|
||||
errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
||||
if (t->getExpectedBreak(i) != 0) {
|
||||
errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->getSrcLine(i), t->getSrcCol(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Check that the break we did find was expected
|
||||
if (t->expectedBreaks->elementAti(bp) == 0) {
|
||||
if (t->getExpectedBreak(bp) == 0) {
|
||||
errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
|
||||
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
|
||||
bp, t->getSrcLine(bp), t->getSrcCol(bp));
|
||||
} else {
|
||||
// The break was expected.
|
||||
// Check that the {nnn} tag value is correct.
|
||||
int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
|
||||
int32_t expectedTagVal = t->getExpectedBreak(bp);
|
||||
if (expectedTagVal == -1) {
|
||||
expectedTagVal = 0;
|
||||
}
|
||||
int line = t->srcLine->elementAti(bp);
|
||||
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
|
||||
int line = t->getSrcLine(bp);
|
||||
int32_t rs = t->bi->getRuleStatus();
|
||||
if (rs != expectedTagVal) {
|
||||
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
|
||||
" Actual, Expected status = %4d, %4d",
|
||||
bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
|
||||
bp, line, t->getSrcCol(bp), rs, expectedTagVal);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -950,30 +1104,30 @@ void RBBITest::executeTest(TestParams *t) {
|
|||
|
||||
// Verify that there were no missed breaks prior to the last one found
|
||||
for (i=prevBP-1; i>=0; i--) {
|
||||
if (t->expectedBreaks->elementAti(i) != 0) {
|
||||
if (t->getExpectedBreak(i) != 0) {
|
||||
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
|
||||
i, t->getSrcLine(i), t->getSrcCol(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Check isBoundary()
|
||||
for (i=0; i<t->expectedBreaks->size(); i++) {
|
||||
UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
|
||||
for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
|
||||
UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
|
||||
UBool boundaryFound = t->bi->isBoundary(i);
|
||||
if (boundaryExpected != boundaryFound) {
|
||||
errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
|
||||
" Expected, Actual= %s, %s",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
|
||||
i, t->getSrcLine(i), t->getSrcCol(i),
|
||||
boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
|
||||
}
|
||||
}
|
||||
|
||||
// Check following()
|
||||
for (i=0; i<t->expectedBreaks->size(); i++) {
|
||||
for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
|
||||
int32_t actualBreak = t->bi->following(i);
|
||||
int32_t expectedBreak = BreakIterator::DONE;
|
||||
for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
|
||||
if (t->expectedBreaks->elementAti(j) != 0) {
|
||||
for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
|
||||
if (t->getExpectedBreak(j) != 0) {
|
||||
expectedBreak = j;
|
||||
break;
|
||||
}
|
||||
|
@ -981,17 +1135,24 @@ void RBBITest::executeTest(TestParams *t) {
|
|||
if (expectedBreak != actualBreak) {
|
||||
errln("following(%d) incorrect. File line,col= %4d,%4d\n"
|
||||
" Expected, Actual= %d, %d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
|
||||
i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
|
||||
}
|
||||
}
|
||||
|
||||
// Check preceding()
|
||||
for (i=t->expectedBreaks->size(); i>=0; i--) {
|
||||
for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
|
||||
int32_t actualBreak = t->bi->preceding(i);
|
||||
int32_t expectedBreak = BreakIterator::DONE;
|
||||
|
||||
for (int32_t j=i-1; j >= 0; j--) {
|
||||
if (t->expectedBreaks->elementAti(j) != 0) {
|
||||
// For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
|
||||
// preceding(trailing byte) will return the index of some preceding code point,
|
||||
// not the lead byte of the current code point, even though that has a smaller index.
|
||||
// Therefore, start looking at the expected break data not at i-1, but at
|
||||
// the start of code point index - 1.
|
||||
utext_setNativeIndex(t->textToBreak, i);
|
||||
int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
|
||||
for (; j >= 0; j--) {
|
||||
if (t->getExpectedBreak(j) != 0) {
|
||||
expectedBreak = j;
|
||||
break;
|
||||
}
|
||||
|
@ -999,7 +1160,7 @@ void RBBITest::executeTest(TestParams *t) {
|
|||
if (expectedBreak != actualBreak) {
|
||||
errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
|
||||
" Expected, Actual= %d, %d",
|
||||
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
|
||||
i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1011,11 +1172,7 @@ void RBBITest::TestExtended() {
|
|||
Locale locale("");
|
||||
|
||||
UnicodeString rules;
|
||||
TestParams tp;
|
||||
tp.bi = NULL;
|
||||
tp.expectedBreaks = new UVector32(status);
|
||||
tp.srcLine = new UVector32(status);
|
||||
tp.srcCol = new UVector32(status);
|
||||
TestParams tp(status);
|
||||
|
||||
RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -1190,7 +1347,16 @@ void RBBITest::TestExtended() {
|
|||
charIdx += 6;
|
||||
|
||||
// RUN THE TEST!
|
||||
executeTest(&tp);
|
||||
status = U_ZERO_ERROR;
|
||||
tp.setUTF16(status);
|
||||
executeTest(&tp, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
// Run again, this time with UTF-8 text wrapped in a UText.
|
||||
status = U_ZERO_ERROR;
|
||||
tp.setUTF8(status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
executeTest(&tp, status);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1356,10 +1522,6 @@ void RBBITest::TestExtended() {
|
|||
}
|
||||
|
||||
end_test:
|
||||
delete tp.bi;
|
||||
delete tp.expectedBreaks;
|
||||
delete tp.srcLine;
|
||||
delete tp.srcCol;
|
||||
delete [] testFile;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*************************************************************************
|
||||
* Copyright (c) 1999-2013, International Business Machines
|
||||
* Copyright (c) 1999-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*************************************************************************
|
||||
* Date Name Description
|
||||
|
@ -57,7 +57,7 @@ public:
|
|||
|
||||
void TestExtended();
|
||||
UChar *ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status);
|
||||
void executeTest(TestParams *);
|
||||
void executeTest(TestParams *, UErrorCode &status);
|
||||
|
||||
void TestWordBreaks();
|
||||
void TestWordBoundary();
|
||||
|
|
7
icu4c/source/test/testdata/rbbitst.txt
vendored
7
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -33,11 +33,10 @@
|
|||
|
||||
|
||||
# Temp debugging tests
|
||||
<word>
|
||||
<data>•Isn't<200></data>
|
||||
<char>
|
||||
<data>•\U00010020•\U00010000\N{COMBINING MACRON}•</data>
|
||||
<sent>
|
||||
<data>•\u00c0.•</data>
|
||||
|
||||
#<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•</data>
|
||||
########################################################################################
|
||||
#
|
||||
#
|
||||
|
|
Loading…
Add table
Reference in a new issue