ICU-12914 Add ubrk_openBinaryRules, ubrk_getBinaryRules, and simple test

X-SVN-Rev: 39582
This commit is contained in:
Peter Edberg 2017-01-19 23:10:23 +00:00
parent f28895cccc
commit 17683ea87f
3 changed files with 163 additions and 24 deletions

View file

@ -20,6 +20,7 @@
#include "unicode/rbbi.h"
#include "rbbirb.h"
#include "uassert.h"
#include "cmemory.h"
U_NAMESPACE_USE
@ -119,7 +120,24 @@ ubrk_openRules( const UChar *rules,
}
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
const UChar * text, int32_t textLength,
UErrorCode * status)
{
if (U_FAILURE(*status)) {
return NULL;
}
LocalPointer<RuleBasedBreakIterator> lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status);
if (U_FAILURE(*status)) {
return NULL;
}
UBreakIterator *uBI = reinterpret_cast<UBreakIterator *>(lpRBBI.orphan());
if (text != NULL) {
ubrk_setText(uBI, text, textLength, status);
}
return uBI;
}
U_CAPI UBreakIterator * U_EXPORT2
@ -288,7 +306,8 @@ ubrk_getLocaleByType(const UBreakIterator *bi,
}
void ubrk_refreshUText(UBreakIterator *bi,
U_CAPI void U_EXPORT2
ubrk_refreshUText(UBreakIterator *bi,
UText *text,
UErrorCode *status)
{
@ -296,6 +315,34 @@ void ubrk_refreshUText(UBreakIterator *bi,
bii->refreshInputText(text, *status);
}
U_CAPI uint32_t U_EXPORT2
ubrk_getBinaryRules(UBreakIterator *bi,
uint8_t * binaryRules, uint32_t rulesCapacity,
UErrorCode * status)
{
if (U_FAILURE(*status)) {
return 0;
}
if (binaryRules == NULL && rulesCapacity > 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
RuleBasedBreakIterator* rbbi;
if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
uint32_t rulesLength;
const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength);
if (binaryRules != NULL) { // if not preflighting
if (rulesLength > rulesCapacity) {
*status = U_BUFFER_OVERFLOW_ERROR;
} else {
uprv_memcpy(binaryRules, returnedRules, rulesLength);
}
}
return rulesLength;
}
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -267,6 +267,34 @@ ubrk_openRules(const UChar *rules,
UParseError *parseErr,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
* Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
* Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
* compatible across different major versions of ICU, nor across platforms of different
* endianness or different base character set family (ASCII vs EBCDIC).
* @param binaryRules A set of compiled binary rules specifying the text breaking
* conventions. Ownership of the storage containing the compiled
* rules remains with the caller of this function. The compiled
* rules must not be modified or deleted during the life of the
* break iterator.
* @param rulesLength The length of binaryRules in bytes.
* @param text The text to be iterated over. May be null, in which case
* ubrk_setText() is used to specify the text to be iterated.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param status Pointer to UErrorCode to receive any errors.
* @return UBreakIterator for the specified rules.
* @see ubrk_getBinaryRules
* @draft ICU 59
*/
U_DRAFT UBreakIterator* U_EXPORT2
ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
const UChar * text, int32_t textLength,
UErrorCode * status);
#endif /* U_HIDE_DRAFT_API */
/**
* Thread safe cloning operation
* @param bi iterator to be cloned
@ -566,6 +594,35 @@ ubrk_refreshUText(UBreakIterator *bi,
UText *text,
UErrorCode *status);
#ifndef U_HIDE_DRAFT_API
/**
* Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
* The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
* more quickly than using ubrk_openRules. The compiled rules are not compatible across
* different major versions of ICU, nor across platforms of different endianness or
* different base character set family (ASCII vs EBCDIC). Supports preflighting (with
* binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
* the binaryRules buffer,
* @param bi The break iterator to use.
* @param binaryRules Buffer to receive the compiled binary rules; set to NULL for
* preflighting.
* @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
* preflighting.
* @param status Pointer to UErrorCode to receive any errors.
* @return The actual byte length of the binary rules. If not preflighting
* and this is larger than rulesCapacity, *status will be set to
* an error.
* @see ubrk_openBinaryRules
* @draft ICU 59
*/
U_DRAFT uint32_t U_EXPORT2
ubrk_getBinaryRules(UBreakIterator *bi,
uint8_t * binaryRules, uint32_t rulesCapacity,
UErrorCode * status);
#endif /* U_HIDE_DRAFT_API */
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif

View file

@ -10,7 +10,7 @@
* File CBIAPTS.C
*
* Modification History:
* Name Description
* Name Description
* Madhu Katragadda Creation
*********************************************************************************/
/*C API TEST FOR BREAKITERATOR */
@ -128,7 +128,7 @@ static UChar* toUChar(const char *src, void **freeHook) {
if (dest == NULL) {
return NULL;
}
dest->link = (StringStruct*)(*freeHook);
*freeHook = dest;
return dest->str;
@ -164,7 +164,7 @@ static void TestBreakIteratorCAPI()
/*test ubrk_open()*/
log_verbose("\nTesting BreakIterator open functions\n");
/* Use french for fun */
word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
if(status == U_FILE_ACCESS_ERROR) {
@ -176,7 +176,7 @@ static void TestBreakIteratorCAPI()
else{
log_verbose("PASS: Successfully opened word breakiterator\n");
}
sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
if(U_FAILURE(status)){
log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
@ -185,7 +185,7 @@ static void TestBreakIteratorCAPI()
else{
log_verbose("PASS: Successfully opened sentence breakiterator\n");
}
line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
if(U_FAILURE(status)){
log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
@ -194,7 +194,7 @@ static void TestBreakIteratorCAPI()
else{
log_verbose("PASS: Successfully opened line breakiterator\n");
}
character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
if(U_FAILURE(status)){
log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
@ -232,10 +232,10 @@ static void TestBreakIteratorCAPI()
}
for(i=0;i<count;i++)
{
log_verbose("%s\n", ubrk_getAvailable(i));
log_verbose("%s\n", ubrk_getAvailable(i));
if (ubrk_getAvailable(i) == 0)
log_err("No locale for which breakiterator is applicable\n");
else
else
log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
}
@ -258,10 +258,10 @@ static void TestBreakIteratorCAPI()
if(end!=49)
log_err("error ubrk_last(word) did not return 49\n");
log_verbose("last (word = %d\n", (int32_t)end);
pos=ubrk_previous(word);
log_verbose("%d %d\n", end, pos);
pos=ubrk_previous(word);
log_verbose("%d \n", pos);
@ -277,7 +277,7 @@ static void TestBreakIteratorCAPI()
}
log_verbose("\nTesting the functions for character\n");
ubrk_first(character);
pos = ubrk_following(character, 5);
@ -292,7 +292,7 @@ static void TestBreakIteratorCAPI()
if(pos!=21)
log_err("error ubrk_preceding(character,22) did not return 21\n");
log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
log_verbose("\nTesting the functions for line\n");
pos=ubrk_first(line);
@ -304,7 +304,7 @@ static void TestBreakIteratorCAPI()
log_err("error ubrk_following(line) did not return 22\n");
log_verbose("following (line) = %d\n", (int32_t)pos);
log_verbose("\nTesting the functions for sentence\n");
ubrk_first(sentence);
pos = ubrk_current(sentence);
@ -321,8 +321,8 @@ static void TestBreakIteratorCAPI()
if (ubrk_first(sentence)!=ubrk_current(sentence)) {
log_err("error in ubrk_first() or ubrk_current()\n");
}
/*---- */
/*Testing ubrk_open and ubrk_close()*/
log_verbose("\nTesting open and close for us locale\n");
@ -368,7 +368,7 @@ static void TestBreakIteratorCAPI()
static void TestBreakIteratorSafeClone(void)
{
UChar text[51]; /* Keep this odd to test for 64-bit memory alignment */
/* NOTE: This doesn't reliably force mis-alignment of following items. */
/* NOTE: This doesn't reliably force mis-alignment of following items. */
uint8_t buffer [CLONETEST_ITERATOR_COUNT] [U_BRK_SAFECLONE_BUFFERSIZE];
int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
@ -526,7 +526,7 @@ static UBreakIterator * testOpenRules(char *rules) {
bi = ubrk_openRules(ruleSourceU, -1, /* The rules */
NULL, -1, /* The text to be iterated over. */
&parseErr, &status);
if (U_FAILURE(status)) {
log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status));
bi = 0;
@ -586,6 +586,41 @@ static void TestBreakIteratorRules() {
}
}
/* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
/* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
status = U_ZERO_ERROR;
uint32_t rulesLength = ubrk_getBinaryRules(bi, NULL, 0, &status); /* preflight */
if (U_FAILURE(status)) {
log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status));
} else {
uint8_t* binaryRules = (uint8_t*)uprv_malloc(rulesLength);
if (binaryRules == NULL) {
log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength);
} else {
rulesLength = ubrk_getBinaryRules(bi, binaryRules, rulesLength, &status);
if (U_FAILURE(status)) {
log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status));
} else {
UBreakIterator* bi2 = ubrk_openBinaryRules(binaryRules, rulesLength, uData, -1, &status);
if (U_FAILURE(status)) {
log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status));
} else {
int32_t pos2 = ubrk_first(bi2);
pos = ubrk_first(bi);
for (i=0; i<sizeof(breaks); i++) {
if (pos2 != pos) {
log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2, pos);
}
pos2 = ubrk_next(bi2);
pos = ubrk_next(bi);
}
ubrk_close(bi2);
}
}
uprv_free(binaryRules);
}
}
freeToUCharStrings(&freeHook);
ubrk_close(bi);
}
@ -809,7 +844,7 @@ static void TestBreakIteratorTailoring(void) {
}
if (!foundError && offsindx < testPtr->numOffsets) {
log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
}
foundError = FALSE;
@ -826,7 +861,7 @@ static void TestBreakIteratorTailoring(void) {
}
if (!foundError && offsindx < testPtr->numOffsets) {
log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
}
ubrk_close(ubrkiter);
@ -851,7 +886,7 @@ static void TestBreakIteratorRefresh(void) {
UBreakIterator *bi;
UText ut1 = UTEXT_INITIALIZER;
UText ut2 = UTEXT_INITIALIZER;
bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
TEST_ASSERT_SUCCESS(status);
if (U_FAILURE(status)) {
@ -875,7 +910,7 @@ static void TestBreakIteratorRefresh(void) {
TEST_ASSERT_SUCCESS(status);
ubrk_refreshUText(bi, &ut2, &status);
TEST_ASSERT_SUCCESS(status);
/* Find the following matches, now working in the moved string. */
TEST_ASSERT(5 == ubrk_next(bi));
TEST_ASSERT(7 == ubrk_next(bi));
@ -994,7 +1029,7 @@ static const TestBISuppressionsItem testBISuppressionsItems[] = {
static void TestBreakIteratorSuppressions(void) {
const TestBISuppressionsItem * itemPtr;
for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
UChar textU[kTextULenMax];
int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);