mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-12914 Add ubrk_openBinaryRules, ubrk_getBinaryRules, and simple test
X-SVN-Rev: 39582
This commit is contained in:
parent
f28895cccc
commit
17683ea87f
3 changed files with 163 additions and 24 deletions
|
@ -20,6 +20,7 @@
|
|||
#include "unicode/rbbi.h"
|
||||
#include "rbbirb.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
@ -119,7 +120,24 @@ ubrk_openRules( const UChar *rules,
|
|||
}
|
||||
|
||||
|
||||
|
||||
U_CAPI UBreakIterator* U_EXPORT2
|
||||
ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
|
||||
const UChar * text, int32_t textLength,
|
||||
UErrorCode * status)
|
||||
{
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
LocalPointer<RuleBasedBreakIterator> lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
UBreakIterator *uBI = reinterpret_cast<UBreakIterator *>(lpRBBI.orphan());
|
||||
if (text != NULL) {
|
||||
ubrk_setText(uBI, text, textLength, status);
|
||||
}
|
||||
return uBI;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI UBreakIterator * U_EXPORT2
|
||||
|
@ -288,7 +306,8 @@ ubrk_getLocaleByType(const UBreakIterator *bi,
|
|||
}
|
||||
|
||||
|
||||
void ubrk_refreshUText(UBreakIterator *bi,
|
||||
U_CAPI void U_EXPORT2
|
||||
ubrk_refreshUText(UBreakIterator *bi,
|
||||
UText *text,
|
||||
UErrorCode *status)
|
||||
{
|
||||
|
@ -296,6 +315,34 @@ void ubrk_refreshUText(UBreakIterator *bi,
|
|||
bii->refreshInputText(text, *status);
|
||||
}
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
ubrk_getBinaryRules(UBreakIterator *bi,
|
||||
uint8_t * binaryRules, uint32_t rulesCapacity,
|
||||
UErrorCode * status)
|
||||
{
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if (binaryRules == NULL && rulesCapacity > 0) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
RuleBasedBreakIterator* rbbi;
|
||||
if ((rbbi = dynamic_cast<RuleBasedBreakIterator*>(reinterpret_cast<BreakIterator*>(bi))) == NULL) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
uint32_t rulesLength;
|
||||
const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength);
|
||||
if (binaryRules != NULL) { // if not preflighting
|
||||
if (rulesLength > rulesCapacity) {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
} else {
|
||||
uprv_memcpy(binaryRules, returnedRules, rulesLength);
|
||||
}
|
||||
}
|
||||
return rulesLength;
|
||||
}
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -267,6 +267,34 @@ ubrk_openRules(const UChar *rules,
|
|||
UParseError *parseErr,
|
||||
UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
|
||||
* Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
|
||||
* Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
|
||||
* compatible across different major versions of ICU, nor across platforms of different
|
||||
* endianness or different base character set family (ASCII vs EBCDIC).
|
||||
* @param binaryRules A set of compiled binary rules specifying the text breaking
|
||||
* conventions. Ownership of the storage containing the compiled
|
||||
* rules remains with the caller of this function. The compiled
|
||||
* rules must not be modified or deleted during the life of the
|
||||
* break iterator.
|
||||
* @param rulesLength The length of binaryRules in bytes.
|
||||
* @param text The text to be iterated over. May be null, in which case
|
||||
* ubrk_setText() is used to specify the text to be iterated.
|
||||
* @param textLength The number of characters in text, or -1 if null-terminated.
|
||||
* @param status Pointer to UErrorCode to receive any errors.
|
||||
* @return UBreakIterator for the specified rules.
|
||||
* @see ubrk_getBinaryRules
|
||||
* @draft ICU 59
|
||||
*/
|
||||
U_DRAFT UBreakIterator* U_EXPORT2
|
||||
ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength,
|
||||
const UChar * text, int32_t textLength,
|
||||
UErrorCode * status);
|
||||
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
/**
|
||||
* Thread safe cloning operation
|
||||
* @param bi iterator to be cloned
|
||||
|
@ -566,6 +594,35 @@ ubrk_refreshUText(UBreakIterator *bi,
|
|||
UText *text,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
|
||||
* The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
|
||||
* more quickly than using ubrk_openRules. The compiled rules are not compatible across
|
||||
* different major versions of ICU, nor across platforms of different endianness or
|
||||
* different base character set family (ASCII vs EBCDIC). Supports preflighting (with
|
||||
* binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
|
||||
* the binaryRules buffer,
|
||||
* @param bi The break iterator to use.
|
||||
* @param binaryRules Buffer to receive the compiled binary rules; set to NULL for
|
||||
* preflighting.
|
||||
* @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
|
||||
* preflighting.
|
||||
* @param status Pointer to UErrorCode to receive any errors.
|
||||
* @return The actual byte length of the binary rules. If not preflighting
|
||||
* and this is larger than rulesCapacity, *status will be set to
|
||||
* an error.
|
||||
* @see ubrk_openBinaryRules
|
||||
* @draft ICU 59
|
||||
*/
|
||||
U_DRAFT uint32_t U_EXPORT2
|
||||
ubrk_getBinaryRules(UBreakIterator *bi,
|
||||
uint8_t * binaryRules, uint32_t rulesCapacity,
|
||||
UErrorCode * status);
|
||||
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
#endif
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
* File CBIAPTS.C
|
||||
*
|
||||
* Modification History:
|
||||
* Name Description
|
||||
* Name Description
|
||||
* Madhu Katragadda Creation
|
||||
*********************************************************************************/
|
||||
/*C API TEST FOR BREAKITERATOR */
|
||||
|
@ -128,7 +128,7 @@ static UChar* toUChar(const char *src, void **freeHook) {
|
|||
if (dest == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
dest->link = (StringStruct*)(*freeHook);
|
||||
*freeHook = dest;
|
||||
return dest->str;
|
||||
|
@ -164,7 +164,7 @@ static void TestBreakIteratorCAPI()
|
|||
|
||||
/*test ubrk_open()*/
|
||||
log_verbose("\nTesting BreakIterator open functions\n");
|
||||
|
||||
|
||||
/* Use french for fun */
|
||||
word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
|
||||
if(status == U_FILE_ACCESS_ERROR) {
|
||||
|
@ -176,7 +176,7 @@ static void TestBreakIteratorCAPI()
|
|||
else{
|
||||
log_verbose("PASS: Successfully opened word breakiterator\n");
|
||||
}
|
||||
|
||||
|
||||
sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
|
||||
|
@ -185,7 +185,7 @@ static void TestBreakIteratorCAPI()
|
|||
else{
|
||||
log_verbose("PASS: Successfully opened sentence breakiterator\n");
|
||||
}
|
||||
|
||||
|
||||
line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
|
||||
|
@ -194,7 +194,7 @@ static void TestBreakIteratorCAPI()
|
|||
else{
|
||||
log_verbose("PASS: Successfully opened line breakiterator\n");
|
||||
}
|
||||
|
||||
|
||||
character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
|
||||
if(U_FAILURE(status)){
|
||||
log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
|
||||
|
@ -232,10 +232,10 @@ static void TestBreakIteratorCAPI()
|
|||
}
|
||||
for(i=0;i<count;i++)
|
||||
{
|
||||
log_verbose("%s\n", ubrk_getAvailable(i));
|
||||
log_verbose("%s\n", ubrk_getAvailable(i));
|
||||
if (ubrk_getAvailable(i) == 0)
|
||||
log_err("No locale for which breakiterator is applicable\n");
|
||||
else
|
||||
else
|
||||
log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
|
||||
}
|
||||
|
||||
|
@ -258,10 +258,10 @@ static void TestBreakIteratorCAPI()
|
|||
if(end!=49)
|
||||
log_err("error ubrk_last(word) did not return 49\n");
|
||||
log_verbose("last (word = %d\n", (int32_t)end);
|
||||
|
||||
|
||||
pos=ubrk_previous(word);
|
||||
log_verbose("%d %d\n", end, pos);
|
||||
|
||||
|
||||
pos=ubrk_previous(word);
|
||||
log_verbose("%d \n", pos);
|
||||
|
||||
|
@ -277,7 +277,7 @@ static void TestBreakIteratorCAPI()
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
log_verbose("\nTesting the functions for character\n");
|
||||
ubrk_first(character);
|
||||
pos = ubrk_following(character, 5);
|
||||
|
@ -292,7 +292,7 @@ static void TestBreakIteratorCAPI()
|
|||
if(pos!=21)
|
||||
log_err("error ubrk_preceding(character,22) did not return 21\n");
|
||||
log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
|
||||
|
||||
|
||||
|
||||
log_verbose("\nTesting the functions for line\n");
|
||||
pos=ubrk_first(line);
|
||||
|
@ -304,7 +304,7 @@ static void TestBreakIteratorCAPI()
|
|||
log_err("error ubrk_following(line) did not return 22\n");
|
||||
log_verbose("following (line) = %d\n", (int32_t)pos);
|
||||
|
||||
|
||||
|
||||
log_verbose("\nTesting the functions for sentence\n");
|
||||
ubrk_first(sentence);
|
||||
pos = ubrk_current(sentence);
|
||||
|
@ -321,8 +321,8 @@ static void TestBreakIteratorCAPI()
|
|||
if (ubrk_first(sentence)!=ubrk_current(sentence)) {
|
||||
log_err("error in ubrk_first() or ubrk_current()\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*---- */
|
||||
/*Testing ubrk_open and ubrk_close()*/
|
||||
log_verbose("\nTesting open and close for us locale\n");
|
||||
|
@ -368,7 +368,7 @@ static void TestBreakIteratorCAPI()
|
|||
static void TestBreakIteratorSafeClone(void)
|
||||
{
|
||||
UChar text[51]; /* Keep this odd to test for 64-bit memory alignment */
|
||||
/* NOTE: This doesn't reliably force mis-alignment of following items. */
|
||||
/* NOTE: This doesn't reliably force mis-alignment of following items. */
|
||||
uint8_t buffer [CLONETEST_ITERATOR_COUNT] [U_BRK_SAFECLONE_BUFFERSIZE];
|
||||
int32_t bufferSize = U_BRK_SAFECLONE_BUFFERSIZE;
|
||||
|
||||
|
@ -526,7 +526,7 @@ static UBreakIterator * testOpenRules(char *rules) {
|
|||
bi = ubrk_openRules(ruleSourceU, -1, /* The rules */
|
||||
NULL, -1, /* The text to be iterated over. */
|
||||
&parseErr, &status);
|
||||
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
log_data_err("FAIL: ubrk_openRules: ICU Error \"%s\" (Are you missing data?)\n", u_errorName(status));
|
||||
bi = 0;
|
||||
|
@ -586,6 +586,41 @@ static void TestBreakIteratorRules() {
|
|||
}
|
||||
}
|
||||
|
||||
/* #12914 add basic sanity test for ubrk_getBinaryRules, ubrk_openBinaryRules */
|
||||
/* Underlying functionality checked in C++ rbbiapts.cpp TestRoundtripRules */
|
||||
status = U_ZERO_ERROR;
|
||||
uint32_t rulesLength = ubrk_getBinaryRules(bi, NULL, 0, &status); /* preflight */
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("FAIL: ubrk_getBinaryRules preflight err: %s", u_errorName(status));
|
||||
} else {
|
||||
uint8_t* binaryRules = (uint8_t*)uprv_malloc(rulesLength);
|
||||
if (binaryRules == NULL) {
|
||||
log_err("FAIL: unable to malloc rules buffer, size %u", rulesLength);
|
||||
} else {
|
||||
rulesLength = ubrk_getBinaryRules(bi, binaryRules, rulesLength, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("FAIL: ubrk_getBinaryRules err: %s", u_errorName(status));
|
||||
} else {
|
||||
UBreakIterator* bi2 = ubrk_openBinaryRules(binaryRules, rulesLength, uData, -1, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("FAIL: ubrk_openBinaryRules err: %s", u_errorName(status));
|
||||
} else {
|
||||
int32_t pos2 = ubrk_first(bi2);
|
||||
pos = ubrk_first(bi);
|
||||
for (i=0; i<sizeof(breaks); i++) {
|
||||
if (pos2 != pos) {
|
||||
log_err("FAIL: interator from ubrk_openBinaryRules does not match original, get pos = %d instead of %d", pos2, pos);
|
||||
}
|
||||
pos2 = ubrk_next(bi2);
|
||||
pos = ubrk_next(bi);
|
||||
}
|
||||
ubrk_close(bi2);
|
||||
}
|
||||
}
|
||||
uprv_free(binaryRules);
|
||||
}
|
||||
}
|
||||
|
||||
freeToUCharStrings(&freeHook);
|
||||
ubrk_close(bi);
|
||||
}
|
||||
|
@ -809,7 +844,7 @@ static void TestBreakIteratorTailoring(void) {
|
|||
}
|
||||
if (!foundError && offsindx < testPtr->numOffsets) {
|
||||
log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
|
||||
testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
|
||||
testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
|
||||
}
|
||||
|
||||
foundError = FALSE;
|
||||
|
@ -826,7 +861,7 @@ static void TestBreakIteratorTailoring(void) {
|
|||
}
|
||||
if (!foundError && offsindx < testPtr->numOffsets) {
|
||||
log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
|
||||
testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
|
||||
testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
|
||||
}
|
||||
|
||||
ubrk_close(ubrkiter);
|
||||
|
@ -851,7 +886,7 @@ static void TestBreakIteratorRefresh(void) {
|
|||
UBreakIterator *bi;
|
||||
UText ut1 = UTEXT_INITIALIZER;
|
||||
UText ut2 = UTEXT_INITIALIZER;
|
||||
|
||||
|
||||
bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -875,7 +910,7 @@ static void TestBreakIteratorRefresh(void) {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
ubrk_refreshUText(bi, &ut2, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
|
||||
/* Find the following matches, now working in the moved string. */
|
||||
TEST_ASSERT(5 == ubrk_next(bi));
|
||||
TEST_ASSERT(7 == ubrk_next(bi));
|
||||
|
@ -994,7 +1029,7 @@ static const TestBISuppressionsItem testBISuppressionsItems[] = {
|
|||
|
||||
static void TestBreakIteratorSuppressions(void) {
|
||||
const TestBISuppressionsItem * itemPtr;
|
||||
|
||||
|
||||
for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
|
||||
UChar textU[kTextULenMax];
|
||||
int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);
|
||||
|
|
Loading…
Add table
Reference in a new issue