diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp index b02c966b107..925c1e90a91 100644 --- a/icu4c/source/common/ubrk.cpp +++ b/icu4c/source/common/ubrk.cpp @@ -20,6 +20,7 @@ #include "unicode/rbbi.h" #include "rbbirb.h" #include "uassert.h" +#include "cmemory.h" U_NAMESPACE_USE @@ -119,7 +120,24 @@ ubrk_openRules( const UChar *rules, } - +U_CAPI UBreakIterator* U_EXPORT2 +ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength, + const UChar * text, int32_t textLength, + UErrorCode * status) +{ + if (U_FAILURE(*status)) { + return NULL; + } + LocalPointer lpRBBI(new RuleBasedBreakIterator(binaryRules, rulesLength, *status), *status); + if (U_FAILURE(*status)) { + return NULL; + } + UBreakIterator *uBI = reinterpret_cast(lpRBBI.orphan()); + if (text != NULL) { + ubrk_setText(uBI, text, textLength, status); + } + return uBI; +} U_CAPI UBreakIterator * U_EXPORT2 @@ -288,7 +306,8 @@ ubrk_getLocaleByType(const UBreakIterator *bi, } -void ubrk_refreshUText(UBreakIterator *bi, +U_CAPI void U_EXPORT2 +ubrk_refreshUText(UBreakIterator *bi, UText *text, UErrorCode *status) { @@ -296,6 +315,34 @@ void ubrk_refreshUText(UBreakIterator *bi, bii->refreshInputText(text, *status); } +U_CAPI uint32_t U_EXPORT2 +ubrk_getBinaryRules(UBreakIterator *bi, + uint8_t * binaryRules, uint32_t rulesCapacity, + UErrorCode * status) +{ + if (U_FAILURE(*status)) { + return 0; + } + if (binaryRules == NULL && rulesCapacity > 0) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + RuleBasedBreakIterator* rbbi; + if ((rbbi = dynamic_cast(reinterpret_cast(bi))) == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + uint32_t rulesLength; + const uint8_t * returnedRules = rbbi->getBinaryRules(rulesLength); + if (binaryRules != NULL) { // if not preflighting + if (rulesLength > rulesCapacity) { + *status = U_BUFFER_OVERFLOW_ERROR; + } else { + uprv_memcpy(binaryRules, returnedRules, rulesLength); + } + } + return rulesLength; +} #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h index f43943ed1ab..1c8f62a17c8 100644 --- a/icu4c/source/common/unicode/ubrk.h +++ b/icu4c/source/common/unicode/ubrk.h @@ -267,6 +267,34 @@ ubrk_openRules(const UChar *rules, UParseError *parseErr, UErrorCode *status); +#ifndef U_HIDE_DRAFT_API +/** + * Open a new UBreakIterator for locating text boundaries using precompiled binary rules. + * Opening a UBreakIterator this way is substantially faster than using ubrk_openRules. + * Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not + * compatible across different major versions of ICU, nor across platforms of different + * endianness or different base character set family (ASCII vs EBCDIC). + * @param binaryRules A set of compiled binary rules specifying the text breaking + * conventions. Ownership of the storage containing the compiled + * rules remains with the caller of this function. The compiled + * rules must not be modified or deleted during the life of the + * break iterator. + * @param rulesLength The length of binaryRules in bytes. + * @param text The text to be iterated over. May be null, in which case + * ubrk_setText() is used to specify the text to be iterated. + * @param textLength The number of characters in text, or -1 if null-terminated. + * @param status Pointer to UErrorCode to receive any errors. + * @return UBreakIterator for the specified rules. + * @see ubrk_getBinaryRules + * @draft ICU 59 + */ +U_DRAFT UBreakIterator* U_EXPORT2 +ubrk_openBinaryRules(const uint8_t *binaryRules, uint32_t rulesLength, + const UChar * text, int32_t textLength, + UErrorCode * status); + +#endif /* U_HIDE_DRAFT_API */ + /** * Thread safe cloning operation * @param bi iterator to be cloned @@ -566,6 +594,35 @@ ubrk_refreshUText(UBreakIterator *bi, UText *text, UErrorCode *status); + +#ifndef U_HIDE_DRAFT_API +/** + * Get a compiled binary version of the rules specifying the behavior of a UBreakIterator. + * The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator + * more quickly than using ubrk_openRules. The compiled rules are not compatible across + * different major versions of ICU, nor across platforms of different endianness or + * different base character set family (ASCII vs EBCDIC). Supports preflighting (with + * binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to + * the binaryRules buffer, + * @param bi The break iterator to use. + * @param binaryRules Buffer to receive the compiled binary rules; set to NULL for + * preflighting. + * @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for + * preflighting. + * @param status Pointer to UErrorCode to receive any errors. + * @return The actual byte length of the binary rules. If not preflighting + * and this is larger than rulesCapacity, *status will be set to + * an error. + * @see ubrk_openBinaryRules + * @draft ICU 59 + */ +U_DRAFT uint32_t U_EXPORT2 +ubrk_getBinaryRules(UBreakIterator *bi, + uint8_t * binaryRules, uint32_t rulesCapacity, + UErrorCode * status); + +#endif /* U_HIDE_DRAFT_API */ + #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ #endif diff --git a/icu4c/source/test/cintltst/cbiapts.c b/icu4c/source/test/cintltst/cbiapts.c index 398db3c7fe3..7ad1924ca89 100644 --- a/icu4c/source/test/cintltst/cbiapts.c +++ b/icu4c/source/test/cintltst/cbiapts.c @@ -10,7 +10,7 @@ * File CBIAPTS.C * * Modification History: -* Name Description +* Name Description * Madhu Katragadda Creation *********************************************************************************/ /*C API TEST FOR BREAKITERATOR */ @@ -128,7 +128,7 @@ static UChar* toUChar(const char *src, void **freeHook) { if (dest == NULL) { return NULL; } - + dest->link = (StringStruct*)(*freeHook); *freeHook = dest; return dest->str; @@ -164,7 +164,7 @@ static void TestBreakIteratorCAPI() /*test ubrk_open()*/ log_verbose("\nTesting BreakIterator open functions\n"); - + /* Use french for fun */ word = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status); if(status == U_FILE_ACCESS_ERROR) { @@ -176,7 +176,7 @@ static void TestBreakIteratorCAPI() else{ log_verbose("PASS: Successfully opened word breakiterator\n"); } - + sentence = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status)); @@ -185,7 +185,7 @@ static void TestBreakIteratorCAPI() else{ log_verbose("PASS: Successfully opened sentence breakiterator\n"); } - + line = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status)); @@ -194,7 +194,7 @@ static void TestBreakIteratorCAPI() else{ log_verbose("PASS: Successfully opened line breakiterator\n"); } - + character = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status)); @@ -232,10 +232,10 @@ static void TestBreakIteratorCAPI() } for(i=0;inumOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n", - testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]); + testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]); } foundError = FALSE; @@ -826,7 +861,7 @@ static void TestBreakIteratorTailoring(void) { } if (!foundError && offsindx < testPtr->numOffsets) { log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n", - testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]); + testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]); } ubrk_close(ubrkiter); @@ -851,7 +886,7 @@ static void TestBreakIteratorRefresh(void) { UBreakIterator *bi; UText ut1 = UTEXT_INITIALIZER; UText ut2 = UTEXT_INITIALIZER; - + bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status); TEST_ASSERT_SUCCESS(status); if (U_FAILURE(status)) { @@ -875,7 +910,7 @@ static void TestBreakIteratorRefresh(void) { TEST_ASSERT_SUCCESS(status); ubrk_refreshUText(bi, &ut2, &status); TEST_ASSERT_SUCCESS(status); - + /* Find the following matches, now working in the moved string. */ TEST_ASSERT(5 == ubrk_next(bi)); TEST_ASSERT(7 == ubrk_next(bi)); @@ -994,7 +1029,7 @@ static const TestBISuppressionsItem testBISuppressionsItems[] = { static void TestBreakIteratorSuppressions(void) { const TestBISuppressionsItem * itemPtr; - + for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) { UChar textU[kTextULenMax]; int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);