ICU-22342 Implement ExternalBreakEngineAPI

ICU-22342 Fix comments
This commit is contained in:
Frank Tang 2023-08-18 17:23:09 +00:00 committed by Frank Yung-Fong Tang
parent 2207e2c3df
commit 02d5e71903
13 changed files with 484 additions and 64 deletions

View file

@ -21,6 +21,7 @@
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/rbbi.h"
#include "brkeng.h"
#include "cmemory.h"
@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
}
UBool
UnhandledEngine::handles(UChar32 c) const {
UnhandledEngine::handles(UChar32 c, const char* locale) const {
(void)locale; // Unused
return fHandled && fHandled->contains(c);
}
int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */,
int32_t startPos,
int32_t endPos,
UVector32 &/*foundBreaks*/,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
UChar32 c = utext_current32(text);
utext_setNativeIndex(text, startPos);
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
@ -120,41 +123,39 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() {
}
}
U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
delete (const icu::LanguageBreakEngine *) obj;
void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
if (U_SUCCESS(status)) {
fEngines = engines.orphan();
}
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
ensureEngines(status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
if (fEngines == nullptr) {
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
if (U_FAILURE(status) ) {
// Note: no way to return error code to caller.
return nullptr;
}
fEngines = engines.orphan();
} else {
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c)) {
return lbe;
}
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != nullptr && lbe->handles(c, locale)) {
return lbe;
}
}
// We didn't find an engine. Create one.
lbe = loadEngineFor(c);
lbe = loadEngineFor(c, locale);
if (lbe != nullptr) {
fEngines->push((void *)lbe, status);
}
@ -162,7 +163,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) {
}
const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
return nullptr;
}
void ICULanguageBreakFactory::addExternalEngine(
ExternalBreakEngine* external, UErrorCode& status) {
LocalPointer<ExternalBreakEngine> engine(external, status);
ensureEngines(status);
LocalPointer<BreakEngineWrapper> wrapper(
new BreakEngineWrapper(engine.orphan(), status), status);
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
fEngines->push(wrapper.getAlias(), status);
wrapper.orphan();
}
BreakEngineWrapper::BreakEngineWrapper(
ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
}
BreakEngineWrapper::~BreakEngineWrapper() {
}
UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
return delegate->isFor(c, locale);
}
int32_t BreakEngineWrapper::findBreaks(
UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
int32_t result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.
utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = utext_current32(text);
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
utext_next32(text); // TODO: recast loop for postincrement
c = utext_current32(text);
}
rangeStart = start;
rangeEnd = current;
int32_t beforeSize = foundBreaks.size();
int32_t additionalCapacity = rangeEnd - rangeStart + 1;
// enlarge to contains (rangeEnd-rangeStart+1) more items
foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
result = delegate->fillBreak(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
additionalCapacity, status);
if (U_FAILURE(status)) return 0;
foundBreaks.setSize(beforeSize + result);
utext_setNativeIndex(text, current);
return result;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -10,6 +10,7 @@
#ifndef BRKENG_H
#define BRKENG_H
#include "unicode/umisc.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
@ -21,6 +22,7 @@ class UnicodeSet;
class UStack;
class UVector32;
class DictionaryMatcher;
class ExternalBreakEngine;
/*******************************************************************
* LanguageBreakEngine
@ -35,7 +37,7 @@ class DictionaryMatcher;
* <p>LanguageBreakEngines should normally be implemented so as to
* be shared between threads without locking.</p>
*/
class LanguageBreakEngine : public UMemory {
class LanguageBreakEngine : public UObject {
public:
/**
@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const = 0;
virtual UBool handles(UChar32 c, const char* locale) const = 0;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {
};
/*******************************************************************
* BreakEngineWrapper
*/
/**
* <p>BreakEngineWrapper implement LanguageBreakEngine by
* a thin wrapper that delegate the task to ExternalBreakEngine
* </p>
*/
class BreakEngineWrapper : public LanguageBreakEngine {
public:
BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
virtual ~BreakEngineWrapper();
virtual UBool handles(UChar32 c, const char* locale) const override;
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
UBool isPhraseBreaking,
UErrorCode &status) const override;
private:
LocalPointer<ExternalBreakEngine> delegate;
};
/*******************************************************************
* LanguageBreakFactory
*/
@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
};
@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const override;
virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
/**
* Add and adopt the engine and return an URegistryKey.
* @param engine The ExternalBreakEngine to be added and adopt. The caller
* pass the ownership and should not release the memory after this.
* @param status the error code.
*/
virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
protected:
/**
@ -258,9 +301,10 @@ protected:
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
/**
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
@ -269,6 +313,9 @@ protected:
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
private:
void ensureEngines(UErrorCode& status);
};
U_NAMESPACE_END

View file

@ -27,6 +27,7 @@
#include "unicode/rbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/filteredbrk.h"
@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data());
uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}
ures_close(b);
@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)
BreakIterator::BreakIterator()
{
*validLocale = *actualLocale = 0;
*validLocale = *actualLocale = *requestLocale = 0;
}
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
if (this != &other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
return *this;
}
@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
Locale
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return Locale(requestLocale);
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
}
const char *
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
if (type == ULOC_REQUESTED_LOCALE) {
return requestLocale;
}
U_LOCALE_BASED(locBased, *this);
return locBased.getLocaleID(type, status);
}

View file

@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
}
UBool
DictionaryBreakEngine::handles(UChar32 c) const {
DictionaryBreakEngine::handles(UChar32 c, const char*) const {
return fSet.contains(c);
}
@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text,
UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
(void)startPos; // TODO: remove this param?
int32_t result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.
utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;

View file

@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c) const override;
virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* <p>Find any breaks within a run in the supplied text.</p>

View file

@ -1125,6 +1125,7 @@ static icu::UStack *gLanguageBreakFactories = nullptr;
static const icu::UnicodeString *gEmptyString = nullptr;
static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
static icu::UInitOnce gRBBIInitOnce {};
static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
/**
* Release all static memory held by breakiterator.
@ -1153,37 +1154,41 @@ static void U_CALLCONV rbbiInit() {
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
}
static void U_CALLCONV initLanguageFactories() {
UErrorCode status = U_ZERO_ERROR;
static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
U_ASSERT(gLanguageBreakFactories == nullptr);
gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
gLanguageBreakFactories->push(builtIn, status);
LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status);
if (U_SUCCESS(status)) {
gICULanguageBreakFactory = factory.orphan();
gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
#ifdef U_LOCAL_SERVICE_HOOK
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
if (extra != nullptr) {
gLanguageBreakFactories->push(extra, status);
}
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
if (extra != nullptr) {
gLanguageBreakFactories->push(extra, status);
}
#endif
}
}
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
}
void ensureLanguageFactories(UErrorCode& status) {
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
}
static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c)
getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
{
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
if (gLanguageBreakFactories == nullptr) {
return nullptr;
}
UErrorCode status = U_ZERO_ERROR;
ensureLanguageFactories(status);
if (U_FAILURE(status)) return nullptr;
int32_t i = gLanguageBreakFactories->size();
const LanguageBreakEngine *lbe = nullptr;
while (--i >= 0) {
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
lbe = factory->getEngineFor(c);
lbe = factory->getEngineFor(c, locale);
if (lbe != nullptr) {
break;
}
@ -1199,7 +1204,7 @@ getLanguageBreakEngineFromFactory(UChar32 c)
//
//-------------------------------------------------------------------------------
const LanguageBreakEngine *
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
@ -1215,14 +1220,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
int32_t i = fLanguageBreakEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
if (lbe->handles(c)) {
if (lbe->handles(c, locale)) {
return lbe;
}
}
// No existing dictionary took the character. See if a factory wants to
// give us a new LanguageBreakEngine for this character.
lbe = getLanguageBreakEngineFromFactory(c);
lbe = getLanguageBreakEngineFromFactory(c, locale);
// If we got one, use it and push it on our stack.
if (lbe != nullptr) {
@ -1259,6 +1264,18 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
return fUnhandledBreakEngine;
}
#ifndef U_HIDE_DRAFT_API
void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
ExternalBreakEngine* toAdopt, UErrorCode& status) {
LocalPointer<ExternalBreakEngine> engine(toAdopt, status);
if (U_FAILURE(status)) return;
ensureLanguageFactories(status);
if (U_FAILURE(status)) return;
gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
}
#endif /* U_HIDE_DRAFT_API */
void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache();
}

View file

@ -158,12 +158,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round

View file

@ -649,6 +649,7 @@ private:
/** @internal (private) */
char actualLocale[ULOC_FULLNAME_CAPACITY];
char validLocale[ULOC_FULLNAME_CAPACITY];
char requestLocale[ULOC_FULLNAME_CAPACITY];
};
#ifndef U_HIDE_DEPRECATED_API

View file

@ -43,6 +43,71 @@ class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
#ifndef U_HIDE_DRAFT_API
#if !UCONFIG_NO_SERVICE
/**
* The ExternalBreakEngine class define an abstract interface for the host environment
* to provide a low level facility to break text for unicode text in script that the text boundary
* cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
* word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
* The host environment implement one or more subclass of ExternalBreakEngine and
* register them in the initialization time by calling
* RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
* delete the registered external engine in proper time during the clean up
* event.
* @internal ICU 74 technology preview
*/
class ExternalBreakEngine : public UObject {
public:
/**
* destructor
* @internal ICU 74 technology preview
*/
virtual ~ExternalBreakEngine() {}
/**
* <p>Indicate whether this engine handles a particular character when
* the RuleBasedBreakIterator is used for a particular locale. This method is used
* by the RuleBasedBreakIterator to find a break engine.</p>
* @param c A character which begins a run that the engine might handle.
* @param locale The locale.
* @return true if this engine handles the particular character for that locale.
* @internal ICU 74 technology preview
*/
virtual bool isFor(UChar32 c, const char* locale) const = 0;
/**
* <p>Indicate whether this engine handles a particular character.This method is
* used by the RuleBasedBreakIterator after it already find a break engine to see which
* characters after the first one can be handled by this break engine.</p>
* @param c A character that the engine might handle.
* @return true if this engine handles the particular character.
* @internal ICU 74 technology preview
*/
virtual bool handles(UChar32 c) const = 0;
/**
* <p>Divide up a range of text handled by this break engine.</p>
*
* @param text A UText representing the text
* @param start The start of the range of known characters
* @param end The end of the range of known characters
* @param foundBreaks Output of C array of int32_t break positions, or
* nullptr
* @param foundBreaksCapacity The capacity of foundBreaks
* @param status Information on any errors encountered.
* @return The number of breaks found
* @internal ICU 74 technology preview
*/
virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
int32_t* foundBreaks, int32_t foundBreaksCapacity,
UErrorCode& status) const = 0;
};
#endif /* UCONFIG_NO_SERVICE */
#endif /* U_HIDE_DRAFT_API */
/**
*
* A subclass of BreakIterator whose behavior is specified using a list of rules.
@ -716,9 +781,10 @@ private:
* This function returns the appropriate LanguageBreakEngine for a
* given character c.
* @param c A character in the dictionary set
* @param locale The locale.
* @internal (private)
*/
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
public:
#ifndef U_HIDE_INTERNAL_API
@ -734,8 +800,26 @@ private:
*/
void dumpTables();
#endif /* U_HIDE_INTERNAL_API */
#ifndef U_HIDE_DRAFT_API
#if !UCONFIG_NO_SERVICE
/**
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
* @param toAdopt the ExternalBreakEngine instance to be adopted
* @param status the in/out status code, no special meanings are assigned
* @internal ICU 74 technology preview
*/
static void U_EXPORT2 registerExternalBreakEngine(
ExternalBreakEngine* toAdopt, UErrorCode& status);
#endif /* UCONFIG_NO_SERVICE */
#endif /* U_HIDE_DRAFT_API */
};
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -73,7 +73,7 @@ UScriptCode getScriptFromModelName(const std::string& modelName) {
// the model. Since by default the LSTM models are not included, all the tested
// models need to be included under source/test/testdata.
void LSTMBETest::runTestFromFile(const char* filename) {
void LSTMBETest::runTestFromFile(const char* filename, const char* locale) {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<const LanguageBreakEngine> engine;
// Open and read the test data file.
@ -123,7 +123,7 @@ void LSTMBETest::runTestFromFile(const char* filename) {
caseNum++;
bool canHandleAllChars = true;
for (int32_t i = 0; i < value.length(); i++) {
if (!engine->handles(value.charAt(i))) {
if (!engine->handles(value.charAt(i), locale)) {
errln(UnicodeString("Test Case#") + caseNum + " contains char '" +
UnicodeString(value.charAt(i)) +
"' cannot be handled by the engine in offset " + i + "\n" + line);
@ -200,15 +200,15 @@ void LSTMBETest::runTestFromFile(const char* filename) {
}
void LSTMBETest::TestThaiGraphclust() {
runTestFromFile("Thai_graphclust_model4_heavy_Test.txt");
runTestFromFile("Thai_graphclust_model4_heavy_Test.txt", "th");
}
void LSTMBETest::TestThaiCodepoints() {
runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt");
runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt", "th");
}
void LSTMBETest::TestBurmeseGraphclust() {
runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt");
runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", "my");
}
const LanguageBreakEngine* LSTMBETest::createEngineFromTestData(

View file

@ -40,7 +40,7 @@ public:
private:
const LanguageBreakEngine* createEngineFromTestData(const char* model, UScriptCode script, UErrorCode& status);
void runTestFromFile(const char* filename);
void runTestFromFile(const char* filename, const char* locale);
void runTestWithLargeMemory(const char* model, UScriptCode script);
// Test parameters, from the test framework and test invocation.

View file

@ -142,6 +142,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestLSTMThai);
TESTCASE_AUTO(TestLSTMBurmese);
TESTCASE_AUTO(TestRandomAccess);
TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe);
TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@ -5667,4 +5669,192 @@ void RBBITest::TestRandomAccess() {
}
}
// A Fake Tai Le break engine which handle Unicode Tai Le (Tale) block
// https://unicode.org/charts/PDF/U1950.pdf
// U+1950 - U+197F and always break after Tone letters (U+1970-U+1974)
class FakeTaiLeBreakEngine : public ExternalBreakEngine {
public:
FakeTaiLeBreakEngine() : block(0x1950, 0x197f), tones(0x1970, 0x1974) {
}
virtual ~FakeTaiLeBreakEngine() {
}
virtual bool isFor(UChar32 c, const char* /* locale */) const override {
// We implmement this for any locale, not return false for some langauge
// here.
return handles(c);
}
virtual bool handles(UChar32 c) const override {
return block.contains(c);
}
virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
int32_t* foundBreaks, int32_t foundBreaksCapacity,
UErrorCode& status) const override {
if (U_FAILURE(status)) return 0;
int32_t i = 0;
// Save the state of the utext
int64_t savedIndex = utext_getNativeIndex(text);
if (savedIndex != start) {
utext_setNativeIndex(text, start);
}
int32_t current;
while((current = (int32_t)utext_getNativeIndex(text)) < end) {
UChar32 c = utext_current32(text);
// Break after tone marks as a fake break point.
if (tones.contains(c)) {
if (i >= foundBreaksCapacity) {
status = U_BUFFER_OVERFLOW_ERROR;
utext_setNativeIndex(text, savedIndex);
return i;
}
foundBreaks[i++] = current;
}
UTEXT_NEXT32(text);
}
// Restore the utext
if (savedIndex != current) {
utext_setNativeIndex(text, savedIndex);
}
return i;
}
private:
UnicodeSet block;
UnicodeSet tones;
};
// A Fake Yue Break Engine which handle CJK Unified Ideographs
// block (U+4E00-U+9FFF) when locale start with 'yue' and break
// after every character.
class FakeYueBreakEngine : public ExternalBreakEngine {
public:
FakeYueBreakEngine() : block(0x4e00, 0x9FFF) {
}
virtual ~FakeYueBreakEngine() {
}
virtual bool isFor(UChar32 c, const char* locale) const override {
// We implmement this for any locale starts with "yue" such as
// "yue", "yue-CN", "yue-Hant-CN", etc.
return handles(c) && uprv_strncmp("yue", locale, 3) == 0;
}
virtual bool handles(UChar32 c) const override {
return block.contains(c);
}
virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
int32_t* foundBreaks, int32_t foundBreaksCapacity,
UErrorCode& status) const override {
(void)text;
if (U_FAILURE(status)) return 0;
int32_t i = 0;
int32_t current = start;
while (current++ < end) {
// A fake word segmentation by breaking every two Unicode.
if ((current - start) % 2 == 0) {
if (i >= foundBreaksCapacity) {
status = U_BUFFER_OVERFLOW_ERROR;
return i;
}
foundBreaks[i++] = current;
}
}
return i;
}
private:
UnicodeSet block;
};
void RBBITest::TestExternalBreakEngineWithFakeYue() {
UErrorCode status = U_ZERO_ERROR;
UnicodeString text(u"a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn");
std::vector<int32_t> actual1;
{
LocalPointer<BreakIterator> bi1(
BreakIterator::createWordInstance(Locale::getRoot(), status),
status);
bi1->setText(text);
assertTrue(WHERE "BreakIterator::createWordInstance( root )",
U_SUCCESS(status));
do {
actual1.push_back(bi1->current());
} while(bi1->next() != BreakIterator::DONE);
}
std::vector<int32_t> expected1({{ 0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 30}});
assertTrue("root break Yue as Chinese", expected1 == actual1);
status = U_ZERO_ERROR;
RuleBasedBreakIterator::registerExternalBreakEngine(
new FakeYueBreakEngine(), status);
assertTrue(WHERE "registerExternalBreakEngine w FakeYueBreakEngine",
U_SUCCESS(status));
std::vector<int32_t> actual2;
{
status = U_ZERO_ERROR;
LocalPointer<BreakIterator> bi2(
BreakIterator::createWordInstance(Locale("yue"), status), status);
assertTrue(WHERE "BreakIterator::createWordInstance( yue )",
U_SUCCESS(status));
bi2->setText(text);
do {
actual2.push_back(bi2->current());
} while(bi2->next() != BreakIterator::DONE);
}
std::vector<int32_t> expected2({{ 0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20,
22, 23, 24, 26, 27, 30}});
assertTrue(WHERE "break Yue by Fake external breaker",
expected2 == actual2);
}
void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
UErrorCode status = U_ZERO_ERROR;
UnicodeString text(
u"a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ"
u"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn");
std::vector<int32_t> actual1;
{
LocalPointer<BreakIterator> bi1(
BreakIterator::createLineInstance(Locale::getRoot(), status),
status);
bi1->setText(text);
assertTrue(WHERE "BreakIterator::createLineInstance( root )",
U_SUCCESS(status));
do {
actual1.push_back(bi1->current());
} while(bi1->next() != BreakIterator::DONE);
}
std::vector<int32_t> expected1({{
0, 2, 5, 86, 89, 92 }});
assertTrue(WHERE "root break Tai Le", expected1 == actual1);
RuleBasedBreakIterator::registerExternalBreakEngine(
new FakeTaiLeBreakEngine(), status);
assertTrue(WHERE "registerExternalBreakEngine w FakeTaiLeBreakEngine",
U_SUCCESS(status));
std::vector<int32_t> actual2;
{
status = U_ZERO_ERROR;
LocalPointer<BreakIterator> bi2(
BreakIterator::createLineInstance(Locale("tdd"), status), status);
assertTrue(WHERE "BreakIterator::createLineInstance( tdd )",
U_SUCCESS(status));
bi2->setText(text);
do {
actual2.push_back(bi2->current());
} while(bi2->next() != BreakIterator::DONE);
}
std::vector<int32_t> expected2({{
0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67,
70, 73, 76, 80, 86, 89, 92}});
assertTrue("break Tai Le by Fake external breaker",
expected2 == actual2);
}
#endif // #if !UCONFIG_NO_BREAK_ITERATION

View file

@ -96,6 +96,8 @@ public:
void TestLSTMThai();
void TestLSTMBurmese();
void TestRandomAccess();
void TestExternalBreakEngineWithFakeTaiLe();
void TestExternalBreakEngineWithFakeYue();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();