mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 05:25:34 +00:00
ICU-22342 Implement ExternalBreakEngineAPI
ICU-22342 Fix comments
This commit is contained in:
parent
2207e2c3df
commit
02d5e71903
13 changed files with 484 additions and 64 deletions
|
@ -21,6 +21,7 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/rbbi.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
|
|||
}
|
||||
|
||||
UBool
|
||||
UnhandledEngine::handles(UChar32 c) const {
|
||||
UnhandledEngine::handles(UChar32 c, const char* locale) const {
|
||||
(void)locale; // Unused
|
||||
return fHandled && fHandled->contains(c);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnhandledEngine::findBreaks( UText *text,
|
||||
int32_t /* startPos */,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &/*foundBreaks*/,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
UChar32 c = utext_current32(text);
|
||||
utext_setNativeIndex(text, startPos);
|
||||
UChar32 c = utext_current32(text);
|
||||
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
||||
c = utext_current32(text);
|
||||
|
@ -120,41 +123,39 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() {
|
|||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
U_CDECL_BEGIN
|
||||
static void U_CALLCONV _deleteEngine(void *obj) {
|
||||
delete (const icu::LanguageBreakEngine *) obj;
|
||||
void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
|
||||
static UMutex gBreakEngineMutex;
|
||||
Mutex m(&gBreakEngineMutex);
|
||||
if (fEngines == nullptr) {
|
||||
LocalPointer<UStack> engines(new UStack(uprv_deleteUObject, nullptr, status), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
fEngines = engines.orphan();
|
||||
}
|
||||
}
|
||||
}
|
||||
U_CDECL_END
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
|
||||
ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
|
||||
const LanguageBreakEngine *lbe = nullptr;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ensureEngines(status);
|
||||
if (U_FAILURE(status) ) {
|
||||
// Note: no way to return error code to caller.
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
static UMutex gBreakEngineMutex;
|
||||
Mutex m(&gBreakEngineMutex);
|
||||
|
||||
if (fEngines == nullptr) {
|
||||
LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status);
|
||||
if (U_FAILURE(status) ) {
|
||||
// Note: no way to return error code to caller.
|
||||
return nullptr;
|
||||
}
|
||||
fEngines = engines.orphan();
|
||||
} else {
|
||||
int32_t i = fEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
||||
if (lbe != nullptr && lbe->handles(c)) {
|
||||
return lbe;
|
||||
}
|
||||
int32_t i = fEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
||||
if (lbe != nullptr && lbe->handles(c, locale)) {
|
||||
return lbe;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// We didn't find an engine. Create one.
|
||||
lbe = loadEngineFor(c);
|
||||
lbe = loadEngineFor(c, locale);
|
||||
if (lbe != nullptr) {
|
||||
fEngines->push((void *)lbe, status);
|
||||
}
|
||||
|
@ -162,7 +163,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) {
|
|||
}
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
void ICULanguageBreakFactory::addExternalEngine(
|
||||
ExternalBreakEngine* external, UErrorCode& status) {
|
||||
LocalPointer<ExternalBreakEngine> engine(external, status);
|
||||
ensureEngines(status);
|
||||
LocalPointer<BreakEngineWrapper> wrapper(
|
||||
new BreakEngineWrapper(engine.orphan(), status), status);
|
||||
static UMutex gBreakEngineMutex;
|
||||
Mutex m(&gBreakEngineMutex);
|
||||
fEngines->push(wrapper.getAlias(), status);
|
||||
wrapper.orphan();
|
||||
}
|
||||
|
||||
BreakEngineWrapper::BreakEngineWrapper(
|
||||
ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
|
||||
}
|
||||
|
||||
BreakEngineWrapper::~BreakEngineWrapper() {
|
||||
}
|
||||
|
||||
UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
|
||||
return delegate->isFor(c, locale);
|
||||
}
|
||||
|
||||
int32_t BreakEngineWrapper::findBreaks(
|
||||
UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
// The span to break begins at the current position in the text, and
|
||||
// extends towards the start or end of the text, depending on 'reverse'.
|
||||
|
||||
utext_setNativeIndex(text, startPos);
|
||||
int32_t start = (int32_t)utext_getNativeIndex(text);
|
||||
int32_t current;
|
||||
int32_t rangeStart;
|
||||
int32_t rangeEnd;
|
||||
UChar32 c = utext_current32(text);
|
||||
while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
|
||||
utext_next32(text); // TODO: recast loop for postincrement
|
||||
c = utext_current32(text);
|
||||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
int32_t beforeSize = foundBreaks.size();
|
||||
int32_t additionalCapacity = rangeEnd - rangeStart + 1;
|
||||
// enlarge to contains (rangeEnd-rangeStart+1) more items
|
||||
foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
|
||||
result = delegate->fillBreak(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
|
||||
additionalCapacity, status);
|
||||
if (U_FAILURE(status)) return 0;
|
||||
foundBreaks.setSize(beforeSize + result);
|
||||
utext_setNativeIndex(text, current);
|
||||
return result;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#ifndef BRKENG_H
|
||||
#define BRKENG_H
|
||||
|
||||
#include "unicode/umisc.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/utext.h"
|
||||
|
@ -21,6 +22,7 @@ class UnicodeSet;
|
|||
class UStack;
|
||||
class UVector32;
|
||||
class DictionaryMatcher;
|
||||
class ExternalBreakEngine;
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakEngine
|
||||
|
@ -35,7 +37,7 @@ class DictionaryMatcher;
|
|||
* <p>LanguageBreakEngines should normally be implemented so as to
|
||||
* be shared between threads without locking.</p>
|
||||
*/
|
||||
class LanguageBreakEngine : public UMemory {
|
||||
class LanguageBreakEngine : public UObject {
|
||||
public:
|
||||
|
||||
/**
|
||||
|
@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
|
|||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c) const = 0;
|
||||
virtual UBool handles(UChar32 c, const char* locale) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
|
@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {
|
|||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* BreakEngineWrapper
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>BreakEngineWrapper implement LanguageBreakEngine by
|
||||
* a thin wrapper that delegate the task to ExternalBreakEngine
|
||||
* </p>
|
||||
*/
|
||||
class BreakEngineWrapper : public LanguageBreakEngine {
|
||||
public:
|
||||
|
||||
BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
|
||||
|
||||
virtual ~BreakEngineWrapper();
|
||||
|
||||
virtual UBool handles(UChar32 c, const char* locale) const override;
|
||||
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode &status) const override;
|
||||
|
||||
private:
|
||||
LocalPointer<ExternalBreakEngine> delegate;
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakFactory
|
||||
*/
|
||||
|
@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
|
|||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param locale The locale.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
|
||||
|
||||
};
|
||||
|
||||
|
@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c) const override;
|
||||
virtual UBool handles(UChar32 c, const char* locale) const override;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
|
@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
|
|||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param locale The locale.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
|
||||
|
||||
/**
|
||||
* Add and adopt the engine and return an URegistryKey.
|
||||
* @param engine The ExternalBreakEngine to be added and adopt. The caller
|
||||
* pass the ownership and should not release the memory after this.
|
||||
* @param status the error code.
|
||||
*/
|
||||
virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
|
||||
|
||||
protected:
|
||||
/**
|
||||
|
@ -258,9 +301,10 @@ protected:
|
|||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param locale The locale.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
|
||||
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
|
||||
|
||||
/**
|
||||
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
|
||||
|
@ -269,6 +313,9 @@ protected:
|
|||
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
|
||||
*/
|
||||
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
|
||||
|
||||
private:
|
||||
void ensureEngines(UErrorCode& status);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include "unicode/rbbi.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/filteredbrk.h"
|
||||
|
@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
|
|||
// If there is a result, set the valid locale and actual locale, and the kind
|
||||
if (U_SUCCESS(status) && result != nullptr) {
|
||||
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
|
||||
|
||||
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
|
||||
actualLocale.data());
|
||||
uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
|
||||
result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
|
||||
}
|
||||
|
||||
ures_close(b);
|
||||
|
@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)
|
|||
|
||||
BreakIterator::BreakIterator()
|
||||
{
|
||||
*validLocale = *actualLocale = 0;
|
||||
*validLocale = *actualLocale = *requestLocale = 0;
|
||||
}
|
||||
|
||||
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
|
||||
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
|
||||
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
|
||||
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
|
||||
}
|
||||
|
||||
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
|
||||
if (this != &other) {
|
||||
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
|
||||
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
|
||||
uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
|
||||
Locale
|
||||
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
if (type == ULOC_REQUESTED_LOCALE) {
|
||||
return Locale(requestLocale);
|
||||
}
|
||||
U_LOCALE_BASED(locBased, *this);
|
||||
return locBased.getLocale(type, status);
|
||||
}
|
||||
|
||||
const char *
|
||||
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
|
||||
if (type == ULOC_REQUESTED_LOCALE) {
|
||||
return requestLocale;
|
||||
}
|
||||
U_LOCALE_BASED(locBased, *this);
|
||||
return locBased.getLocaleID(type, status);
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
|
|||
}
|
||||
|
||||
UBool
|
||||
DictionaryBreakEngine::handles(UChar32 c) const {
|
||||
DictionaryBreakEngine::handles(UChar32 c, const char*) const {
|
||||
return fSet.contains(c);
|
||||
}
|
||||
|
||||
|
@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
|||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
(void)startPos; // TODO: remove this param?
|
||||
int32_t result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
// The span to break begins at the current position in the text, and
|
||||
// extends towards the start or end of the text, depending on 'reverse'.
|
||||
|
||||
utext_setNativeIndex(text, startPos);
|
||||
int32_t start = (int32_t)utext_getNativeIndex(text);
|
||||
int32_t current;
|
||||
int32_t rangeStart;
|
||||
|
|
|
@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c) const override;
|
||||
virtual UBool handles(UChar32 c, const char* locale) const override;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
|
|
|
@ -1125,6 +1125,7 @@ static icu::UStack *gLanguageBreakFactories = nullptr;
|
|||
static const icu::UnicodeString *gEmptyString = nullptr;
|
||||
static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
|
||||
static icu::UInitOnce gRBBIInitOnce {};
|
||||
static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
|
||||
|
||||
/**
|
||||
* Release all static memory held by breakiterator.
|
||||
|
@ -1153,37 +1154,41 @@ static void U_CALLCONV rbbiInit() {
|
|||
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
|
||||
}
|
||||
|
||||
static void U_CALLCONV initLanguageFactories() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
|
||||
U_ASSERT(gLanguageBreakFactories == nullptr);
|
||||
gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
|
||||
if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
|
||||
ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
|
||||
gLanguageBreakFactories->push(builtIn, status);
|
||||
LocalPointer<ICULanguageBreakFactory> factory(new ICULanguageBreakFactory(status), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
gICULanguageBreakFactory = factory.orphan();
|
||||
gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
|
||||
#ifdef U_LOCAL_SERVICE_HOOK
|
||||
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
|
||||
if (extra != nullptr) {
|
||||
gLanguageBreakFactories->push(extra, status);
|
||||
}
|
||||
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
|
||||
if (extra != nullptr) {
|
||||
gLanguageBreakFactories->push(extra, status);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
|
||||
}
|
||||
|
||||
void ensureLanguageFactories(UErrorCode& status) {
|
||||
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
|
||||
}
|
||||
|
||||
static const LanguageBreakEngine*
|
||||
getLanguageBreakEngineFromFactory(UChar32 c)
|
||||
getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
|
||||
{
|
||||
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
|
||||
if (gLanguageBreakFactories == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
ensureLanguageFactories(status);
|
||||
if (U_FAILURE(status)) return nullptr;
|
||||
|
||||
int32_t i = gLanguageBreakFactories->size();
|
||||
const LanguageBreakEngine *lbe = nullptr;
|
||||
while (--i >= 0) {
|
||||
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
|
||||
lbe = factory->getEngineFor(c);
|
||||
lbe = factory->getEngineFor(c, locale);
|
||||
if (lbe != nullptr) {
|
||||
break;
|
||||
}
|
||||
|
@ -1199,7 +1204,7 @@ getLanguageBreakEngineFromFactory(UChar32 c)
|
|||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
const LanguageBreakEngine *
|
||||
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
|
||||
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
|
||||
const LanguageBreakEngine *lbe = nullptr;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
@ -1215,14 +1220,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
|
|||
int32_t i = fLanguageBreakEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
|
||||
if (lbe->handles(c)) {
|
||||
if (lbe->handles(c, locale)) {
|
||||
return lbe;
|
||||
}
|
||||
}
|
||||
|
||||
// No existing dictionary took the character. See if a factory wants to
|
||||
// give us a new LanguageBreakEngine for this character.
|
||||
lbe = getLanguageBreakEngineFromFactory(c);
|
||||
lbe = getLanguageBreakEngineFromFactory(c, locale);
|
||||
|
||||
// If we got one, use it and push it on our stack.
|
||||
if (lbe != nullptr) {
|
||||
|
@ -1259,6 +1264,18 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
|
|||
return fUnhandledBreakEngine;
|
||||
}
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
|
||||
ExternalBreakEngine* toAdopt, UErrorCode& status) {
|
||||
LocalPointer<ExternalBreakEngine> engine(toAdopt, status);
|
||||
if (U_FAILURE(status)) return;
|
||||
ensureLanguageFactories(status);
|
||||
if (U_FAILURE(status)) return;
|
||||
gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
|
||||
}
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
|
||||
void RuleBasedBreakIterator::dumpCache() {
|
||||
fBreakCache->dumpCache();
|
||||
}
|
||||
|
|
|
@ -158,12 +158,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
|
||||
// We now have a dictionary character. Get the appropriate language object
|
||||
// to deal with it.
|
||||
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
|
||||
const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
|
||||
c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
|
||||
|
||||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != nullptr) {
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
|
||||
foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
|
|
|
@ -649,6 +649,7 @@ private:
|
|||
/** @internal (private) */
|
||||
char actualLocale[ULOC_FULLNAME_CAPACITY];
|
||||
char validLocale[ULOC_FULLNAME_CAPACITY];
|
||||
char requestLocale[ULOC_FULLNAME_CAPACITY];
|
||||
};
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
|
|
|
@ -43,6 +43,71 @@ class RBBIDataWrapper;
|
|||
class UnhandledEngine;
|
||||
class UStack;
|
||||
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
/**
|
||||
* The ExternalBreakEngine class define an abstract interface for the host environment
|
||||
* to provide a low level facility to break text for unicode text in script that the text boundary
|
||||
* cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
|
||||
* word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
|
||||
* The host environment implement one or more subclass of ExternalBreakEngine and
|
||||
* register them in the initialization time by calling
|
||||
* RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
|
||||
* delete the registered external engine in proper time during the clean up
|
||||
* event.
|
||||
* @internal ICU 74 technology preview
|
||||
*/
|
||||
class ExternalBreakEngine : public UObject {
|
||||
public:
|
||||
/**
|
||||
* destructor
|
||||
* @internal ICU 74 technology preview
|
||||
*/
|
||||
virtual ~ExternalBreakEngine() {}
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character when
|
||||
* the RuleBasedBreakIterator is used for a particular locale. This method is used
|
||||
* by the RuleBasedBreakIterator to find a break engine.</p>
|
||||
* @param c A character which begins a run that the engine might handle.
|
||||
* @param locale The locale.
|
||||
* @return true if this engine handles the particular character for that locale.
|
||||
* @internal ICU 74 technology preview
|
||||
*/
|
||||
virtual bool isFor(UChar32 c, const char* locale) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character.This method is
|
||||
* used by the RuleBasedBreakIterator after it already find a break engine to see which
|
||||
* characters after the first one can be handled by this break engine.</p>
|
||||
* @param c A character that the engine might handle.
|
||||
* @return true if this engine handles the particular character.
|
||||
* @internal ICU 74 technology preview
|
||||
*/
|
||||
virtual bool handles(UChar32 c) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Divide up a range of text handled by this break engine.</p>
|
||||
*
|
||||
* @param text A UText representing the text
|
||||
* @param start The start of the range of known characters
|
||||
* @param end The end of the range of known characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or
|
||||
* nullptr
|
||||
* @param foundBreaksCapacity The capacity of foundBreaks
|
||||
* @param status Information on any errors encountered.
|
||||
* @return The number of breaks found
|
||||
* @internal ICU 74 technology preview
|
||||
*/
|
||||
virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
|
||||
int32_t* foundBreaks, int32_t foundBreaksCapacity,
|
||||
UErrorCode& status) const = 0;
|
||||
};
|
||||
#endif /* UCONFIG_NO_SERVICE */
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* A subclass of BreakIterator whose behavior is specified using a list of rules.
|
||||
|
@ -716,9 +781,10 @@ private:
|
|||
* This function returns the appropriate LanguageBreakEngine for a
|
||||
* given character c.
|
||||
* @param c A character in the dictionary set
|
||||
* @param locale The locale.
|
||||
* @internal (private)
|
||||
*/
|
||||
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
|
||||
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
|
||||
|
||||
public:
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
|
@ -734,8 +800,26 @@ private:
|
|||
*/
|
||||
void dumpTables();
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
/**
|
||||
* Register a new external break engine. The external break engine will be adopted.
|
||||
* Because ICU may choose to cache break engine internally, this must
|
||||
* be called at application startup, prior to any calls to
|
||||
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
|
||||
* @param toAdopt the ExternalBreakEngine instance to be adopted
|
||||
* @param status the in/out status code, no special meanings are assigned
|
||||
* @internal ICU 74 technology preview
|
||||
*/
|
||||
static void U_EXPORT2 registerExternalBreakEngine(
|
||||
ExternalBreakEngine* toAdopt, UErrorCode& status);
|
||||
#endif /* UCONFIG_NO_SERVICE */
|
||||
#endif /* U_HIDE_DRAFT_API */
|
||||
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -73,7 +73,7 @@ UScriptCode getScriptFromModelName(const std::string& modelName) {
|
|||
// the model. Since by default the LSTM models are not included, all the tested
|
||||
// models need to be included under source/test/testdata.
|
||||
|
||||
void LSTMBETest::runTestFromFile(const char* filename) {
|
||||
void LSTMBETest::runTestFromFile(const char* filename, const char* locale) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<const LanguageBreakEngine> engine;
|
||||
// Open and read the test data file.
|
||||
|
@ -123,7 +123,7 @@ void LSTMBETest::runTestFromFile(const char* filename) {
|
|||
caseNum++;
|
||||
bool canHandleAllChars = true;
|
||||
for (int32_t i = 0; i < value.length(); i++) {
|
||||
if (!engine->handles(value.charAt(i))) {
|
||||
if (!engine->handles(value.charAt(i), locale)) {
|
||||
errln(UnicodeString("Test Case#") + caseNum + " contains char '" +
|
||||
UnicodeString(value.charAt(i)) +
|
||||
"' cannot be handled by the engine in offset " + i + "\n" + line);
|
||||
|
@ -200,15 +200,15 @@ void LSTMBETest::runTestFromFile(const char* filename) {
|
|||
}
|
||||
|
||||
void LSTMBETest::TestThaiGraphclust() {
|
||||
runTestFromFile("Thai_graphclust_model4_heavy_Test.txt");
|
||||
runTestFromFile("Thai_graphclust_model4_heavy_Test.txt", "th");
|
||||
}
|
||||
|
||||
void LSTMBETest::TestThaiCodepoints() {
|
||||
runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt");
|
||||
runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt", "th");
|
||||
}
|
||||
|
||||
void LSTMBETest::TestBurmeseGraphclust() {
|
||||
runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt");
|
||||
runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", "my");
|
||||
}
|
||||
|
||||
const LanguageBreakEngine* LSTMBETest::createEngineFromTestData(
|
||||
|
|
|
@ -40,7 +40,7 @@ public:
|
|||
|
||||
private:
|
||||
const LanguageBreakEngine* createEngineFromTestData(const char* model, UScriptCode script, UErrorCode& status);
|
||||
void runTestFromFile(const char* filename);
|
||||
void runTestFromFile(const char* filename, const char* locale);
|
||||
void runTestWithLargeMemory(const char* model, UScriptCode script);
|
||||
|
||||
// Test parameters, from the test framework and test invocation.
|
||||
|
|
|
@ -142,6 +142,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
TESTCASE_AUTO(TestLSTMThai);
|
||||
TESTCASE_AUTO(TestLSTMBurmese);
|
||||
TESTCASE_AUTO(TestRandomAccess);
|
||||
TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe);
|
||||
TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
TESTCASE_AUTO(TestTraceCreateCharacter);
|
||||
|
@ -5667,4 +5669,192 @@ void RBBITest::TestRandomAccess() {
|
|||
}
|
||||
}
|
||||
|
||||
// A Fake Tai Le break engine which handle Unicode Tai Le (Tale) block
|
||||
// https://unicode.org/charts/PDF/U1950.pdf
|
||||
// U+1950 - U+197F and always break after Tone letters (U+1970-U+1974)
|
||||
class FakeTaiLeBreakEngine : public ExternalBreakEngine {
|
||||
public:
|
||||
FakeTaiLeBreakEngine() : block(0x1950, 0x197f), tones(0x1970, 0x1974) {
|
||||
}
|
||||
virtual ~FakeTaiLeBreakEngine() {
|
||||
}
|
||||
virtual bool isFor(UChar32 c, const char* /* locale */) const override {
|
||||
// We implmement this for any locale, not return false for some langauge
|
||||
// here.
|
||||
return handles(c);
|
||||
}
|
||||
virtual bool handles(UChar32 c) const override {
|
||||
return block.contains(c);
|
||||
}
|
||||
virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
|
||||
int32_t* foundBreaks, int32_t foundBreaksCapacity,
|
||||
UErrorCode& status) const override {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t i = 0;
|
||||
// Save the state of the utext
|
||||
int64_t savedIndex = utext_getNativeIndex(text);
|
||||
if (savedIndex != start) {
|
||||
utext_setNativeIndex(text, start);
|
||||
}
|
||||
int32_t current;
|
||||
while((current = (int32_t)utext_getNativeIndex(text)) < end) {
|
||||
UChar32 c = utext_current32(text);
|
||||
// Break after tone marks as a fake break point.
|
||||
if (tones.contains(c)) {
|
||||
if (i >= foundBreaksCapacity) {
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
utext_setNativeIndex(text, savedIndex);
|
||||
return i;
|
||||
}
|
||||
foundBreaks[i++] = current;
|
||||
}
|
||||
UTEXT_NEXT32(text);
|
||||
}
|
||||
// Restore the utext
|
||||
if (savedIndex != current) {
|
||||
utext_setNativeIndex(text, savedIndex);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
private:
|
||||
UnicodeSet block;
|
||||
UnicodeSet tones;
|
||||
};
|
||||
|
||||
// A Fake Yue Break Engine which handle CJK Unified Ideographs
|
||||
// block (U+4E00-U+9FFF) when locale start with 'yue' and break
|
||||
// after every character.
|
||||
class FakeYueBreakEngine : public ExternalBreakEngine {
|
||||
public:
|
||||
FakeYueBreakEngine() : block(0x4e00, 0x9FFF) {
|
||||
}
|
||||
virtual ~FakeYueBreakEngine() {
|
||||
}
|
||||
virtual bool isFor(UChar32 c, const char* locale) const override {
|
||||
// We implmement this for any locale starts with "yue" such as
|
||||
// "yue", "yue-CN", "yue-Hant-CN", etc.
|
||||
return handles(c) && uprv_strncmp("yue", locale, 3) == 0;
|
||||
}
|
||||
virtual bool handles(UChar32 c) const override {
|
||||
return block.contains(c);
|
||||
}
|
||||
virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
|
||||
int32_t* foundBreaks, int32_t foundBreaksCapacity,
|
||||
UErrorCode& status) const override {
|
||||
(void)text;
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t i = 0;
|
||||
int32_t current = start;
|
||||
while (current++ < end) {
|
||||
// A fake word segmentation by breaking every two Unicode.
|
||||
if ((current - start) % 2 == 0) {
|
||||
if (i >= foundBreaksCapacity) {
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return i;
|
||||
}
|
||||
foundBreaks[i++] = current;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
private:
|
||||
UnicodeSet block;
|
||||
};
|
||||
|
||||
void RBBITest::TestExternalBreakEngineWithFakeYue() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString text(u"a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn");
|
||||
|
||||
std::vector<int32_t> actual1;
|
||||
{
|
||||
LocalPointer<BreakIterator> bi1(
|
||||
BreakIterator::createWordInstance(Locale::getRoot(), status),
|
||||
status);
|
||||
bi1->setText(text);
|
||||
assertTrue(WHERE "BreakIterator::createWordInstance( root )",
|
||||
U_SUCCESS(status));
|
||||
|
||||
do {
|
||||
actual1.push_back(bi1->current());
|
||||
} while(bi1->next() != BreakIterator::DONE);
|
||||
}
|
||||
|
||||
std::vector<int32_t> expected1({{ 0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 30}});
|
||||
assertTrue("root break Yue as Chinese", expected1 == actual1);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator::registerExternalBreakEngine(
|
||||
new FakeYueBreakEngine(), status);
|
||||
assertTrue(WHERE "registerExternalBreakEngine w FakeYueBreakEngine",
|
||||
U_SUCCESS(status));
|
||||
|
||||
std::vector<int32_t> actual2;
|
||||
{
|
||||
status = U_ZERO_ERROR;
|
||||
LocalPointer<BreakIterator> bi2(
|
||||
BreakIterator::createWordInstance(Locale("yue"), status), status);
|
||||
assertTrue(WHERE "BreakIterator::createWordInstance( yue )",
|
||||
U_SUCCESS(status));
|
||||
bi2->setText(text);
|
||||
do {
|
||||
actual2.push_back(bi2->current());
|
||||
} while(bi2->next() != BreakIterator::DONE);
|
||||
}
|
||||
std::vector<int32_t> expected2({{ 0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20,
|
||||
22, 23, 24, 26, 27, 30}});
|
||||
assertTrue(WHERE "break Yue by Fake external breaker",
|
||||
expected2 == actual2);
|
||||
}
|
||||
|
||||
void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString text(
|
||||
u"a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ"
|
||||
u"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn");
|
||||
|
||||
std::vector<int32_t> actual1;
|
||||
{
|
||||
LocalPointer<BreakIterator> bi1(
|
||||
BreakIterator::createLineInstance(Locale::getRoot(), status),
|
||||
status);
|
||||
bi1->setText(text);
|
||||
assertTrue(WHERE "BreakIterator::createLineInstance( root )",
|
||||
U_SUCCESS(status));
|
||||
|
||||
do {
|
||||
actual1.push_back(bi1->current());
|
||||
} while(bi1->next() != BreakIterator::DONE);
|
||||
}
|
||||
|
||||
std::vector<int32_t> expected1({{
|
||||
0, 2, 5, 86, 89, 92 }});
|
||||
assertTrue(WHERE "root break Tai Le", expected1 == actual1);
|
||||
|
||||
RuleBasedBreakIterator::registerExternalBreakEngine(
|
||||
new FakeTaiLeBreakEngine(), status);
|
||||
assertTrue(WHERE "registerExternalBreakEngine w FakeTaiLeBreakEngine",
|
||||
U_SUCCESS(status));
|
||||
|
||||
std::vector<int32_t> actual2;
|
||||
{
|
||||
status = U_ZERO_ERROR;
|
||||
LocalPointer<BreakIterator> bi2(
|
||||
BreakIterator::createLineInstance(Locale("tdd"), status), status);
|
||||
assertTrue(WHERE "BreakIterator::createLineInstance( tdd )",
|
||||
U_SUCCESS(status));
|
||||
bi2->setText(text);
|
||||
do {
|
||||
actual2.push_back(bi2->current());
|
||||
} while(bi2->next() != BreakIterator::DONE);
|
||||
}
|
||||
std::vector<int32_t> expected2({{
|
||||
0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67,
|
||||
70, 73, 76, 80, 86, 89, 92}});
|
||||
assertTrue("break Tai Le by Fake external breaker",
|
||||
expected2 == actual2);
|
||||
}
|
||||
|
||||
#endif // #if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
|
|
@ -96,6 +96,8 @@ public:
|
|||
void TestLSTMThai();
|
||||
void TestLSTMBurmese();
|
||||
void TestRandomAccess();
|
||||
void TestExternalBreakEngineWithFakeTaiLe();
|
||||
void TestExternalBreakEngineWithFakeYue();
|
||||
|
||||
#if U_ENABLE_TRACING
|
||||
void TestTraceCreateCharacter();
|
||||
|
|
Loading…
Add table
Reference in a new issue