From 02d5e7190305deae8adf71da4e10710f1bc391e7 Mon Sep 17 00:00:00 2001
From: Frank Tang
Date: Fri, 18 Aug 2023 17:23:09 +0000
Subject: [PATCH] ICU-22342 Implement ExternalBreakEngineAPI
ICU-22342 Fix comments
---
icu4c/source/common/brkeng.cpp | 121 +++++++++++----
icu4c/source/common/brkeng.h | 59 ++++++-
icu4c/source/common/brkiter.cpp | 14 +-
icu4c/source/common/dictbe.cpp | 4 +-
icu4c/source/common/dictbe.h | 3 +-
icu4c/source/common/rbbi.cpp | 51 ++++--
icu4c/source/common/rbbi_cache.cpp | 5 +-
icu4c/source/common/unicode/brkiter.h | 1 +
icu4c/source/common/unicode/rbbi.h | 86 +++++++++-
icu4c/source/test/intltest/lstmbetst.cpp | 10 +-
icu4c/source/test/intltest/lstmbetst.h | 2 +-
icu4c/source/test/intltest/rbbitst.cpp | 190 +++++++++++++++++++++++
icu4c/source/test/intltest/rbbitst.h | 2 +
13 files changed, 484 insertions(+), 64 deletions(-)
diff --git a/icu4c/source/common/brkeng.cpp b/icu4c/source/common/brkeng.cpp
index ce3d09cf23b..c8442310b8b 100644
--- a/icu4c/source/common/brkeng.cpp
+++ b/icu4c/source/common/brkeng.cpp
@@ -21,6 +21,7 @@
#include "unicode/uscript.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
+#include "unicode/rbbi.h"
#include "brkeng.h"
#include "cmemory.h"
@@ -70,19 +71,21 @@ UnhandledEngine::~UnhandledEngine() {
}
UBool
-UnhandledEngine::handles(UChar32 c) const {
+UnhandledEngine::handles(UChar32 c, const char* locale) const {
+ (void)locale; // Unused
return fHandled && fHandled->contains(c);
}
int32_t
UnhandledEngine::findBreaks( UText *text,
- int32_t /* startPos */,
+ int32_t startPos,
int32_t endPos,
UVector32 &/*foundBreaks*/,
UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
- UChar32 c = utext_current32(text);
+ utext_setNativeIndex(text, startPos);
+ UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
@@ -120,41 +123,39 @@ ICULanguageBreakFactory::~ICULanguageBreakFactory() {
}
}
-U_NAMESPACE_END
-U_CDECL_BEGIN
-static void U_CALLCONV _deleteEngine(void *obj) {
- delete (const icu::LanguageBreakEngine *) obj;
+void ICULanguageBreakFactory::ensureEngines(UErrorCode& status) {
+ static UMutex gBreakEngineMutex;
+ Mutex m(&gBreakEngineMutex);
+ if (fEngines == nullptr) {
+ LocalPointer engines(new UStack(uprv_deleteUObject, nullptr, status), status);
+ if (U_SUCCESS(status)) {
+ fEngines = engines.orphan();
+ }
+ }
}
-U_CDECL_END
-U_NAMESPACE_BEGIN
const LanguageBreakEngine *
-ICULanguageBreakFactory::getEngineFor(UChar32 c) {
+ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
+ ensureEngines(status);
+ if (U_FAILURE(status) ) {
+ // Note: no way to return error code to caller.
+ return nullptr;
+ }
static UMutex gBreakEngineMutex;
Mutex m(&gBreakEngineMutex);
-
- if (fEngines == nullptr) {
- LocalPointer engines(new UStack(_deleteEngine, nullptr, status), status);
- if (U_FAILURE(status) ) {
- // Note: no way to return error code to caller.
- return nullptr;
- }
- fEngines = engines.orphan();
- } else {
- int32_t i = fEngines->size();
- while (--i >= 0) {
- lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
- if (lbe != nullptr && lbe->handles(c)) {
- return lbe;
- }
+ int32_t i = fEngines->size();
+ while (--i >= 0) {
+ lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
+ if (lbe != nullptr && lbe->handles(c, locale)) {
+ return lbe;
}
}
-
+
// We didn't find an engine. Create one.
- lbe = loadEngineFor(c);
+ lbe = loadEngineFor(c, locale);
if (lbe != nullptr) {
fEngines->push((void *)lbe, status);
}
@@ -162,7 +163,7 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c) {
}
const LanguageBreakEngine *
-ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
+ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
@@ -299,6 +300,70 @@ ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
return nullptr;
}
+
+void ICULanguageBreakFactory::addExternalEngine(
+ ExternalBreakEngine* external, UErrorCode& status) {
+ LocalPointer engine(external, status);
+ ensureEngines(status);
+ LocalPointer wrapper(
+ new BreakEngineWrapper(engine.orphan(), status), status);
+ static UMutex gBreakEngineMutex;
+ Mutex m(&gBreakEngineMutex);
+ fEngines->push(wrapper.getAlias(), status);
+ wrapper.orphan();
+}
+
+BreakEngineWrapper::BreakEngineWrapper(
+ ExternalBreakEngine* engine, UErrorCode &status) : delegate(engine, status) {
+}
+
+BreakEngineWrapper::~BreakEngineWrapper() {
+}
+
+UBool BreakEngineWrapper::handles(UChar32 c, const char* locale) const {
+ return delegate->isFor(c, locale);
+}
+
+int32_t BreakEngineWrapper::findBreaks(
+ UText *text,
+ int32_t startPos,
+ int32_t endPos,
+ UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
+ UErrorCode &status) const {
+ if (U_FAILURE(status)) return 0;
+ int32_t result = 0;
+
+ // Find the span of characters included in the set.
+ // The span to break begins at the current position in the text, and
+ // extends towards the start or end of the text, depending on 'reverse'.
+
+ utext_setNativeIndex(text, startPos);
+ int32_t start = (int32_t)utext_getNativeIndex(text);
+ int32_t current;
+ int32_t rangeStart;
+ int32_t rangeEnd;
+ UChar32 c = utext_current32(text);
+ while((current = (int32_t)utext_getNativeIndex(text)) < endPos && delegate->handles(c)) {
+ utext_next32(text); // TODO: recast loop for postincrement
+ c = utext_current32(text);
+ }
+ rangeStart = start;
+ rangeEnd = current;
+ int32_t beforeSize = foundBreaks.size();
+ int32_t additionalCapacity = rangeEnd - rangeStart + 1;
+ // enlarge to contains (rangeEnd-rangeStart+1) more items
+ foundBreaks.ensureCapacity(beforeSize+additionalCapacity, status);
+ if (U_FAILURE(status)) return 0;
+ foundBreaks.setSize(beforeSize + beforeSize+additionalCapacity);
+ result = delegate->fillBreak(text, rangeStart, rangeEnd, foundBreaks.getBuffer()+beforeSize,
+ additionalCapacity, status);
+ if (U_FAILURE(status)) return 0;
+ foundBreaks.setSize(beforeSize + result);
+ utext_setNativeIndex(text, current);
+ return result;
+}
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/common/brkeng.h b/icu4c/source/common/brkeng.h
index 240dc8f4d34..42a3d697cfe 100644
--- a/icu4c/source/common/brkeng.h
+++ b/icu4c/source/common/brkeng.h
@@ -10,6 +10,7 @@
#ifndef BRKENG_H
#define BRKENG_H
+#include "unicode/umisc.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/utext.h"
@@ -21,6 +22,7 @@ class UnicodeSet;
class UStack;
class UVector32;
class DictionaryMatcher;
+class ExternalBreakEngine;
/*******************************************************************
* LanguageBreakEngine
@@ -35,7 +37,7 @@ class DictionaryMatcher;
* LanguageBreakEngines should normally be implemented so as to
* be shared between threads without locking.
*/
-class LanguageBreakEngine : public UMemory {
+class LanguageBreakEngine : public UObject {
public:
/**
@@ -54,10 +56,11 @@ class LanguageBreakEngine : public UMemory {
* a particular kind of break.
*
* @param c A character which begins a run that the engine might handle
+ * @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
- virtual UBool handles(UChar32 c) const = 0;
+ virtual UBool handles(UChar32 c, const char* locale) const = 0;
/**
* Find any breaks within a run in the supplied text.
@@ -80,6 +83,35 @@ class LanguageBreakEngine : public UMemory {
};
+/*******************************************************************
+ * BreakEngineWrapper
+ */
+
+/**
+ * BreakEngineWrapper implement LanguageBreakEngine by
+ * a thin wrapper that delegate the task to ExternalBreakEngine
+ *
+ */
+class BreakEngineWrapper : public LanguageBreakEngine {
+ public:
+
+ BreakEngineWrapper(ExternalBreakEngine* engine, UErrorCode &status);
+
+ virtual ~BreakEngineWrapper();
+
+ virtual UBool handles(UChar32 c, const char* locale) const override;
+
+ virtual int32_t findBreaks( UText *text,
+ int32_t startPos,
+ int32_t endPos,
+ UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
+ UErrorCode &status) const override;
+
+ private:
+ LocalPointer delegate;
+};
+
/*******************************************************************
* LanguageBreakFactory
*/
@@ -125,9 +157,10 @@ class LanguageBreakFactory : public UMemory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
+ * @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
- virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
+ virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) = 0;
};
@@ -174,10 +207,11 @@ class UnhandledEngine : public LanguageBreakEngine {
* a particular kind of break.
*
* @param c A character which begins a run that the engine might handle
+ * @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
- virtual UBool handles(UChar32 c) const override;
+ virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* Find any breaks within a run in the supplied text.
@@ -247,9 +281,18 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
+ * @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
- virtual const LanguageBreakEngine *getEngineFor(UChar32 c) override;
+ virtual const LanguageBreakEngine *getEngineFor(UChar32 c, const char* locale) override;
+
+ /**
+ * Add and adopt the engine and return an URegistryKey.
+ * @param engine The ExternalBreakEngine to be added and adopt. The caller
+ * pass the ownership and should not release the memory after this.
+ * @param status the error code.
+ */
+ virtual void addExternalEngine(ExternalBreakEngine* engine, UErrorCode& status);
protected:
/**
@@ -258,9 +301,10 @@ protected:
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
+ * @param locale The locale.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
- virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
+ virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, const char* locale);
/**
* Create a DictionaryMatcher for the specified script and break type.
@@ -269,6 +313,9 @@ protected:
* @return A DictionaryMatcher with the desired characteristics, or nullptr.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
+
+ private:
+ void ensureEngines(UErrorCode& status);
};
U_NAMESPACE_END
diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp
index 41e4e0dff57..b452cf2c050 100644
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@@ -27,6 +27,7 @@
#include "unicode/rbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
+#include "unicode/uloc.h"
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/filteredbrk.h"
@@ -121,8 +122,11 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != nullptr) {
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
+
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data());
+ uprv_strncpy(result->requestLocale, loc.getName(), ULOC_FULLNAME_CAPACITY);
+ result->requestLocale[ULOC_FULLNAME_CAPACITY-1] = 0; // always terminate
}
ures_close(b);
@@ -202,18 +206,20 @@ BreakIterator::getAvailableLocales(int32_t& count)
BreakIterator::BreakIterator()
{
- *validLocale = *actualLocale = 0;
+ *validLocale = *actualLocale = *requestLocale = 0;
}
BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
+ uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
if (this != &other) {
uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
+ uprv_strncpy(requestLocale, other.requestLocale, sizeof(requestLocale));
}
return *this;
}
@@ -493,12 +499,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
Locale
BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
+ if (type == ULOC_REQUESTED_LOCALE) {
+ return Locale(requestLocale);
+ }
U_LOCALE_BASED(locBased, *this);
return locBased.getLocale(type, status);
}
const char *
BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
+ if (type == ULOC_REQUESTED_LOCALE) {
+ return requestLocale;
+ }
U_LOCALE_BASED(locBased, *this);
return locBased.getLocaleID(type, status);
}
diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
index 0e420c67c5d..3d672c03bfb 100644
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@@ -42,7 +42,7 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
}
UBool
-DictionaryBreakEngine::handles(UChar32 c) const {
+DictionaryBreakEngine::handles(UChar32 c, const char*) const {
return fSet.contains(c);
}
@@ -54,13 +54,13 @@ DictionaryBreakEngine::findBreaks( UText *text,
UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
- (void)startPos; // TODO: remove this param?
int32_t result = 0;
// Find the span of characters included in the set.
// The span to break begins at the current position in the text, and
// extends towards the start or end of the text, depending on 'reverse'.
+ utext_setNativeIndex(text, startPos);
int32_t start = (int32_t)utext_getNativeIndex(text);
int32_t current;
int32_t rangeStart;
diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h
index a2c761bdc3a..e512071fa45 100644
--- a/icu4c/source/common/dictbe.h
+++ b/icu4c/source/common/dictbe.h
@@ -62,10 +62,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* a particular kind of break.
*
* @param c A character which begins a run that the engine might handle
+ * @param locale The locale.
* @return true if this engine handles the particular character and break
* type.
*/
- virtual UBool handles(UChar32 c) const override;
+ virtual UBool handles(UChar32 c, const char* locale) const override;
/**
* Find any breaks within a run in the supplied text.
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
index 73716ab4066..599279fb72b 100644
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -1125,6 +1125,7 @@ static icu::UStack *gLanguageBreakFactories = nullptr;
static const icu::UnicodeString *gEmptyString = nullptr;
static icu::UInitOnce gLanguageBreakFactoriesInitOnce {};
static icu::UInitOnce gRBBIInitOnce {};
+static icu::ICULanguageBreakFactory *gICULanguageBreakFactory = nullptr;
/**
* Release all static memory held by breakiterator.
@@ -1153,37 +1154,41 @@ static void U_CALLCONV rbbiInit() {
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
}
-static void U_CALLCONV initLanguageFactories() {
- UErrorCode status = U_ZERO_ERROR;
+static void U_CALLCONV initLanguageFactories(UErrorCode& status) {
U_ASSERT(gLanguageBreakFactories == nullptr);
gLanguageBreakFactories = new UStack(_deleteFactory, nullptr, status);
if (gLanguageBreakFactories != nullptr && U_SUCCESS(status)) {
- ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
- gLanguageBreakFactories->push(builtIn, status);
+ LocalPointer factory(new ICULanguageBreakFactory(status), status);
+ if (U_SUCCESS(status)) {
+ gICULanguageBreakFactory = factory.orphan();
+ gLanguageBreakFactories->push(gICULanguageBreakFactory, status);
#ifdef U_LOCAL_SERVICE_HOOK
- LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
- if (extra != nullptr) {
- gLanguageBreakFactories->push(extra, status);
- }
+ LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
+ if (extra != nullptr) {
+ gLanguageBreakFactories->push(extra, status);
+ }
#endif
+ }
}
ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
}
+void ensureLanguageFactories(UErrorCode& status) {
+ umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories, status);
+}
static const LanguageBreakEngine*
-getLanguageBreakEngineFromFactory(UChar32 c)
+getLanguageBreakEngineFromFactory(UChar32 c, const char* locale)
{
- umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
- if (gLanguageBreakFactories == nullptr) {
- return nullptr;
- }
+ UErrorCode status = U_ZERO_ERROR;
+ ensureLanguageFactories(status);
+ if (U_FAILURE(status)) return nullptr;
int32_t i = gLanguageBreakFactories->size();
const LanguageBreakEngine *lbe = nullptr;
while (--i >= 0) {
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
- lbe = factory->getEngineFor(c);
+ lbe = factory->getEngineFor(c, locale);
if (lbe != nullptr) {
break;
}
@@ -1199,7 +1204,7 @@ getLanguageBreakEngineFromFactory(UChar32 c)
//
//-------------------------------------------------------------------------------
const LanguageBreakEngine *
-RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
+RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c, const char* locale) {
const LanguageBreakEngine *lbe = nullptr;
UErrorCode status = U_ZERO_ERROR;
@@ -1215,14 +1220,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
int32_t i = fLanguageBreakEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
- if (lbe->handles(c)) {
+ if (lbe->handles(c, locale)) {
return lbe;
}
}
// No existing dictionary took the character. See if a factory wants to
// give us a new LanguageBreakEngine for this character.
- lbe = getLanguageBreakEngineFromFactory(c);
+ lbe = getLanguageBreakEngineFromFactory(c, locale);
// If we got one, use it and push it on our stack.
if (lbe != nullptr) {
@@ -1259,6 +1264,18 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
return fUnhandledBreakEngine;
}
+#ifndef U_HIDE_DRAFT_API
+void U_EXPORT2 RuleBasedBreakIterator::registerExternalBreakEngine(
+ ExternalBreakEngine* toAdopt, UErrorCode& status) {
+ LocalPointer engine(toAdopt, status);
+ if (U_FAILURE(status)) return;
+ ensureLanguageFactories(status);
+ if (U_FAILURE(status)) return;
+ gICULanguageBreakFactory->addExternalEngine(engine.orphan(), status);
+}
+#endif /* U_HIDE_DRAFT_API */
+
+
void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache();
}
diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp
index 02ca555a890..f7a283f69e4 100644
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@@ -158,12 +158,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
- const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(c);
+ const LanguageBreakEngine *lbe = fBI->getLanguageBreakEngine(
+ c, fBI->getLocaleID(ULOC_REQUESTED_LOCALE, status));
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != nullptr) {
- foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
+ foundBreakCount += lbe->findBreaks(text, current, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round
diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h
index 108652799e6..1b10e6ef116 100644
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@@ -649,6 +649,7 @@ private:
/** @internal (private) */
char actualLocale[ULOC_FULLNAME_CAPACITY];
char validLocale[ULOC_FULLNAME_CAPACITY];
+ char requestLocale[ULOC_FULLNAME_CAPACITY];
};
#ifndef U_HIDE_DEPRECATED_API
diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h
index 418b52e41f4..c137ac5c7a8 100644
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@@ -43,6 +43,71 @@ class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
+
+#ifndef U_HIDE_DRAFT_API
+#if !UCONFIG_NO_SERVICE
+/**
+ * The ExternalBreakEngine class define an abstract interface for the host environment
+ * to provide a low level facility to break text for unicode text in script that the text boundary
+ * cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
+ * word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
+ * The host environment implement one or more subclass of ExternalBreakEngine and
+ * register them in the initialization time by calling
+ * RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
+ * delete the registered external engine in proper time during the clean up
+ * event.
+ * @internal ICU 74 technology preview
+ */
+class ExternalBreakEngine : public UObject {
+ public:
+ /**
+ * destructor
+ * @internal ICU 74 technology preview
+ */
+ virtual ~ExternalBreakEngine() {}
+
+ /**
+ * Indicate whether this engine handles a particular character when
+ * the RuleBasedBreakIterator is used for a particular locale. This method is used
+ * by the RuleBasedBreakIterator to find a break engine.
+ * @param c A character which begins a run that the engine might handle.
+ * @param locale The locale.
+ * @return true if this engine handles the particular character for that locale.
+ * @internal ICU 74 technology preview
+ */
+ virtual bool isFor(UChar32 c, const char* locale) const = 0;
+
+ /**
+ * Indicate whether this engine handles a particular character.This method is
+ * used by the RuleBasedBreakIterator after it already find a break engine to see which
+ * characters after the first one can be handled by this break engine.
+ * @param c A character that the engine might handle.
+ * @return true if this engine handles the particular character.
+ * @internal ICU 74 technology preview
+ */
+ virtual bool handles(UChar32 c) const = 0;
+
+ /**
+ * Divide up a range of text handled by this break engine.
+ *
+ * @param text A UText representing the text
+ * @param start The start of the range of known characters
+ * @param end The end of the range of known characters
+ * @param foundBreaks Output of C array of int32_t break positions, or
+ * nullptr
+ * @param foundBreaksCapacity The capacity of foundBreaks
+ * @param status Information on any errors encountered.
+ * @return The number of breaks found
+ * @internal ICU 74 technology preview
+ */
+ virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
+ int32_t* foundBreaks, int32_t foundBreaksCapacity,
+ UErrorCode& status) const = 0;
+};
+#endif /* UCONFIG_NO_SERVICE */
+#endif /* U_HIDE_DRAFT_API */
+
+
/**
*
* A subclass of BreakIterator whose behavior is specified using a list of rules.
@@ -716,9 +781,10 @@ private:
* This function returns the appropriate LanguageBreakEngine for a
* given character c.
* @param c A character in the dictionary set
+ * @param locale The locale.
* @internal (private)
*/
- const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
+ const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
public:
#ifndef U_HIDE_INTERNAL_API
@@ -734,8 +800,26 @@ private:
*/
void dumpTables();
#endif /* U_HIDE_INTERNAL_API */
+
+#ifndef U_HIDE_DRAFT_API
+#if !UCONFIG_NO_SERVICE
+ /**
+ * Register a new external break engine. The external break engine will be adopted.
+ * Because ICU may choose to cache break engine internally, this must
+ * be called at application startup, prior to any calls to
+ * object methods of RuleBasedBreakIterator to avoid undefined behavior.
+ * @param toAdopt the ExternalBreakEngine instance to be adopted
+ * @param status the in/out status code, no special meanings are assigned
+ * @internal ICU 74 technology preview
+ */
+ static void U_EXPORT2 registerExternalBreakEngine(
+ ExternalBreakEngine* toAdopt, UErrorCode& status);
+#endif /* UCONFIG_NO_SERVICE */
+#endif /* U_HIDE_DRAFT_API */
+
};
+
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/icu4c/source/test/intltest/lstmbetst.cpp b/icu4c/source/test/intltest/lstmbetst.cpp
index 0e3fe8c3f9e..d1f28223b3c 100644
--- a/icu4c/source/test/intltest/lstmbetst.cpp
+++ b/icu4c/source/test/intltest/lstmbetst.cpp
@@ -73,7 +73,7 @@ UScriptCode getScriptFromModelName(const std::string& modelName) {
// the model. Since by default the LSTM models are not included, all the tested
// models need to be included under source/test/testdata.
-void LSTMBETest::runTestFromFile(const char* filename) {
+void LSTMBETest::runTestFromFile(const char* filename, const char* locale) {
UErrorCode status = U_ZERO_ERROR;
LocalPointer engine;
// Open and read the test data file.
@@ -123,7 +123,7 @@ void LSTMBETest::runTestFromFile(const char* filename) {
caseNum++;
bool canHandleAllChars = true;
for (int32_t i = 0; i < value.length(); i++) {
- if (!engine->handles(value.charAt(i))) {
+ if (!engine->handles(value.charAt(i), locale)) {
errln(UnicodeString("Test Case#") + caseNum + " contains char '" +
UnicodeString(value.charAt(i)) +
"' cannot be handled by the engine in offset " + i + "\n" + line);
@@ -200,15 +200,15 @@ void LSTMBETest::runTestFromFile(const char* filename) {
}
void LSTMBETest::TestThaiGraphclust() {
- runTestFromFile("Thai_graphclust_model4_heavy_Test.txt");
+ runTestFromFile("Thai_graphclust_model4_heavy_Test.txt", "th");
}
void LSTMBETest::TestThaiCodepoints() {
- runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt");
+ runTestFromFile("Thai_codepoints_exclusive_model5_heavy_Test.txt", "th");
}
void LSTMBETest::TestBurmeseGraphclust() {
- runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt");
+ runTestFromFile("Burmese_graphclust_model5_heavy_Test.txt", "my");
}
const LanguageBreakEngine* LSTMBETest::createEngineFromTestData(
diff --git a/icu4c/source/test/intltest/lstmbetst.h b/icu4c/source/test/intltest/lstmbetst.h
index a5da3c9e83b..0bc00578e66 100644
--- a/icu4c/source/test/intltest/lstmbetst.h
+++ b/icu4c/source/test/intltest/lstmbetst.h
@@ -40,7 +40,7 @@ public:
private:
const LanguageBreakEngine* createEngineFromTestData(const char* model, UScriptCode script, UErrorCode& status);
- void runTestFromFile(const char* filename);
+ void runTestFromFile(const char* filename, const char* locale);
void runTestWithLargeMemory(const char* model, UScriptCode script);
// Test parameters, from the test framework and test invocation.
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 05572cbd087..c7687bad503 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -142,6 +142,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestLSTMThai);
TESTCASE_AUTO(TestLSTMBurmese);
TESTCASE_AUTO(TestRandomAccess);
+ TESTCASE_AUTO(TestExternalBreakEngineWithFakeTaiLe);
+ TESTCASE_AUTO(TestExternalBreakEngineWithFakeYue);
#if U_ENABLE_TRACING
TESTCASE_AUTO(TestTraceCreateCharacter);
@@ -5667,4 +5669,192 @@ void RBBITest::TestRandomAccess() {
}
}
+// A Fake Tai Le break engine which handle Unicode Tai Le (Tale) block
+// https://unicode.org/charts/PDF/U1950.pdf
+// U+1950 - U+197F and always break after Tone letters (U+1970-U+1974)
+class FakeTaiLeBreakEngine : public ExternalBreakEngine {
+ public:
+ FakeTaiLeBreakEngine() : block(0x1950, 0x197f), tones(0x1970, 0x1974) {
+ }
+ virtual ~FakeTaiLeBreakEngine() {
+ }
+ virtual bool isFor(UChar32 c, const char* /* locale */) const override {
+ // We implmement this for any locale, not return false for some langauge
+ // here.
+ return handles(c);
+ }
+ virtual bool handles(UChar32 c) const override {
+ return block.contains(c);
+ }
+ virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
+ int32_t* foundBreaks, int32_t foundBreaksCapacity,
+ UErrorCode& status) const override {
+ if (U_FAILURE(status)) return 0;
+ int32_t i = 0;
+ // Save the state of the utext
+ int64_t savedIndex = utext_getNativeIndex(text);
+ if (savedIndex != start) {
+ utext_setNativeIndex(text, start);
+ }
+ int32_t current;
+ while((current = (int32_t)utext_getNativeIndex(text)) < end) {
+ UChar32 c = utext_current32(text);
+ // Break after tone marks as a fake break point.
+ if (tones.contains(c)) {
+ if (i >= foundBreaksCapacity) {
+ status = U_BUFFER_OVERFLOW_ERROR;
+ utext_setNativeIndex(text, savedIndex);
+ return i;
+ }
+ foundBreaks[i++] = current;
+ }
+ UTEXT_NEXT32(text);
+ }
+ // Restore the utext
+ if (savedIndex != current) {
+ utext_setNativeIndex(text, savedIndex);
+ }
+ return i;
+ }
+
+ private:
+ UnicodeSet block;
+ UnicodeSet tones;
+};
+
+// A Fake Yue Break Engine which handle CJK Unified Ideographs
+// block (U+4E00-U+9FFF) when locale start with 'yue' and break
+// after every character.
+class FakeYueBreakEngine : public ExternalBreakEngine {
+ public:
+ FakeYueBreakEngine() : block(0x4e00, 0x9FFF) {
+ }
+ virtual ~FakeYueBreakEngine() {
+ }
+ virtual bool isFor(UChar32 c, const char* locale) const override {
+ // We implmement this for any locale starts with "yue" such as
+ // "yue", "yue-CN", "yue-Hant-CN", etc.
+ return handles(c) && uprv_strncmp("yue", locale, 3) == 0;
+ }
+ virtual bool handles(UChar32 c) const override {
+ return block.contains(c);
+ }
+ virtual int32_t fillBreak(UText* text, int32_t start, int32_t end,
+ int32_t* foundBreaks, int32_t foundBreaksCapacity,
+ UErrorCode& status) const override {
+ (void)text;
+ if (U_FAILURE(status)) return 0;
+ int32_t i = 0;
+ int32_t current = start;
+ while (current++ < end) {
+ // A fake word segmentation by breaking every two Unicode.
+ if ((current - start) % 2 == 0) {
+ if (i >= foundBreaksCapacity) {
+ status = U_BUFFER_OVERFLOW_ERROR;
+ return i;
+ }
+ foundBreaks[i++] = current;
+ }
+ }
+ return i;
+ }
+
+ private:
+ UnicodeSet block;
+};
+
+void RBBITest::TestExternalBreakEngineWithFakeYue() {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString text(u"a bc def一兩年前佢真係唔鍾意畀我影相i jk lmn");
+
+ std::vector actual1;
+ {
+ LocalPointer bi1(
+ BreakIterator::createWordInstance(Locale::getRoot(), status),
+ status);
+ bi1->setText(text);
+ assertTrue(WHERE "BreakIterator::createWordInstance( root )",
+ U_SUCCESS(status));
+
+ do {
+ actual1.push_back(bi1->current());
+ } while(bi1->next() != BreakIterator::DONE);
+ }
+
+ std::vector expected1({{ 0, 1, 2, 4, 5, 8, 10, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 30}});
+ assertTrue("root break Yue as Chinese", expected1 == actual1);
+
+ status = U_ZERO_ERROR;
+ RuleBasedBreakIterator::registerExternalBreakEngine(
+ new FakeYueBreakEngine(), status);
+ assertTrue(WHERE "registerExternalBreakEngine w FakeYueBreakEngine",
+ U_SUCCESS(status));
+
+ std::vector actual2;
+ {
+ status = U_ZERO_ERROR;
+ LocalPointer bi2(
+ BreakIterator::createWordInstance(Locale("yue"), status), status);
+ assertTrue(WHERE "BreakIterator::createWordInstance( yue )",
+ U_SUCCESS(status));
+ bi2->setText(text);
+ do {
+ actual2.push_back(bi2->current());
+ } while(bi2->next() != BreakIterator::DONE);
+ }
+ std::vector expected2({{ 0, 1, 2, 4, 5, 8, 10, 12, 14, 16, 18, 20,
+ 22, 23, 24, 26, 27, 30}});
+ assertTrue(WHERE "break Yue by Fake external breaker",
+ expected2 == actual2);
+}
+
+void RBBITest::TestExternalBreakEngineWithFakeTaiLe() {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString text(
+ u"a bc defᥛᥫᥒᥰᥖᥭᥰᥞᥝᥰᥙᥥᥢᥛᥫᥒᥰᥑᥩᥢᥲᥔᥣᥝᥴᥓᥬᥖᥩᥢᥲᥛᥣᥝᥱᥙᥝᥱᥙᥤᥱᥓᥣᥒᥛᥣᥰᥓᥧ"
+ u"ᥰᥘᥩᥰᥗᥪᥒᥴᥛᥣᥰᥘᥬᥰᥝᥣᥱᥘᥒᥱᥔᥣᥛᥴᥘᥫᥢi jk lmn");
+
+ std::vector actual1;
+ {
+ LocalPointer bi1(
+ BreakIterator::createLineInstance(Locale::getRoot(), status),
+ status);
+ bi1->setText(text);
+ assertTrue(WHERE "BreakIterator::createLineInstance( root )",
+ U_SUCCESS(status));
+
+ do {
+ actual1.push_back(bi1->current());
+ } while(bi1->next() != BreakIterator::DONE);
+ }
+
+ std::vector expected1({{
+ 0, 2, 5, 86, 89, 92 }});
+ assertTrue(WHERE "root break Tai Le", expected1 == actual1);
+
+ RuleBasedBreakIterator::registerExternalBreakEngine(
+ new FakeTaiLeBreakEngine(), status);
+ assertTrue(WHERE "registerExternalBreakEngine w FakeTaiLeBreakEngine",
+ U_SUCCESS(status));
+
+ std::vector actual2;
+ {
+ status = U_ZERO_ERROR;
+ LocalPointer bi2(
+ BreakIterator::createLineInstance(Locale("tdd"), status), status);
+ assertTrue(WHERE "BreakIterator::createLineInstance( tdd )",
+ U_SUCCESS(status));
+ bi2->setText(text);
+ do {
+ actual2.push_back(bi2->current());
+ } while(bi2->next() != BreakIterator::DONE);
+ }
+ std::vector expected2({{
+ 0, 2, 5, 11, 14, 17, 24, 28, 32, 38, 42, 45, 48, 54, 57, 60, 64, 67,
+ 70, 73, 76, 80, 86, 89, 92}});
+ assertTrue("break Tai Le by Fake external breaker",
+ expected2 == actual2);
+}
+
#endif // #if !UCONFIG_NO_BREAK_ITERATION
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index c8785f72343..537a537863a 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -96,6 +96,8 @@ public:
void TestLSTMThai();
void TestLSTMBurmese();
void TestRandomAccess();
+ void TestExternalBreakEngineWithFakeTaiLe();
+ void TestExternalBreakEngineWithFakeYue();
#if U_ENABLE_TRACING
void TestTraceCreateCharacter();