mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-10688 branch, work in progress.
X-SVN-Rev: 40686
This commit is contained in:
parent
c67d9d0a4a
commit
ca7b62180e
9 changed files with 47 additions and 121 deletions
|
@ -59,58 +59,47 @@ LanguageBreakFactory::~LanguageBreakFactory() {
|
|||
******************************************************************
|
||||
*/
|
||||
|
||||
UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
|
||||
fHandled[i] = 0;
|
||||
}
|
||||
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
|
||||
(void)status;
|
||||
}
|
||||
|
||||
UnhandledEngine::~UnhandledEngine() {
|
||||
for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
|
||||
if (fHandled[i] != 0) {
|
||||
delete fHandled[i];
|
||||
}
|
||||
}
|
||||
delete fHandled;
|
||||
fHandled = nullptr;
|
||||
}
|
||||
|
||||
UBool
|
||||
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
|
||||
return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
|
||||
&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
|
||||
UnhandledEngine::handles(UChar32 c) const {
|
||||
return fHandled && fHandled->contains(c);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnhandledEngine::findBreaks( UText *text,
|
||||
int32_t /* startPos */,
|
||||
int32_t endPos,
|
||||
int32_t breakType,
|
||||
UVector32 &/*foundBreaks*/ ) const {
|
||||
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
|
||||
UChar32 c = utext_current32(text);
|
||||
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
||||
c = utext_current32(text);
|
||||
}
|
||||
UChar32 c = utext_current32(text);
|
||||
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
|
||||
utext_next32(text); // TODO: recast loop to work with post-increment operations.
|
||||
c = utext_current32(text);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
|
||||
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
|
||||
if (fHandled[breakType] == 0) {
|
||||
fHandled[breakType] = new UnicodeSet();
|
||||
if (fHandled[breakType] == 0) {
|
||||
UnhandledEngine::handleCharacter(UChar32 c) {
|
||||
if (fHandled == nullptr) {
|
||||
fHandled = new UnicodeSet();
|
||||
if (fHandled == nullptr) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!fHandled[breakType]->contains(c)) {
|
||||
if (!fHandled->contains(c)) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// Apply the entire script of the character.
|
||||
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
|
||||
fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
|
||||
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -138,7 +127,7 @@ U_NAMESPACE_BEGIN
|
|||
static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
|
||||
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
|
||||
const LanguageBreakEngine *lbe = NULL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
|
@ -156,14 +145,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
|
|||
int32_t i = fEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
||||
if (lbe != NULL && lbe->handles(c, breakType)) {
|
||||
if (lbe != NULL && lbe->handles(c)) {
|
||||
return lbe;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We didn't find an engine. Create one.
|
||||
lbe = loadEngineFor(c, breakType);
|
||||
lbe = loadEngineFor(c);
|
||||
if (lbe != NULL) {
|
||||
fEngines->push((void *)lbe, status);
|
||||
}
|
||||
|
@ -171,11 +160,11 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
|
|||
}
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
|
||||
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UScriptCode code = uscript_getScript(c, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
|
||||
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
|
||||
if (m != NULL) {
|
||||
const LanguageBreakEngine *engine = NULL;
|
||||
switch(code) {
|
||||
|
@ -236,7 +225,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
|
|||
}
|
||||
|
||||
DictionaryMatcher *
|
||||
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
|
||||
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// open root from brkitr tree.
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
||||
|
|
|
@ -54,11 +54,10 @@ class LanguageBreakEngine : public UMemory {
|
|||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
|
||||
virtual UBool handles(UChar32 c) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
|
@ -68,14 +67,12 @@ class LanguageBreakEngine : public UMemory {
|
|||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks A Vector of int32_t to receive the breaks.
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
int32_t breakType,
|
||||
UVector32 &foundBreaks ) const = 0;
|
||||
|
||||
};
|
||||
|
@ -125,11 +122,9 @@ class LanguageBreakFactory : public UMemory {
|
|||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param breakType The kind of text break for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
|
||||
|
||||
};
|
||||
|
||||
|
@ -152,11 +147,11 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
private:
|
||||
|
||||
/**
|
||||
* The sets of characters handled, for each break type
|
||||
* The sets of characters handled.
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet *fHandled[4];
|
||||
UnicodeSet *fHandled;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -176,11 +171,10 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, int32_t breakType) const;
|
||||
virtual UBool handles(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
|
@ -190,23 +184,20 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
int32_t breakType,
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
/**
|
||||
* <p>Tell the engine to handle a particular character and break type.</p>
|
||||
*
|
||||
* @param c A character which the engine should handle
|
||||
* @param breakType The type of text break for which the engine should handle c
|
||||
*/
|
||||
virtual void handleCharacter(UChar32 c, int32_t breakType);
|
||||
virtual void handleCharacter(UChar32 c);
|
||||
|
||||
};
|
||||
|
||||
|
@ -250,11 +241,9 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
|
|||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param breakType The kind of text break for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
|
||||
|
||||
protected:
|
||||
/**
|
||||
|
@ -263,21 +252,17 @@ protected:
|
|||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param breakType The kind of text break for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
|
||||
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
|
||||
|
||||
/**
|
||||
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
|
||||
* @param script An ISO 15924 script code that identifies the dictionary to be
|
||||
* created.
|
||||
* @param breakType The kind of text break for which a dictionary is
|
||||
* sought.
|
||||
* @return A DictionaryMatcher with the desired characteristics, or NULL.
|
||||
*/
|
||||
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
|
||||
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -52,7 +52,7 @@ U_NAMESPACE_BEGIN
|
|||
// -------------------------------------
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status)
|
||||
BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
|
||||
{
|
||||
char fnbuff[256];
|
||||
char ext[4]={'\0'};
|
||||
|
@ -121,7 +121,6 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind,
|
|||
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
|
||||
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
|
||||
actualLocale.data());
|
||||
result->setBreakType(kind);
|
||||
}
|
||||
|
||||
ures_close(b);
|
||||
|
@ -413,10 +412,10 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
BreakIterator *result = NULL;
|
||||
switch (kind) {
|
||||
case UBRK_CHARACTER:
|
||||
result = BreakIterator::buildInstance(loc, "grapheme", kind, status);
|
||||
result = BreakIterator::buildInstance(loc, "grapheme", status);
|
||||
break;
|
||||
case UBRK_WORD:
|
||||
result = BreakIterator::buildInstance(loc, "word", kind, status);
|
||||
result = BreakIterator::buildInstance(loc, "word", status);
|
||||
break;
|
||||
case UBRK_LINE:
|
||||
uprv_strcpy(lbType, "line");
|
||||
|
@ -429,10 +428,10 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
uprv_strcat(lbType, lbKeyValue);
|
||||
}
|
||||
}
|
||||
result = BreakIterator::buildInstance(loc, lbType, kind, status);
|
||||
result = BreakIterator::buildInstance(loc, lbType, status);
|
||||
break;
|
||||
case UBRK_SENTENCE:
|
||||
result = BreakIterator::buildInstance(loc, "sentence", kind, status);
|
||||
result = BreakIterator::buildInstance(loc, "sentence", status);
|
||||
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
{
|
||||
char ssKeyValue[kKeyValueLenMax] = {0};
|
||||
|
@ -449,7 +448,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
#endif
|
||||
break;
|
||||
case UBRK_TITLE:
|
||||
result = BreakIterator::buildInstance(loc, "title", kind, status);
|
||||
result = BreakIterator::buildInstance(loc, "title", status);
|
||||
break;
|
||||
default:
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
|
|
@ -37,9 +37,8 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
|
|||
}
|
||||
|
||||
UBool
|
||||
DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
|
||||
return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
|
||||
&& fSet.contains(c));
|
||||
DictionaryBreakEngine::handles(UChar32 c) const {
|
||||
return fSet.contains(c);
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
|
|
@ -42,27 +42,12 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
|
||||
UnicodeSet fSet;
|
||||
|
||||
/**
|
||||
* The set of break types handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
uint32_t fTypes;
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
DictionaryBreakEngine();
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Constructor setting the break types handled.</p>
|
||||
*
|
||||
* @param breakTypes A bitmap of types handled by the engine.
|
||||
* <p>Constructor </p>
|
||||
*/
|
||||
DictionaryBreakEngine( uint32_t breakTypes );
|
||||
DictionaryBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
|
@ -78,7 +63,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles( UChar32 c, int32_t breakType ) const;
|
||||
virtual UBool handles(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
|
@ -88,14 +73,12 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
* that starts from the first character in the range.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks vector of int32_t to receive the break positions
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
int32_t breakType,
|
||||
UVector32 &foundBreaks ) const;
|
||||
|
||||
protected:
|
||||
|
|
|
@ -217,7 +217,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
|||
}
|
||||
BreakIterator::operator=(that);
|
||||
|
||||
fBreakType = that.fBreakType;
|
||||
if (fLanguageBreakEngines != NULL) {
|
||||
delete fLanguageBreakEngines;
|
||||
fLanguageBreakEngines = NULL; // Just rebuild for now
|
||||
|
@ -278,11 +277,6 @@ void RuleBasedBreakIterator::init(UErrorCode &status) {
|
|||
fRuleStatusIndex = 0;
|
||||
fDone = false;
|
||||
fDictionaryCharCount = 0;
|
||||
fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
|
||||
// dictionary behavior for Break Iterators that are
|
||||
// built from rules. Even better would be the ability to
|
||||
// declare the type in the rules.
|
||||
|
||||
fLanguageBreakEngines = NULL;
|
||||
fUnhandledBreakEngine = NULL;
|
||||
fBreakCache = NULL;
|
||||
|
@ -1290,14 +1284,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
|
|||
int32_t i = fLanguageBreakEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
|
||||
if (lbe->handles(c, fBreakType)) {
|
||||
if (lbe->handles(c)) {
|
||||
return lbe;
|
||||
}
|
||||
}
|
||||
|
||||
// No existing dictionary took the character. See if a factory wants to
|
||||
// give us a new LanguageBreakEngine for this character.
|
||||
lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
|
||||
lbe = getLanguageBreakEngineFromFactory(c);
|
||||
|
||||
// If we got one, use it and push it on our stack.
|
||||
if (lbe != NULL) {
|
||||
|
@ -1327,21 +1321,11 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
|
|||
|
||||
// Tell the reject engine about the character; at its discretion, it may
|
||||
// add more than just the one character.
|
||||
fUnhandledBreakEngine->handleCharacter(c, fBreakType);
|
||||
fUnhandledBreakEngine->handleCharacter(c);
|
||||
|
||||
return fUnhandledBreakEngine;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*int32_t RuleBasedBreakIterator::getBreakType() const {
|
||||
return fBreakType;
|
||||
}*/
|
||||
|
||||
void RuleBasedBreakIterator::setBreakType(int32_t type) {
|
||||
fBreakType = type;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::dumpCache() {
|
||||
fBreakCache->dumpCache();
|
||||
}
|
||||
|
|
|
@ -168,7 +168,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != NULL) {
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, *fBreaks);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
|
|
|
@ -616,7 +616,7 @@ public:
|
|||
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
|
||||
|
||||
private:
|
||||
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
|
||||
static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
|
||||
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
|
||||
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
|
||||
|
||||
|
|
|
@ -149,13 +149,6 @@ private:
|
|||
*/
|
||||
UnhandledEngine *fUnhandledBreakEngine;
|
||||
|
||||
/**
|
||||
*
|
||||
* The type of the break iterator, or -1 if it has not been set.
|
||||
* @internal
|
||||
*/
|
||||
int32_t fBreakType;
|
||||
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
@ -645,12 +638,6 @@ private:
|
|||
*/
|
||||
void reset(void);
|
||||
|
||||
/**
|
||||
* Set the type of the break iterator.
|
||||
* @internal
|
||||
*/
|
||||
void setBreakType(int32_t type);
|
||||
|
||||
/**
|
||||
* Common initialization function, used by constructors and bufferClone.
|
||||
* @internal
|
||||
|
|
Loading…
Add table
Reference in a new issue