ICU-5117 Thai break should work in all locales

X-SVN-Rev: 19408
This commit is contained in:
Deborah Goldsmith 2006-03-23 00:54:12 +00:00
parent 7dce112ff1
commit 490cb834fa
53 changed files with 30817 additions and 1524 deletions

1
.gitattributes vendored
View file

@ -48,7 +48,6 @@ README text !eol
*.spp -text
*.tri2 -text
icu4c/source/data/brkitr/thaidict.brk -text
icu4c/source/data/unidata/UCARules.txt -text
icu4c/source/samples/ucnv/data02.bin -text
icu4c/source/test/testdata/icu26_testtypes.res -text

View file

@ -75,7 +75,7 @@ normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o unames.o \
uscript.o usc_impl.o uvector.o ustack.o uvectr32.o ucmp8.o \
uarrsort.o utrie.o uset.o uset_props.o uniset.o uniset_props.o ruleiter.o caniter.o unifilt.o unifunct.o usetiter.o \
brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o \
brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o locutil.o \
uenum.o ustrenum.o uidna.o usprep.o punycode.o \

View file

@ -0,0 +1,229 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and others. *
* All Rights Reserved. *
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "brkeng.h"
#include "dictbe.h"
#include "triedict.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ures.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "uvector.h"
#include "mutex.h"
#include "uresimp.h"
#include "ubrkimpl.h"
U_NAMESPACE_BEGIN
/*
******************************************************************
*/
LanguageBreakEngine::LanguageBreakEngine() {
}
LanguageBreakEngine::~LanguageBreakEngine() {
}
/*
******************************************************************
*/
LanguageBreakFactory::LanguageBreakFactory() {
}
LanguageBreakFactory::~LanguageBreakFactory() {
}
/*
******************************************************************
*/
UnhandledEngine::UnhandledEngine(UErrorCode &status) {
for (int32_t i = 0; i < sizeof(fHandled)/sizeof(fHandled[0]); ++i) {
fHandled[i] = 0;
}
}
UnhandledEngine::~UnhandledEngine() {
for (int32_t i = 0; i < sizeof(fHandled)/sizeof(fHandled[0]); ++i) {
if (fHandled[i] != 0) {
delete fHandled[i];
}
}
}
UBool
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
return (breakType >= 0 && breakType < sizeof(fHandled)/sizeof(fHandled[0])
&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
}
int32_t
UnhandledEngine::findBreaks( CharacterIterator *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const {
if (breakType >= 0 && breakType < sizeof(fHandled)/sizeof(fHandled[0])) {
UChar32 c = text->current32();
if (reverse) {
while(text->getIndex() > startPos && fHandled[breakType]->contains(c)) {
c = text->previous32();
}
}
else {
while(text->getIndex() < endPos && fHandled[breakType]->contains(c)) {
c = text->next32();
}
}
}
return 0;
}
void
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
if (breakType >= 0 && breakType < sizeof(fHandled)/sizeof(fHandled[0])) {
if (fHandled[breakType] == 0) {
fHandled[breakType] = new UnicodeSet();
if (fHandled[breakType] == 0) {
return;
}
}
if (!fHandled[breakType]->contains(c)) {
UErrorCode status = U_ZERO_ERROR;
// Apply the entire script of the character.
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
}
}
}
/*
******************************************************************
*/
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &status) {
fEngines = 0;
}
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
if (fEngines != 0) {
delete fEngines;
}
}
U_NAMESPACE_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteEngine(void *obj) {
delete (const LanguageBreakEngine *) obj;
}
U_CDECL_END
U_NAMESPACE_BEGIN
const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
UBool needsInit;
UErrorCode status = U_ZERO_ERROR;
umtx_lock(NULL);
needsInit = (UBool)(fEngines == NULL);
umtx_unlock(NULL);
if (needsInit) {
UStack *engines = new UStack(_deleteEngine, NULL, status);
if (U_SUCCESS(status) && engines == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
// TODO: add locale parameter, check "dictionaries" in locale
// TODO: generalize once we can figure out how to parameterize engines
// instead of having different subclasses. Right now it needs to check
// for the key of each particular subclass.
// Open root from brkitr tree.
UResourceBundle dictBundleStack;
UResourceBundle dictNameStack;
UResourceBundle *dictBundle = &dictBundleStack;
UResourceBundle *dictName = &dictNameStack;
char dictnbuff[256];
ures_initStackObject(dictBundle);
ures_initStackObject(dictName);
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
dictBundle = ures_getByKeyWithFallback(b, "dictionaries", dictBundle, &status);
dictName = ures_getByKeyWithFallback(dictBundle, "Thai", dictName, &status);
const UChar *dictfname = NULL;
int32_t dictnlength = 0;
dictfname = ures_getString(dictName, &dictnlength, &status);
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
dictnlength = 0;
status = U_BUFFER_OVERFLOW_ERROR;
}
if (U_SUCCESS(status) && dictfname) {
u_UCharsToChars(dictfname, dictnbuff, dictnlength+1);
}
ures_close(dictName);
ures_close(dictBundle);
ures_close(b);
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, "ctd", dictnbuff, &status);
if (U_SUCCESS(status)) {
const CompactTrieDictionary *dict = new CompactTrieDictionary(
(const TrieWordDictionary *)udata_getMemory(file), status);
if (U_SUCCESS(status) && dict == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) {
delete dict;
dict = NULL;
}
const ThaiBreakEngine *thai = new ThaiBreakEngine(dict, status);
if (thai == NULL) {
delete dict;
if (U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
if (U_SUCCESS(status)) {
engines->push((void *)thai, status);
}
else {
delete thai;
}
}
umtx_lock(NULL);
if (fEngines == NULL) {
fEngines = engines;
engines = NULL;
}
umtx_unlock(NULL);
delete engines;
}
if (fEngines == NULL) {
return NULL;
}
int32_t i = fEngines->size();
const LanguageBreakEngine *lbe = NULL;
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != NULL && lbe->handles(c, breakType)) {
break;
}
lbe = NULL;
}
return lbe;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -0,0 +1,265 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and others. *
* All Rights Reserved. *
*******************************************************************************
*/
#ifndef BRKENG_H
#define BRKENG_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
class CharacterIterator;
class UnicodeSet;
class UStack;
/*******************************************************************
* LanguageBreakEngine
*/
/**
* <p>LanguageBreakEngines implement language-specific knowledge for
* finding text boundaries within a run of characters belonging to a
* specific set. The boundaries will be of a specific kind, e.g. word,
* line, etc.</p>
*
* <p>LanguageBreakEngines should normally be implemented so as to
* be shared between threads without locking.</p>
*/
class U_COMMON_API LanguageBreakEngine : public UMemory {
public:
/**
* <p>Default constructor.</p>
*
*/
LanguageBreakEngine();
/**
* <p>Virtual destructor.</p>
*/
virtual ~LanguageBreakEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( CharacterIterator *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const = 0;
};
/*******************************************************************
* LanguageBreakFactory
*/
/**
* <p>LanguageBreakFactorys find and return a LanguageBreakEngine
* that can determine breaks for characters in a specific set, if
* such an object can be found.</p>
*
* <p>If a LanguageBreakFactory is to be shared between threads,
* appropriate synchronization must be used; there is none internal
* to the factory.</p>
*
* <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
* normally be shared between threads without synchronization, unless
* the specific subclass of LanguageBreakFactory indicates otherwise.</p>
*
* <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
* it returns when it itself is deleted, unless the specific subclass of
* LanguageBreakFactory indicates otherwise. Naturally, the factory should
* not be deleted until the LanguageBreakEngines it has returned are no
* longer needed.</p>
*/
class U_COMMON_API LanguageBreakFactory : public UMemory {
public:
/**
* <p>Default constructor.</p>
*
*/
LanguageBreakFactory();
/**
* <p>Virtual destructor.</p>
*/
virtual ~LanguageBreakFactory();
/**
* <p>Find and return a LanguageBreakEngine that can find the desired
* kind of break for the set of characters to which the supplied
* character belongs. It is up to the set of available engines to
* determine what the sets of characters are.</p>
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
};
/*******************************************************************
* UnhandledEngine
*/
/**
* <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
* handles characters that no other LanguageBreakEngine is available to
* handle. It is told the character and the type of break; at its
* discretion it may handle more than the specified character (e.g.,
* the entire script to which that character belongs.</p>
*
* <p>UnhandledEngines may not be shared between threads without
* external synchronization.</p>
*/
class U_COMMON_API UnhandledEngine : public LanguageBreakEngine {
private:
/**
* The sets of characters handled, for each break type
* @internal
*/
UnicodeSet *fHandled[4];
public:
/**
* <p>Default constructor.</p>
*
*/
UnhandledEngine(UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~UnhandledEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, int32_t breakType) const;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( CharacterIterator *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const;
/**
* <p>Tell the engine to handle a particular character and break type.</p>
*
* @param c A character which the engine should handle
* @param breakType The type of text break for which the engine should handle c
*/
virtual void handleCharacter(UChar32 c, int32_t breakType);
};
/*******************************************************************
* ICULanguageBreakFactory
*/
/**
* <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
* ICU. It creates dictionary-based LanguageBreakEngines from dictionary
* data in the ICU data file.</p>
*/
class U_COMMON_API ICULanguageBreakFactory : public LanguageBreakFactory {
private:
/**
* The stack of break engines created by this factory
* @internal
*/
UStack *fEngines;
public:
/**
* <p>Standard constructor.</p>
*
*/
ICULanguageBreakFactory(UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~ICULanguageBreakFactory();
/**
* <p>Find and return a LanguageBreakEngine that can find the desired
* kind of break for the set of characters to which the supplied
* character belongs. It is up to the set of available engines to
* determine what the sets of characters are.</p>
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
};
U_NAMESPACE_END
/* BRKENG_H */
#endif

View file

@ -22,7 +22,7 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/dbbi.h"
#include "unicode/rbbi.h"
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/ures.h"
@ -33,6 +33,7 @@
#include "locbased.h"
#include "uresimp.h"
#include "uassert.h"
#include "ubrkimpl.h"
// *****************************************************************************
// class BreakIterator
@ -46,7 +47,7 @@ U_NAMESPACE_BEGIN
// -------------------------------------
BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode &status)
BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status)
{
char fnbuff[256];
char actualLocale[ULOC_FULLNAME_CAPACITY];
@ -56,7 +57,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UE
UResourceBundle brkNameStack;
UResourceBundle *brkRules = &brkRulesStack;
UResourceBundle *brkName = &brkNameStack;
BreakIterator *result = NULL;
RuleBasedBreakIterator *result = NULL;
if (U_FAILURE(status))
return NULL;
@ -65,7 +66,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UE
ures_initStackObject(brkName);
// Get the locale
UResourceBundle *b = ures_open(NULL, loc.getName(), &status);
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, loc.getName(), &status);
// Get the "boundaries" array.
if (U_SUCCESS(status)) {
@ -94,45 +95,20 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UE
ures_close(brkRules);
ures_close(brkName);
UDataMemory* file = udata_open(NULL, "brk", fnbuff, &status);
UDataMemory* file = udata_open(U_ICUDATA_BRKITR, "brk", fnbuff, &status);
if (U_FAILURE(status)) {
ures_close(b);
return NULL;
}
// We found the break rules; now see if a dictionary is needed
if (dict)
{
UErrorCode localStatus = U_ZERO_ERROR;
brkName = &brkNameStack;
ures_initStackObject(brkName);
brkName = ures_getByKeyWithFallback(b, "BreakDictionaryData", brkName, &localStatus);
#if 0
if (U_SUCCESS(localStatus)) {
brkfname = ures_getString(&brkname, &size, &localStatus);
}
#endif
if (U_SUCCESS(localStatus)) {
#if 0
// TODO: if this code is ever enabled, need to add a bounds check for fnbuff.
u_UCharsToChars(brkfname, fnbuff, size);
fnbuff[size] = '\0';
#endif
result = new DictionaryBasedBreakIterator(file, "thaidict.brk", status);
}
ures_close(brkName);
}
// If there is still no result but we haven't had an error, no dictionary,
// so make a non-dictionary break iterator
if (U_SUCCESS(status) && result == NULL) {
result = new RuleBasedBreakIterator(file, status);
}
// Create a RuleBasedBreakIterator
result = new RuleBasedBreakIterator(file, status);
// If there is a result, set the valid locale and actual locale
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != NULL) {
U_LOCALE_BASED(locBased, *result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale);
result->setBreakType(kind);
}
ures_close(b);
@ -372,7 +348,7 @@ BreakIterator::getAvailableLocales(void)
// -------------------------------------
BreakIterator*
BreakIterator::createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status)
BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
{
if (U_FAILURE(status)) {
return NULL;
@ -419,19 +395,19 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
BreakIterator *result = NULL;
switch (kind) {
case UBRK_CHARACTER:
result = BreakIterator::buildInstance(loc, "grapheme", FALSE, status);
result = BreakIterator::buildInstance(loc, "grapheme", kind, status);
break;
case UBRK_WORD:
result = BreakIterator::buildInstance(loc, "word", TRUE, status);
result = BreakIterator::buildInstance(loc, "word", kind, status);
break;
case UBRK_LINE:
result = BreakIterator::buildInstance(loc, "line", TRUE, status);
result = BreakIterator::buildInstance(loc, "line", kind, status);
break;
case UBRK_SENTENCE:
result = BreakIterator::buildInstance(loc, "sentence", FALSE, status);
result = BreakIterator::buildInstance(loc, "sentence", kind, status);
break;
case UBRK_TITLE:
result = BreakIterator::buildInstance(loc, "title", FALSE, status);
result = BreakIterator::buildInstance(loc, "title", kind, status);
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;

View file

@ -1,637 +0,0 @@
/*
**********************************************************************
* Copyright (C) 1999-2005 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/dbbi.h"
#include "unicode/schriter.h"
#include "dbbi_tbl.h"
#include "uvector.h"
#include "cmemory.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DictionaryBasedBreakIterator)
//------------------------------------------------------------------------------
//
// constructors
//
//------------------------------------------------------------------------------
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
RuleBasedBreakIterator() {
init();
}
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
const char* dictionaryFilename,
UErrorCode& status)
: RuleBasedBreakIterator(rbbiData, status)
{
init();
if (U_FAILURE(status)) {return;};
fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
if (U_FAILURE(status)) {
if (fTables != NULL) {
fTables->removeReference();
fTables = NULL;
}
return;
}
/* test for NULL */
if(fTables == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
RuleBasedBreakIterator(other)
{
init();
if (other.fTables != NULL) {
fTables = other.fTables;
fTables->addReference();
}
}
//------------------------------------------------------------------------------
//
// Destructor
//
//------------------------------------------------------------------------------
DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
{
uprv_free(cachedBreakPositions);
cachedBreakPositions = NULL;
if (fTables != NULL) {fTables->removeReference();};
}
//------------------------------------------------------------------------------
//
// Assignment operator. Sets this iterator to have the same behavior,
// and iterate over the same text, as the one passed in.
//
//------------------------------------------------------------------------------
DictionaryBasedBreakIterator&
DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
if (this == &that) {
return *this;
}
reset(); // clears out cached break positions.
RuleBasedBreakIterator::operator=(that);
if (this->fTables != that.fTables) {
if (this->fTables != NULL) {this->fTables->removeReference();};
this->fTables = that.fTables;
if (this->fTables != NULL) {this->fTables->addReference();};
}
return *this;
}
//------------------------------------------------------------------------------
//
// Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
// behavior, and iterating over the same text, as this one.
//
//------------------------------------------------------------------------------
BreakIterator*
DictionaryBasedBreakIterator::clone() const {
return new DictionaryBasedBreakIterator(*this);
}
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
*/
int32_t
DictionaryBasedBreakIterator::previous()
{
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (cachedBreakPositions != NULL && positionInCache > 0) {
--positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
// otherwise, dump the cache and use the inherited previous() method to move
// backward. This may fill up the cache with new break positions, in which
// case we have to mark our position in the cache
else {
reset();
int32_t result = RuleBasedBreakIterator::previous();
if (cachedBreakPositions != NULL) {
for (positionInCache=0;
cachedBreakPositions[positionInCache] != result;
positionInCache++);
U_ASSERT(positionInCache < numCachedBreakPositions);
if (positionInCache >= numCachedBreakPositions) {
// Something has gone wrong. Dump the cache.
reset();
}
}
return result;
}
}
/**
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
*/
int32_t
DictionaryBasedBreakIterator::preceding(int32_t offset)
{
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > fText->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < fText->startIndex()) {
return fText->startIndex();
}
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
reset();
return RuleBasedBreakIterator::preceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
positionInCache = 0;
while (positionInCache < numCachedBreakPositions
&& offset > cachedBreakPositions[positionInCache])
++positionInCache;
--positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return fText->getIndex();
}
}
/**
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
*/
int32_t
DictionaryBasedBreakIterator::following(int32_t offset)
{
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > fText->endIndex()) {
return BreakIterator::DONE;
}
else if (offset < fText->startIndex()) {
return fText->startIndex();
}
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
reset();
return RuleBasedBreakIterator::following(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
positionInCache = 0;
while (positionInCache < numCachedBreakPositions
&& offset >= cachedBreakPositions[positionInCache])
++positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return fText->getIndex();
}
}
/**
* This is the implementation function for next().
*/
int32_t
DictionaryBasedBreakIterator::handleNext()
{
UErrorCode status = U_ZERO_ERROR;
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
// start by using the inherited handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
int32_t startPos = fText->getIndex();
fDictionaryCharCount = 0;
int32_t result = RuleBasedBreakIterator::handleNext();
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range
if (fDictionaryCharCount > 1 && result - startPos > 1) {
divideUpDictionaryRange(startPos, result, status);
U_ASSERT(U_SUCCESS(status));
if (U_FAILURE(status)) {
// Something went badly wrong, an internal error.
// We have no way from here to report it to caller.
// Treat as if this is if the dictionary did not apply to range.
reset();
return result;
}
}
// otherwise, the value we got back from the inherited fuction
// is our return value, and we can dump the cache
else {
reset();
return result;
}
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (cachedBreakPositions != NULL) {
++positionInCache;
fText->setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
return -9999; // SHOULD NEVER GET HERE!
}
void
DictionaryBasedBreakIterator::reset()
{
uprv_free(cachedBreakPositions);
cachedBreakPositions = NULL;
numCachedBreakPositions = 0;
fDictionaryCharCount = 0;
positionInCache = 0;
}
//------------------------------------------------------------------------------
//
// init() Common initialization routine, for use by constructors, etc.
//
//------------------------------------------------------------------------------
void DictionaryBasedBreakIterator::init() {
cachedBreakPositions = NULL;
fTables = NULL;
numCachedBreakPositions = 0;
fDictionaryCharCount = 0;
positionInCache = 0;
}
//------------------------------------------------------------------------------
//
// BufferClone
//
//------------------------------------------------------------------------------
BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
int32_t &bufferSize,
UErrorCode &status)
{
if (U_FAILURE(status)){
return NULL;
}
//
// If user buffer size is zero this is a preflight operation to
// obtain the needed buffer size, allowing for worst case misalignment.
//
if (bufferSize == 0) {
bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
return NULL;
}
//
// Check the alignment and size of the user supplied buffer.
// Allocate heap memory if the user supplied memory is insufficient.
//
char *buf = (char *)stackBuffer;
uint32_t s = bufferSize;
if (stackBuffer == NULL) {
s = 0; // Ignore size, force allocation if user didn't give us a buffer.
}
if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
s -= offsetUp;
buf += offsetUp;
}
if (s < sizeof(DictionaryBasedBreakIterator)) {
buf = (char *) new DictionaryBasedBreakIterator();
if (buf == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
status = U_SAFECLONE_ALLOCATED_WARNING;
}
//
// Initialize the clone object.
// TODO: using an overloaded C++ "operator new" to directly initialize the
// copy in the user's buffer would be better, but it doesn't seem
// to get along with namespaces. Investigate why.
//
// The memcpy is only safe with an empty (default constructed)
// break iterator. Use on others can screw up reference counts
// to data. memcpy-ing objects is not really a good idea...
//
DictionaryBasedBreakIterator localIter; // Empty break iterator, source for memcpy
DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
*clone = *this; // clone = the real one we want.
if (status != U_SAFECLONE_ALLOCATED_WARNING) {
clone->fBufferClone = TRUE;
}
return clone;
}
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
void
DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
{
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
// that needs to be kept with the word). Seek from the beginning of the
// range to the first dictionary character
fText->setIndex(startPos);
UChar32 c = fText->current32();
while (isDictionaryChar(c) == FALSE) {
c = fText->next32();
}
if (U_FAILURE(status)) {
return; // UStack below overwrites the status error codes
}
// initialize. We maintain two stacks: currentBreakPositions contains
// the list of break positions that will be returned if we successfully
// finish traversing the whole range now. possibleBreakPositions lists
// all other possible word ends we've passed along the way. (Whenever
// we reach an error [a sequence of characters that can't begin any word
// in the dictionary], we back up, possibly delete some breaks from
// currentBreakPositions, move a break from possibleBreakPositions
// to currentBreakPositions, and start over from there. This process
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.) wrongBreakPositions is used to keep track of paths we've
// tried on previous iterations. As the iterator backs up further and
// further, this saves us from having to follow each possible path
// through the text all the way to the error (hopefully avoiding many
// future recursive calls as well).
// there can be only one kind of error in UStack and UVector, so we'll
// just let the error fall through
UStack currentBreakPositions(status);
UStack possibleBreakPositions(status);
UVector wrongBreakPositions(status);
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
int16_t state = 0;
// these two variables are used for error handling. We keep track of the
// farthest we've gotten through the range being divided, and the combination
// of breaks that got us that far. If we use up all possible break
// combinations, the text contains an error or a word that's not in the
// dictionary. In this case, we "bless" the break positions that got us the
// farthest as real break positions, and then start over from scratch with
// the character where the error occurred.
int32_t farthestEndPoint = fText->getIndex();
UStack bestBreakPositions(status);
UBool bestBreakPositionsInitialized = FALSE;
if (U_FAILURE(status)) {
return;
}
// initialize (we always exit the loop with a break statement)
c = fText->current32();
for (;;) {
// The dictionary implementation doesn't do supplementary chars.
// Put them through as an unpaired surrogate, which
// will end any dictionary match in progress.
// With any luck, this dictionary implementation will be retired soon.
if (c>0x10000) {
c = 0xd800;
}
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
possibleBreakPositions.push(fText->getIndex(), status);
if (U_FAILURE(status)) {
return;
}
}
// look up the new state to transition to in the dictionary
state = fTables->fDictionary->at(state, (UChar)c);
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
currentBreakPositions.push(fText->getIndex(), status);
if (U_FAILURE(status)) {
return;
}
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
// without transitioning to the "end of word" state, we've hit
// an error...
else if (state == 0 || fText->getIndex() >= endPos) {
// if this is the farthest we've gotten, take note of it in
// case there's an error in the text
if (fText->getIndex() > farthestEndPoint) {
farthestEndPoint = fText->getIndex();
bestBreakPositions.removeAllElements();
bestBreakPositionsInitialized = TRUE;
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
}
}
// wrongBreakPositions is a list of all break positions we've tried starting
// that didn't allow us to traverse all the way through the text. Every time
// we pop a break position off of currentBreakPositions, we put it into
// wrongBreakPositions to avoid trying it again later. If we make it to this
// spot, we're either going to back up to a break in possibleBreakPositions
// and try starting over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure. This loop prevents
// us from messing with anything in possibleBreakPositions that didn't work as
// a starting point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
possibleBreakPositions.peeki())) {
possibleBreakPositions.popi();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
// over, treating the farthest character we've reached as the beginning
// of the range, and "blessing" the break positions that got us that
// far as real break positions
if (possibleBreakPositions.isEmpty()) {
if (bestBreakPositionsInitialized) {
currentBreakPositions.removeAllElements();
for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
if (U_FAILURE(status)) {
return;
}
}
bestBreakPositions.removeAllElements();
if (farthestEndPoint < endPos) {
fText->setIndex(farthestEndPoint);
fText->next32();
}
else {
break;
}
}
else {
if ((currentBreakPositions.isEmpty()
|| currentBreakPositions.peeki() != fText->getIndex())
&& fText->getIndex() != startPos) {
currentBreakPositions.push(fText->getIndex(), status);
if (U_FAILURE(status)) {
return;
}
}
fText->next32();
currentBreakPositions.push(fText->getIndex(), status);
if (U_FAILURE(status)) {
return;
}
}
}
// if we still have more break positions we can try, then promote the
// last break in possibleBreakPositions into currentBreakPositions,
// and get rid of all entries in currentBreakPositions that come after
// it. Then back up to that position and start over from there (i.e.,
// treat that position as the beginning of a new word)
else {
int32_t temp = possibleBreakPositions.popi();
int32_t temp2 = 0;
while (!currentBreakPositions.isEmpty() && temp <
currentBreakPositions.peeki()) {
temp2 = currentBreakPositions.popi();
wrongBreakPositions.addElement(temp2, status);
}
currentBreakPositions.push(temp, status);
fText->setIndex(currentBreakPositions.peeki());
}
// re-sync "c" for the next go-round, and drop out of the loop if
// we've made it off the end of the range
c = fText->current32();
if (fText->getIndex() >= endPos) {
break;
}
}
// if we didn't hit any exceptional conditions on this last iteration,
// just advance to the next character and loop
else {
c = fText->next32();
}
}
// dump the last break position in the list, and replace it with the actual
// end of the range (which may be the same character, or may be further on
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.popi();
}
currentBreakPositions.push(endPos, status);
if (U_FAILURE(status)) {
return;
}
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()
// and previous(), so this is where we actually refresh the cache.
if (cachedBreakPositions != NULL) {
uprv_free(cachedBreakPositions);
}
cachedBreakPositions = (int32_t *)uprv_malloc((currentBreakPositions.size() + 1) * sizeof(int32_t));
/* Test for NULL */
if(cachedBreakPositions == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
numCachedBreakPositions = currentBreakPositions.size() + 1;
cachedBreakPositions[0] = startPos;
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
}
positionInCache = 0;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
/* eof */

View file

@ -1,59 +0,0 @@
/*
**********************************************************************
* Copyright (C) 1999-2002 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
* 06/14/2002 andy Gutted for new RBBI impl.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "dbbi_tbl.h"
#include "unicode/dbbi.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
//=======================================================================
// constructor
//=======================================================================
DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
const char* dictionaryFilename,
UErrorCode &status) {
fDictionary = new BreakDictionary(dictionaryFilename, status);
fRefCount = 1;
}
void DictionaryBasedBreakIteratorTables::addReference() {
umtx_atomic_inc(&fRefCount);
}
void DictionaryBasedBreakIteratorTables::removeReference() {
if (umtx_atomic_dec(&fRefCount) == 0) {
delete this;
}
}
/**
* Destructor
*/
DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
delete fDictionary;
fDictionary = NULL;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
/* eof */

View file

@ -1,90 +0,0 @@
/*
**********************************************************************
* Copyright (C) 1999-2000 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#ifndef DBBI_TBL_H
#define DBBI_TBL_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/udata.h"
#include "brkdict.h"
U_NAMESPACE_BEGIN
/* forward declaration */
class DictionaryBasedBreakIterator;
//
// DictionaryBasedBreakIteratorTables
//
// This class sits between instances of DictionaryBasedBreakIterator
// and the dictionary data itself, which is of type BreakDictionary.
// It provides reference counting, allowing multiple copies of a
// DictionaryBasedBreakIterator to share a single instance of
// BreakDictionary.
//
// TODO: it'd probably be cleaner to add the reference counting to
// BreakDictionary and get rid of this class, but doing it this way
// was a convenient transition from earlier code, and time is short...
//
class DictionaryBasedBreakIteratorTables : public UMemory {
private:
int32_t fRefCount;
public:
//=======================================================================
// constructor
//=======================================================================
/* @param dictionaryFilename The name of the dictionary file
* @param status The error code
* @return the newly created DictionaryBasedBreakIteratorTables
**/
DictionaryBasedBreakIteratorTables(const char* dictionaryFilename,
UErrorCode& status);
BreakDictionary *fDictionary;
void addReference();
void removeReference();
/**
* Destructor. Should not be used directly. Use removeReference() istead.
* (Not private to avoid compiler warnings.)
*/
virtual ~DictionaryBasedBreakIteratorTables();
private:
/**
* The copy constructor is declared private and not implemented.
* THIS CLASS MAY NOT BE COPIED.
* @param that The DictionaryBasedBreakIteratorTables to be copied.
* @return the newly constructed DictionaryBasedBreakIteratorTables.
*/
DictionaryBasedBreakIteratorTables(const DictionaryBasedBreakIteratorTables& that);
//=======================================================================
// boilerplate
//=======================================================================
/**
* The assignment operator is declared private and not implemented.
* THIS CLASS MAY NOT BE COPIED.
* Call addReference() and share an existing copy instead.
* @that The object to be copied
* @return the newly created DictionaryBasedBreakIteratorTables.
*/
DictionaryBasedBreakIteratorTables& operator=(
const DictionaryBasedBreakIteratorTables& that);
};
U_NAMESPACE_END
#endif

View file

@ -0,0 +1,413 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and others. *
* All Rights Reserved. *
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "brkeng.h"
#include "dictbe.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/ubrk.h"
#include "uvector.h"
#include "triedict.h"
U_NAMESPACE_BEGIN
/*
******************************************************************
*/
DictionaryBreakEngine::DictionaryBreakEngine() {
fTypes = 0;
}
DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
fTypes = breakTypes;
}
DictionaryBreakEngine::~DictionaryBreakEngine() {
}
UBool
DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
return fSet.contains(c);
}
int32_t
DictionaryBreakEngine::findBreaks( CharacterIterator *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const {
int32_t result = 0;
// Find the span of characters included in the set.
int32_t start = text->getIndex();
int32_t current;
int32_t rangeStart;
int32_t rangeEnd;
UChar32 c = text->current32();
if (reverse) {
UBool isDict = fSet.contains(c);
while((current = text->getIndex()) > startPos && isDict) {
c = text->previous32();
isDict = fSet.contains(c);
}
rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
rangeEnd = start + 1;
}
else {
while((current = text->getIndex()) < endPos && fSet.contains(c)) {
c = text->next32();
}
rangeStart = start;
rangeEnd = current;
}
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
text->setIndex(current);
}
return result;
}
void
DictionaryBreakEngine::setCharacters( UnicodeSet &set ) {
fSet = set;
}
void
DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
fTypes = breakTypes;
}
/*
******************************************************************
*/
// Helper class for improving readability of the Thai word break
// algorithm. The implementation is completely inline.
// List size, limited by the maximum number of words in the dictionary
// that form a nested sequence.
#define POSSIBLE_WORD_LIST_MAX 20
class PossibleWord {
private:
// list of word candidate lengths, in increasing length order
int32_t lengths[POSSIBLE_WORD_LIST_MAX];
int count; // Count of candidates
int32_t prefix; // The longest match with a dictionary word
int32_t offset; // Offset in the text of these candidates
int mark; // The preferred candidate's offset
int current; // The candidate we're currently looking at
public:
PossibleWord();
~PossibleWord();
// Fill the list of candidates if needed, select the longest, and return the number found
int candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd );
// Select the currently marked candidate, point after it in the text, and invalidate self
int32_t acceptMarked( CharacterIterator *text );
// Back up from the current candidate to the next shorter one; return TRUE if that exists
// and point the text after it
UBool backUp( CharacterIterator *text );
// Return the longest prefix this candidate location shares with a dictionary word
int32_t longestPrefix();
// Mark the current candidate as the one we like
void markCurrent();
};
inline
PossibleWord::PossibleWord() {
offset = -1;
}
inline
PossibleWord::~PossibleWord() {
}
inline int
PossibleWord::candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
int32_t start = text->getIndex();
if (start != offset) {
offset = start;
prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
// Dictionary leaves text after longest prefix, not longest word. Back up.
if (count <= 0) {
text->setIndex(start);
}
}
if (count > 0) {
text->setIndex(start+lengths[count-1]);
}
current = count-1;
mark = current;
return count;
}
inline int32_t
PossibleWord::acceptMarked( CharacterIterator *text ) {
text->setIndex(offset + lengths[mark]);
return lengths[mark];
}
inline UBool
PossibleWord::backUp( CharacterIterator *text ) {
if (current > 0) {
text->setIndex(offset + lengths[--current]);
return TRUE;
}
return FALSE;
}
inline int32_t
PossibleWord::longestPrefix() {
return prefix;
}
inline void
PossibleWord::markCurrent() {
mark = current;
}
// How many words in a row are "good enough"?
#define THAI_LOOKAHEAD 3
// Will not combine a non-word with a preceding dictionary word longer than this
#define THAI_ROOT_COMBINE_THRESHOLD 3
// Will not combine a non-word that shares at least this much prefix with a
// dictionary word, with a preceding word
#define THAI_PREFIX_COMBINE_THRESHOLD 3
// Ellision character
#define THAI_PAIYANNOI 0x0E2F
// Repeat character
#define THAI_MAIYAMOK 0x0E46
// Minimum word size
#define THAI_MIN_WORD 2
// Minimum number of characters for two words
#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
fDictionary(adoptDictionary) {
UnicodeString thaiSet("[[:Thai:]&[:LineBreak=SA:]]", -1, US_INV);
UnicodeString markSet("[[:Thai:]&[:LineBreak=SA:]&[:M:]]", -1, US_INV);
fThaiWordSet.applyPattern(thaiSet, status);
if (U_SUCCESS(status)) {
setCharacters(fThaiWordSet);
}
fMarkSet.applyPattern(markSet, status);
fEndWordSet = fThaiWordSet;
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fSuffixSet.add(THAI_PAIYANNOI);
fSuffixSet.add(THAI_MAIYAMOK);
}
ThaiBreakEngine::~ThaiBreakEngine() {
delete fDictionary;
}
int32_t
ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const {
if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
return 0; // Not enough characters for two words
}
uint32_t wordsFound = 0;
int32_t wordLength;
int32_t current;
UErrorCode status = U_ZERO_ERROR;
PossibleWord words[THAI_LOOKAHEAD];
UChar32 uc;
text->setIndex(rangeStart);
while (U_SUCCESS(status) && (current = text->getIndex()) < rangeEnd) {
wordLength = 0;
// Look for candidate words at the current position
int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
// If we found exactly one, use that
if (candidates == 1) {
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
wordsFound += 1;
}
// If there was more than one, see which one can take us forward the most words
else if (candidates > 1) {
// If we're already at the end of the range, we're done
if (text->getIndex() >= rangeEnd) {
goto foundBest;
}
do {
int wordsMatched = 1;
if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
if (wordsMatched < 2) {
// Followed by another dictionary word; mark first word as a good candidate
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
wordsMatched = 2;
}
// If we're already at the end of the range, we're done
if (text->getIndex() >= rangeEnd) {
goto foundBest;
}
// See if any of the possible second words is followed by a third word
do {
// If we find a third word, stop right away
if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
goto foundBest;
}
}
while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
}
}
while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
foundBest:
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
wordsFound += 1;
}
// We come here after having either found a word or not. We look ahead to the
// next word. If it's not a dictionary word, we will combine it withe the word we
// just found (if there is one), but only if the preceding word does not exceed
// the threshold.
// The text iterator should now be positioned at the end of the word we found.
if (text->getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
// if it is a dictionary word, do nothing. If it isn't, then if there is
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
&& (wordLength == 0
|| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
//TODO: This section will need a rework for UText.
int32_t remaining = rangeEnd - (current+wordLength);
UChar32 pc = text->current32();
int32_t chars = 0;
while (TRUE) {
uc = text->next32();
// TODO: Here we're counting on the fact that the SA languages are all
// in the BMP. This should get fixed with the UText rework.
chars += 1;
if (--remaining <= 0) {
break;
}
if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
// Maybe. See if it's in the dictionary.
// NOTE: In the original Apple code, checked that the next
// two characters after uc were not 0x0E4C THANTHAKHAT before
// checking the dictionary. That is just a performance filter,
// but it's not clear it's faster than checking the trie.
int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
text->setIndex(current+wordLength+chars);
if (candidates > 0) {
break;
}
}
pc = uc;
}
// Bump the word count if there wasn't already one
if (wordLength <= 0) {
wordsFound += 1;
}
// Update the length with the passed-over characters
wordLength += chars;
}
else {
// Back up to where we were for next iteration
text->setIndex(current+wordLength);
}
}
// Never stop before a combining mark.
int32_t currPos;
while ((currPos = text->getIndex()) < rangeEnd && fMarkSet.contains(text->current32())) {
wordLength += text->move32(1, CharacterIterator::kCurrent) - currPos;
}
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
if (text->getIndex() < rangeEnd && wordLength > 0) {
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
&& fSuffixSet.contains(uc = text->current32())) {
if (uc == THAI_PAIYANNOI) {
if (!fSuffixSet.contains(text->previous32())) {
// Skip over previous end and PAIYANNOI
text->move32(2, CharacterIterator::kCurrent);
wordLength += 1; // Add PAIYANNOI to word
uc = text->current32(); // Fetch next character
}
else {
// Restore prior position
text->move32(1, CharacterIterator::kCurrent);
}
}
if (uc == THAI_MAIYAMOK) {
if (text->previous32() != THAI_MAIYAMOK) {
// Skip over previous end and MAIYAMOK
text->move32(2, CharacterIterator::kCurrent);
wordLength += 1; // Add MAIYAMOK to word
}
else {
// Restore prior position
text->move32(1, CharacterIterator::kCurrent);
}
}
}
else {
text->setIndex(current+wordLength);
}
}
// Did we find a word on this iteration? If so, push it on the break stack
if (wordLength > 0) {
foundBreaks.push((current+wordLength), status);
}
}
// Don't return a break for the end of the dictionary range if there is one there.
if (foundBreaks.peeki() >= rangeEnd) {
(void) foundBreaks.popi();
wordsFound -= 1;
}
return wordsFound;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -0,0 +1,192 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and others. *
* All Rights Reserved. *
*******************************************************************************
*/
#ifndef DICTBE_H
#define DICTBE_H
#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "brkeng.h"
U_NAMESPACE_BEGIN
class CharacterIterator;
class TrieWordDictionary;
/*******************************************************************
* DictionaryBreakEngine
*/
/**
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
* dictionary to determine language-specific breaks.</p>
*
* <p>After it is constructed a DictionaryBreakEngine may be shared between
* threads without synchronization.</p>
*/
class U_COMMON_API DictionaryBreakEngine : public LanguageBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fSet;
/**
* The set of break types handled by this engine
* @internal
*/
uint32_t fTypes;
public:
/**
* <p>Default constructor.</p>
*
*/
DictionaryBreakEngine();
/**
* <p>Constructor setting the break types handled.</p>
*
* @param breakTypes A bitmap of types handled by the engine.
*/
DictionaryBreakEngine( uint32_t breakTypes );
/**
* <p>Virtual destructor.</p>
*/
virtual ~DictionaryBreakEngine();
/**
* <p>Indicate whether this engine handles a particular character for
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles( UChar32 c, int32_t breakType ) const;
/**
* <p>Find any breaks within a run in the supplied text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left at the end of the run of characters which the engine
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param reverse Whether the caller is looking for breaks in a reverse
* direction.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( CharacterIterator *text,
int32_t startPos,
int32_t endPos,
UBool reverse,
int32_t breakType,
UStack &foundBreaks ) const;
protected:
/**
* <p>Set the character set handled by this engine.</p>
*
* @param set A UnicodeSet of the set of characters handled by the engine
*/
virtual void setCharacters( UnicodeSet &set );
/**
* <p>Set the break types handled by this engine.</p>
*
* @param breakTypes A bitmap of types handled by the engine.
*/
virtual void setBreakTypes( uint32_t breakTypes );
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A CharacterIterator representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const = 0;
};
/*******************************************************************
* ThaiBreakEngine
*/
/**
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
* TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
*
* <p>After it is constructed a ThaiBreakEngine may be shared between
* threads without synchronization.</p>
*/
class U_COMMON_API ThaiBreakEngine : public DictionaryBreakEngine {
private:
/**
* The set of characters handled by this engine
* @internal
*/
UnicodeSet fThaiWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fSuffixSet;
UnicodeSet fMarkSet;
const TrieWordDictionary *fDictionary;
public:
/**
* <p>Default constructor.</p>
*
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
* engine is deleted.
*/
ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
/**
* <p>Virtual destructor.</p>
*/
virtual ~ThaiBreakEngine();
protected:
/**
* <p>Divide up a range of known dictionary characters.</p>
*
* @param text A CharacterIterator representing the text
* @param rangeStart The start of the range of dictionary characters
* @param rangeEnd The end of the range of dictionary characters
* @param foundBreaks Output of C array of int32_t break positions, or 0
* @return The number of breaks found
*/
virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
int32_t rangeStart,
int32_t rangeEnd,
UStack &foundBreaks ) const;
};
U_NAMESPACE_END
/* DICTBE_H */
#endif

View file

@ -22,8 +22,12 @@
#include "rbbirb.h"
#include "cmemory.h"
#include "cstring.h"
#include "mutex.h"
#include "ucln_cmn.h"
#include "brkeng.h"
#include "uassert.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
@ -133,6 +137,18 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
fData->removeReference();
fData = NULL;
}
if (fCachedBreakPositions) {
uprv_free(fCachedBreakPositions);
fCachedBreakPositions = NULL;
}
if (fLanguageBreakEngines) {
delete fLanguageBreakEngines;
fLanguageBreakEngines = NULL;
}
if (fUnhandledBreakEngine) {
delete fUnhandledBreakEngine;
fUnhandledBreakEngine = NULL;
}
}
/**
@ -144,6 +160,13 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
if (this == &that) {
return *this;
}
reset(); // Delete break cache information
fBreakType = that.fBreakType;
if (fLanguageBreakEngines != NULL) {
delete fLanguageBreakEngines;
fLanguageBreakEngines = NULL; // Just rebuild for now
}
// TODO: clone fLanguageBreakEngines from "that"
delete fText;
fText = NULL;
if (that.fText != NULL) {
@ -178,6 +201,13 @@ void RuleBasedBreakIterator::init() {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
fDictionaryCharCount = 0;
fBreakType = -1;
fCachedBreakPositions = NULL;
fLanguageBreakEngines = NULL;
fUnhandledBreakEngine = NULL;
fNumCachedBreakPositions = 0;
fPositionInCache = 0;
#ifdef RBBI_DEBUG
static UBool debugInitDone = FALSE;
@ -374,7 +404,7 @@ int32_t RuleBasedBreakIterator::last(void) {
int32_t RuleBasedBreakIterator::next(int32_t n) {
int32_t result = current();
while (n > 0) {
result = handleNext();
result = next();
--n;
}
while (n < 0) {
@ -389,7 +419,25 @@ int32_t RuleBasedBreakIterator::next(int32_t n) {
* @return The position of the first boundary after this one.
*/
int32_t RuleBasedBreakIterator::next(void) {
return handleNext();
// if we have cached break positions and we're still in the range
// covered by them, just move one step forward in the cache
if (fCachedBreakPositions != NULL) {
if (fPositionInCache < fNumCachedBreakPositions - 1) {
++fPositionInCache;
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
return fCachedBreakPositions[fPositionInCache];
}
else {
reset();
}
}
int32_t startPos = current();
int32_t result = handleNext(fData->fForwardTable);
if (fDictionaryCharCount > 0) {
result = checkDictionary(startPos, result, FALSE);
}
return result;
}
/**
@ -397,15 +445,35 @@ int32_t RuleBasedBreakIterator::next(void) {
* @return The position of the last boundary position preceding this one.
*/
int32_t RuleBasedBreakIterator::previous(void) {
int32_t result;
int32_t startPos;
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (fCachedBreakPositions != NULL) {
if (fPositionInCache > 0) {
--fPositionInCache;
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
return fCachedBreakPositions[fPositionInCache];
}
else {
reset();
}
}
// if we're already sitting at the beginning of the text, return DONE
if (fText == NULL || current() == fText->startIndex()) {
if (fText == NULL || (startPos = current()) == fText->startIndex()) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
return BreakIterator::DONE;
}
if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
return handlePrevious(fData->fReverseTable);
result = handlePrevious(fData->fReverseTable);
if (fDictionaryCharCount > 0) {
result = checkDictionary(result, startPos, TRUE);
}
return result;
}
// old rule syntax
@ -424,7 +492,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
lastResult = fText->startIndex();
fText->setIndex(lastResult);
}
int32_t result = lastResult;
result = lastResult;
int32_t lastTag = 0;
UBool breakTagValid = FALSE;
@ -433,7 +501,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
// point is our return value
for (;;) {
result = handleNext();
result = next();
if (result == BreakIterator::DONE || result >= start) {
break;
}
@ -445,7 +513,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// next()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
// we wont have a tag value for that position, which is only set by handleNext().
@ -454,6 +522,10 @@ int32_t RuleBasedBreakIterator::previous(void) {
fText->setIndex(lastResult);
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
fLastStatusIndexValid = breakTagValid;
// No need to check the dictionary; it will have been handled by
// next()
return lastResult;
}
@ -464,6 +536,25 @@ int32_t RuleBasedBreakIterator::previous(void) {
* @return The position of the first break after the current position.
*/
int32_t RuleBasedBreakIterator::following(int32_t offset) {
// if we have cached break positions and offset is in the range
// covered by them, use them
// TODO: could use binary search
// TODO: what if offset is outside range, but break is not?
if (fCachedBreakPositions != NULL) {
if (offset >= fCachedBreakPositions[0]
&& offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
fPositionInCache = 0;
// We are guaranteed not to leave the array due to range test above
while (offset >= fCachedBreakPositions[fPositionInCache])
++fPositionInCache;
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
return fCachedBreakPositions[fPositionInCache];
}
else {
reset();
}
}
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
@ -533,7 +624,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
fText->setIndex(offset);
if (offset == fText->startIndex()) {
return handleNext();
return next();
}
result = previous();
@ -551,6 +642,26 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
* @return The position of the last boundary before the starting position.
*/
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// if we have cached break positions and offset is in the range
// covered by them, use them
if (fCachedBreakPositions != NULL) {
// TODO: binary search?
// TODO: What if offset is outside range, but break is not?
if (offset > fCachedBreakPositions[0]
&& offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
fPositionInCache = 0;
while (fPositionInCache < fNumCachedBreakPositions
&& offset > fCachedBreakPositions[fPositionInCache])
++fPositionInCache;
--fPositionInCache;
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
return fCachedBreakPositions[fPositionInCache];
}
else {
reset();
}
}
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
@ -688,20 +799,6 @@ enum RBBIRunMode {
};
//-----------------------------------------------------------------------------------
//
// handleNext(void) All forward iteration vectors through this function.
// NOTE: This function is overridden by the dictionary base break iterator.
// User level API functions go to the dbbi implementation
// when the break iterator type is dbbi.
// The DBBI implementation sometimes explicitly calls back to here,
// its inherited handleNext().
//
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handleNext() {
return handleNext(fData->fForwardTable);
}
//-----------------------------------------------------------------------------------
//
// handleNext(stateTable)
@ -1125,8 +1222,13 @@ continueOn:
void
RuleBasedBreakIterator::reset()
{
// Base-class version of this function is a no-op.
// Subclasses may override with their own reset behavior.
if (fCachedBreakPositions) {
uprv_free(fCachedBreakPositions);
}
fCachedBreakPositions = NULL;
fNumCachedBreakPositions = 0;
fDictionaryCharCount = 0;
fPositionInCache = 0;
}
@ -1155,6 +1257,9 @@ void RuleBasedBreakIterator::makeRuleStatusValid() {
// Not at start of text. Find status the tedious way.
int32_t pa = current();
previous();
if (fNumCachedBreakPositions > 0) {
reset(); // Blow off the dictionary cache
}
int32_t pb = next();
if (pa != pb) {
// note: the if (pa != pb) test is here only to eliminate warnings for
@ -1306,7 +1411,6 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
}
//-------------------------------------------------------------------------------
//
// isDictionaryChar Return true if the category lookup for this char
@ -1327,6 +1431,305 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
}
//-------------------------------------------------------------------------------
//
// checkDictionary This function handles all processing of characters in
// the "dictionary" set. It will determine the appropriate
// course of action, and possibly set up a cache in the
// process.
//
//-------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
int32_t endPos,
UBool reverse) {
// Reset the old break cache first.
uint32_t dictionaryCount = fDictionaryCharCount;
reset();
if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
return (reverse ? startPos : endPos);
}
// Starting from the starting point, scan towards the proposed result,
// looking for the first dictionary character (which may be the one
// we're on, if we're starting in the middle of a range).
fText->setIndex(reverse ? endPos : startPos);
if (reverse) {
fText->move32(-1, CharacterIterator::kCurrent);
}
int32_t rangeStart = startPos;
int32_t rangeEnd = endPos;
uint16_t category;
int32_t current;
UErrorCode status = U_ZERO_ERROR;
UStack breaks(status);
int32_t foundBreakCount = 0;
UChar32 c = fText->current32();
UTRIE_GET16(&fData->fTrie, c, category);
// Is the character we're starting on a dictionary character? If so, we
// need to back up to include the entire run; otherwise the results of
// the break algorithm will differ depending on where we start. Since
// the result is cached and there is typically a non-dictionary break
// within a small number of words, there should be little performance impact.
if (category & 0x4000) {
if (reverse) {
do {
c = fText->next32();
UTRIE_GET16(&fData->fTrie, c, category);
}
while (c != CharacterIterator::DONE && (category & 0x4000));
// Back up to the last dictionary character
rangeEnd = fText->getIndex();
if (c == CharacterIterator::DONE) {
c = fText->last32();
}
else {
c = fText->previous32();
}
}
else {
do {
c = fText->previous32();
UTRIE_GET16(&fData->fTrie, c, category);
}
while (c != CharacterIterator::DONE && (category & 0x4000));
// Back up to the last dictionary character
if (c == CharacterIterator::DONE) {
c = fText->first32();
}
else {
c = fText->next32();
}
rangeStart = fText->getIndex();
}
UTRIE_GET16(&fData->fTrie, c, category);
}
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
// any breaks within the span.
while(U_SUCCESS(status)) {
if (reverse) {
while((current = fText->getIndex()) > rangeStart && (category & 0x4000) == 0) {
c = fText->previous32();
UTRIE_GET16(&fData->fTrie, c, category);
}
if (current <= rangeStart) {
break;
}
}
else {
while((current = fText->getIndex()) < rangeEnd && (category & 0x4000) == 0) {
c = fText->next32();
UTRIE_GET16(&fData->fTrie, c, category);
}
if (current >= rangeEnd) {
break;
}
}
// We now have a dictionary character. Get the appropriate language object
// to deal with it.
const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
// Ask the language object if there are any breaks. It will leave the text
// pointer on the other side of its range, ready to search for the next one.
if (lbe != NULL) {
foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, reverse, fBreakType, breaks);
}
// Reload the loop variables for the next go-round
c = fText->current32();
UTRIE_GET16(&fData->fTrie, c, category);
}
// If we found breaks, build a new break cache. The first and last entries must
// be the original starting and ending position.
if (foundBreakCount > 0) {
int32_t totalBreaks = foundBreakCount;
if (startPos < breaks.elementAti(0)) {
totalBreaks += 1;
}
if (endPos > breaks.peeki()) {
totalBreaks += 1;
}
fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
if (fCachedBreakPositions != NULL) {
int32_t out = 0;
fNumCachedBreakPositions = totalBreaks;
if (startPos < breaks.elementAti(0)) {
fCachedBreakPositions[out++] = startPos;
}
for (int32_t i = 0; i < foundBreakCount; ++i) {
fCachedBreakPositions[out++] = breaks.elementAti(i);
}
if (endPos > fCachedBreakPositions[out-1]) {
fCachedBreakPositions[out] = endPos;
}
// If there are breaks, then by definition, we are replacing the original
// proposed break by one of the breaks we found. Use following() and
// preceding() to do the work. They should never recurse in this case.
if (reverse) {
return preceding(endPos - 1);
}
else {
return following(startPos);
}
}
// If the allocation failed, just fall through to the "no breaks found" case.
}
// If we get here, there were no language-based breaks. As a result, the
// text pointer should be back to where it started, but set it just to
// make sure.
fText->setIndex(reverse ? startPos : endPos);
return (reverse ? startPos : endPos);
}
static UStack *gLanguageBreakFactories = NULL;
U_NAMESPACE_END
// defined in ucln_cmn.h
/**
* Release all static memory held by breakiterator.
*/
U_CDECL_BEGIN
static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
if (gLanguageBreakFactories) {
delete gLanguageBreakFactories;
gLanguageBreakFactories = NULL;
}
return TRUE;
}
U_CDECL_END
U_CDECL_BEGIN
static void U_CALLCONV _deleteFactory(void *obj) {
delete (LanguageBreakFactory *) obj;
}
U_CDECL_END
U_NAMESPACE_BEGIN
static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
{
UBool needsInit;
UErrorCode status = U_ZERO_ERROR;
umtx_lock(NULL);
needsInit = (UBool)(gLanguageBreakFactories == NULL);
umtx_unlock(NULL);
if (needsInit) {
UStack *factories = new UStack(_deleteFactory, NULL, status);
if (U_SUCCESS(status)) {
ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
factories->push(builtIn, status);
#ifdef U_LOCAL_SERVICE_HOOK
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
if (extra != NULL) {
factories->push(extra, status);
}
#endif
}
umtx_lock(NULL);
if (gLanguageBreakFactories == NULL) {
gLanguageBreakFactories = factories;
factories = NULL;
ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
}
umtx_unlock(NULL);
delete factories;
}
if (gLanguageBreakFactories == NULL) {
return NULL;
}
int32_t i = gLanguageBreakFactories->size();
const LanguageBreakEngine *lbe = NULL;
while (--i >= 0) {
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
lbe = factory->getEngineFor(c, breakType);
if (lbe != NULL) {
break;
}
}
return lbe;
}
//-------------------------------------------------------------------------------
//
// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
// the characer c.
//
//-------------------------------------------------------------------------------
const LanguageBreakEngine *
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
const LanguageBreakEngine *lbe = NULL;
UErrorCode status = U_ZERO_ERROR;
if (fLanguageBreakEngines == NULL) {
fLanguageBreakEngines = new UStack(status);
if (U_FAILURE(status)) {
delete fLanguageBreakEngines;
fLanguageBreakEngines = 0;
return NULL;
}
}
int32_t i = fLanguageBreakEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
if (lbe->handles(c, fBreakType)) {
return lbe;
}
}
// No existing dictionary took the character. See if a factory wants to
// give us a new LanguageBreakEngine for this character.
lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
// If we got one, use it and push it on our stack.
if (lbe != NULL) {
fLanguageBreakEngines->push((void *)lbe, status);
// Even if we can't remember it, we can keep looking it up, so
// return it even if the push fails.
return lbe;
}
// No engine is forthcoming for this character. Add it to the
// reject set. Create the reject break engine if needed.
if (fUnhandledBreakEngine == NULL) {
fUnhandledBreakEngine = new UnhandledEngine(status);
if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
// Put it last so that scripts for which we have an engine get tried
// first.
fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
// If we can't insert it, or creation failed, get rid of it
if (U_FAILURE(status)) {
delete fUnhandledBreakEngine;
fUnhandledBreakEngine = 0;
return NULL;
}
}
// Tell the reject engine about the character; at its discretion, it may
// add more than just the one character.
fUnhandledBreakEngine->handleCharacter(c, fBreakType);
return fUnhandledBreakEngine;
}
//-------------------------------------------------------------------------------
//
// UText functions As a temporary implementation, create a type of CharacterIterator
@ -1580,6 +1983,15 @@ UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const
return result;
}
int32_t RuleBasedBreakIterator::getBreakType() const {
return fBreakType;
}
void RuleBasedBreakIterator::setBreakType(int32_t type) {
fBreakType = type;
reset();
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,338 @@
/**
*******************************************************************************
* Copyright (C) 2006, International Business Machines Corporation and others. *
* All Rights Reserved. *
*******************************************************************************
*/
#ifndef TRIEDICT_H
#define TRIEDICT_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
struct UEnumeration;
struct UDataSwapper;
/**
* <p>UDataSwapFn function for use in swapping a compact dictionary.</p>
*
* @param ds Pointer to UDataSwapper containing global data about the
* transformation and function pointers for handling primitive
* types.
* @param inData Pointer to the input data to be transformed or examined.
* @param length Length of the data, counting bytes. May be -1 for preflighting.
* If length>=0, then transform the data.
* If length==-1, then only determine the length of the data.
* The length cannot be determined from the data itself for all
* types of data (e.g., not for simple arrays of integers).
* @param outData Pointer to the output data buffer.
* If length>=0 (transformation), then the output buffer must
* have a capacity of at least length.
* If length==-1, then outData will not be used and can be NULL.
* @param pErrorCode ICU UErrorCode parameter, must not be NULL and must
* fulfill U_SUCCESS on input.
* @return The actual length of the data.
*
* @see UDataSwapper
*/
U_CAPI int32_t U_EXPORT2
triedict_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
U_NAMESPACE_BEGIN
class CharacterIterator;
class UCharCharacterIterator;
class StringEnumeration;
struct CompactTrieHeader;
/*******************************************************************
* TrieWordDictionary
*/
/**
* <p>TrieWordDictionary is an abstract class that represents a word
* dictionary based on a trie. The base protocol is read-only.
* Subclasses may allow writing.</p>
*/
class U_COMMON_API TrieWordDictionary : public UMemory {
public:
/**
* <p>Default constructor.</p>
*
*/
TrieWordDictionary();
/**
* <p>Virtual destructor.</p>
*/
virtual ~TrieWordDictionary();
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left after the longest prefix match in the dictionary.
* @param start The current position in text.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( CharacterIterator *text,
int32_t maxLength,
int32_t *lengths,
int &count,
int limit ) const = 0;
/**
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A StringEnumeration that will iterate through the whole dictionary.
* The caller is responsible for closing it. The order is unspecified.
*/
virtual StringEnumeration *openWords( UErrorCode &status ) const = 0;
};
/*******************************************************************
* MutableTrieDictionary
*/
/**
* <p>MutableTrieDictionary is a TrieWordDictionary that allows words to be
* added.</p>
*/
struct TernaryNode; // Forwards declaration
class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
private:
/**
* The root node of the trie
* @internal
*/
TernaryNode *fTrie;
/**
* A UCharCharacterIterator for internal use
* @internal
*/
UCharCharacterIterator *fIter;
friend class CompactTrieDictionary; // For fast conversion
public:
/**
* <p>Constructor.</p>
*
* @param median A UChar around which to balance the trie. Ideally, it should
* begin at least one word that is near the median of the set in the dictionary
* @param status A status code recording the success of the call.
*/
MutableTrieDictionary( UChar median, UErrorCode &status );
/**
* <p>Virtual destructor.</p>
*/
virtual ~MutableTrieDictionary();
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( CharacterIterator *text,
int32_t maxLength,
int32_t *lengths,
int &count,
int limit ) const;
/**
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A StringEnumeration that will iterate through the whole dictionary.
* The caller is responsible for closing it. The order is unspecified.
*/
virtual StringEnumeration *openWords( UErrorCode &status ) const;
/**
* <p>Add one word to the dictionary.</p>
*
* @param word A UChar buffer containing the word.
* @param length The length of the word.
* @param status The resultant status
*/
virtual void addWord( const UChar *word,
int32_t length,
UErrorCode &status);
/**
* <p>Add all strings from a UEnumeration to the dictionary.</p>
*
* @param words A UEnumeration that will return the desired words.
* @param status The resultant status
*/
virtual void addWords( UEnumeration *words, UErrorCode &status );
protected:
/**
* <p>Search the dictionary for matches.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @param parent The parent of the current node
* @param pMatched The returned parent node matched the input
* @return The number of characters in text that were matched.
*/
virtual int32_t search( CharacterIterator *text,
int32_t maxLength,
int32_t *lengths,
int &count,
int limit,
TernaryNode *&parent,
UBool &pMatched ) const;
private:
/**
* <p>Private constructor. The root node it not allocated.</p>
*
* @param status A status code recording the success of the call.
*/
MutableTrieDictionary( UErrorCode &status );
};
/*******************************************************************
* CompactTrieDictionary
*/
/**
* <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
* to save space.</p>
*/
class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
private:
/**
* The root node of the trie
* @internal
*/
const CompactTrieHeader *fData;
/**
* A UBool indicating whether or not we own the data.
* @internal
*/
UBool fOwnData;
public:
/**
* <p>Construct a dictionary from raw saved data.</p>
*
* @param data A pointer to the raw data, which is still owned by the caller
* @param status A status code giving the result of the constructor
*/
CompactTrieDictionary( const void *data, UErrorCode &status );
/**
* <p>Construct a dictionary from a MutableTrieDictionary.</p>
*
* @param dict The dictionary to use as input.
* @param status A status code recording the success of the call.
*/
CompactTrieDictionary( const MutableTrieDictionary &dict, UErrorCode &status );
/**
* <p>Virtual destructor.</p>
*/
virtual ~CompactTrieDictionary();
/**
* <p>Find dictionary words that match the text.</p>
*
* @param text A CharacterIterator representing the text (TODO: UText). The
* iterator is left after the longest prefix match in the dictionary.
* @param maxLength The maximum number of code units to match.
* @param lengths An array that is filled with the lengths of words that matched.
* @param count Filled with the number of elements output in lengths.
* @param limit The size of the lengths array; this limits the number of words output.
* @return The number of characters in text that were matched.
*/
virtual int32_t matches( CharacterIterator *text,
int32_t rangeEnd,
int32_t *lengths,
int &count,
int limit ) const;
/**
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A StringEnumeration that will iterate through the whole dictionary.
* The caller is responsible for closing it. The order is unspecified.
*/
virtual StringEnumeration *openWords( UErrorCode &status ) const;
/**
* <p>Return the size of the compact data.</p>
*
* @return The size of the dictionary's compact data.
*/
virtual uint32_t dataSize() const;
/**
* <p>Return a void * pointer to the compact data, platform-endian.</p>
*
* @return The data for the compact dictionary, suitable for passing to the
* constructor.
*/
virtual const void *data() const;
/**
* <p>Return a MutableTrieDictionary clone of this dictionary.</p>
*
* @param status A status code recording the success of the call.
* @return A MutableTrieDictionary with the same data as this dictionary
*/
virtual MutableTrieDictionary *cloneMutable( UErrorCode &status ) const;
private:
/**
* <p>Convert a MutableTrieDictionary into a compact data blob.</p>
*
* @param dict The dictionary to convert.
* @param status A status code recording the success of the call.
* @return A single data blob starting with a CompactTrieHeader.
*/
static CompactTrieHeader *compactMutableTrieDictionary( const MutableTrieDictionary &dict,
UErrorCode &status );
};
U_NAMESPACE_END
/* TRIEDICT_H */
#endif

View file

@ -0,0 +1,13 @@
/*
**********************************************************************
* Copyright (C) 2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef UBRKIMPL_H
#define UBRKIMPL_H
#define U_ICUDATA_BRKITR U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "brkitr"
#endif /*UBRKIMPL_H*/

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
* *
* Copyright (C) 2001-2005, International Business Machines *
* Copyright (C) 2001-2006, International Business Machines *
* Corporation and others. All Rights Reserved. *
* *
******************************************************************************
@ -35,6 +35,7 @@ typedef enum ECleanupCommonType {
UCLN_COMMON_START = -1,
UCLN_COMMON_USPREP,
UCLN_COMMON_BREAKITERATOR,
UCLN_COMMON_BREAKITERATOR_DICT,
UCLN_COMMON_SERVICE,
UCLN_COMMON_URES,
UCLN_COMMON_LOCALE,

View file

@ -515,8 +515,8 @@ public:
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
friend class ICUBreakIteratorFactory;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2005 IBM Corp. All rights reserved.
* Copyright (C) 1999-2006 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
@ -22,253 +22,17 @@
U_NAMESPACE_BEGIN
/* forward declaration */
class DictionaryBasedBreakIteratorTables;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
* word and line breaking in Thai, which doesn't use spaces between words. The
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
* up text as far as possible, and then contiguous ranges of letters are
* repeatedly compared against a list of known words (i.e., the dictionary)
* to divide them up into words.
*
* <p>Applications do not normally need to include this header.</p>
*
* <p>This class will probably be deprecated in a future release of ICU, and replaced
* with a more flexible and capable dictionary based break iterator. This change
* should be invisible to applications, because creation and use of instances of
* DictionaryBasedBreakIterator is through the factories and abstract
* API on class BreakIterator, which will remain stable.</p>
*
* <p>This class is not intended to be subclassed.</p>
*
*
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
* but adds one more special substitution name: &lt;dictionary&gt;. This substitution
* name is used to identify characters in words in the dictionary. The idea is that
* if the iterator passes over a chunk of text that includes two or more characters
* in a row that are included in &lt;dictionary&gt;, it goes back through that range and
* derives additional break positions (if possible) using the dictionary.
*
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
* file. It follows a prescribed search path to locate the dictionary (right now,
* it looks for it in /com/ibm/text/resources in each directory in the classpath,
* and won't find it in JAR files, but this location is likely to change). The
* dictionary file is in a serialized binary format. We have a very primitive (and
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
* currently making it public. Contact us for help.
* <p>
* <b> NOTE </b> The DictionaryBasedIterator class is still under development. The
* APIs are not in stable condition yet.
* An obsolete subclass of RuleBasedBreakIterator. Handling of dictionary-
* based break iteration has been folded into the base class. This class
* is deprecated as of ICU 3.6.
*/
class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
#ifndef U_HIDE_DEPRECATED_API
private:
typedef RuleBasedBreakIterator DictionaryBasedBreakIterator;
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
int32_t* cachedBreakPositions;
/**
* The number of elements in cachedBreakPositions
*/
int32_t numCachedBreakPositions;
/**
* if cachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
int32_t positionInCache;
DictionaryBasedBreakIteratorTables *fTables;
/**=======================================================================
* Create a dictionary based break boundary detection iterator.
* @param tablesImage The location for the dictionary to be loaded into memory
* @param dictionaryFilename The name of the dictionary file
* @param status the error code status
* @return A dictionary based break detection iterator. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success. For example,
* U_FILE_ACCESS_ERROR will be returned if the file does not exist.
* The caller owns the returned object and is responsible for deleting it.
======================================================================= */
DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status);
public:
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~DictionaryBasedBreakIterator();
/**
* Default constructor. Creates an "empty" break iterator.
* Such an iterator can subsequently be assigned to.
* @return the newly created DictionaryBaseBreakIterator.
* @stable ICU 2.0
*/
DictionaryBasedBreakIterator();
/**
* Copy constructor.
* @param other The DictionaryBasedBreakIterator to be copied.
* @return the newly created DictionaryBasedBreakIterator.
* @stable ICU 2.0
*/
DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other);
/**
* Assignment operator.
* @param that The object to be copied.
* @return the newly set DictionaryBasedBreakIterator.
* @stable ICU 2.0
*/
DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
* @return Returns a newly-constructed RuleBasedBreakIterator.
* @stable ICU 2.0
*/
virtual BreakIterator* clone(void) const;
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* Advances the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
virtual int32_t previous(void);
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @param offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
virtual int32_t following(int32_t offset);
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
virtual int32_t preceding(int32_t offset);
/**
* Returns the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
*
* Base* polymorphic_pointer = createPolymorphicObject();
* if (polymorphic_pointer->getDynamicClassID() ==
* Derived::getStaticClassID()) ...
*
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
static UClassID U_EXPORT2 getStaticClassID(void);
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
* C++ compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID(void) const;
protected:
//=======================================================================
// implementation
//=======================================================================
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
* and advances through the text character by character until we reach the end
* of the text or the state machine transitions to state 0. We update our return
* value every time the state machine passes through a possible end state.
* @internal
*/
virtual int32_t handleNext(void);
/**
* removes the cache of break positions (usually in response to a change in
* position of some sort)
* @internal
*/
virtual void reset(void);
/**
* init Initialize a dbbi. Common routine for use by constructors.
* @internal
*/
void init();
/**
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* @param BufferSize reference to size of allocated space.
* If BufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If BufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
* necessary.
* @return pointer to the new clone
* @internal
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status);
private:
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param status The error code status
*/
void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);
/*
* HSYS : Please revisit with Rich, the ctors of the DBBI class is currently
* marked as private.
*/
friend class DictionaryBasedBreakIteratorTables;
friend class BreakIterator;
};
#endif
U_NAMESPACE_END

View file

@ -37,6 +37,9 @@ struct RBBIDataHeader;
class RuleBasedBreakIteratorTables;
class BreakIterator;
class RBBIDataWrapper;
class UStack;
class LanguageBreakEngine;
class UnhandledEngine;
struct RBBIStateTable;
@ -86,13 +89,58 @@ protected:
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set. Normal RBBI iterators don't use it, although the code
* for updating it is live. Dictionary Based break iterators (a subclass
* of us) access this field directly.
* flag set.
* @internal
*/
uint32_t fDictionaryCharCount;
uint32_t fDictionaryCharCount;
/**
* When a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text. Has the most impact for line breaking.
* @internal
*/
int32_t* fCachedBreakPositions;
/**
* The number of elements in fCachedBreakPositions
* @internal
*/
int32_t fNumCachedBreakPositions;
/**
* if fCachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
* @internal
*/
int32_t fPositionInCache;
/**
*
* If present, UStack of LanguageBreakEngine objects that might handle
* dictionary characters. Searched from top to bottom to find an object to
* handle a given character.
* @internal
*/
UStack *fLanguageBreakEngines;
/**
*
* If present, the special LanguageBreakEngine used for handling
* characters that are in the dictionary set, but not handled by any
* LangugageBreakEngine.
* @internal
*/
UnhandledEngine *fUnhandledBreakEngine;
/**
*
* The type of the break iterator, or -1 if it has not been set.
* @internal
*/
int32_t fBreakType;
/**
* Debugging flag. Trace operation of state machine when true.
* @internal
@ -117,7 +165,7 @@ protected:
*/
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
/** @internal */
friend class RBBIRuleBuilder;
/** @internal */
friend class BreakIterator;
@ -506,20 +554,9 @@ protected:
//=======================================================================
// implementation
//=======================================================================
/**
* This method is the actual implementation of the next() method. All iteration
* vectors through here. This method initializes the state machine to state 1
* and advances through the text character by character until we reach the end
* of the text or the state machine transitions to state 0. We update our return
* value every time the state machine passes through a possible end state.
* @internal
*/
virtual int32_t handleNext(void);
/**
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
* but subclasses can and do override it.
* in text or iteration position.
* @internal
*/
virtual void reset(void);
@ -534,6 +571,20 @@ protected:
*/
virtual UBool isDictionaryChar(UChar32);
/**
* Get the type of the break iterator.
* @internal
*/
virtual int32_t getBreakType() const;
/** @internal */
/**
* Set the type of the break iterator.
* @internal
*/
virtual void setBreakType(int32_t type);
/** @internal */
/**
* Common initialization function, used by constructors and bufferClone.
* (Also used by DictionaryBasedBreakIterator::createBufferClone().)
@ -565,6 +616,30 @@ private:
*/
int32_t handleNext(const RBBIStateTable *statetable);
/**
* This is the function that actually implements dictionary-based
* breaking. Covering at least the range from startPos to endPos,
* it checks for dictionary characters, and if it finds them determines
* the appropriate object to deal with them. It may cache found breaks in
* fCachedBreakPositions as it goes. It may well also look at text outside
* the range startPos to endPos.
* If going forward, endPos is the normal Unicode break result, and
* if goind in reverse, startPos is the normal Unicode break result
* @param startPos The start position of a range of text
* @param endPos The end position of a range of text
* @param reverse The call is for the reverse direction
* @internal
*/
int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
/**
* This function returns the appropriate LanguageBreakEngine for a
* given character c.
* @param c A character in the dictionary set
* @internal
*/
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
/**
* @internal
*/

View file

@ -7685,7 +7685,7 @@ then
CXXFLAGS="$CXXFLAGS \$(THREADSCXXFLAGS)"
fi
ac_config_files="$ac_config_files icudefs.mk Makefile data/icupkg.inc config/Makefile.inc data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/pkgdata/Makefile tools/toolutil/Makefile tools/dumpce/Makefile test/Makefile test/testdata/Makefile test/testdata/pkgdata.inc test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/thaitest/Makefile test/testmap/Makefile test/letest/Makefile test/threadtest/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/csdet/Makefile samples/layout/Makefile common/unicode/platform.h"
ac_config_files="$ac_config_files icudefs.mk Makefile data/icupkg.inc config/Makefile.inc data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/pkgdata/Makefile tools/toolutil/Makefile tools/dumpce/Makefile test/Makefile test/testdata/Makefile test/testdata/pkgdata.inc test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/thaitest/Makefile test/testmap/Makefile test/letest/Makefile test/threadtest/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/csdet/Makefile samples/layout/Makefile common/unicode/platform.h"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
# tests run on this system so they can be shared between configure
@ -8233,6 +8233,7 @@ do
"tools/genccode/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/genccode/Makefile" ;;
"tools/gencmn/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gencmn/Makefile" ;;
"tools/gencnval/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gencnval/Makefile" ;;
"tools/genctd/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/genctd/Makefile" ;;
"tools/gennames/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gennames/Makefile" ;;
"tools/gentest/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gentest/Makefile" ;;
"tools/gennorm/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gennorm/Makefile" ;;

View file

@ -1010,6 +1010,7 @@ AC_OUTPUT([icudefs.mk \
tools/genccode/Makefile \
tools/gencmn/Makefile \
tools/gencnval/Makefile \
tools/genctd/Makefile \
tools/gennames/Makefile \
tools/gentest/Makefile \
tools/gennorm/Makefile \

View file

@ -73,11 +73,12 @@ TRANSLITSRCDIR=$(SRCDATADIR)/translit
TRANSLITBLDDIR=$(BUILDDIR)/translit
MISCSRCDIR=$(SRCDATADIR)/misc
BRKSRCDIR=$(SRCDATADIR)/brkitr
BRKBLDDIR=$(BUILDDIR)/brkitr
MISCSRCDIR=$(SRCDATADIR)/misc
UCMSRCDIR=$(SRCDATADIR)/mappings
COMINCDIR=$(top_srcdir)/common/unicode
SRCLISTDEPS=Makefile $(srcdir)/Makefile.in
BUILD_DIRS=$(OUTDIR) $(BUILDDIR) $(COLBLDDIR) $(RBNFBLDDIR) $(TRANSLITBLDDIR) $(TESTOUTDIR) $(TESTBUILDDIR) $(OUTTMPDIR) $(OUTTMPDIR_390STUB) $(OUTTMPDIR)/$(COLLATION_TREE) $(OUTTMPDIR)/$(RBNF_TREE) $(OUTTMPDIR)/$(TRANSLIT_TREE)
BUILD_DIRS=$(OUTDIR) $(BUILDDIR) $(BRKBLDDIR) $(COLBLDDIR) $(RBNFBLDDIR) $(TRANSLITBLDDIR) $(TESTOUTDIR) $(TESTBUILDDIR) $(OUTTMPDIR) $(OUTTMPDIR_390STUB) $(OUTTMPDIR)/$(COLLATION_TREE) $(OUTTMPDIR)/$(RBNF_TREE) $(OUTTMPDIR)/$(TRANSLIT_TREE) $(OUTTMPDIR)/$(BREAK_TREE)
# relative lib links from pkgdata are the same as for tmp
TOOLDIR=$(top_builddir)/tools
@ -209,11 +210,19 @@ DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu ucadata.icu invuca.icu uidna.
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
## BRK files
BREAK_TREE=brkitr
-include $(BRKSRCDIR)/brkfiles.mk
-include $(BRKSRCDIR)/brklocal.mk
ALL_BRK_SOURCE=char.txt title.txt word.txt $(BRK_SOURCE) $(BRK_SOURCE_LOCAL)
BRK_FILES_SHORT=$(ALL_BRK_SOURCE:%.txt=%.brk)
BRK_FILES=$(BRK_FILES_SHORT:%=$(BUILDDIR)/%)
BRK_FILES_SHORT=$(ALL_BRK_SOURCE:%.txt=$(BREAK_TREE)/%.brk)
BRK_FILES=$(ALL_BRK_SOURCE:%.txt=$(BRKBLDDIR)/%.brk)
## CTD files
-include $(BRKSRCDIR)/ctdfiles.mk
-include $(BRKSRCDIR)/ctdlocal.mk
ALL_CTD_SOURCE=$(CTD_SOURCE) $(CTD_SOURCE_LOCAL)
CTD_FILES_SHORT=$(ALL_CTD_SOURCE:%.txt=$(BREAK_TREE)/%.ctd)
CTD_FILES=$(ALL_CTD_SOURCE:%.txt=$(BRKBLDDIR)/%.ctd)
## UCM files
-include $(UCMSRCDIR)/ucmcore.mk
@ -228,10 +237,12 @@ CNV_FILES_SHORT = $(ALL_UCM_SOURCE:%.ucm=%.cnv)
## RES files
-include $(LOCSRCDIR)/resfiles.mk
-include $(COLSRCDIR)/colfiles.mk
-include $(BRKSRCDIR)/brsfiles.mk
-include $(RBNFSRCDIR)/rbnffiles.mk
-include $(TRANSLITSRCDIR)/trnsfiles.mk
-include $(LOCSRCDIR)/reslocal.mk
-include $(COLSRCDIR)/collocal.mk
-include $(BRKSRCDIR)/brslocal.mk
-include $(RBNFSRCDIR)/rbnflocal.mk
-include $(TRANSLITSRCDIR)/trnslocal.mk
ifdef GENRB_SOURCE
@ -244,6 +255,11 @@ COL_SRC= root.txt $(COLLATION_SOURCE) $(COLLATION_ALIAS_SOURCE) $(COLLATION_SOUR
COL_SRC_FILES = $(COL_SRC:%=$(COLSRCDIR)/%)
INSTALLED_COL_FILES = $(COLLATION_SOURCE:%.txt=%) $(COLLATION_SOURCE_LOCAL:%.txt=%)
endif
ifdef BREAKRES_SOURCE
BRS_SRC= root.txt $(BREAKRES_SOURCE) $(BREAKRES_SOURCE_LOCAL)
BRS_SRC_FILES = $(BRS_SRC:%=$(BRKSRCDIR)/%)
INSTALLED_BRS_FILES = $(BREAKRES_SOURCE:%.txt=%) $(BREAKRES_SOURCE_LOCAL:%.txt=%)
endif
ifdef RBNF_SOURCE
RBNF_SRC= root.txt $(RBNF_SOURCE) $(RBNF_ALIAS_SOURCE) $(RBNF_SOURCE_LOCAL)
RBNF_SRC_FILES = $(RBNF_SRC:%=$(RBNFSRCDIR)/%)
@ -286,6 +302,12 @@ COLLATION_INDEX_RES_SHORT=$(COLLATION_TREE)/$(INDEX_NAME).res
COLLATION_FILES = $(COL_SRC:%.txt=$(COLBLDDIR)/%.res) $(COLLATION_INDEX_RES)
COLLATION_FILES_SHORT = $(COL_SRC:%.txt=$(COLLATION_TREE)/%.res)
BREAKRES_INDEX_FILE=$(OUTTMPDIR)/$(BREAK_TREE)/$(INDEX_NAME).txt
BREAKRES_INDEX_RES=$(BRKBLDDIR)/$(INDEX_NAME).res
BREAKRES_INDEX_RES_SHORT=$(BREAK_TREE)/$(INDEX_NAME).res
BREAKRES_FILES = $(BRS_SRC:%.txt=$(BRKBLDDIR)/%.res) $(BREAKRES_INDEX_RES)
BREAKRES_FILES_SHORT = $(BRS_SRC:%.txt=$(BREAK_TREE)/%.res)
RBNF_TREE=rbnf
RBNF_INDEX_FILE=$(OUTTMPDIR)/$(RBNF_TREE)/$(INDEX_NAME).txt
RBNF_INDEX_RES=$(RBNFBLDDIR)/$(INDEX_NAME).res
@ -301,9 +323,9 @@ TRANSLIT_FILES = $(TRANSLIT_SRC:%.txt=$(TRANSLITBLDDIR)/%.res)
TRANSLIT_FILES_SHORT = $(TRANSLIT_SRC:%.txt=$(TRANSLIT_TREE)/%.res)
## All generated files
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(RBNF_FILES) $(TRANSLIT_FILES)
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BREAKRES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES)
# a list to use in the .lst files (package-relative)
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT)
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BREAKRES_FILES_SHORT) $(BREAKRES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT)
#####################################################
# General data build rules
@ -387,12 +409,18 @@ $(BUILDDIR)/uidna.spp: $(MISCSRCDIR)/NamePrepProfile.txt $(BINDIR)/gensprep$(EXE
#################################################### BRK
# BRK FILES
thaidict.brk: $(SRCDATADIR)/thaidict.brk
$(RMV) $@ && ln -s $(BUILDDIR) $@
#thaidict.brk: $(SRCDATADIR)/thaidict.brk
# $(RMV) $@ && ln -s $(BUILDDIR) $@
$(BUILDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(BINDIR)/genbrk$(EXEEXT) $(DAT_FILES)
$(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(BINDIR)/genbrk$(EXEEXT) $(DAT_FILES)
$(INVOKE) $(BINDIR)/genbrk -c -i $(BUILDDIR) -r $< -o $@
#################################################### CTD
# CTD FILES
$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(BINDIR)/genctd$(EXEEXT) $(DAT_FILES)
$(INVOKE) $(BINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
#################################################### CNV
# CNV FILES
$(BUILDDIR)/%.cnv: $(UCMSRCDIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT)
@ -420,6 +448,25 @@ $(OUTTMPDIR)/$(COLLATION_TREE)/$(INDEX_NAME).txt: $(SRCLISTDEPS)
echo " }" >> $@; \
echo "}" >> $@;
### brk res
$(BRKBLDDIR)/%.res: $(BRKSRCDIR)/%.txt $(BINDIR)/genrb$(EXEEXT) $(DAT_FILES)
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -s $(BRKSRCDIR) -d $(BRKBLDDIR) $(<F)
$(BRKBLDDIR)/$(INDEX_NAME).res: $(OUTTMPDIR)/$(BREAK_TREE)/$(INDEX_NAME).txt $(BINDIR)/genrb$(EXEEXT)
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -s $(OUTTMPDIR)/$(BREAK_TREE) -d $(BRKBLDDIR) $(INDEX_NAME).txt
$(OUTTMPDIR)/$(BREAK_TREE)/$(INDEX_NAME).txt: $(SRCLISTDEPS)
@echo "generating $@ (list of installed break locales)"; \
$(RMV) $@; \
echo "// Warning this file is automatically generated" > $@; \
echo "$(INDEX_NAME):table(nofallback) {" >> $@; \
echo " InstalledLocales {" >> $@; \
for file in $(INSTALLED_BRS_FILES); do \
echo " $$file {\"\"}" >> $@; \
done; \
echo " }" >> $@; \
echo "}" >> $@;
### RBNF res
$(RBNFBLDDIR)/%.res: $(RBNFSRCDIR)/%.txt $(BINDIR)/genrb$(EXEEXT) $(DAT_FILES)
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -s $(RBNFSRCDIR) -d $(RBNFBLDDIR) $(<F)

View file

@ -1,4 +1,4 @@
# * Copyright (C) 1997-2004, International Business Machines
# * Copyright (C) 1997-2006, International Business Machines
# * Corporation and others. All Rights Reserved.
# A list of txt's to build
# Note:
@ -28,4 +28,4 @@
# char.txt, title.txt and word.txt are not included so that more tests pass by default,
# and so that the makefile rules are simplier.
BRK_SOURCE = \
line.txt sent.txt line_th.txt word_th.txt word_ja.txt word_POSIX.txt
line.txt sent.txt word_ja.txt word_POSIX.txt

View file

@ -0,0 +1,27 @@
# * Copyright (C) 2006, International Business Machines
# * Corporation and others. All Rights Reserved.
# A list of txt's to build
# Note:
#
# If you are thinking of modifying this file, READ THIS.
#
# Instead of changing this file [unless you want to check it back in],
# you should consider creating a 'brslocal.mk' file in this same directory.
# Then, you can have your local changes remain even if you upgrade or
# reconfigure ICU.
#
# Example 'brslocal.mk' files:
#
# * To add an additional locale to the list:
# _____________________________________________________
# | BREAKRES_SOURCE_LOCAL = myLocale.txt ...
#
# * To REPLACE the default list and only build with a few
# locale:
# _____________________________________________________
# | BREAKRES_SOURCE = ar.txt ar_AE.txt en.txt de.txt zh.txt
#
#
# Ordinary resources
BREAKRES_SOURCE = ja.txt en.txt en_US.txt en_US_POSIX.txt

View file

@ -0,0 +1,27 @@
# * Copyright (C) 2006, International Business Machines
# * Corporation and others. All Rights Reserved.
# A list of txt's to build
# Note:
#
# If you are thinking of modifying this file, READ THIS.
#
# Instead of changing this file [unless you want to check it back in],
# you should consider creating a 'ctdlocal.mk' file in this same directory.
# Then, you can have your local changes remain even if you upgrade or
# reconfigure ICU.
#
# Example 'ctdlocal.mk' files:
#
# * To add an additional dictionary to the list:
# _____________________________________________________
# | CTD_SOURCE_LOCAL = myDict.txt ...
#
# * To REPLACE the default list and only build with a different
# dictionary:
# _____________________________________________________
# | CTD_SOURCE = myDict.txt
#
#
CTD_SOURCE = \
thaidict.txt

View file

@ -0,0 +1,14 @@
// ***************************************************************************
// *
// * Copyright (C) 2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/en_US_POSIX.xml
// *
// ***************************************************************************
/**
* ICU <specials> source: <path>/xml/main/en.xml
*/
en{
Version{"1.36"}
}

View file

@ -0,0 +1,14 @@
// ***************************************************************************
// *
// * Copyright (C) 2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/en_US_POSIX.xml
// *
// ***************************************************************************
/**
* ICU <specials> source: <path>/xml/main/en_US.xml
*/
en_US{
Version{"1.36"}
}

View file

@ -0,0 +1,17 @@
// ***************************************************************************
// *
// * Copyright (C) 2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/en_US_POSIX.xml
// *
// ***************************************************************************
/**
* ICU <specials> source: <path>/xml/main/en_US_POSIX.xml
*/
en_US_POSIX{
Version{"1.36"}
boundaries{
word{"word_POSIX"}
}
}

View file

@ -0,0 +1,17 @@
// ***************************************************************************
// *
// * Copyright (C) 2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/ja.xml
// *
// ***************************************************************************
/**
* ICU <specials> source: <path>/xml/main/ja.xml
*/
ja{
Version{"1.68"}
boundaries{
word{"word_ja"}
}
}

View file

@ -1,4 +1,4 @@
# Copyright (c) 2002-2005 International Business Machines Corporation and
# Copyright (c) 2002-2006 International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
@ -89,6 +89,12 @@ $WJ = [:LineBreak = Word_Joiner:];
$XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
#
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
@ -551,6 +557,9 @@ $SP+ $CM* $B2;
($CM* ($IS | $SY))+ $CM* $NU;
$CL $CM* ($NU | $IS | $SY);
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
@ -564,5 +573,6 @@ $CL $CM* ($NU | $IS | $SY);
# turn off rule chaining. We don't want to move more
# than necessary.
#
[$CM $OP $QU $CL $B2 $PR $HY $SP]+ [^$CM $OP $QU $CL $B2 $PR $HY];
[$CM $OP $QU $CL $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $dictionary];
$dictionary $dictionary;

View file

@ -1,173 +0,0 @@
# Copyright (c) 2002-2006, International Business Machines Corporation and
# others. All Rights Reserved.
#
# file: line.txt
#
# Line Breaking Rules for ICU rules based break iteration.
# Implement default line breaking as defined by Unicode TR 14.
#
# TODO: Rework the rules not pertaining to Thai to be based on the
# default line break rules. Not done yet because of interactions
# between exact reverse rules and the Dictionary code.
#
# These rules, in their current form, do not conform to TR-14 for
# non-Thai breaks.
#
$LF = [\p{LineBreak = LF}];
$IN = [\p{LineBreak = IN}];
$SY = [\p{LineBreak = SY}];
$EX = [\p{LineBreak = EX}];
$BA = [\p{LineBreak = BA}];
$IS = [\p{LineBreak = IS}];
$BB = [\p{LineBreak = BB}];
$SA = [\p{LineBreak = SA}];
$CB = [\p{LineBreak = CB}];
$XX = [\p{LineBreak = XX}];
$HY = [\p{LineBreak = HY}];
$AI = [\p{LineBreak = AI}];
$ZW = [\p{LineBreak = ZW}];
$SG = [\p{LineBreak = SG}];
$AL = [\p{LineBreak = AL}];
$OP = [\p{LineBreak = OP}];
$BK = [\p{LineBreak = BK}];
$PO = [\p{LineBreak = PO}];
$NS = [\p{LineBreak = NS}];
$CL = [\p{LineBreak = CL}];
$NU = [\p{LineBreak = NU}];
$CM = [\p{LineBreak = CM}];
$PR = [\p{LineBreak = PR}];
$B2 = [\p{LineBreak = B2}];
$ID = [\p{LineBreak = ID}];
$SP = [\p{LineBreak = SP}];
$QU = [\p{LineBreak = QU}];
$CR = [\p{LineBreak = CR}];
$GL = [\p{LineBreak = GL}];
$JL = [\p{LineBreak = JL}];
$JV = [\p{LineBreak = JV}];
$JT = [\p{LineBreak = JT}];
$H2 = [\p{LineBreak = H2}];
$H3 = [\p{LineBreak = H3}];
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
#
# Thai Dictionary related definitions and rules
#
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
$paiyannoi = [\u0e2f];
$maiyamok = [\u0e46];
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
#
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
#
$ALPlus = $AL | $AI | [$SA - $dictionary];
#
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
# TODO: This is going to produce some odd results, because of the non-combining
# chars that are included in $CM. Use $Extend instead, where possible.
#
$ALcm = $ALPlus $CM*;
$IDcm = $ID $CM*;
$NUcm = $NU $Extend*;
$HYcm = $HY $Extend*;
$SPcm = $SP $Extend*;
$QUcm = $QU $Extend*;
$POcm = $PO $Extend*;
$OPcm = $OP $Extend*;
$BAcm = $BA $Extend*;
$BBcm = $BB $Extend*;
$NScm = $NS $Extend*;
$GLcm = $GL $Extend*;
$B2cm = $B2 $Extend*;
$INcm = $IN $Extend*;
# New Lines. Always break after, never break before.
# Rule LB 3
#
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
# Because we never break before these things, $Endings
# appears at the end of line break rule.
#
$NLF = $BK | $CR | $LF | $CR $LF;
$Endings = $SPcm* $ZW* $NLF?;
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;
#
# Openings Sequences that can precede Words, and that should not be separated from them.
# Rules LB 9, 10
#
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
#
# Closings Seqences that follow words, and that should not be separated from them,
# Rule LB 8, 11, 15
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*;
#
# Words. Includes mixed Alpha-numerics.
# Rules 11a, 16, 17, 19, more or less.
#
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
$ThaiRange = $dictionary+ | $thai_etc;
$WordLikeThing = $Number | $Word | $Dashes | $ThaiRange;
$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD to be glued.
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
# Rules 13, 14
#
# The actual rules, a combination of everything defined above.
#
$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
$Openings $GluedWord $Closings $Endings;
$Openings $GluedWord $Closings $paiyannoi /
([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
#"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
# + "\u0e25[^$paiyannoi$_ignore_]);"
#
# LB 18b. Do not break a Korean syllable
#
$JL+ $JV* $JT* $Extend*;
$JV+ $JT* $Extend*;
$JT+ $Extend*;
$H2 $JV* $JT* $Extend*;
$H3 $JT* $Extend*;
#
# Reverse Rules.
#
# Back up to a hard break or a space that will cause a boundary.
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
# containing a space that may inhibit a break from occuring.
#
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
$ClumpingChars = [^$SP $BK $CR $LF];
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR)?;

View file

@ -0,0 +1,24 @@
// ***************************************************************************
// *
// * Copyright (C) 2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/root.xml
// *
// ***************************************************************************
/**
* ICU <specials> source: <path>/xml/main/root.xml
*/
root{
Version{"1.00"}
boundaries{
grapheme{"char"}
line{"line"}
sentence{"sent"}
title{"title"}
word{"word"}
}
dictionaries{
Thai{"thaidict"}
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
#
# Copyright (C) 2002-2005, International Business Machines Corporation
# Copyright (C) 2002-2006, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
@ -38,12 +38,21 @@ $LF = \u000a;
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
#
# Rules 3 Grapheme Clusters behave like their first char.
# Rule 4 Ignore trailing Format characters (Also see note in TR 29)
#
$KatakanaEx = $Katakana $Extend* $Format*;
$ALetterEx = $ALetter $Extend* $Format*;
$ALetterEx = $ALetterPlus $Extend* $Format*;
$MidLetterEx = $MidLetter $Extend* $Format*;
$MidNumEx = $MidNum $Extend* $Format*;
$NumericEx = $Numeric $Extend* $Format*;
@ -125,7 +134,7 @@ $ExtendNumLetEx $KatakanaEx {300}; # (13b)
!!reverse;
$BackALetterEx = $Format* $Extend* $ALetter;
$BackALetterEx = $Format* $Extend* $ALetterPlus;
$BackNumericEx = $Format* $Extend* $Numeric;
$BackMidNumEx = $Format* $Extend* $MidNum;
$BackMidLetterEx = $Format* $Extend* $MidLetter;
@ -190,6 +199,9 @@ $MidLetter $BackALetterEx;
# rule 11
$MidNum $BackNumericEx;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
@ -218,3 +230,5 @@ $MidLetterEx $ALetterEx;
# rule 11
$MidNumEx $NumericEx;
# For dictionary-based break
$dictionary $dictionary;

View file

@ -1,5 +1,5 @@
#
# Copyright (C) 2002-2005, International Business Machines Corporation
# Copyright (C) 2002-2006, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
@ -39,12 +39,21 @@ $LF = \u000a;
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
#
# Rules 3 Grapheme Clusters behave like their first char.
# Rule 4 Ignore trailing Format characters (Also see note in TR 29)
#
$KatakanaEx = $Katakana $Extend* $Format*;
$ALetterEx = $ALetter $Extend* $Format*;
$ALetterEx = $ALetterPlus $Extend* $Format*;
$MidLetterEx = $MidLetter $Extend* $Format*;
$MidNumEx = $MidNum $Extend* $Format*;
$NumericEx = $Numeric $Extend* $Format*;
@ -126,7 +135,7 @@ $ExtendNumLetEx $KatakanaEx {300}; # (13b)
!!reverse;
$BackALetterEx = $Format* $Extend* $ALetter;
$BackALetterEx = $Format* $Extend* $ALetterPlus;
$BackNumericEx = $Format* $Extend* $Numeric;
$BackMidNumEx = $Format* $Extend* $MidNum;
$BackMidLetterEx = $Format* $Extend* $MidLetter;
@ -191,6 +200,9 @@ $MidLetter $BackALetterEx;
# rule 11
$MidNum $BackNumericEx;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
@ -219,3 +231,5 @@ $MidLetterEx $ALetterEx;
# rule 11
$MidNumEx $NumericEx;
# For dictionary-based break
$dictionary $dictionary;

View file

@ -1,5 +1,5 @@
#
# Copyright (C) 2002-2005, International Business Machines Corporation
# Copyright (C) 2002-2006, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word_ja.txt
@ -38,12 +38,21 @@ $LF = \u000a;
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
# Dictionary character set, for triggering language-based break engines. Currently
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
#
# Rules 3 Grapheme Clusters behave like their first char.
# Rule 4 Ignore trailing Format characters (Also see note in TR 29)
#
$KatakanaEx = $Katakana $Extend* $Format*;
$ALetterEx = $ALetter $Extend* $Format*;
$ALetterEx = $ALetterPlus $Extend* $Format*;
$MidLetterEx = $MidLetter $Extend* $Format*;
$MidNumEx = $MidNum $Extend* $Format*;
$NumericEx = $Numeric $Extend* $Format*;
@ -127,7 +136,7 @@ $ExtendNumLetEx $KatakanaEx {300}; # (13b)
!!reverse;
$BackALetterEx = $Format* $Extend* $ALetter;
$BackALetterEx = $Format* $Extend* $ALetterPlus;
$BackNumericEx = $Format* $Extend* $Numeric;
$BackMidNumEx = $Format* $Extend* $MidNum;
$BackMidLetterEx = $Format* $Extend* $MidLetter;
@ -196,6 +205,9 @@ $MidLetter $BackALetterEx;
# rule 11
$MidNum $BackNumericEx;
# For dictionary-based break
$dictionary $dictionary;
## -------------------------------------------------
!!safe_forward;
@ -228,3 +240,5 @@ $MidLetterEx $ALetterEx;
# rule 11
$MidNumEx $NumericEx;
# For dictionary-based break
$dictionary $dictionary;

View file

@ -1,72 +0,0 @@
# Copyright (c) 2002-2005, International Business Machines Corporation and
# others. All Rights Reserved.
#
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
#
# TODO: Shift this over to being based on the current default (non-Thai)
# word rules, including exact reverse rules. Postponed
# because of interactions with dictionary implementation.
$Katakana = [\p{Word_Break = Katakana}];
$ALetter = [\p{Word_Break = ALetter}];
$MidLetter = [\p{Word_Break = MidLetter}];
$Numeric = [\p{Line_Break = Numeric}];
$MidNum = [\p{Word_Break = MidNum}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$Hiragana = [\p{Hiragana}];
$Control = [^\p{Grapheme_Cluster_Break = Control}];
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
$ALetterEx = $ALetter $Extend*;
$NumericEx = $Numeric $Extend*;
$MidLetterEx = $MidLetter $Extend*;
$MidNumEx = $MidNum $Extend*;
$ExtendNumLetEx = $ExtendNumLet $Extend*;
#
# Thai Dictionary Related Rules. Identify runs that will be subdivided into words
# using the dictionary.
#
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
$paiyannoi = [\u0e2f];
$maiyamok = [\u0e46];
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
$dictionary+ ($paiyannoi? $maiyamok)?;
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
$thai_etc;
#
# The Big Rule. Gloms Non-Thai words together.
#
$NumericClump = $NumericEx ($MidNumEx? $NumericEx)*;
$AlphaClump = $ALetterEx ($MidLetterEx? $ALetterEx)*;
($AlphaClump | $NumericClump | $ExtendNumLetEx)+;
#
# Lesser rules
#
($Hiragana $Extend*)*;
($Katakana $Extend*)*;
[^$Control] $Extend*;
\r\n;
.;
#
# Reverse Rules. Back up over any of the chars that can group together.
# (Reverse rules do not need to be exact; they can back up a bit too far,
# but must back up at least enough.)
#
! ( $ALetter | $MidLetter | $Numeric | $ExtendNumLet | $MidNum | $Extend )*;
! ($Hiragana | $Extend)*;
! ($Katakana | $Extend)*;
! $Extend* .;
! \n\r;
! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;

View file

@ -1,6 +1,6 @@
// ***************************************************************************
// *
// * Copyright (C) 2005 International Business Machines
// * Copyright (C) 2005-2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/en_US_POSIX.xml
@ -31,7 +31,4 @@ en_US_POSIX{
"0.000000E+000",
}
Version{"1.36"}
boundaries{
word{"word_POSIX"}
}
}

View file

@ -1,6 +1,6 @@
// ***************************************************************************
// *
// * Copyright (C) 2005 International Business Machines
// * Copyright (C) 2005-2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/ja.xml
@ -1678,9 +1678,6 @@ ja{
REVISED{"改訂版"}
}
Version{"1.68"}
boundaries{
word{"word_ja"}
}
calendar{
gregorian{
AmPmMarkers{

View file

@ -1,6 +1,6 @@
// ***************************************************************************
// *
// * Copyright (C) 2005 International Business Machines
// * Copyright (C) 2005-2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/root.xml
@ -66,13 +66,6 @@ root{
210,
}
Version{"1.63"}
boundaries{
grapheme{"char"}
line{"line"}
sentence{"sent"}
title{"title"}
word{"word"}
}
calendar{
buddhist{
DateTimePatterns{

View file

@ -1,6 +1,6 @@
// ***************************************************************************
// *
// * Copyright (C) 2005 International Business Machines
// * Copyright (C) 2005-2006 International Business Machines
// * Corporation and others. All Rights Reserved.
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
// * Source File:<path>/common/main/th.xml
@ -10,7 +10,6 @@
* ICU <specials> source: <path>/xml/main/th.xml
*/
th{
BreakDictionaryData:import{"../brkitr/thaidict.brk"}
Countries{
001{"โลก"}
002{"แอฟริกา"}
@ -558,10 +557,6 @@ th{
Thai{"ไทย"}
}
Version{"1.56"}
boundaries{
line{"line_th"}
word{"word_th"}
}
calendar{
buddhist{
DateTimePatterns{

View file

@ -2155,6 +2155,8 @@ static void TestResourceLevelAliasing(void) {
log_err("Referencing alias didn't get the right string\n");
}
#if 0
/* TODO: Needs to be replaced as this data is no longer present! */
/* check whether the binary collation data is properly referenced by an alias */
uk = ures_findResource("th/BreakDictionaryData", uk, &status);
binSequence = ures_getBinary(uk, &binSeqLen, &status);
@ -2167,6 +2169,7 @@ static void TestResourceLevelAliasing(void) {
} else if(binSeqLen != binLen || memcmp(binSequence, binary, binSeqLen) != 0) {
log_err("Referencing alias didn't get the right data\n");
}
#endif
/* simple alias */
testtypes = ures_open(testdatapath, "testtypes", &status);

View file

@ -26,6 +26,7 @@
#include "filestrm.h"
#include "udatamem.h"
#include "cintltst.h"
#include "ubrkimpl.h"
#include <sys/types.h>
#include <sys/stat.h>
@ -1100,6 +1101,12 @@ static void TestICUDataName()
/* test data swapping ------------------------------------------------------- */
/* Unfortunately, trie dictionaries are in a C++ header */
int32_t
triedict_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/* test cases for maximum data swapping code coverage */
static const struct {
const char *name, *type;
@ -1156,6 +1163,7 @@ static const struct {
#if !UCONFIG_NO_BREAK_ITERATION
{"char", "brk", ubrk_swap},
{"thaidict", "ctd", triedict_swap},
#endif
/* the last item should not be #if'ed so that it can reliably omit the last comma */
@ -1459,6 +1467,11 @@ TestSwapData() {
pkg=loadTestData(&errorCode);
nm=swapCases[i].name+1;
uprv_strcpy(name, "testdata");
} else if (uprv_strcmp(swapCases[i].type, "brk")==0
|| uprv_strcmp(swapCases[i].type, "ctd")==0) {
pkg=U_ICUDATA_BRKITR;
nm=swapCases[i].name;
uprv_strcpy(name, U_ICUDATA_BRKITR);
} else {
pkg=NULL;
nm=swapCases[i].name;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2005, International Business Machines Corporation and
* Copyright (c) 1999-2006, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -20,6 +20,7 @@
#include "rbbiapts.h"
#include "rbbidata.h"
#include "cstring.h"
#include "ubrkimpl.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
@ -147,8 +148,8 @@ void RBBIAPITest::TestCloneEquals()
void RBBIAPITest::TestBoilerPlate()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator* a = BreakIterator::createLineInstance(Locale("hi"), status);
BreakIterator* b = BreakIterator::createLineInstance(Locale("hi_IN"),status);
BreakIterator* a = BreakIterator::createWordInstance(Locale("hi"), status);
BreakIterator* b = BreakIterator::createWordInstance(Locale("hi_IN"),status);
if (U_FAILURE(status)) {
errln("Creation of break iterator failed %s", u_errorName(status));
return;
@ -156,7 +157,7 @@ void RBBIAPITest::TestBoilerPlate()
if(*a!=*b){
errln("Failed: boilerplate method operator!= does not return correct results");
}
BreakIterator* c = BreakIterator::createLineInstance(Locale("th"),status);
BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
if(a && c){
if(*c==*a){
errln("Failed: boilerplate method opertator== does not return correct results");
@ -864,17 +865,17 @@ void RBBIAPITest::TestBug2190() {
void RBBIAPITest::TestRegistration() {
#if !UCONFIG_NO_SERVICE
UErrorCode status = U_ZERO_ERROR;
BreakIterator* thai_word = BreakIterator::createWordInstance("th_TH", status);
BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
// ok to not delete these if we exit because of error?
BreakIterator* thai_char = BreakIterator::createCharacterInstance("th_TH", status);
BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
BreakIterator* root_word = BreakIterator::createWordInstance("", status);
BreakIterator* root_char = BreakIterator::createCharacterInstance("", status);
URegistryKey key = BreakIterator::registerInstance(thai_word, "xx", UBRK_WORD, status);
URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
{
if (thai_word && *thai_word == *root_word) {
errln("thai not different from root");
if (ja_word && *ja_word == *root_word) {
errln("japan not different from root");
}
}
@ -882,7 +883,7 @@ void RBBIAPITest::TestRegistration() {
BreakIterator* result = BreakIterator::createWordInstance("xx_XX", status);
UBool fail = TRUE;
if(result){
fail = *result != *thai_word;
fail = *result != *ja_word;
}
delete result;
if (fail) {
@ -891,14 +892,14 @@ void RBBIAPITest::TestRegistration() {
}
{
BreakIterator* result = BreakIterator::createCharacterInstance("th_TH", status);
BreakIterator* result = BreakIterator::createCharacterInstance("ja_JP", status);
UBool fail = TRUE;
if(result){
fail = *result != *thai_char;
fail = *result != *ja_char;
}
delete result;
if (fail) {
errln("bad result for th_TH/char");
errln("bad result for ja_JP/char");
}
}
@ -983,8 +984,8 @@ void RBBIAPITest::TestRegistration() {
}
// that_word was adopted by factory
delete thai_char;
// ja_word was adopted by factory
delete ja_char;
delete root_word;
delete root_char;
#endif
@ -995,7 +996,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
UParseError parseError;
parseError.line = 0;
parseError.offset = 0;
UDataMemory *data = udata_open(NULL, "brk", dataFile, &status);
UDataMemory *data = udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status);
uint32_t length;
const UChar *builtSource;
const uint8_t *rbbiRules;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1999-2005, International Business Machines Corporation and
* Copyright (c) 1999-2006, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/************************************************************************
@ -493,9 +493,11 @@ void RBBITest::TestMaiyamok()
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
@ -2055,7 +2057,10 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0)
fSets = new UVector(status);
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]", status);
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
"[\\p{Line_Break = Complex_Context}"
"-\\p{Grapheme_Cluster_Break = Extend}"
"-\\p{Grapheme_Cluster_Break = Control}]]", status);
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]", status);
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]", status);
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]", status);
@ -2063,6 +2068,7 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0)
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]", status);
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]", status);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
deferredStatus = status;

View file

@ -261,7 +261,7 @@ void UObjectTest::testIDs()
#if !UCONFIG_NO_BREAK_ITERATION
/* TESTCLASSID_ABSTRACT(BreakIterator); No staticID! */
TESTCLASSID_FACTORY(RuleBasedBreakIterator, BreakIterator::createLineInstance("mt",status));
TESTCLASSID_FACTORY(DictionaryBasedBreakIterator, BreakIterator::createLineInstance("th",status));
//TESTCLASSID_FACTORY(DictionaryBasedBreakIterator, BreakIterator::createLineInstance("th",status));
#endif
//TESTCLASSID_DEFAULT(EscapeTransliterator);

View file

@ -1,4 +1,4 @@
# Copyright (c) 2001-2005 International Business Machines
# Copyright (c) 2001-2006 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -512,14 +512,14 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
# Test data originally from the test code source file
# // @suwit -- Thai sample data from GVT Guideline
#
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07•\u0E04\u0E33•\u0E44\u0E17\u0E22•\
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16•\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A•\
\u0E14\u0E49\u0E27\u0E22•\u0e2b\u0e25\u0e32\u0e22•\
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c</data>
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
#
# Jitterbug 3671 Test Case
#
<data>•สวัสดี•ครับ•สบาย•ดี•ไหม• •ครับ•</data>
<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>

View file

@ -1,5 +1,5 @@
บท ที่พายุ ไซโคลน
โดโรธี อาศัย อยู่ ท่าม กลาง ทุ่ง ใหญ่ ใน แคนซัส กับ ลุง เฮนรี ชาว ไร่ และ ป้า เอ็ม ภรรยา ชาวไร่
บท ที่พายุ ไซโคลน
โด โรธี อาศัย อยู่ ท่ามกลาง ทุ่งใหญ่ ใน แคนซัส กับ ลุง เฮ นรี ชาวไร่ และ ป้า เอ็ม ภรรยา ชาวไร่
บ้าน ของ พวก เขา หลัง เล็ก เพราะ ไม้ สร้าง บ้าน ต้อง ขน มา ด้วย เกวียน เป็น
@ -19,52 +19,52 @@
แล้ว ก็ มี เตียง นอน
ลุง เฮนรี กับ ป้า เอ็ม มี เตียง นอน ใหญ่ อยู่ ที่ มุม หนึ่ง
ลุง เฮ นรี กับ ป้า เอ็ม มี เตียง นอน ใหญ่ อยู่ ที่ มุม หนึ่ง
ส่วน โดโรธี มี เตียง เล็ก อีก ที่ มุม หนึ่ง
ส่วน โด โร ธีมี เตียง เล็ก อีก ที่ มุม หนึ่ง
ไม่ มี ห้อง ใต้ เพดาน เลย ห้อง ใต้ถุน ก็ ไม่ มี
ไม่มี ห้อง ใต้ เพดาน เลย ห้อง ใต้ถุน ก็ ไม่มี
เว้น แต่ มี โพรง เล็กๆ
เว้น แต่ มี โพ รง เล็กๆ
ที่ ขุด ไป ใต้
พื้น เรียก ว่า
" โพรง ไซโคลน "
"โพรง ไซโคลน"
เป็น ที่ ครอบครัว นี้ จะ มุด เข้า ไป เมื่อ เกิด ลม มหาภัย
เป็น ที่ ครอบครัว นี้ จะ มุด เข้าไป เมื่อ เกิด ลม มหา ภัย
ซึ่ง กระโชก แรง จน บด ขยี้ สิ่ง ก่อ สร้าง ใด
ซึ่ง กระโชก แรง จน บดขยี้ สิ่ง ก่อ สร้าง ใดๆ
ที่ ขวาง ทาง มัน ได้ ตรง กลาง พื้น มี ฝา เปิด เข้า ไป
ที่ ขวาง ทาง มัน ได้ ตรง กลาง พื้น มี ฝา เปิด เข้าไป
จาก นั้น มี บันได ลง ไป ถึง โพรง มืด เล็ก
จาก นั้น มี บันได ลง ไป ถึง โพรง มืด เล็กๆ
เมื่อ โดโรธี ยืน ที่ ปาก ประตู และ มอง ไป รอบ
เมื่อ โด โรธี ยืน ที่ ปาก ประตู และ มอง ไป รอบๆ
เธอ ไม่ เห็น อะไร นอก จาก ท้อง ทุ่ง กว้าง สี เทา หม่น ทั่ว ทุก ด้าน
เธอ ไม่ เห็น อะไร นอกจาก ท้อง ทุ่ง กว้าง สี เทา หม่น ทั่ว ทุก ด้าน
ไม่ มี แม้ ต้นไม้ สัก ต้น หรือ บ้าน สัก หลัง ที่ โผล่ พ้น ภูมิ ประเทศ อัน ราบ เรียบ
ไม่มี แม้ ต้นไม้ สัก ต้น หรือ บ้าน สัก หลัง ที่ โผล่ พ้น ภูมิประเทศ อัน ราบ เรียบ
แผ่ ไป ไกล จน จด ขอบ ฟ้า ทั่ว ทุก ทิศ
ดวง ตะวัน เผา ผืน ดิน ที่ ไถ แล้ว จน กลาย เป็น แผ่น มหึมา สี ดำ
ดวงตะวัน เผา ผืน ดิน ที่ ไถ แล้ว จน กลาย เป็น แผ่น มหึมา สี ดำ
มี รอย แตกระแหง อยู่ ตลอด
มี รอย แตก ระแหง อยู่ ตลอด
แม้แต่ หญ้า ก็ ไม่ เขียว
เพราะ ดวง ตะวัน เผา ยอด ใบ ยาว เสีย จน เป็น สี เทา หม่น มอง เห็น อยู่ ทั่ว ไป
เพราะ ดวงตะวัน เผา ยอด ใบ ยาว เสีย จน เป็น สี เทา หม่น มอง เห็น อยู่ ทั่วไป
ครั้ง หนึ่ง เคย ทา สี บ้านเอาไว้
ครั้ง หนึ่ง เคย ทาสี บ้าน เอา ไว้
แต่ ก็ ถูก ดวง ตะวัน เผา เสีย จน สี พอง
แต่ ก็ ถูก ดวงตะวัน เผา เสีย จน สี พอง
แล้ว ฝน ก็ ชะมัน หลุด ไป จน หมด
แล้ว ฝน ก็ ชะ มัน หลุด ไป จน หมด
และ ตอน นี้ บ้าน จึง ดู หม่นหมอง เป็น สี เทา เหมือน สิ่ง อื่น
และ ตอน นี้ บ้าน จึง ดู หม่นหมอง เป็น สี เทา เหมือน สิ่ง อื่นๆ
ด้วย
@ -75,13 +75,13 @@
เป็น ภรรยา ที่ งดงาม
แล้ว แดด และ ลม ก็ ได้ เปลี่ยน เธอ ไป
แล้ว แดด และ ลม ก็ได้ เปลี่ยน เธอ ไป
เอา ประกาย ไป จาก ดวงตา เธอ ปล่อย ไว้ แต่ ความ สุขุม อย่าง หม่นหมอง
เอา สี แดง จาก แก้ม และ ริมฝีปาก เธอ ไป
เอา สี แดง จาก แก้ม และ ริม ฝีปาก เธอ ไป
กลาย เป็น สี หม่น
กลาย เป็น สี หม่นๆ
เหมือน กัน
@ -89,62 +89,62 @@
และ เดี๋ยว นี้ ไม่ เคย ยิ้ม เลย
เมื่อ โดโรธี ซึ่ง เป็น เด็ก กำพร้า มา อยู่ กับ เธอ ตอน แรก
เมื่อ โด โรธี ซึ่ง เป็น เด็ก กำพร้า มา อยู่ กับ เธอ ตอน แรก
ป้า เอ็ม ตื่น เต้น กับ เสียง หัวเราะ ของ เด็ก น้อย มาก
เธอ จะส่ง เสียง ร้อง แล้ว เอา มือ ทาบ อก ทุก ครั้ง ที่ เสียง อัน ร่าเริง ของ โดโรธี เข้า หู เธอ
เธอ จะ ส่ง เสียง ร้อง แล้ว เอา มือ ทาบ อก ทุก ครั้ง ที่ เสียง อัน ร่าเริง ของ โด โรธี เข้าหู เธอ
และ เธอ เฝ้า มอง เด็ก หญิง น้อย
และ เธอ เฝ้า มอง เด็ก หญิง น้อยๆ
ด้วย ความ ประหลาด ใจ
ด้วย ยัง หา อะไร มา เป็น เรื่อง หัวเราะ ได้
ลุง เฮนรี ไม่ เคย หัวเราะ
ลุง เฮ นรี ไม่ เคย หัวเราะ
ลุง ทำงาน หนัก จาก เช้า ยัน ค่ำ
และ ไม่ เคย รู้จัก ว่า ความ ร่าเริง คือ อะไร
ลุง ดู หม่นหมอง ไป หมด ตั้ง แต่ เครา ยาว จน จด รองเท้า บูต อัน หยาบ
ลุง ดู หม่นหมอง ไป หมด ตั้งแต่ เครา ยาว จน จด รองเท้า บูต อัน หยาบ
แล้ว ลุง ก็ ดู เคร่งขรึม น่า เกรงขาม ไม่ ค่อย จะ พูด
แล้ว ลุง ก็ ดู เคร่งขรึม น่า เกรง ขาม ไม่ ค่อย จะ พูด
มี โตโต้ ที่ ทำ ให้ โดโรธี หัวเราะ ได้
มี โต โต้ ที่ ทำให้ โด โรธี หัวเราะ ได้
และ ช่วย เธอ ให้ พ้น จาก การ กลาย เป็น สี เทา หม่นเหมือน กับ สิ่ง รอบ ตัว อื่น
และ ช่วย เธอ ให้ พ้น จาก การก ลาย เป็น สี เทา หม่น เหมือน กับ สิ่ง รอบ ตัว อื่นๆ
โตโต้ สี ไม่ เทา หม่น
โต โต้ สี ไม่ เทา หม่น
แต่ มัน เป็น หมา สี ดำ ตัว น้อย
แต่ มัน เป็น หมา สี ดำ ตัว น้อยๆ
ขน ยาว ปุย ราว กับ ไหม
ขน ยาว ปุย ราวกับ ไหม
มี ตา ดำ เล็ก เป็น ประกาย รื่นเริง อยู่ สอง ข้าง จมูก เล็ก อัน น่า ขัน ของ มัน
โตโต้ เล่น ทั้ง วัน
โต โต้ เล่น ทั้ง วัน
และ โดโรธี ก็ เล่น กับ มัน
และ โด โรธี ก็ เล่น กับ มัน
และ รัก มัน เหลือ เกิน
อย่างไร ก็ ตาม
อย่างไร ก็ตาม
วัน นี้ ทั้ง คู่ ไม่ ได้ เล่น
ลุง เฮนรี นั่ง อยู่ ที่ บันได ประตู และ เฝ้า กังวล จ้อง ดู ท้อง ฟ้า สี เทา หม่น ผิด ปกติ
ลุง เฮ นรี นั่ง อยู่ ที่ บันได ประตู และ เฝ้า กังวล จ้อง ดู ท้องฟ้า สี เทา หม่น ผิด ปกติ
โดโรธี ยืน ที่ ประตู
โด โรธี ยืน ที่ ประตู
กอด โตโต้ ไว้ ใน อ้อม แขน
กอด โต โต้ ไว้ ใน อ้อม แขน
และ ก็ มอง ดู ท้อง ฟ้า อยู่ เหมือน กัน
และ ก็ มอง ดู ท้องฟ้า อยู่ เหมือน กัน
ป้า เอ็ม กำลัง ล้าง ชาม อยู่
ป้า เอ็ มกำ ลัง ล้าง ชาม อยู่
@ -152,25 +152,25 @@
มี เสียง ลม คราง แผ่ว เบา ได้ยิน มา
ลุง เฮนรี และ โดโรธี เห็น ต้น หญ้า สูง เอน เป็น คลื่น ก่อน ที่ พายุ จะ มา ถึง
ลุง เฮ นรี และ โด โรธี เห็น ต้น หญ้า สูง เอน เป็น คลื่น ก่อน ที่ พายุ จะ มา ถึง
แล้ว ก็ มี เสียง หวีดหวิว ชัดเจน มา จาก บรรยากาศ ทาง ใต้
แล้ว ก็ มี เสียง หวีด หวิว ชัดเจน มา จาก บรรยากาศ ทาง ใต้
และ เมื่อ เหลือบ ตา ไป ทาง ด้าน นั้น ก็ เห็น คลื่น หญ้า มา ทาง ด้าน นั้น ด้วย
ลุง เฮนรี ผุด ลุก ขึ้น ทัน ใด
ลุง เฮ นรี ผุด ลุก ขึ้น ทันใด
" ลม ไซโคลน มา
"ลม ไซโคลน มา
เอ็ม "
เอ็ม"
ลุง ร้อง บอก ภรรยา
" ข้า จะ ไป ดู สัตว์ เลี้ยง หน่อย "
"ข้า จะ ไป ดู สัตว์ เลี้ยง หน่อย"
แล้ว ลุง ก็ วิ่ง ไป ยัง เพิง ที่ วัว และ ม้า อาศัย อยู่
@ -184,31 +184,31 @@
" เร็ว โดโรธี ! "
"เร็ว โด โรธี!"
ป้า ตะโกน
" วิ่ง ไป ห้อง ใต้ถุน "
"วิ่ง ไป ห้อง ใต้ถุน"
โตโต้ ผลุน กระโดด ลง จาก อ้อมแขน โดโรธี
โต โต้ ผลุน กระโดด ลง จาก อ้อม แขน โด โรธี
แล้ว เข้า ไป ซ่อน อยู่ ใต้ เตียง
แล้ว เข้าไป ซ่อน อยู่ ใต้ เตียง
เด็ก หญิง น้อย เข้า ไป ดึง มัน ออก มา
เด็ก หญิง น้อย เข้าไป ดึง มัน ออก มา
ป้า เอ็ม กระชาก ฝา ที่ พื้น ออก อย่าง อก สั่น ขวัญ หาย
ป้า เอ็ มก ระ ชาก ฝา ที่ พื้น ออก อย่าง อก สั่น ขวัญ หาย
ปีน บันได ไม้ ลง ไป ใน โพรง เล็ก อัน มืด ทึบ
โดโรธี จับ โตโต้ ได้ ใน ที่ สุด
โด โรธี จับ โต โต้ ได้ ใน ที่สุด
และ วิ่ง ตาม ป้า เธอ ไป
เมื่อ เธอ มา ได้ ครึ่ง ห้อง ก็ มี เสียง หวีดหวือ
เมื่อ เธอ มา ได้ ครึ่ง ห้อง ก็ มี เสียง หวีด หวือ
ส่วน บ้าน ก็ สั่น อย่าง แรง จน เธอ หก คะมำ นั่ง จ้ำเบ้า อยู่ กับ พื้น
@ -218,39 +218,39 @@
บ้าน หมุน ไป หมุน มา สอง สาม รอบ
บ้าน หมุน ไป หมุน มาส อง สาม รอบ
แล้ว ก็ ลอย ขึ้น สู่ อากาศ อย่าง ช้า
แล้ว ก็ ลอย ขึ้น สู่ อากาศ อย่าง ช้าๆ
โดโรธี รู้สึก ราว กับ ว่า เธอ ได้ ขึ้น ไป กับ ลูก บอลลูน
โด โร ธีรู้ สึก ราวกับ ว่า เธอ ได้ ขึ้น ไป กับ ลูก บอลลูน
พายุ เหนือ กับ พายุ ใต้ มา พบ กัน ตรง ที่ บ้าน พอ ดี
พายุ เหนือ กับ พายุ ใต้ มา พบ กัน ตรง ที่ บ้าน พอดี
และ ทำ ให้ ตรง นั้น เป็น จุด ศูนย์ กลาง ของ พายุ ไซโคลน
และ ทำให้ ตรง นั้น เป็น จุดศูนย์กลาง ของ พายุ ไซโคลน
ตาม ปกติ ตรง กลาง พายุ ไซโคลน อากาศ จะ นิ่ง
แต่ ความ กดดัน อย่าง หนัก ของ ลม ทุก ด้าน รอบ บ้าน
ทำ ให้ บ้าน ลอย สูง ขึ้น
ทำให้ บ้าน ลอย สูง ขึ้นๆ
จน กระทั่ง ขึ้น ไป อยู่ สุด ยอด ของ พายุ ไซโคลน
และ จาก ตรง นั้น ก็ ถูก หอบ ไป หลาย ไมล์
ง่าย ดาย ราว กับ หอบ ขน นก
ง่ายดาย ราวกับ หอบ ขน นก
มืด มาก แล้ว
ลม ยัง ส่ง เสียง หวีดหวือ น่า กลัว อยู่ รอบ ตัว เธอ
ลม ยัง ส่ง เสียง หวีด หวือ น่า กลัว อยู่ รอบ ตัว เธอ
แต่ โดโรธี เห็น ว่า เธอ สามารถ นั่ง ไป ได้ อย่า งง่าย ดาย นัก
แต่ โด โรธี เห็น ว่า เธอ สามารถ นั่ง ไป ได้ อย่าง ง่ายดาย นัก
ครั้ง หนึ่ง หลัง จาก ที่ บ้าน สะดุด อย่าง แรง และ หมุน ไป รอบ
ครั้ง หนึ่ง หลัง จาก ที่ บ้าน สะดุด อย่าง แรง และ หมุน ไป รอบๆ
สอง สาม ครั้ง ใน ตอน แรก
@ -258,17 +258,17 @@
โตโต้ ไม่ ชอบ ใจ เลย
โต โต้ ไม่ ชอบใจ เลย
มัน วิ่ง ไป วิ่ง มา รอบ ห้อง
มัน วิ่ง ไป วิ่ง มาร อบ ห้อง
ทาง โน้น ที ทาง นี้ ที ส่ง เสียง เห่า ดัง ก้อง
แต่ โดโรธี นั่ง นิ่ง อยู่ บน พื้น เฝ้า คอย ดู ว่า จะ เกิด อะไร ขึ้น
แต่ โด โรธี นั่ง นิ่ง อยู่ บน พื้น เฝ้า คอย ดู ว่า จะ เกิด อะไร ขึ้น
ครั้ง หนึ่ง โตโต้ เข้า ไป ใกล้ ฝา ที่ พื้น มาก ไป
ครั้ง หนึ่ง โต โต้ เข้าไป ใกล้ ฝา ที่ พื้น มาก ไป
เลย พลัด ตกลง ไป
@ -276,9 +276,9 @@
แต่ ชั่ว ครู่ เธอ ก็ เห็น หู ของ มัน โผล่ ขึ้น มา จาก ช่อง นั้น
ทั้ง นี้ เพราะ แรง กด อย่าง หนัก ของ อากาศ ทำ ให้ โตโต้ ไม่ ตกลง ไป ข้าง ล่าง
ทั้งนี้ เพราะ แรง กด อย่าง หนัก ของ อากาศ ทำให้ โต โต้ ไม่ ตกลง ไป ข้าง ล่าง
โดโรธี คลาน ไป ที่ ช่อง นั้น จับ หู โตโต้ ไว้ ได้
โด โรธี คลาน ไป ที่ ช่อง นั้น จับ หู โต โต้ ไว้ ได้
และ ลาก มัน มา ที่ ห้อง อีก
@ -288,33 +288,33 @@
ชั่วโมง แล้ว ชั่วโมง เล่า ผ่าน ไป
โดโรธี ค่อย
โด โรธี ค่อยๆ
หาย กลัว
แต่ เธอ รู้สึก เหงา เหลือ เกิน
และ ลม ก็ ส่ง เสียง หวีดหวือ ดัง เสีย จน เธอ แทบ จะ หู หนวก
และ ลม ก็ ส่ง เสียง หวีด หวือ ดัง เสีย จน เธอ แทบ จะ หู หนวก
ที แรก เธอ สงสัย ว่า คง จะ ถูก ฉีก กระชาก ออก เป็น ชิ้น เล็ก ชิ้น น้อย เมื่อ บ้าน เอน ล้ม ลง อีก ครั้ง
ที แรก เธอ สงสัย ว่า คงจะ ถูก ฉีก กระชาก ออก เป็น ชิ้น เล็ก ชิ้น น้อย เมื่อ บ้าน เอน ล้ม ลง อีก ครั้ง
แต่ หลาย ชั่วโมง ผ่าน ไป ก็ ไม่ มี อะไร เกิด ขึ้น เธอ เลย เลิก วิตก และ ตัดสิน ใจ คอย ดู อย่าง สงบ
แต่ หลาย ชั่วโมง ผ่าน ไป ก็ ไม่มี อะไร เกิด ขึ้น เธอ เลย เลิก วิตก และ ตัดสิน ใจ คอย ดู อย่าง สงบ
และ รอ ว่า อนาคต จะ เป็น อย่างไร
ใน ที่ สุด เธอ คลาน จาก พื้น ห้อง ที่ โยก ไป มา ขึ้น ไป บน เตียง
ใน ที่สุด เธอ คลาน จาก พื้น ห้อง ที่ โยก ไป มา ขึ้น ไป บน เตียง
แล้ว ก็ นอน ลง
โตโต้ ตาม ติด มา นอน ลง ใกล้
โต โต้ ตาม ติด มา นอน ลง ใกล้ๆ
เธอ
ไม่ ช้า โดโรธี ก็ ปิด ตา ลง หลับ ผล็อย ไป อย่าง สนิท ทั้ง
ไม่ ช้า โด โรธี ก็ ปิด ตา ลง หลับ ผล็อย ไป อย่าง สนิท ทั้งๆ
ที่ บ้าน โยก ไป มา และ ลม ก็ คราง หวีดหวือ
ที่ บ้าน โยก ไป มา และ ลม ก็ คราง หวีด หวือ

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
* Copyright (C) 1998-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
* and others. All Rights Reserved. *
******************************************************************************
*/
@ -15,6 +15,8 @@
#include "unicode/brkiter.h"
#include "unicode/locid.h"
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/ustring.h"
/*
* This program takes a Unicode text file containing Thai text with
@ -68,6 +70,9 @@ private:
// current space count
int32_t fSpaceCount;
// UnicodeSet of SA characters
UnicodeSet fComplexContext;
// true when fBreakIter has returned DONE
UBool fDone;
@ -386,6 +391,47 @@ const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count,
return noSpaces;
}
/*
* Generate a text file with spaces in it from a file without.
*/
int generateFile(const UChar *chars, int32_t length) {
Locale root("");
UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
UErrorCode status = U_ZERO_ERROR;
UnicodeString saSet("[:LineBreak=SA:]", -1, US_INV);
UnicodeSet complexContext(saSet, status);
BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
breakIter->adoptText(noSpaceIter);
char outbuf[1024];
int32_t strlength;
UChar bom = 0xFEFF;
printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
int32_t prevbreak = 0;
while (U_SUCCESS(status)) {
int32_t nextbreak = breakIter->next();
if (nextbreak == BreakIterator::DONE) {
break;
}
printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
nextbreak-prevbreak, &status));
if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
&& complexContext.contains(chars[nextbreak])) {
printf(" ");
}
prevbreak = nextbreak;
}
if (U_FAILURE(status)) {
fprintf(stderr, "generate failed: %s\n", u_errorName(status));
return status;
}
else {
return 0;
}
}
/*
* The main routine. Read the command line arguments, read the text file,
* remove the spaces, do the comparison and report the final results
@ -395,6 +441,12 @@ int main(int argc, char **argv)
char *fileName = "space.txt";
int arg = 1;
UBool verbose = FALSE;
UBool generate = FALSE;
if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
generate = TRUE;
arg += 1;
}
if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
verbose = TRUE;
@ -418,6 +470,10 @@ int main(int argc, char **argv)
if (spaces == 0) {
return 1;
}
if (generate) {
return generateFile(spaces, spaceCount);
}
noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
@ -441,11 +497,13 @@ int main(int argc, char **argv)
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
{
UnicodeString saSet("[:LineBreak=SA:]", -1, US_INV);
UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
UErrorCode status = U_ZERO_ERROR;
Locale us("us");
fComplexContext.applyPattern(saSet, status);
Locale root("");
fBreakIter = BreakIterator::createWordInstance(us, status);
fBreakIter = BreakIterator::createWordInstance(root, status);
fBreakIter->adoptText(iter);
}
@ -471,12 +529,17 @@ int32_t SpaceBreakIterator::next()
return BreakIterator::DONE;
}
int32_t nextBreak = fBreakIter->next();
if (nextBreak == BreakIterator::DONE) {
fDone = TRUE;
return BreakIterator::DONE;
int32_t nextBreak;
do {
nextBreak = fBreakIter->next();
if (nextBreak == BreakIterator::DONE) {
fDone = TRUE;
return BreakIterator::DONE;
}
}
while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
&& fComplexContext.contains(fText[nextBreak]));
int32_t result = nextBreak - fSpaceCount;

View file

@ -15,7 +15,7 @@ subdir = tools
SUBDIRS = toolutil ctestfw makeconv genrb genuca genbrk \
gennames genpname gencnval gensprep genccode gencmn icupkg pkgdata \
gentest genprops gencase genbidi gennorm
gentest genprops gencase genbidi gennorm genctd
## List of phony targets
.PHONY : all all-local all-recursive install install-local \

View file

@ -0,0 +1,95 @@
## Makefile.in for ICU - tools/genctd
## Copyright (c) 2002-2006 International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = tools/genctd
TARGET_STUB_NAME = genctd
SECTION = 1
MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS) $(MAN_FILES)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
ifneq ($(top_builddir),$(top_srcdir))
CPPFLAGS += -I$(top_builddir)/common
endif
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = genctd.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check check-local install-man
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET) $(MAN_FILES)
install-local: all-local install-man
$(MKINSTALLDIRS) $(DESTDIR)$(bindir)
$(INSTALL) $(TARGET) $(DESTDIR)$(bindir)
install-man: $(MAN_FILES)
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
$(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(TARGET) $(OBJECTS)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
%.$(SECTION): $(srcdir)/%.$(SECTION).in
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

View file

@ -0,0 +1,111 @@
.\" Hey, Emacs! This is -*-nroff-*- you know...
.\"
.\" genctd.1: manual page for the genctd utility
.\"
.\" Copyright (C) 2006 IBM, Inc. and others.
.\"
.TH GENCTD 1 "8 March 2006" "ICU MANPAGE" "ICU @VERSION@ Manual"
.SH NAME
.B genctd
\- Compiles word list into ICU compact trie dictionary
.SH SYNOPSIS
.B genctd
[
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
]
[
.BR "\-V\fP, \fB\-\-version"
]
[
.BR "\-c\fP, \fB\-\-copyright"
]
[
.BR "\-v\fP, \fB\-\-verbose"
]
[
.BI "\-d\fP, \fB\-\-destdir" " destination"
]
[
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
]
.BI "\-o\fP, \fB\-\-out" " output\-file"
.IR " dictionary\-file"
.SH DESCRIPTION
.B genctd
reads the word list from
.I dictionary-file
and creates a compact trie dictionary file. Normally this data file has the
.B .ctd
extension.
.PP
Words begin at the beginning of a line and are terminated by the first whitespace.
Lines that begin with whitespace are ignored.
.SH OPTIONS
.TP
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
Print help about usage and exit.
.TP
.BR "\-V\fP, \fB\-\-version"
Print the version of
.B genctd
and exit.
.TP
.BR "\-c\fP, \fB\-\-copyright"
Embeds the standard ICU copyright into the
.IR output-file .
.TP
.BR "\-v\fP, \fB\-\-verbose"
Display extra informative messages during execution.
.TP
.BI "\-d\fP, \fB\-\-destdir" " destination"
Set the destination directory of the
.IR output-file
to
.IR destination .
.TP
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
Look for any necessary ICU data files in
.IR directory .
For example, the file
.B pnames.icu
must be located when ICU's data is not built as a shared library.
The default ICU data directory is specified by the environment variable
.BR ICU_DATA .
Most configurations of ICU do not require this argument.
.TP
.BI " dictionary\-file"
The source file to read.
.TP
.BI "\-o\fP, \fB\-\-out" " output\-file"
The output data file to write.
.SH CAVEATS
When the
.IR dictionary-file
contains a byte order mark (BOM) at the beginning of the file, which is the Unicode character
.B U+FEFF,
then the
.IR dictionary-file
is interpreted as Unicode. Without the BOM,
the file is interpreted in the current operating system default codepage.
In order to eliminate any ambiguity of the encoding for how the
.IR rule-file
was written, it is recommended that you write this file in UTF-8
with the BOM.
.SH ENVIRONMENT
.TP 10
.B ICU_DATA
Specifies the directory containing ICU data. Defaults to
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
Some tools in ICU depend on the presence of the trailing slash. It is thus
important to make sure that it is present if
.B ICU_DATA
is set.
.SH AUTHORS
Deborah Goldsmith
.SH VERSION
1.0
.SH COPYRIGHT
Copyright (C) 2006 IBM, Inc. and others.
.SH SEE ALSO
.BR http://icu.sourceforge.net/userguide/boundaryAnalysis.html

View file

@ -0,0 +1,393 @@
/*
**********************************************************************
* Copyright (C) 2002-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File genctd.c
*/
//--------------------------------------------------------------------
//
// Tool for generating CompactTrieDictionary data files (.ctd files).
//
// Usage: genctd [options] -o output-file.ctd input-file
//
// options: -v verbose
// -? or -h help
//
// The input file is a plain text file containing words, one per line.
// Words end at the first whitespace; lines beginning with whitespace
// are ignored.
// The file can be encoded as utf-8, or utf-16 (either endian), or
// in the default code page (platform dependent.). utf encoded
// files must include a BOM.
//
//--------------------------------------------------------------------
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "uoptions.h"
#include "unewdata.h"
#include "ucmndata.h"
#include "rbbidata.h"
#include "triedict.h"
#include "cmemory.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static char *progName;
static UOption options[]={
UOPTION_HELP_H, /* 0 */
UOPTION_HELP_QUESTION_MARK, /* 1 */
UOPTION_VERBOSE, /* 2 */
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
UOPTION_ICUDATADIR, /* 4 */
UOPTION_DESTDIR, /* 5 */
UOPTION_COPYRIGHT, /* 6 */
};
void usageAndDie(int retCode) {
printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
printf("\tRead in word list and write out compact trie dictionary\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-c or --copyright include a copyright notice\n"
"\t-v or --verbose turn on verbose output\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
"\t followed by path, defaults to %s\n"
"\t-d or --destdir destination directory, followed by the path\n",
u_getDataDirectory());
exit (retCode);
}
#if UCONFIG_NO_BREAK_ITERATION
/* dummy UDataInfo cf. udata.h */
static UDataInfo dummyDataInfo = {
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0, 0, 0, 0 }, /* dummy dataFormat */
{ 0, 0, 0, 0 }, /* dummy formatVersion */
{ 0, 0, 0, 0 } /* dummy dataVersion */
};
#else
//
// Set up the ICU data header, defined in ucmndata.h
//
DataHeader dh ={
{sizeof(DataHeader), // Struct MappedData
0xda,
0x27},
{ // struct UDataInfo
sizeof(UDataInfo), // size
0, // reserved
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0, // reserved
{ 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
{ 1, 0, 0, 0 }, // 1.0.0.0
{ 0, 0, 0, 0 }, // Irrelevant for this data type
}};
#endif
//----------------------------------------------------------------------------
//
// main for genctd
//
//----------------------------------------------------------------------------
int main(int argc, char **argv) {
UErrorCode status = U_ZERO_ERROR;
const char *wordFileName;
const char *outFileName;
const char *outDir = NULL;
const char *copyright = NULL;
//
// Pick up and check the command line arguments,
// using the standard ICU tool utils option handling.
//
U_MAIN_INIT_ARGS(argc, argv);
progName = argv[0];
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
if(argc<0) {
// Unrecognized option
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
if(options[0].doesOccur || options[1].doesOccur) {
// -? or -h for help.
usageAndDie(0);
}
if (!options[3].doesOccur || argc < 2) {
fprintf(stderr, "input and output file must both be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
outFileName = options[3].value;
wordFileName = argv[1];
if (options[4].doesOccur) {
u_setDataDirectory(options[4].value);
}
/* Initialize ICU */
u_init(&status);
if (U_FAILURE(status)) {
fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
argv[0], u_errorName(status));
exit(1);
}
status = U_ZERO_ERROR;
/* Combine the directory with the file name */
if(options[5].doesOccur) {
outDir = options[5].value;
}
if (options[6].doesOccur) {
copyright = U_COPYRIGHT_STRING;
}
#if UCONFIG_NO_BREAK_ITERATION
UNewDataMemory *pData;
char msg[1024];
/* write message with just the name */
sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
fprintf(stderr, "%s\n", msg);
/* write the dummy data file */
pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
udata_writeBlock(pData, msg, strlen(msg));
udata_finish(pData, &status);
return (int)status;
#else
//
// Read in the dictionary source file
//
long result;
long wordFileSize;
FILE *file;
char *wordBufferC;
file = fopen(wordFileName, "rb");
if( file == 0 ) {
fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
exit(-1);
}
fseek(file, 0, SEEK_END);
wordFileSize = ftell(file);
fseek(file, 0, SEEK_SET);
wordBufferC = new char[wordFileSize+10];
result = (long)fread(wordBufferC, 1, wordFileSize, file);
if (result != wordFileSize) {
fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
exit (-1);
}
wordBufferC[wordFileSize]=0;
fclose(file);
//
// Look for a Unicode Signature (BOM) on the word file
//
int32_t signatureLength;
const char * wordSourceC = wordBufferC;
const char* encoding = ucnv_detectUnicodeSignature(
wordSourceC, wordFileSize, &signatureLength, &status);
if (U_FAILURE(status)) {
exit(status);
}
if(encoding!=NULL ){
wordSourceC += signatureLength;
wordFileSize -= signatureLength;
}
//
// Open a converter to take the rule file to UTF-16
//
UConverter* conv;
conv = ucnv_open(encoding, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
}
//
// Convert the words to UChar.
// Preflight first to determine required buffer size.
//
uint32_t destCap = ucnv_toUChars(conv,
NULL, // dest,
0, // destCapacity,
wordSourceC,
wordFileSize,
&status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
};
status = U_ZERO_ERROR;
UChar *wordSourceU = new UChar[destCap+1];
ucnv_toUChars(conv,
wordSourceU, // dest,
destCap+1,
wordSourceC,
wordFileSize,
&status);
if (U_FAILURE(status)) {
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
};
ucnv_close(conv);
// Get rid of the original file buffer
delete[] wordBufferC;
// Create a MutableTrieDictionary, and loop through all the lines, inserting
// words.
// First, pick a median character.
UChar *current = wordSourceU + (destCap/2);
UChar uc = *current++;
UnicodeSet breaks;
breaks.add(0x000A); // Line Feed
breaks.add(0x000D); // Carriage Return
breaks.add(0x2028); // Line Separator
breaks.add(0x2029); // Paragraph Separator
do {
// Look for line break
while (uc && !breaks.contains(uc)) {
uc = *current++;
}
// Now skip to first non-line-break
while (uc && breaks.contains(uc)) {
uc = *current++;
}
}
while (uc && (breaks.contains(uc) || u_isspace(uc)));
MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
if (U_FAILURE(status)) {
fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
}
// Now add the words. Words are non-space characters at the beginning of
// lines, and must be at least one UChar.
current = wordSourceU;
UChar *candidate = current;
uc = *current++;
int32_t length = 0;
while (uc) {
while (uc && !u_isspace(uc)) {
++length;
uc = *current++;
}
if (length > 0) {
mtd->addWord(candidate, length, status);
if (U_FAILURE(status)) {
fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
u_errorName(status));
exit(status);
}
}
// Find beginning of next line
while (uc && !breaks.contains(uc)) {
uc = *current++;
}
while (uc && breaks.contains(uc)) {
uc = *current++;
}
candidate = current-1;
length = 0;
}
// Get rid of the Unicode text buffer
delete[] wordSourceU;
// Now, create a CompactTrieDictionary from the mutable dictionary
CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
if (U_FAILURE(status)) {
fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
exit(status);
}
// Get rid of the MutableTrieDictionary
delete mtd;
//
// Get the binary data from the dictionary.
//
uint32_t outDataSize = ctd->dataSize();
const uint8_t *outData = (const uint8_t *)ctd->data();
//
// Create the output file
//
size_t bytesWritten;
UNewDataMemory *pData;
pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
if(U_FAILURE(status)) {
fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
outFileName, u_errorName(status));
exit(status);
}
// Write the data itself.
udata_writeBlock(pData, outData, outDataSize);
// finish up
bytesWritten = udata_finish(pData, &status);
if(U_FAILURE(status)) {
fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
exit(status);
}
if (bytesWritten != outDataSize) {
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
exit(-1);
}
// Get rid of the CompactTrieDictionary
delete ctd;
u_cleanup();
printf("genctd: tool completed successfully.\n");
return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}