mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-5117 Thai break should work in all locales
X-SVN-Rev: 19408
This commit is contained in:
parent
7dce112ff1
commit
490cb834fa
53 changed files with 30817 additions and 1524 deletions
1
.gitattributes
vendored
1
.gitattributes
vendored
|
@ -48,7 +48,6 @@ README text !eol
|
|||
*.spp -text
|
||||
*.tri2 -text
|
||||
|
||||
icu4c/source/data/brkitr/thaidict.brk -text
|
||||
icu4c/source/data/unidata/UCARules.txt -text
|
||||
icu4c/source/samples/ucnv/data02.bin -text
|
||||
icu4c/source/test/testdata/icu26_testtypes.res -text
|
||||
|
|
|
@ -75,7 +75,7 @@ normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o
|
|||
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o unames.o \
|
||||
uscript.o usc_impl.o uvector.o ustack.o uvectr32.o ucmp8.o \
|
||||
uarrsort.o utrie.o uset.o uset_props.o uniset.o uniset_props.o ruleiter.o caniter.o unifilt.o unifunct.o usetiter.o \
|
||||
brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o \
|
||||
brkiter.o ubrk.o brkeng.o dictbe.o triedict.o \
|
||||
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
|
||||
serv.o servnotf.o servls.o servlk.o servlkf.o servrbf.o servslkf.o locutil.o \
|
||||
uenum.o ustrenum.o uidna.o usprep.o punycode.o \
|
||||
|
|
229
icu4c/source/common/brkeng.cpp
Normal file
229
icu4c/source/common/brkeng.cpp
Normal file
|
@ -0,0 +1,229 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "dictbe.h"
|
||||
#include "triedict.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "uvector.h"
|
||||
#include "mutex.h"
|
||||
#include "uresimp.h"
|
||||
#include "ubrkimpl.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
LanguageBreakEngine::LanguageBreakEngine() {
|
||||
}
|
||||
|
||||
LanguageBreakEngine::~LanguageBreakEngine() {
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
LanguageBreakFactory::LanguageBreakFactory() {
|
||||
}
|
||||
|
||||
LanguageBreakFactory::~LanguageBreakFactory() {
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
UnhandledEngine::UnhandledEngine(UErrorCode &status) {
|
||||
for (int32_t i = 0; i < sizeof(fHandled)/sizeof(fHandled[0]); ++i) {
|
||||
fHandled[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
UnhandledEngine::~UnhandledEngine() {
|
||||
for (int32_t i = 0; i < sizeof(fHandled)/sizeof(fHandled[0]); ++i) {
|
||||
if (fHandled[i] != 0) {
|
||||
delete fHandled[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
|
||||
return (breakType >= 0 && breakType < sizeof(fHandled)/sizeof(fHandled[0])
|
||||
&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnhandledEngine::findBreaks( CharacterIterator *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const {
|
||||
if (breakType >= 0 && breakType < sizeof(fHandled)/sizeof(fHandled[0])) {
|
||||
UChar32 c = text->current32();
|
||||
if (reverse) {
|
||||
while(text->getIndex() > startPos && fHandled[breakType]->contains(c)) {
|
||||
c = text->previous32();
|
||||
}
|
||||
}
|
||||
else {
|
||||
while(text->getIndex() < endPos && fHandled[breakType]->contains(c)) {
|
||||
c = text->next32();
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
|
||||
if (breakType >= 0 && breakType < sizeof(fHandled)/sizeof(fHandled[0])) {
|
||||
if (fHandled[breakType] == 0) {
|
||||
fHandled[breakType] = new UnicodeSet();
|
||||
if (fHandled[breakType] == 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!fHandled[breakType]->contains(c)) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// Apply the entire script of the character.
|
||||
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
|
||||
fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &status) {
|
||||
fEngines = 0;
|
||||
}
|
||||
|
||||
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
|
||||
if (fEngines != 0) {
|
||||
delete fEngines;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
U_CDECL_BEGIN
|
||||
static void U_CALLCONV _deleteEngine(void *obj) {
|
||||
delete (const LanguageBreakEngine *) obj;
|
||||
}
|
||||
U_CDECL_END
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const LanguageBreakEngine *
|
||||
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
|
||||
UBool needsInit;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
umtx_lock(NULL);
|
||||
needsInit = (UBool)(fEngines == NULL);
|
||||
umtx_unlock(NULL);
|
||||
|
||||
if (needsInit) {
|
||||
UStack *engines = new UStack(_deleteEngine, NULL, status);
|
||||
if (U_SUCCESS(status) && engines == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
// TODO: add locale parameter, check "dictionaries" in locale
|
||||
// TODO: generalize once we can figure out how to parameterize engines
|
||||
// instead of having different subclasses. Right now it needs to check
|
||||
// for the key of each particular subclass.
|
||||
|
||||
// Open root from brkitr tree.
|
||||
UResourceBundle dictBundleStack;
|
||||
UResourceBundle dictNameStack;
|
||||
UResourceBundle *dictBundle = &dictBundleStack;
|
||||
UResourceBundle *dictName = &dictNameStack;
|
||||
char dictnbuff[256];
|
||||
ures_initStackObject(dictBundle);
|
||||
ures_initStackObject(dictName);
|
||||
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
|
||||
dictBundle = ures_getByKeyWithFallback(b, "dictionaries", dictBundle, &status);
|
||||
dictName = ures_getByKeyWithFallback(dictBundle, "Thai", dictName, &status);
|
||||
const UChar *dictfname = NULL;
|
||||
int32_t dictnlength = 0;
|
||||
dictfname = ures_getString(dictName, &dictnlength, &status);
|
||||
if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) {
|
||||
dictnlength = 0;
|
||||
status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
if (U_SUCCESS(status) && dictfname) {
|
||||
u_UCharsToChars(dictfname, dictnbuff, dictnlength+1);
|
||||
}
|
||||
ures_close(dictName);
|
||||
ures_close(dictBundle);
|
||||
ures_close(b);
|
||||
UDataMemory *file = udata_open(U_ICUDATA_BRKITR, "ctd", dictnbuff, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
const CompactTrieDictionary *dict = new CompactTrieDictionary(
|
||||
(const TrieWordDictionary *)udata_getMemory(file), status);
|
||||
if (U_SUCCESS(status) && dict == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
delete dict;
|
||||
dict = NULL;
|
||||
}
|
||||
const ThaiBreakEngine *thai = new ThaiBreakEngine(dict, status);
|
||||
if (thai == NULL) {
|
||||
delete dict;
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
if (U_SUCCESS(status)) {
|
||||
engines->push((void *)thai, status);
|
||||
}
|
||||
else {
|
||||
delete thai;
|
||||
}
|
||||
}
|
||||
umtx_lock(NULL);
|
||||
if (fEngines == NULL) {
|
||||
fEngines = engines;
|
||||
engines = NULL;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
delete engines;
|
||||
}
|
||||
|
||||
if (fEngines == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
int32_t i = fEngines->size();
|
||||
const LanguageBreakEngine *lbe = NULL;
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
|
||||
if (lbe != NULL && lbe->handles(c, breakType)) {
|
||||
break;
|
||||
}
|
||||
lbe = NULL;
|
||||
}
|
||||
return lbe;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
265
icu4c/source/common/brkeng.h
Normal file
265
icu4c/source/common/brkeng.h
Normal file
|
@ -0,0 +1,265 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef BRKENG_H
|
||||
#define BRKENG_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class CharacterIterator;
|
||||
class UnicodeSet;
|
||||
class UStack;
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LanguageBreakEngines implement language-specific knowledge for
|
||||
* finding text boundaries within a run of characters belonging to a
|
||||
* specific set. The boundaries will be of a specific kind, e.g. word,
|
||||
* line, etc.</p>
|
||||
*
|
||||
* <p>LanguageBreakEngines should normally be implemented so as to
|
||||
* be shared between threads without locking.</p>
|
||||
*/
|
||||
class U_COMMON_API LanguageBreakEngine : public UMemory {
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
LanguageBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LanguageBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left at the end of the run of characters which the engine
|
||||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( CharacterIterator *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* LanguageBreakFactory
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>LanguageBreakFactorys find and return a LanguageBreakEngine
|
||||
* that can determine breaks for characters in a specific set, if
|
||||
* such an object can be found.</p>
|
||||
*
|
||||
* <p>If a LanguageBreakFactory is to be shared between threads,
|
||||
* appropriate synchronization must be used; there is none internal
|
||||
* to the factory.</p>
|
||||
*
|
||||
* <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
|
||||
* normally be shared between threads without synchronization, unless
|
||||
* the specific subclass of LanguageBreakFactory indicates otherwise.</p>
|
||||
*
|
||||
* <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
|
||||
* it returns when it itself is deleted, unless the specific subclass of
|
||||
* LanguageBreakFactory indicates otherwise. Naturally, the factory should
|
||||
* not be deleted until the LanguageBreakEngines it has returned are no
|
||||
* longer needed.</p>
|
||||
*/
|
||||
class U_COMMON_API LanguageBreakFactory : public UMemory {
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
LanguageBreakFactory();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~LanguageBreakFactory();
|
||||
|
||||
/**
|
||||
* <p>Find and return a LanguageBreakEngine that can find the desired
|
||||
* kind of break for the set of characters to which the supplied
|
||||
* character belongs. It is up to the set of available engines to
|
||||
* determine what the sets of characters are.</p>
|
||||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param breakType The kind of text break for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* UnhandledEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
|
||||
* handles characters that no other LanguageBreakEngine is available to
|
||||
* handle. It is told the character and the type of break; at its
|
||||
* discretion it may handle more than the specified character (e.g.,
|
||||
* the entire script to which that character belongs.</p>
|
||||
*
|
||||
* <p>UnhandledEngines may not be shared between threads without
|
||||
* external synchronization.</p>
|
||||
*/
|
||||
|
||||
class U_COMMON_API UnhandledEngine : public LanguageBreakEngine {
|
||||
private:
|
||||
|
||||
/**
|
||||
* The sets of characters handled, for each break type
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet *fHandled[4];
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
UnhandledEngine(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~UnhandledEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles(UChar32 c, int32_t breakType) const;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left at the end of the run of characters which the engine
|
||||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( CharacterIterator *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const;
|
||||
|
||||
/**
|
||||
* <p>Tell the engine to handle a particular character and break type.</p>
|
||||
*
|
||||
* @param c A character which the engine should handle
|
||||
* @param breakType The type of text break for which the engine should handle c
|
||||
*/
|
||||
virtual void handleCharacter(UChar32 c, int32_t breakType);
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* ICULanguageBreakFactory
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
|
||||
* ICU. It creates dictionary-based LanguageBreakEngines from dictionary
|
||||
* data in the ICU data file.</p>
|
||||
*/
|
||||
class U_COMMON_API ICULanguageBreakFactory : public LanguageBreakFactory {
|
||||
private:
|
||||
|
||||
/**
|
||||
* The stack of break engines created by this factory
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UStack *fEngines;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Standard constructor.</p>
|
||||
*
|
||||
*/
|
||||
ICULanguageBreakFactory(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~ICULanguageBreakFactory();
|
||||
|
||||
/**
|
||||
* <p>Find and return a LanguageBreakEngine that can find the desired
|
||||
* kind of break for the set of characters to which the supplied
|
||||
* character belongs. It is up to the set of available engines to
|
||||
* determine what the sets of characters are.</p>
|
||||
*
|
||||
* @param c A character that begins a run for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @param breakType The kind of text break for which a LanguageBreakEngine is
|
||||
* sought.
|
||||
* @return A LanguageBreakEngine with the desired characteristics, or 0.
|
||||
*/
|
||||
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* BRKENG_H */
|
||||
#endif
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/dbbi.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/ures.h"
|
||||
|
@ -33,6 +33,7 @@
|
|||
#include "locbased.h"
|
||||
#include "uresimp.h"
|
||||
#include "uassert.h"
|
||||
#include "ubrkimpl.h"
|
||||
|
||||
// *****************************************************************************
|
||||
// class BreakIterator
|
||||
|
@ -46,7 +47,7 @@ U_NAMESPACE_BEGIN
|
|||
// -------------------------------------
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode &status)
|
||||
BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status)
|
||||
{
|
||||
char fnbuff[256];
|
||||
char actualLocale[ULOC_FULLNAME_CAPACITY];
|
||||
|
@ -56,7 +57,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UE
|
|||
UResourceBundle brkNameStack;
|
||||
UResourceBundle *brkRules = &brkRulesStack;
|
||||
UResourceBundle *brkName = &brkNameStack;
|
||||
BreakIterator *result = NULL;
|
||||
RuleBasedBreakIterator *result = NULL;
|
||||
|
||||
if (U_FAILURE(status))
|
||||
return NULL;
|
||||
|
@ -65,7 +66,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UE
|
|||
ures_initStackObject(brkName);
|
||||
|
||||
// Get the locale
|
||||
UResourceBundle *b = ures_open(NULL, loc.getName(), &status);
|
||||
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, loc.getName(), &status);
|
||||
|
||||
// Get the "boundaries" array.
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -94,45 +95,20 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UBool dict, UE
|
|||
ures_close(brkRules);
|
||||
ures_close(brkName);
|
||||
|
||||
UDataMemory* file = udata_open(NULL, "brk", fnbuff, &status);
|
||||
UDataMemory* file = udata_open(U_ICUDATA_BRKITR, "brk", fnbuff, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
ures_close(b);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// We found the break rules; now see if a dictionary is needed
|
||||
if (dict)
|
||||
{
|
||||
UErrorCode localStatus = U_ZERO_ERROR;
|
||||
brkName = &brkNameStack;
|
||||
ures_initStackObject(brkName);
|
||||
brkName = ures_getByKeyWithFallback(b, "BreakDictionaryData", brkName, &localStatus);
|
||||
#if 0
|
||||
if (U_SUCCESS(localStatus)) {
|
||||
brkfname = ures_getString(&brkname, &size, &localStatus);
|
||||
}
|
||||
#endif
|
||||
if (U_SUCCESS(localStatus)) {
|
||||
#if 0
|
||||
// TODO: if this code is ever enabled, need to add a bounds check for fnbuff.
|
||||
u_UCharsToChars(brkfname, fnbuff, size);
|
||||
fnbuff[size] = '\0';
|
||||
#endif
|
||||
result = new DictionaryBasedBreakIterator(file, "thaidict.brk", status);
|
||||
}
|
||||
ures_close(brkName);
|
||||
}
|
||||
|
||||
// If there is still no result but we haven't had an error, no dictionary,
|
||||
// so make a non-dictionary break iterator
|
||||
if (U_SUCCESS(status) && result == NULL) {
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
// Create a RuleBasedBreakIterator
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
|
||||
// If there is a result, set the valid locale and actual locale
|
||||
// If there is a result, set the valid locale and actual locale, and the kind
|
||||
if (U_SUCCESS(status) && result != NULL) {
|
||||
U_LOCALE_BASED(locBased, *result);
|
||||
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), actualLocale);
|
||||
result->setBreakType(kind);
|
||||
}
|
||||
|
||||
ures_close(b);
|
||||
|
@ -372,7 +348,7 @@ BreakIterator::getAvailableLocales(void)
|
|||
// -------------------------------------
|
||||
|
||||
BreakIterator*
|
||||
BreakIterator::createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status)
|
||||
BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
|
@ -419,19 +395,19 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
BreakIterator *result = NULL;
|
||||
switch (kind) {
|
||||
case UBRK_CHARACTER:
|
||||
result = BreakIterator::buildInstance(loc, "grapheme", FALSE, status);
|
||||
result = BreakIterator::buildInstance(loc, "grapheme", kind, status);
|
||||
break;
|
||||
case UBRK_WORD:
|
||||
result = BreakIterator::buildInstance(loc, "word", TRUE, status);
|
||||
result = BreakIterator::buildInstance(loc, "word", kind, status);
|
||||
break;
|
||||
case UBRK_LINE:
|
||||
result = BreakIterator::buildInstance(loc, "line", TRUE, status);
|
||||
result = BreakIterator::buildInstance(loc, "line", kind, status);
|
||||
break;
|
||||
case UBRK_SENTENCE:
|
||||
result = BreakIterator::buildInstance(loc, "sentence", FALSE, status);
|
||||
result = BreakIterator::buildInstance(loc, "sentence", kind, status);
|
||||
break;
|
||||
case UBRK_TITLE:
|
||||
result = BreakIterator::buildInstance(loc, "title", FALSE, status);
|
||||
result = BreakIterator::buildInstance(loc, "title", kind, status);
|
||||
break;
|
||||
default:
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
|
|
@ -1,637 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2005 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
* 01/13/2000 helena Added UErrorCode to ctors.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/dbbi.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "dbbi_tbl.h"
|
||||
#include "uvector.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DictionaryBasedBreakIterator)
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
|
||||
RuleBasedBreakIterator() {
|
||||
init();
|
||||
}
|
||||
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode& status)
|
||||
: RuleBasedBreakIterator(rbbiData, status)
|
||||
{
|
||||
init();
|
||||
if (U_FAILURE(status)) {return;};
|
||||
fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
|
||||
if (U_FAILURE(status)) {
|
||||
if (fTables != NULL) {
|
||||
fTables->removeReference();
|
||||
fTables = NULL;
|
||||
}
|
||||
return;
|
||||
}
|
||||
/* test for NULL */
|
||||
if(fTables == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
|
||||
RuleBasedBreakIterator(other)
|
||||
{
|
||||
init();
|
||||
if (other.fTables != NULL) {
|
||||
fTables = other.fTables;
|
||||
fTables->addReference();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
|
||||
{
|
||||
uprv_free(cachedBreakPositions);
|
||||
cachedBreakPositions = NULL;
|
||||
if (fTables != NULL) {fTables->removeReference();};
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// Assignment operator. Sets this iterator to have the same behavior,
|
||||
// and iterate over the same text, as the one passed in.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
DictionaryBasedBreakIterator&
|
||||
DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
|
||||
if (this == &that) {
|
||||
return *this;
|
||||
}
|
||||
reset(); // clears out cached break positions.
|
||||
RuleBasedBreakIterator::operator=(that);
|
||||
if (this->fTables != that.fTables) {
|
||||
if (this->fTables != NULL) {this->fTables->removeReference();};
|
||||
this->fTables = that.fTables;
|
||||
if (this->fTables != NULL) {this->fTables->addReference();};
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
// behavior, and iterating over the same text, as this one.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
BreakIterator*
|
||||
DictionaryBasedBreakIterator::clone() const {
|
||||
return new DictionaryBasedBreakIterator(*this);
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Advances the iterator one step backwards.
|
||||
* @return The position of the last boundary position before the
|
||||
* current iteration position
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::previous()
|
||||
{
|
||||
// if we have cached break positions and we're still in the range
|
||||
// covered by them, just move one step backward in the cache
|
||||
if (cachedBreakPositions != NULL && positionInCache > 0) {
|
||||
--positionInCache;
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
|
||||
// otherwise, dump the cache and use the inherited previous() method to move
|
||||
// backward. This may fill up the cache with new break positions, in which
|
||||
// case we have to mark our position in the cache
|
||||
else {
|
||||
reset();
|
||||
int32_t result = RuleBasedBreakIterator::previous();
|
||||
if (cachedBreakPositions != NULL) {
|
||||
for (positionInCache=0;
|
||||
cachedBreakPositions[positionInCache] != result;
|
||||
positionInCache++);
|
||||
U_ASSERT(positionInCache < numCachedBreakPositions);
|
||||
if (positionInCache >= numCachedBreakPositions) {
|
||||
// Something has gone wrong. Dump the cache.
|
||||
reset();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the last boundary position
|
||||
* before the specified position.
|
||||
* @param offset The position to begin searching from
|
||||
* @return The position of the last boundary before "offset"
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::preceding(int32_t offset)
|
||||
{
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (fText == NULL || offset > fText->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < fText->startIndex()) {
|
||||
return fText->startIndex();
|
||||
}
|
||||
|
||||
// if we have no cached break positions, or "offset" is outside the
|
||||
// range covered by the cache, we can just call the inherited routine
|
||||
// (which will eventually call other routines in this class that may
|
||||
// refresh the cache)
|
||||
if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
|
||||
offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
|
||||
reset();
|
||||
return RuleBasedBreakIterator::preceding(offset);
|
||||
}
|
||||
|
||||
// on the other hand, if "offset" is within the range covered by the cache,
|
||||
// then all we have to do is search the cache for the last break position
|
||||
// before "offset"
|
||||
else {
|
||||
positionInCache = 0;
|
||||
while (positionInCache < numCachedBreakPositions
|
||||
&& offset > cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
--positionInCache;
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return fText->getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the first boundary position after
|
||||
* the specified position.
|
||||
* @param offset The position to begin searching forward from
|
||||
* @return The position of the first boundary after "offset"
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::following(int32_t offset)
|
||||
{
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (fText == NULL || offset > fText->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < fText->startIndex()) {
|
||||
return fText->startIndex();
|
||||
}
|
||||
|
||||
// if we have no cached break positions, or if "offset" is outside the
|
||||
// range covered by the cache, then dump the cache and call our
|
||||
// inherited following() method. This will call other methods in this
|
||||
// class that may refresh the cache.
|
||||
if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
|
||||
offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
|
||||
reset();
|
||||
return RuleBasedBreakIterator::following(offset);
|
||||
}
|
||||
|
||||
// on the other hand, if "offset" is within the range covered by the
|
||||
// cache, then just search the cache for the first break position
|
||||
// after "offset"
|
||||
else {
|
||||
positionInCache = 0;
|
||||
while (positionInCache < numCachedBreakPositions
|
||||
&& offset >= cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return fText->getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is the implementation function for next().
|
||||
*/
|
||||
int32_t
|
||||
DictionaryBasedBreakIterator::handleNext()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// if there are no cached break positions, or if we've just moved
|
||||
// off the end of the range covered by the cache, we have to dump
|
||||
// and possibly regenerate the cache
|
||||
if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
|
||||
|
||||
// start by using the inherited handleNext() to find a tentative return
|
||||
// value. dictionaryCharCount tells us how many dictionary characters
|
||||
// we passed over on our way to the tentative return value
|
||||
int32_t startPos = fText->getIndex();
|
||||
fDictionaryCharCount = 0;
|
||||
int32_t result = RuleBasedBreakIterator::handleNext();
|
||||
|
||||
// if we passed over more than one dictionary character, then we use
|
||||
// divideUpDictionaryRange() to regenerate the cached break positions
|
||||
// for the new range
|
||||
if (fDictionaryCharCount > 1 && result - startPos > 1) {
|
||||
divideUpDictionaryRange(startPos, result, status);
|
||||
U_ASSERT(U_SUCCESS(status));
|
||||
if (U_FAILURE(status)) {
|
||||
// Something went badly wrong, an internal error.
|
||||
// We have no way from here to report it to caller.
|
||||
// Treat as if this is if the dictionary did not apply to range.
|
||||
reset();
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// otherwise, the value we got back from the inherited fuction
|
||||
// is our return value, and we can dump the cache
|
||||
else {
|
||||
reset();
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// if the cache of break positions has been regenerated (or existed all
|
||||
// along), then just advance to the next break position in the cache
|
||||
// and return it
|
||||
if (cachedBreakPositions != NULL) {
|
||||
++positionInCache;
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
return -9999; // SHOULD NEVER GET HERE!
|
||||
}
|
||||
|
||||
void
|
||||
DictionaryBasedBreakIterator::reset()
|
||||
{
|
||||
uprv_free(cachedBreakPositions);
|
||||
cachedBreakPositions = NULL;
|
||||
numCachedBreakPositions = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// init() Common initialization routine, for use by constructors, etc.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
void DictionaryBasedBreakIterator::init() {
|
||||
cachedBreakPositions = NULL;
|
||||
fTables = NULL;
|
||||
numCachedBreakPositions = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// BufferClone
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
|
||||
int32_t &bufferSize,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// If user buffer size is zero this is a preflight operation to
|
||||
// obtain the needed buffer size, allowing for worst case misalignment.
|
||||
//
|
||||
if (bufferSize == 0) {
|
||||
bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// Check the alignment and size of the user supplied buffer.
|
||||
// Allocate heap memory if the user supplied memory is insufficient.
|
||||
//
|
||||
char *buf = (char *)stackBuffer;
|
||||
uint32_t s = bufferSize;
|
||||
|
||||
if (stackBuffer == NULL) {
|
||||
s = 0; // Ignore size, force allocation if user didn't give us a buffer.
|
||||
}
|
||||
if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
|
||||
int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
|
||||
s -= offsetUp;
|
||||
buf += offsetUp;
|
||||
}
|
||||
if (s < sizeof(DictionaryBasedBreakIterator)) {
|
||||
buf = (char *) new DictionaryBasedBreakIterator();
|
||||
if (buf == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize the clone object.
|
||||
// TODO: using an overloaded C++ "operator new" to directly initialize the
|
||||
// copy in the user's buffer would be better, but it doesn't seem
|
||||
// to get along with namespaces. Investigate why.
|
||||
//
|
||||
// The memcpy is only safe with an empty (default constructed)
|
||||
// break iterator. Use on others can screw up reference counts
|
||||
// to data. memcpy-ing objects is not really a good idea...
|
||||
//
|
||||
DictionaryBasedBreakIterator localIter; // Empty break iterator, source for memcpy
|
||||
DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
|
||||
uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
|
||||
*clone = *this; // clone = the real one we want.
|
||||
if (status != U_SAFECLONE_ALLOCATED_WARNING) {
|
||||
clone->fBufferClone = TRUE;
|
||||
}
|
||||
return clone;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This is the function that actually implements the dictionary-based
|
||||
* algorithm. Given the endpoints of a range of text, it uses the
|
||||
* dictionary to determine the positions of any boundaries in this
|
||||
* range. It stores all the boundary positions it discovers in
|
||||
* cachedBreakPositions so that we only have to do this work once
|
||||
* for each time we enter the range.
|
||||
*/
|
||||
void
|
||||
DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
|
||||
{
|
||||
// the range we're dividing may begin or end with non-dictionary characters
|
||||
// (i.e., for line breaking, we may have leading or trailing punctuation
|
||||
// that needs to be kept with the word). Seek from the beginning of the
|
||||
// range to the first dictionary character
|
||||
fText->setIndex(startPos);
|
||||
UChar32 c = fText->current32();
|
||||
while (isDictionaryChar(c) == FALSE) {
|
||||
c = fText->next32();
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return; // UStack below overwrites the status error codes
|
||||
}
|
||||
|
||||
// initialize. We maintain two stacks: currentBreakPositions contains
|
||||
// the list of break positions that will be returned if we successfully
|
||||
// finish traversing the whole range now. possibleBreakPositions lists
|
||||
// all other possible word ends we've passed along the way. (Whenever
|
||||
// we reach an error [a sequence of characters that can't begin any word
|
||||
// in the dictionary], we back up, possibly delete some breaks from
|
||||
// currentBreakPositions, move a break from possibleBreakPositions
|
||||
// to currentBreakPositions, and start over from there. This process
|
||||
// continues in this way until we either successfully make it all the way
|
||||
// across the range, or exhaust all of our combinations of break
|
||||
// positions.) wrongBreakPositions is used to keep track of paths we've
|
||||
// tried on previous iterations. As the iterator backs up further and
|
||||
// further, this saves us from having to follow each possible path
|
||||
// through the text all the way to the error (hopefully avoiding many
|
||||
// future recursive calls as well).
|
||||
// there can be only one kind of error in UStack and UVector, so we'll
|
||||
// just let the error fall through
|
||||
UStack currentBreakPositions(status);
|
||||
UStack possibleBreakPositions(status);
|
||||
UVector wrongBreakPositions(status);
|
||||
|
||||
// the dictionary is implemented as a trie, which is treated as a state
|
||||
// machine. -1 represents the end of a legal word. Every word in the
|
||||
// dictionary is represented by a path from the root node to -1. A path
|
||||
// that ends in state 0 is an illegal combination of characters.
|
||||
int16_t state = 0;
|
||||
|
||||
// these two variables are used for error handling. We keep track of the
|
||||
// farthest we've gotten through the range being divided, and the combination
|
||||
// of breaks that got us that far. If we use up all possible break
|
||||
// combinations, the text contains an error or a word that's not in the
|
||||
// dictionary. In this case, we "bless" the break positions that got us the
|
||||
// farthest as real break positions, and then start over from scratch with
|
||||
// the character where the error occurred.
|
||||
int32_t farthestEndPoint = fText->getIndex();
|
||||
UStack bestBreakPositions(status);
|
||||
UBool bestBreakPositionsInitialized = FALSE;
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
// initialize (we always exit the loop with a break statement)
|
||||
c = fText->current32();
|
||||
for (;;) {
|
||||
// The dictionary implementation doesn't do supplementary chars.
|
||||
// Put them through as an unpaired surrogate, which
|
||||
// will end any dictionary match in progress.
|
||||
// With any luck, this dictionary implementation will be retired soon.
|
||||
if (c>0x10000) {
|
||||
c = 0xd800;
|
||||
}
|
||||
|
||||
// if we can transition to state "-1" from our current state, we're
|
||||
// on the last character of a legal word. Push that position onto
|
||||
// the possible-break-positions stack
|
||||
if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
|
||||
possibleBreakPositions.push(fText->getIndex(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// look up the new state to transition to in the dictionary
|
||||
state = fTables->fDictionary->at(state, (UChar)c);
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the "end of word" state, then it was a non-dictionary character
|
||||
// and we've successfully traversed the whole range. Drop out
|
||||
// of the loop.
|
||||
if (state == -1) {
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the error state, or if we've gone off the end of the range
|
||||
// without transitioning to the "end of word" state, we've hit
|
||||
// an error...
|
||||
else if (state == 0 || fText->getIndex() >= endPos) {
|
||||
|
||||
// if this is the farthest we've gotten, take note of it in
|
||||
// case there's an error in the text
|
||||
if (fText->getIndex() > farthestEndPoint) {
|
||||
farthestEndPoint = fText->getIndex();
|
||||
bestBreakPositions.removeAllElements();
|
||||
bestBreakPositionsInitialized = TRUE;
|
||||
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
|
||||
bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
|
||||
}
|
||||
}
|
||||
|
||||
// wrongBreakPositions is a list of all break positions we've tried starting
|
||||
// that didn't allow us to traverse all the way through the text. Every time
|
||||
// we pop a break position off of currentBreakPositions, we put it into
|
||||
// wrongBreakPositions to avoid trying it again later. If we make it to this
|
||||
// spot, we're either going to back up to a break in possibleBreakPositions
|
||||
// and try starting over from there, or we've exhausted all possible break
|
||||
// positions and are going to do the fallback procedure. This loop prevents
|
||||
// us from messing with anything in possibleBreakPositions that didn't work as
|
||||
// a starting point the last time we tried it (this is to prevent a bunch of
|
||||
// repetitive checks from slowing down some extreme cases)
|
||||
while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
|
||||
possibleBreakPositions.peeki())) {
|
||||
possibleBreakPositions.popi();
|
||||
}
|
||||
|
||||
// if we've used up all possible break-position combinations, there's
|
||||
// an error or an unknown word in the text. In this case, we start
|
||||
// over, treating the farthest character we've reached as the beginning
|
||||
// of the range, and "blessing" the break positions that got us that
|
||||
// far as real break positions
|
||||
if (possibleBreakPositions.isEmpty()) {
|
||||
if (bestBreakPositionsInitialized) {
|
||||
currentBreakPositions.removeAllElements();
|
||||
for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
|
||||
currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
bestBreakPositions.removeAllElements();
|
||||
if (farthestEndPoint < endPos) {
|
||||
fText->setIndex(farthestEndPoint);
|
||||
fText->next32();
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ((currentBreakPositions.isEmpty()
|
||||
|| currentBreakPositions.peeki() != fText->getIndex())
|
||||
&& fText->getIndex() != startPos) {
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
fText->next32();
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we still have more break positions we can try, then promote the
|
||||
// last break in possibleBreakPositions into currentBreakPositions,
|
||||
// and get rid of all entries in currentBreakPositions that come after
|
||||
// it. Then back up to that position and start over from there (i.e.,
|
||||
// treat that position as the beginning of a new word)
|
||||
else {
|
||||
int32_t temp = possibleBreakPositions.popi();
|
||||
int32_t temp2 = 0;
|
||||
while (!currentBreakPositions.isEmpty() && temp <
|
||||
currentBreakPositions.peeki()) {
|
||||
temp2 = currentBreakPositions.popi();
|
||||
wrongBreakPositions.addElement(temp2, status);
|
||||
}
|
||||
currentBreakPositions.push(temp, status);
|
||||
fText->setIndex(currentBreakPositions.peeki());
|
||||
}
|
||||
|
||||
// re-sync "c" for the next go-round, and drop out of the loop if
|
||||
// we've made it off the end of the range
|
||||
c = fText->current32();
|
||||
if (fText->getIndex() >= endPos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// if we didn't hit any exceptional conditions on this last iteration,
|
||||
// just advance to the next character and loop
|
||||
else {
|
||||
c = fText->next32();
|
||||
}
|
||||
}
|
||||
|
||||
// dump the last break position in the list, and replace it with the actual
|
||||
// end of the range (which may be the same character, or may be further on
|
||||
// because the range actually ended with non-dictionary characters we want to
|
||||
// keep with the word)
|
||||
if (!currentBreakPositions.isEmpty()) {
|
||||
currentBreakPositions.popi();
|
||||
}
|
||||
currentBreakPositions.push(endPos, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// create a regular array to hold the break positions and copy
|
||||
// the break positions from the stack to the array (in addition,
|
||||
// our starting position goes into this array as a break position).
|
||||
// This array becomes the cache of break positions used by next()
|
||||
// and previous(), so this is where we actually refresh the cache.
|
||||
if (cachedBreakPositions != NULL) {
|
||||
uprv_free(cachedBreakPositions);
|
||||
}
|
||||
cachedBreakPositions = (int32_t *)uprv_malloc((currentBreakPositions.size() + 1) * sizeof(int32_t));
|
||||
/* Test for NULL */
|
||||
if(cachedBreakPositions == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
numCachedBreakPositions = currentBreakPositions.size() + 1;
|
||||
cachedBreakPositions[0] = startPos;
|
||||
|
||||
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
|
||||
cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
|
||||
}
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
/* eof */
|
|
@ -1,59 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2002 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
* 01/13/2000 helena Added UErrorCode to ctors.
|
||||
* 06/14/2002 andy Gutted for new RBBI impl.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "dbbi_tbl.h"
|
||||
#include "unicode/dbbi.h"
|
||||
#include "umutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode &status) {
|
||||
fDictionary = new BreakDictionary(dictionaryFilename, status);
|
||||
fRefCount = 1;
|
||||
}
|
||||
|
||||
|
||||
void DictionaryBasedBreakIteratorTables::addReference() {
|
||||
umtx_atomic_inc(&fRefCount);
|
||||
}
|
||||
|
||||
|
||||
void DictionaryBasedBreakIteratorTables::removeReference() {
|
||||
if (umtx_atomic_dec(&fRefCount) == 0) {
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
|
||||
delete fDictionary;
|
||||
fDictionary = NULL;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
||||
/* eof */
|
|
@ -1,90 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
* 01/13/2000 helena Added UErrorCode to ctors.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef DBBI_TBL_H
|
||||
#define DBBI_TBL_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "brkdict.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/* forward declaration */
|
||||
class DictionaryBasedBreakIterator;
|
||||
|
||||
//
|
||||
// DictionaryBasedBreakIteratorTables
|
||||
//
|
||||
// This class sits between instances of DictionaryBasedBreakIterator
|
||||
// and the dictionary data itself, which is of type BreakDictionary.
|
||||
// It provides reference counting, allowing multiple copies of a
|
||||
// DictionaryBasedBreakIterator to share a single instance of
|
||||
// BreakDictionary.
|
||||
//
|
||||
// TODO: it'd probably be cleaner to add the reference counting to
|
||||
// BreakDictionary and get rid of this class, but doing it this way
|
||||
// was a convenient transition from earlier code, and time is short...
|
||||
//
|
||||
class DictionaryBasedBreakIteratorTables : public UMemory {
|
||||
|
||||
private:
|
||||
int32_t fRefCount;
|
||||
|
||||
|
||||
public:
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
/* @param dictionaryFilename The name of the dictionary file
|
||||
* @param status The error code
|
||||
* @return the newly created DictionaryBasedBreakIteratorTables
|
||||
**/
|
||||
DictionaryBasedBreakIteratorTables(const char* dictionaryFilename,
|
||||
UErrorCode& status);
|
||||
|
||||
BreakDictionary *fDictionary;
|
||||
void addReference();
|
||||
void removeReference();
|
||||
/**
|
||||
* Destructor. Should not be used directly. Use removeReference() istead.
|
||||
* (Not private to avoid compiler warnings.)
|
||||
*/
|
||||
virtual ~DictionaryBasedBreakIteratorTables();
|
||||
|
||||
private:
|
||||
/**
|
||||
* The copy constructor is declared private and not implemented.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
* @param that The DictionaryBasedBreakIteratorTables to be copied.
|
||||
* @return the newly constructed DictionaryBasedBreakIteratorTables.
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables(const DictionaryBasedBreakIteratorTables& that);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
|
||||
/**
|
||||
* The assignment operator is declared private and not implemented.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
* Call addReference() and share an existing copy instead.
|
||||
* @that The object to be copied
|
||||
* @return the newly created DictionaryBasedBreakIteratorTables.
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables& operator=(
|
||||
const DictionaryBasedBreakIteratorTables& that);
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
413
icu4c/source/common/dictbe.cpp
Normal file
413
icu4c/source/common/dictbe.cpp
Normal file
|
@ -0,0 +1,413 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "dictbe.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "uvector.h"
|
||||
#include "triedict.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
DictionaryBreakEngine::DictionaryBreakEngine() {
|
||||
fTypes = 0;
|
||||
}
|
||||
|
||||
DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
|
||||
fTypes = breakTypes;
|
||||
}
|
||||
|
||||
DictionaryBreakEngine::~DictionaryBreakEngine() {
|
||||
}
|
||||
|
||||
UBool
|
||||
DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
|
||||
return fSet.contains(c);
|
||||
}
|
||||
|
||||
int32_t
|
||||
DictionaryBreakEngine::findBreaks( CharacterIterator *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const {
|
||||
int32_t result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
int32_t start = text->getIndex();
|
||||
int32_t current;
|
||||
int32_t rangeStart;
|
||||
int32_t rangeEnd;
|
||||
UChar32 c = text->current32();
|
||||
if (reverse) {
|
||||
UBool isDict = fSet.contains(c);
|
||||
while((current = text->getIndex()) > startPos && isDict) {
|
||||
c = text->previous32();
|
||||
isDict = fSet.contains(c);
|
||||
}
|
||||
rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
|
||||
rangeEnd = start + 1;
|
||||
}
|
||||
else {
|
||||
while((current = text->getIndex()) < endPos && fSet.contains(c)) {
|
||||
c = text->next32();
|
||||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
}
|
||||
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
text->setIndex(current);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
DictionaryBreakEngine::setCharacters( UnicodeSet &set ) {
|
||||
fSet = set;
|
||||
}
|
||||
|
||||
void
|
||||
DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
|
||||
fTypes = breakTypes;
|
||||
}
|
||||
|
||||
/*
|
||||
******************************************************************
|
||||
*/
|
||||
|
||||
|
||||
// Helper class for improving readability of the Thai word break
|
||||
// algorithm. The implementation is completely inline.
|
||||
|
||||
// List size, limited by the maximum number of words in the dictionary
|
||||
// that form a nested sequence.
|
||||
#define POSSIBLE_WORD_LIST_MAX 20
|
||||
|
||||
class PossibleWord {
|
||||
private:
|
||||
// list of word candidate lengths, in increasing length order
|
||||
int32_t lengths[POSSIBLE_WORD_LIST_MAX];
|
||||
int count; // Count of candidates
|
||||
int32_t prefix; // The longest match with a dictionary word
|
||||
int32_t offset; // Offset in the text of these candidates
|
||||
int mark; // The preferred candidate's offset
|
||||
int current; // The candidate we're currently looking at
|
||||
|
||||
public:
|
||||
PossibleWord();
|
||||
~PossibleWord();
|
||||
|
||||
// Fill the list of candidates if needed, select the longest, and return the number found
|
||||
int candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd );
|
||||
|
||||
// Select the currently marked candidate, point after it in the text, and invalidate self
|
||||
int32_t acceptMarked( CharacterIterator *text );
|
||||
|
||||
// Back up from the current candidate to the next shorter one; return TRUE if that exists
|
||||
// and point the text after it
|
||||
UBool backUp( CharacterIterator *text );
|
||||
|
||||
// Return the longest prefix this candidate location shares with a dictionary word
|
||||
int32_t longestPrefix();
|
||||
|
||||
// Mark the current candidate as the one we like
|
||||
void markCurrent();
|
||||
};
|
||||
|
||||
inline
|
||||
PossibleWord::PossibleWord() {
|
||||
offset = -1;
|
||||
}
|
||||
|
||||
inline
|
||||
PossibleWord::~PossibleWord() {
|
||||
}
|
||||
|
||||
inline int
|
||||
PossibleWord::candidates( CharacterIterator *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
|
||||
// TODO: If getIndex is too slow, use offset < 0 and add discardAll()
|
||||
int32_t start = text->getIndex();
|
||||
if (start != offset) {
|
||||
offset = start;
|
||||
prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
|
||||
// Dictionary leaves text after longest prefix, not longest word. Back up.
|
||||
if (count <= 0) {
|
||||
text->setIndex(start);
|
||||
}
|
||||
}
|
||||
if (count > 0) {
|
||||
text->setIndex(start+lengths[count-1]);
|
||||
}
|
||||
current = count-1;
|
||||
mark = current;
|
||||
return count;
|
||||
}
|
||||
|
||||
inline int32_t
|
||||
PossibleWord::acceptMarked( CharacterIterator *text ) {
|
||||
text->setIndex(offset + lengths[mark]);
|
||||
return lengths[mark];
|
||||
}
|
||||
|
||||
inline UBool
|
||||
PossibleWord::backUp( CharacterIterator *text ) {
|
||||
if (current > 0) {
|
||||
text->setIndex(offset + lengths[--current]);
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
inline int32_t
|
||||
PossibleWord::longestPrefix() {
|
||||
return prefix;
|
||||
}
|
||||
|
||||
inline void
|
||||
PossibleWord::markCurrent() {
|
||||
mark = current;
|
||||
}
|
||||
|
||||
// How many words in a row are "good enough"?
|
||||
#define THAI_LOOKAHEAD 3
|
||||
|
||||
// Will not combine a non-word with a preceding dictionary word longer than this
|
||||
#define THAI_ROOT_COMBINE_THRESHOLD 3
|
||||
|
||||
// Will not combine a non-word that shares at least this much prefix with a
|
||||
// dictionary word, with a preceding word
|
||||
#define THAI_PREFIX_COMBINE_THRESHOLD 3
|
||||
|
||||
// Ellision character
|
||||
#define THAI_PAIYANNOI 0x0E2F
|
||||
|
||||
// Repeat character
|
||||
#define THAI_MAIYAMOK 0x0E46
|
||||
|
||||
// Minimum word size
|
||||
#define THAI_MIN_WORD 2
|
||||
|
||||
// Minimum number of characters for two words
|
||||
#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
|
||||
|
||||
ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
|
||||
fDictionary(adoptDictionary) {
|
||||
UnicodeString thaiSet("[[:Thai:]&[:LineBreak=SA:]]", -1, US_INV);
|
||||
UnicodeString markSet("[[:Thai:]&[:LineBreak=SA:]&[:M:]]", -1, US_INV);
|
||||
fThaiWordSet.applyPattern(thaiSet, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fThaiWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(markSet, status);
|
||||
fEndWordSet = fThaiWordSet;
|
||||
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
|
||||
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
|
||||
fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fSuffixSet.add(THAI_PAIYANNOI);
|
||||
fSuffixSet.add(THAI_MAIYAMOK);
|
||||
}
|
||||
|
||||
ThaiBreakEngine::~ThaiBreakEngine() {
|
||||
delete fDictionary;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ThaiBreakEngine::divideUpDictionaryRange( CharacterIterator *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const {
|
||||
if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
|
||||
return 0; // Not enough characters for two words
|
||||
}
|
||||
|
||||
uint32_t wordsFound = 0;
|
||||
int32_t wordLength;
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
PossibleWord words[THAI_LOOKAHEAD];
|
||||
UChar32 uc;
|
||||
|
||||
text->setIndex(rangeStart);
|
||||
|
||||
while (U_SUCCESS(status) && (current = text->getIndex()) < rangeEnd) {
|
||||
wordLength = 0;
|
||||
|
||||
// Look for candidate words at the current position
|
||||
int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
|
||||
// If we found exactly one, use that
|
||||
if (candidates == 1) {
|
||||
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
// If there was more than one, see which one can take us forward the most words
|
||||
else if (candidates > 1) {
|
||||
// If we're already at the end of the range, we're done
|
||||
if (text->getIndex() >= rangeEnd) {
|
||||
goto foundBest;
|
||||
}
|
||||
do {
|
||||
int wordsMatched = 1;
|
||||
if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
|
||||
if (wordsMatched < 2) {
|
||||
// Followed by another dictionary word; mark first word as a good candidate
|
||||
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
|
||||
wordsMatched = 2;
|
||||
}
|
||||
|
||||
// If we're already at the end of the range, we're done
|
||||
if (text->getIndex() >= rangeEnd) {
|
||||
goto foundBest;
|
||||
}
|
||||
|
||||
// See if any of the possible second words is followed by a third word
|
||||
do {
|
||||
// If we find a third word, stop right away
|
||||
if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
|
||||
words[wordsFound%THAI_LOOKAHEAD].markCurrent();
|
||||
goto foundBest;
|
||||
}
|
||||
}
|
||||
while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
|
||||
}
|
||||
}
|
||||
while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
|
||||
foundBest:
|
||||
wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
// We come here after having either found a word or not. We look ahead to the
|
||||
// next word. If it's not a dictionary word, we will combine it withe the word we
|
||||
// just found (if there is one), but only if the preceding word does not exceed
|
||||
// the threshold.
|
||||
// The text iterator should now be positioned at the end of the word we found.
|
||||
if (text->getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
|
||||
// if it is a dictionary word, do nothing. If it isn't, then if there is
|
||||
// no preceding word, or the non-word shares less than the minimum threshold
|
||||
// of characters with a dictionary word, then scan to resynchronize
|
||||
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
|
||||
&& (wordLength == 0
|
||||
|| words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
|
||||
// Look for a plausible word boundary
|
||||
//TODO: This section will need a rework for UText.
|
||||
int32_t remaining = rangeEnd - (current+wordLength);
|
||||
UChar32 pc = text->current32();
|
||||
int32_t chars = 0;
|
||||
while (TRUE) {
|
||||
uc = text->next32();
|
||||
// TODO: Here we're counting on the fact that the SA languages are all
|
||||
// in the BMP. This should get fixed with the UText rework.
|
||||
chars += 1;
|
||||
if (--remaining <= 0) {
|
||||
break;
|
||||
}
|
||||
if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
|
||||
// Maybe. See if it's in the dictionary.
|
||||
// NOTE: In the original Apple code, checked that the next
|
||||
// two characters after uc were not 0x0E4C THANTHAKHAT before
|
||||
// checking the dictionary. That is just a performance filter,
|
||||
// but it's not clear it's faster than checking the trie.
|
||||
int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
|
||||
text->setIndex(current+wordLength+chars);
|
||||
if (candidates > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pc = uc;
|
||||
}
|
||||
|
||||
// Bump the word count if there wasn't already one
|
||||
if (wordLength <= 0) {
|
||||
wordsFound += 1;
|
||||
}
|
||||
|
||||
// Update the length with the passed-over characters
|
||||
wordLength += chars;
|
||||
}
|
||||
else {
|
||||
// Back up to where we were for next iteration
|
||||
text->setIndex(current+wordLength);
|
||||
}
|
||||
}
|
||||
|
||||
// Never stop before a combining mark.
|
||||
int32_t currPos;
|
||||
while ((currPos = text->getIndex()) < rangeEnd && fMarkSet.contains(text->current32())) {
|
||||
wordLength += text->move32(1, CharacterIterator::kCurrent) - currPos;
|
||||
}
|
||||
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// could be a typo in the middle of a word.
|
||||
if (text->getIndex() < rangeEnd && wordLength > 0) {
|
||||
if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
|
||||
&& fSuffixSet.contains(uc = text->current32())) {
|
||||
if (uc == THAI_PAIYANNOI) {
|
||||
if (!fSuffixSet.contains(text->previous32())) {
|
||||
// Skip over previous end and PAIYANNOI
|
||||
text->move32(2, CharacterIterator::kCurrent);
|
||||
wordLength += 1; // Add PAIYANNOI to word
|
||||
uc = text->current32(); // Fetch next character
|
||||
}
|
||||
else {
|
||||
// Restore prior position
|
||||
text->move32(1, CharacterIterator::kCurrent);
|
||||
}
|
||||
}
|
||||
if (uc == THAI_MAIYAMOK) {
|
||||
if (text->previous32() != THAI_MAIYAMOK) {
|
||||
// Skip over previous end and MAIYAMOK
|
||||
text->move32(2, CharacterIterator::kCurrent);
|
||||
wordLength += 1; // Add MAIYAMOK to word
|
||||
}
|
||||
else {
|
||||
// Restore prior position
|
||||
text->move32(1, CharacterIterator::kCurrent);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
text->setIndex(current+wordLength);
|
||||
}
|
||||
}
|
||||
|
||||
// Did we find a word on this iteration? If so, push it on the break stack
|
||||
if (wordLength > 0) {
|
||||
foundBreaks.push((current+wordLength), status);
|
||||
}
|
||||
}
|
||||
|
||||
// Don't return a break for the end of the dictionary range if there is one there.
|
||||
if (foundBreaks.peeki() >= rangeEnd) {
|
||||
(void) foundBreaks.popi();
|
||||
wordsFound -= 1;
|
||||
}
|
||||
|
||||
return wordsFound;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
192
icu4c/source/common/dictbe.h
Normal file
192
icu4c/source/common/dictbe.h
Normal file
|
@ -0,0 +1,192 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef DICTBE_H
|
||||
#define DICTBE_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "brkeng.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class CharacterIterator;
|
||||
class TrieWordDictionary;
|
||||
|
||||
/*******************************************************************
|
||||
* DictionaryBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
|
||||
* dictionary to determine language-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a DictionaryBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class U_COMMON_API DictionaryBreakEngine : public LanguageBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fSet;
|
||||
|
||||
/**
|
||||
* The set of break types handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
uint32_t fTypes;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
DictionaryBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Constructor setting the break types handled.</p>
|
||||
*
|
||||
* @param breakTypes A bitmap of types handled by the engine.
|
||||
*/
|
||||
DictionaryBreakEngine( uint32_t breakTypes );
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~DictionaryBreakEngine();
|
||||
|
||||
/**
|
||||
* <p>Indicate whether this engine handles a particular character for
|
||||
* a particular kind of break.</p>
|
||||
*
|
||||
* @param c A character which begins a run that the engine might handle
|
||||
* @param breakType The type of text break which the caller wants to determine
|
||||
* @return TRUE if this engine handles the particular character and break
|
||||
* type.
|
||||
*/
|
||||
virtual UBool handles( UChar32 c, int32_t breakType ) const;
|
||||
|
||||
/**
|
||||
* <p>Find any breaks within a run in the supplied text.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left at the end of the run of characters which the engine
|
||||
* is capable of handling.
|
||||
* @param startPos The start of the run within the supplied text.
|
||||
* @param endPos The end of the run within the supplied text.
|
||||
* @param reverse Whether the caller is looking for breaks in a reverse
|
||||
* direction.
|
||||
* @param breakType The type of break desired, or -1.
|
||||
* @param foundBreaks An allocated C array of the breaks found, if any
|
||||
* @return The number of breaks found.
|
||||
*/
|
||||
virtual int32_t findBreaks( CharacterIterator *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse,
|
||||
int32_t breakType,
|
||||
UStack &foundBreaks ) const;
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* <p>Set the character set handled by this engine.</p>
|
||||
*
|
||||
* @param set A UnicodeSet of the set of characters handled by the engine
|
||||
*/
|
||||
virtual void setCharacters( UnicodeSet &set );
|
||||
|
||||
/**
|
||||
* <p>Set the break types handled by this engine.</p>
|
||||
*
|
||||
* @param breakTypes A bitmap of types handled by the engine.
|
||||
*/
|
||||
virtual void setBreakTypes( uint32_t breakTypes );
|
||||
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* ThaiBreakEngine
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
|
||||
* TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
|
||||
*
|
||||
* <p>After it is constructed a ThaiBreakEngine may be shared between
|
||||
* threads without synchronization.</p>
|
||||
*/
|
||||
class U_COMMON_API ThaiBreakEngine : public DictionaryBreakEngine {
|
||||
private:
|
||||
/**
|
||||
* The set of characters handled by this engine
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fThaiWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fSuffixSet;
|
||||
UnicodeSet fMarkSet;
|
||||
const TrieWordDictionary *fDictionary;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
* @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
|
||||
* engine is deleted.
|
||||
*/
|
||||
ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~ThaiBreakEngine();
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Divide up a range of known dictionary characters.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text
|
||||
* @param rangeStart The start of the range of dictionary characters
|
||||
* @param rangeEnd The end of the range of dictionary characters
|
||||
* @param foundBreaks Output of C array of int32_t break positions, or 0
|
||||
* @return The number of breaks found
|
||||
*/
|
||||
virtual int32_t divideUpDictionaryRange( CharacterIterator *text,
|
||||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UStack &foundBreaks ) const;
|
||||
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* DICTBE_H */
|
||||
#endif
|
|
@ -22,8 +22,12 @@
|
|||
#include "rbbirb.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "mutex.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "brkeng.h"
|
||||
|
||||
#include "uassert.h"
|
||||
#include "uvector.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -133,6 +137,18 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() {
|
|||
fData->removeReference();
|
||||
fData = NULL;
|
||||
}
|
||||
if (fCachedBreakPositions) {
|
||||
uprv_free(fCachedBreakPositions);
|
||||
fCachedBreakPositions = NULL;
|
||||
}
|
||||
if (fLanguageBreakEngines) {
|
||||
delete fLanguageBreakEngines;
|
||||
fLanguageBreakEngines = NULL;
|
||||
}
|
||||
if (fUnhandledBreakEngine) {
|
||||
delete fUnhandledBreakEngine;
|
||||
fUnhandledBreakEngine = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -144,6 +160,13 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
|
|||
if (this == &that) {
|
||||
return *this;
|
||||
}
|
||||
reset(); // Delete break cache information
|
||||
fBreakType = that.fBreakType;
|
||||
if (fLanguageBreakEngines != NULL) {
|
||||
delete fLanguageBreakEngines;
|
||||
fLanguageBreakEngines = NULL; // Just rebuild for now
|
||||
}
|
||||
// TODO: clone fLanguageBreakEngines from "that"
|
||||
delete fText;
|
||||
fText = NULL;
|
||||
if (that.fText != NULL) {
|
||||
|
@ -178,6 +201,13 @@ void RuleBasedBreakIterator::init() {
|
|||
fLastRuleStatusIndex = 0;
|
||||
fLastStatusIndexValid = TRUE;
|
||||
fDictionaryCharCount = 0;
|
||||
fBreakType = -1;
|
||||
|
||||
fCachedBreakPositions = NULL;
|
||||
fLanguageBreakEngines = NULL;
|
||||
fUnhandledBreakEngine = NULL;
|
||||
fNumCachedBreakPositions = 0;
|
||||
fPositionInCache = 0;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
static UBool debugInitDone = FALSE;
|
||||
|
@ -374,7 +404,7 @@ int32_t RuleBasedBreakIterator::last(void) {
|
|||
int32_t RuleBasedBreakIterator::next(int32_t n) {
|
||||
int32_t result = current();
|
||||
while (n > 0) {
|
||||
result = handleNext();
|
||||
result = next();
|
||||
--n;
|
||||
}
|
||||
while (n < 0) {
|
||||
|
@ -389,7 +419,25 @@ int32_t RuleBasedBreakIterator::next(int32_t n) {
|
|||
* @return The position of the first boundary after this one.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::next(void) {
|
||||
return handleNext();
|
||||
// if we have cached break positions and we're still in the range
|
||||
// covered by them, just move one step forward in the cache
|
||||
if (fCachedBreakPositions != NULL) {
|
||||
if (fPositionInCache < fNumCachedBreakPositions - 1) {
|
||||
++fPositionInCache;
|
||||
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
|
||||
return fCachedBreakPositions[fPositionInCache];
|
||||
}
|
||||
else {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
int32_t startPos = current();
|
||||
int32_t result = handleNext(fData->fForwardTable);
|
||||
if (fDictionaryCharCount > 0) {
|
||||
result = checkDictionary(startPos, result, FALSE);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -397,15 +445,35 @@ int32_t RuleBasedBreakIterator::next(void) {
|
|||
* @return The position of the last boundary position preceding this one.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::previous(void) {
|
||||
int32_t result;
|
||||
int32_t startPos;
|
||||
|
||||
// if we have cached break positions and we're still in the range
|
||||
// covered by them, just move one step backward in the cache
|
||||
if (fCachedBreakPositions != NULL) {
|
||||
if (fPositionInCache > 0) {
|
||||
--fPositionInCache;
|
||||
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
|
||||
return fCachedBreakPositions[fPositionInCache];
|
||||
}
|
||||
else {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
// if we're already sitting at the beginning of the text, return DONE
|
||||
if (fText == NULL || current() == fText->startIndex()) {
|
||||
if (fText == NULL || (startPos = current()) == fText->startIndex()) {
|
||||
fLastRuleStatusIndex = 0;
|
||||
fLastStatusIndexValid = TRUE;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
|
||||
return handlePrevious(fData->fReverseTable);
|
||||
result = handlePrevious(fData->fReverseTable);
|
||||
if (fDictionaryCharCount > 0) {
|
||||
result = checkDictionary(result, startPos, TRUE);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// old rule syntax
|
||||
|
@ -424,7 +492,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
lastResult = fText->startIndex();
|
||||
fText->setIndex(lastResult);
|
||||
}
|
||||
int32_t result = lastResult;
|
||||
result = lastResult;
|
||||
int32_t lastTag = 0;
|
||||
UBool breakTagValid = FALSE;
|
||||
|
||||
|
@ -433,7 +501,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
// point is our return value
|
||||
|
||||
for (;;) {
|
||||
result = handleNext();
|
||||
result = next();
|
||||
if (result == BreakIterator::DONE || result >= start) {
|
||||
break;
|
||||
}
|
||||
|
@ -445,7 +513,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
// fLastBreakTag wants to have the value for section of text preceding
|
||||
// the result position that we are to return (in lastResult.) If
|
||||
// the backwards rules overshot and the above loop had to do two or more
|
||||
// handleNext()s to move up to the desired return position, we will have a valid
|
||||
// next()s to move up to the desired return position, we will have a valid
|
||||
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
|
||||
// we wont have a tag value for that position, which is only set by handleNext().
|
||||
|
||||
|
@ -454,6 +522,10 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
fText->setIndex(lastResult);
|
||||
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
|
||||
fLastStatusIndexValid = breakTagValid;
|
||||
|
||||
// No need to check the dictionary; it will have been handled by
|
||||
// next()
|
||||
|
||||
return lastResult;
|
||||
}
|
||||
|
||||
|
@ -464,6 +536,25 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
* @return The position of the first break after the current position.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
||||
// if we have cached break positions and offset is in the range
|
||||
// covered by them, use them
|
||||
// TODO: could use binary search
|
||||
// TODO: what if offset is outside range, but break is not?
|
||||
if (fCachedBreakPositions != NULL) {
|
||||
if (offset >= fCachedBreakPositions[0]
|
||||
&& offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
|
||||
fPositionInCache = 0;
|
||||
// We are guaranteed not to leave the array due to range test above
|
||||
while (offset >= fCachedBreakPositions[fPositionInCache])
|
||||
++fPositionInCache;
|
||||
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
|
||||
return fCachedBreakPositions[fPositionInCache];
|
||||
}
|
||||
else {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
|
@ -533,7 +624,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
|
||||
fText->setIndex(offset);
|
||||
if (offset == fText->startIndex()) {
|
||||
return handleNext();
|
||||
return next();
|
||||
}
|
||||
result = previous();
|
||||
|
||||
|
@ -551,6 +642,26 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
|
|||
* @return The position of the last boundary before the starting position.
|
||||
*/
|
||||
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
|
||||
// if we have cached break positions and offset is in the range
|
||||
// covered by them, use them
|
||||
if (fCachedBreakPositions != NULL) {
|
||||
// TODO: binary search?
|
||||
// TODO: What if offset is outside range, but break is not?
|
||||
if (offset > fCachedBreakPositions[0]
|
||||
&& offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
|
||||
fPositionInCache = 0;
|
||||
while (fPositionInCache < fNumCachedBreakPositions
|
||||
&& offset > fCachedBreakPositions[fPositionInCache])
|
||||
++fPositionInCache;
|
||||
--fPositionInCache;
|
||||
fText->setIndex(fCachedBreakPositions[fPositionInCache]);
|
||||
return fCachedBreakPositions[fPositionInCache];
|
||||
}
|
||||
else {
|
||||
reset();
|
||||
}
|
||||
}
|
||||
|
||||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
|
||||
|
@ -688,20 +799,6 @@ enum RBBIRunMode {
|
|||
};
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handleNext(void) All forward iteration vectors through this function.
|
||||
// NOTE: This function is overridden by the dictionary base break iterator.
|
||||
// User level API functions go to the dbbi implementation
|
||||
// when the break iterator type is dbbi.
|
||||
// The DBBI implementation sometimes explicitly calls back to here,
|
||||
// its inherited handleNext().
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handleNext() {
|
||||
return handleNext(fData->fForwardTable);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handleNext(stateTable)
|
||||
|
@ -1125,8 +1222,13 @@ continueOn:
|
|||
void
|
||||
RuleBasedBreakIterator::reset()
|
||||
{
|
||||
// Base-class version of this function is a no-op.
|
||||
// Subclasses may override with their own reset behavior.
|
||||
if (fCachedBreakPositions) {
|
||||
uprv_free(fCachedBreakPositions);
|
||||
}
|
||||
fCachedBreakPositions = NULL;
|
||||
fNumCachedBreakPositions = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
fPositionInCache = 0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1155,6 +1257,9 @@ void RuleBasedBreakIterator::makeRuleStatusValid() {
|
|||
// Not at start of text. Find status the tedious way.
|
||||
int32_t pa = current();
|
||||
previous();
|
||||
if (fNumCachedBreakPositions > 0) {
|
||||
reset(); // Blow off the dictionary cache
|
||||
}
|
||||
int32_t pb = next();
|
||||
if (pa != pb) {
|
||||
// note: the if (pa != pb) test is here only to eliminate warnings for
|
||||
|
@ -1306,7 +1411,6 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
|
|||
}
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// isDictionaryChar Return true if the category lookup for this char
|
||||
|
@ -1327,6 +1431,305 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
|
|||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// checkDictionary This function handles all processing of characters in
|
||||
// the "dictionary" set. It will determine the appropriate
|
||||
// course of action, and possibly set up a cache in the
|
||||
// process.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
||||
int32_t endPos,
|
||||
UBool reverse) {
|
||||
// Reset the old break cache first.
|
||||
uint32_t dictionaryCount = fDictionaryCharCount;
|
||||
reset();
|
||||
|
||||
if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
|
||||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
// Starting from the starting point, scan towards the proposed result,
|
||||
// looking for the first dictionary character (which may be the one
|
||||
// we're on, if we're starting in the middle of a range).
|
||||
fText->setIndex(reverse ? endPos : startPos);
|
||||
if (reverse) {
|
||||
fText->move32(-1, CharacterIterator::kCurrent);
|
||||
}
|
||||
|
||||
int32_t rangeStart = startPos;
|
||||
int32_t rangeEnd = endPos;
|
||||
|
||||
uint16_t category;
|
||||
int32_t current;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UStack breaks(status);
|
||||
int32_t foundBreakCount = 0;
|
||||
UChar32 c = fText->current32();
|
||||
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Is the character we're starting on a dictionary character? If so, we
|
||||
// need to back up to include the entire run; otherwise the results of
|
||||
// the break algorithm will differ depending on where we start. Since
|
||||
// the result is cached and there is typically a non-dictionary break
|
||||
// within a small number of words, there should be little performance impact.
|
||||
if (category & 0x4000) {
|
||||
if (reverse) {
|
||||
do {
|
||||
c = fText->next32();
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
while (c != CharacterIterator::DONE && (category & 0x4000));
|
||||
// Back up to the last dictionary character
|
||||
rangeEnd = fText->getIndex();
|
||||
if (c == CharacterIterator::DONE) {
|
||||
c = fText->last32();
|
||||
}
|
||||
else {
|
||||
c = fText->previous32();
|
||||
}
|
||||
}
|
||||
else {
|
||||
do {
|
||||
c = fText->previous32();
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
while (c != CharacterIterator::DONE && (category & 0x4000));
|
||||
// Back up to the last dictionary character
|
||||
if (c == CharacterIterator::DONE) {
|
||||
c = fText->first32();
|
||||
}
|
||||
else {
|
||||
c = fText->next32();
|
||||
}
|
||||
rangeStart = fText->getIndex();
|
||||
}
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
// For each span, find the appropriate break engine, and ask it to find
|
||||
// any breaks within the span.
|
||||
while(U_SUCCESS(status)) {
|
||||
if (reverse) {
|
||||
while((current = fText->getIndex()) > rangeStart && (category & 0x4000) == 0) {
|
||||
c = fText->previous32();
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
if (current <= rangeStart) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while((current = fText->getIndex()) < rangeEnd && (category & 0x4000) == 0) {
|
||||
c = fText->next32();
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// We now have a dictionary character. Get the appropriate language object
|
||||
// to deal with it.
|
||||
const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
|
||||
|
||||
// Ask the language object if there are any breaks. It will leave the text
|
||||
// pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != NULL) {
|
||||
foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, reverse, fBreakType, breaks);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = fText->current32();
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
|
||||
// If we found breaks, build a new break cache. The first and last entries must
|
||||
// be the original starting and ending position.
|
||||
if (foundBreakCount > 0) {
|
||||
int32_t totalBreaks = foundBreakCount;
|
||||
if (startPos < breaks.elementAti(0)) {
|
||||
totalBreaks += 1;
|
||||
}
|
||||
if (endPos > breaks.peeki()) {
|
||||
totalBreaks += 1;
|
||||
}
|
||||
fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
|
||||
if (fCachedBreakPositions != NULL) {
|
||||
int32_t out = 0;
|
||||
fNumCachedBreakPositions = totalBreaks;
|
||||
if (startPos < breaks.elementAti(0)) {
|
||||
fCachedBreakPositions[out++] = startPos;
|
||||
}
|
||||
for (int32_t i = 0; i < foundBreakCount; ++i) {
|
||||
fCachedBreakPositions[out++] = breaks.elementAti(i);
|
||||
}
|
||||
if (endPos > fCachedBreakPositions[out-1]) {
|
||||
fCachedBreakPositions[out] = endPos;
|
||||
}
|
||||
// If there are breaks, then by definition, we are replacing the original
|
||||
// proposed break by one of the breaks we found. Use following() and
|
||||
// preceding() to do the work. They should never recurse in this case.
|
||||
if (reverse) {
|
||||
return preceding(endPos - 1);
|
||||
}
|
||||
else {
|
||||
return following(startPos);
|
||||
}
|
||||
}
|
||||
// If the allocation failed, just fall through to the "no breaks found" case.
|
||||
}
|
||||
|
||||
// If we get here, there were no language-based breaks. As a result, the
|
||||
// text pointer should be back to where it started, but set it just to
|
||||
// make sure.
|
||||
fText->setIndex(reverse ? startPos : endPos);
|
||||
return (reverse ? startPos : endPos);
|
||||
}
|
||||
|
||||
static UStack *gLanguageBreakFactories = NULL;
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// defined in ucln_cmn.h
|
||||
|
||||
/**
|
||||
* Release all static memory held by breakiterator.
|
||||
*/
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
|
||||
if (gLanguageBreakFactories) {
|
||||
delete gLanguageBreakFactories;
|
||||
gLanguageBreakFactories = NULL;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static void U_CALLCONV _deleteFactory(void *obj) {
|
||||
delete (LanguageBreakFactory *) obj;
|
||||
}
|
||||
U_CDECL_END
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
static const LanguageBreakEngine*
|
||||
getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
|
||||
{
|
||||
UBool needsInit;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
umtx_lock(NULL);
|
||||
needsInit = (UBool)(gLanguageBreakFactories == NULL);
|
||||
umtx_unlock(NULL);
|
||||
|
||||
if (needsInit) {
|
||||
UStack *factories = new UStack(_deleteFactory, NULL, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
|
||||
factories->push(builtIn, status);
|
||||
#ifdef U_LOCAL_SERVICE_HOOK
|
||||
LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
|
||||
if (extra != NULL) {
|
||||
factories->push(extra, status);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
umtx_lock(NULL);
|
||||
if (gLanguageBreakFactories == NULL) {
|
||||
gLanguageBreakFactories = factories;
|
||||
factories = NULL;
|
||||
ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
delete factories;
|
||||
}
|
||||
|
||||
if (gLanguageBreakFactories == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int32_t i = gLanguageBreakFactories->size();
|
||||
const LanguageBreakEngine *lbe = NULL;
|
||||
while (--i >= 0) {
|
||||
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
|
||||
lbe = factory->getEngineFor(c, breakType);
|
||||
if (lbe != NULL) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return lbe;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
|
||||
// the characer c.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
const LanguageBreakEngine *
|
||||
RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
|
||||
const LanguageBreakEngine *lbe = NULL;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
if (fLanguageBreakEngines == NULL) {
|
||||
fLanguageBreakEngines = new UStack(status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete fLanguageBreakEngines;
|
||||
fLanguageBreakEngines = 0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t i = fLanguageBreakEngines->size();
|
||||
while (--i >= 0) {
|
||||
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
|
||||
if (lbe->handles(c, fBreakType)) {
|
||||
return lbe;
|
||||
}
|
||||
}
|
||||
|
||||
// No existing dictionary took the character. See if a factory wants to
|
||||
// give us a new LanguageBreakEngine for this character.
|
||||
lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
|
||||
|
||||
// If we got one, use it and push it on our stack.
|
||||
if (lbe != NULL) {
|
||||
fLanguageBreakEngines->push((void *)lbe, status);
|
||||
// Even if we can't remember it, we can keep looking it up, so
|
||||
// return it even if the push fails.
|
||||
return lbe;
|
||||
}
|
||||
|
||||
// No engine is forthcoming for this character. Add it to the
|
||||
// reject set. Create the reject break engine if needed.
|
||||
if (fUnhandledBreakEngine == NULL) {
|
||||
fUnhandledBreakEngine = new UnhandledEngine(status);
|
||||
if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
// Put it last so that scripts for which we have an engine get tried
|
||||
// first.
|
||||
fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
|
||||
// If we can't insert it, or creation failed, get rid of it
|
||||
if (U_FAILURE(status)) {
|
||||
delete fUnhandledBreakEngine;
|
||||
fUnhandledBreakEngine = 0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Tell the reject engine about the character; at its discretion, it may
|
||||
// add more than just the one character.
|
||||
fUnhandledBreakEngine->handleCharacter(c, fBreakType);
|
||||
|
||||
return fUnhandledBreakEngine;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// UText functions As a temporary implementation, create a type of CharacterIterator
|
||||
|
@ -1580,6 +1983,15 @@ UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const
|
|||
return result;
|
||||
}
|
||||
|
||||
int32_t RuleBasedBreakIterator::getBreakType() const {
|
||||
return fBreakType;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::setBreakType(int32_t type) {
|
||||
fBreakType = type;
|
||||
reset();
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
1380
icu4c/source/common/triedict.cpp
Normal file
1380
icu4c/source/common/triedict.cpp
Normal file
File diff suppressed because it is too large
Load diff
338
icu4c/source/common/triedict.h
Normal file
338
icu4c/source/common/triedict.h
Normal file
|
@ -0,0 +1,338 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2006, International Business Machines Corporation and others. *
|
||||
* All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef TRIEDICT_H
|
||||
#define TRIEDICT_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
struct UEnumeration;
|
||||
struct UDataSwapper;
|
||||
|
||||
/**
|
||||
* <p>UDataSwapFn function for use in swapping a compact dictionary.</p>
|
||||
*
|
||||
* @param ds Pointer to UDataSwapper containing global data about the
|
||||
* transformation and function pointers for handling primitive
|
||||
* types.
|
||||
* @param inData Pointer to the input data to be transformed or examined.
|
||||
* @param length Length of the data, counting bytes. May be -1 for preflighting.
|
||||
* If length>=0, then transform the data.
|
||||
* If length==-1, then only determine the length of the data.
|
||||
* The length cannot be determined from the data itself for all
|
||||
* types of data (e.g., not for simple arrays of integers).
|
||||
* @param outData Pointer to the output data buffer.
|
||||
* If length>=0 (transformation), then the output buffer must
|
||||
* have a capacity of at least length.
|
||||
* If length==-1, then outData will not be used and can be NULL.
|
||||
* @param pErrorCode ICU UErrorCode parameter, must not be NULL and must
|
||||
* fulfill U_SUCCESS on input.
|
||||
* @return The actual length of the data.
|
||||
*
|
||||
* @see UDataSwapper
|
||||
*/
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
triedict_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class CharacterIterator;
|
||||
class UCharCharacterIterator;
|
||||
class StringEnumeration;
|
||||
struct CompactTrieHeader;
|
||||
|
||||
/*******************************************************************
|
||||
* TrieWordDictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>TrieWordDictionary is an abstract class that represents a word
|
||||
* dictionary based on a trie. The base protocol is read-only.
|
||||
* Subclasses may allow writing.</p>
|
||||
*/
|
||||
class U_COMMON_API TrieWordDictionary : public UMemory {
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Default constructor.</p>
|
||||
*
|
||||
*/
|
||||
TrieWordDictionary();
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~TrieWordDictionary();
|
||||
|
||||
/**
|
||||
* <p>Find dictionary words that match the text.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param start The current position in text.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t matches( CharacterIterator *text,
|
||||
int32_t maxLength,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit ) const = 0;
|
||||
|
||||
/**
|
||||
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A StringEnumeration that will iterate through the whole dictionary.
|
||||
* The caller is responsible for closing it. The order is unspecified.
|
||||
*/
|
||||
virtual StringEnumeration *openWords( UErrorCode &status ) const = 0;
|
||||
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* MutableTrieDictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>MutableTrieDictionary is a TrieWordDictionary that allows words to be
|
||||
* added.</p>
|
||||
*/
|
||||
|
||||
struct TernaryNode; // Forwards declaration
|
||||
|
||||
class U_COMMON_API MutableTrieDictionary : public TrieWordDictionary {
|
||||
private:
|
||||
/**
|
||||
* The root node of the trie
|
||||
* @internal
|
||||
*/
|
||||
|
||||
TernaryNode *fTrie;
|
||||
|
||||
/**
|
||||
* A UCharCharacterIterator for internal use
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UCharCharacterIterator *fIter;
|
||||
|
||||
friend class CompactTrieDictionary; // For fast conversion
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Constructor.</p>
|
||||
*
|
||||
* @param median A UChar around which to balance the trie. Ideally, it should
|
||||
* begin at least one word that is near the median of the set in the dictionary
|
||||
* @param status A status code recording the success of the call.
|
||||
*/
|
||||
MutableTrieDictionary( UChar median, UErrorCode &status );
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~MutableTrieDictionary();
|
||||
|
||||
/**
|
||||
* <p>Find dictionary words that match the text.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t matches( CharacterIterator *text,
|
||||
int32_t maxLength,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit ) const;
|
||||
|
||||
/**
|
||||
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A StringEnumeration that will iterate through the whole dictionary.
|
||||
* The caller is responsible for closing it. The order is unspecified.
|
||||
*/
|
||||
virtual StringEnumeration *openWords( UErrorCode &status ) const;
|
||||
|
||||
/**
|
||||
* <p>Add one word to the dictionary.</p>
|
||||
*
|
||||
* @param word A UChar buffer containing the word.
|
||||
* @param length The length of the word.
|
||||
* @param status The resultant status
|
||||
*/
|
||||
virtual void addWord( const UChar *word,
|
||||
int32_t length,
|
||||
UErrorCode &status);
|
||||
|
||||
/**
|
||||
* <p>Add all strings from a UEnumeration to the dictionary.</p>
|
||||
*
|
||||
* @param words A UEnumeration that will return the desired words.
|
||||
* @param status The resultant status
|
||||
*/
|
||||
virtual void addWords( UEnumeration *words, UErrorCode &status );
|
||||
|
||||
protected:
|
||||
/**
|
||||
* <p>Search the dictionary for matches.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @param parent The parent of the current node
|
||||
* @param pMatched The returned parent node matched the input
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t search( CharacterIterator *text,
|
||||
int32_t maxLength,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit,
|
||||
TernaryNode *&parent,
|
||||
UBool &pMatched ) const;
|
||||
|
||||
private:
|
||||
/**
|
||||
* <p>Private constructor. The root node it not allocated.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
*/
|
||||
MutableTrieDictionary( UErrorCode &status );
|
||||
};
|
||||
|
||||
/*******************************************************************
|
||||
* CompactTrieDictionary
|
||||
*/
|
||||
|
||||
/**
|
||||
* <p>CompactTrieDictionary is a TrieWordDictionary that has been compacted
|
||||
* to save space.</p>
|
||||
*/
|
||||
class U_COMMON_API CompactTrieDictionary : public TrieWordDictionary {
|
||||
private:
|
||||
/**
|
||||
* The root node of the trie
|
||||
* @internal
|
||||
*/
|
||||
|
||||
const CompactTrieHeader *fData;
|
||||
|
||||
/**
|
||||
* A UBool indicating whether or not we own the data.
|
||||
* @internal
|
||||
*/
|
||||
|
||||
UBool fOwnData;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* <p>Construct a dictionary from raw saved data.</p>
|
||||
*
|
||||
* @param data A pointer to the raw data, which is still owned by the caller
|
||||
* @param status A status code giving the result of the constructor
|
||||
*/
|
||||
CompactTrieDictionary( const void *data, UErrorCode &status );
|
||||
|
||||
/**
|
||||
* <p>Construct a dictionary from a MutableTrieDictionary.</p>
|
||||
*
|
||||
* @param dict The dictionary to use as input.
|
||||
* @param status A status code recording the success of the call.
|
||||
*/
|
||||
CompactTrieDictionary( const MutableTrieDictionary &dict, UErrorCode &status );
|
||||
|
||||
/**
|
||||
* <p>Virtual destructor.</p>
|
||||
*/
|
||||
virtual ~CompactTrieDictionary();
|
||||
|
||||
/**
|
||||
* <p>Find dictionary words that match the text.</p>
|
||||
*
|
||||
* @param text A CharacterIterator representing the text (TODO: UText). The
|
||||
* iterator is left after the longest prefix match in the dictionary.
|
||||
* @param maxLength The maximum number of code units to match.
|
||||
* @param lengths An array that is filled with the lengths of words that matched.
|
||||
* @param count Filled with the number of elements output in lengths.
|
||||
* @param limit The size of the lengths array; this limits the number of words output.
|
||||
* @return The number of characters in text that were matched.
|
||||
*/
|
||||
virtual int32_t matches( CharacterIterator *text,
|
||||
int32_t rangeEnd,
|
||||
int32_t *lengths,
|
||||
int &count,
|
||||
int limit ) const;
|
||||
|
||||
/**
|
||||
* <p>Return a StringEnumeration for iterating all the words in the dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A StringEnumeration that will iterate through the whole dictionary.
|
||||
* The caller is responsible for closing it. The order is unspecified.
|
||||
*/
|
||||
virtual StringEnumeration *openWords( UErrorCode &status ) const;
|
||||
|
||||
/**
|
||||
* <p>Return the size of the compact data.</p>
|
||||
*
|
||||
* @return The size of the dictionary's compact data.
|
||||
*/
|
||||
virtual uint32_t dataSize() const;
|
||||
|
||||
/**
|
||||
* <p>Return a void * pointer to the compact data, platform-endian.</p>
|
||||
*
|
||||
* @return The data for the compact dictionary, suitable for passing to the
|
||||
* constructor.
|
||||
*/
|
||||
virtual const void *data() const;
|
||||
|
||||
/**
|
||||
* <p>Return a MutableTrieDictionary clone of this dictionary.</p>
|
||||
*
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A MutableTrieDictionary with the same data as this dictionary
|
||||
*/
|
||||
virtual MutableTrieDictionary *cloneMutable( UErrorCode &status ) const;
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* <p>Convert a MutableTrieDictionary into a compact data blob.</p>
|
||||
*
|
||||
* @param dict The dictionary to convert.
|
||||
* @param status A status code recording the success of the call.
|
||||
* @return A single data blob starting with a CompactTrieHeader.
|
||||
*/
|
||||
static CompactTrieHeader *compactMutableTrieDictionary( const MutableTrieDictionary &dict,
|
||||
UErrorCode &status );
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* TRIEDICT_H */
|
||||
#endif
|
13
icu4c/source/common/ubrkimpl.h
Normal file
13
icu4c/source/common/ubrkimpl.h
Normal file
|
@ -0,0 +1,13 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2006, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef UBRKIMPL_H
|
||||
#define UBRKIMPL_H
|
||||
|
||||
#define U_ICUDATA_BRKITR U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "brkitr"
|
||||
|
||||
#endif /*UBRKIMPL_H*/
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* *
|
||||
* Copyright (C) 2001-2005, International Business Machines *
|
||||
* Copyright (C) 2001-2006, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
* *
|
||||
******************************************************************************
|
||||
|
@ -35,6 +35,7 @@ typedef enum ECleanupCommonType {
|
|||
UCLN_COMMON_START = -1,
|
||||
UCLN_COMMON_USPREP,
|
||||
UCLN_COMMON_BREAKITERATOR,
|
||||
UCLN_COMMON_BREAKITERATOR_DICT,
|
||||
UCLN_COMMON_SERVICE,
|
||||
UCLN_COMMON_URES,
|
||||
UCLN_COMMON_LOCALE,
|
||||
|
|
|
@ -515,8 +515,8 @@ public:
|
|||
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
|
||||
|
||||
private:
|
||||
static BreakIterator* buildInstance(const Locale& loc, const char *type, UBool dict, UErrorCode& status);
|
||||
static BreakIterator* createInstance(const Locale& loc, UBreakIteratorType kind, UErrorCode& status);
|
||||
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
|
||||
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
|
||||
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
|
||||
|
||||
friend class ICUBreakIteratorFactory;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2005 IBM Corp. All rights reserved.
|
||||
* Copyright (C) 1999-2006 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
|
@ -22,253 +22,17 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/* forward declaration */
|
||||
class DictionaryBasedBreakIteratorTables;
|
||||
|
||||
/**
|
||||
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
|
||||
* to further subdivide ranges of text beyond what is possible using just the
|
||||
* state-table-based algorithm. This is necessary, for example, to handle
|
||||
* word and line breaking in Thai, which doesn't use spaces between words. The
|
||||
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
|
||||
* up text as far as possible, and then contiguous ranges of letters are
|
||||
* repeatedly compared against a list of known words (i.e., the dictionary)
|
||||
* to divide them up into words.
|
||||
*
|
||||
* <p>Applications do not normally need to include this header.</p>
|
||||
*
|
||||
* <p>This class will probably be deprecated in a future release of ICU, and replaced
|
||||
* with a more flexible and capable dictionary based break iterator. This change
|
||||
* should be invisible to applications, because creation and use of instances of
|
||||
* DictionaryBasedBreakIterator is through the factories and abstract
|
||||
* API on class BreakIterator, which will remain stable.</p>
|
||||
*
|
||||
* <p>This class is not intended to be subclassed.</p>
|
||||
*
|
||||
*
|
||||
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
|
||||
* but adds one more special substitution name: <dictionary>. This substitution
|
||||
* name is used to identify characters in words in the dictionary. The idea is that
|
||||
* if the iterator passes over a chunk of text that includes two or more characters
|
||||
* in a row that are included in <dictionary>, it goes back through that range and
|
||||
* derives additional break positions (if possible) using the dictionary.
|
||||
*
|
||||
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
|
||||
* file. It follows a prescribed search path to locate the dictionary (right now,
|
||||
* it looks for it in /com/ibm/text/resources in each directory in the classpath,
|
||||
* and won't find it in JAR files, but this location is likely to change). The
|
||||
* dictionary file is in a serialized binary format. We have a very primitive (and
|
||||
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
|
||||
* currently making it public. Contact us for help.
|
||||
* <p>
|
||||
* <b> NOTE </b> The DictionaryBasedIterator class is still under development. The
|
||||
* APIs are not in stable condition yet.
|
||||
* An obsolete subclass of RuleBasedBreakIterator. Handling of dictionary-
|
||||
* based break iteration has been folded into the base class. This class
|
||||
* is deprecated as of ICU 3.6.
|
||||
*/
|
||||
class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
|
||||
private:
|
||||
typedef RuleBasedBreakIterator DictionaryBasedBreakIterator;
|
||||
|
||||
/**
|
||||
* when a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
* to use either the dictionary or the state table again until the iterator
|
||||
* leaves this range of text
|
||||
*/
|
||||
int32_t* cachedBreakPositions;
|
||||
|
||||
/**
|
||||
* The number of elements in cachedBreakPositions
|
||||
*/
|
||||
int32_t numCachedBreakPositions;
|
||||
|
||||
/**
|
||||
* if cachedBreakPositions is not null, this indicates which item in the
|
||||
* cache the current iteration position refers to
|
||||
*/
|
||||
int32_t positionInCache;
|
||||
|
||||
DictionaryBasedBreakIteratorTables *fTables;
|
||||
|
||||
/**=======================================================================
|
||||
* Create a dictionary based break boundary detection iterator.
|
||||
* @param tablesImage The location for the dictionary to be loaded into memory
|
||||
* @param dictionaryFilename The name of the dictionary file
|
||||
* @param status the error code status
|
||||
* @return A dictionary based break detection iterator. The UErrorCode& status
|
||||
* parameter is used to return status information to the user.
|
||||
* To check whether the construction succeeded or not, you should check
|
||||
* the value of U_SUCCESS(err). If you wish more detailed information, you
|
||||
* can check for informational error results which still indicate success. For example,
|
||||
* U_FILE_ACCESS_ERROR will be returned if the file does not exist.
|
||||
* The caller owns the returned object and is responsible for deleting it.
|
||||
======================================================================= */
|
||||
DictionaryBasedBreakIterator(UDataMemory* tablesImage, const char* dictionaryFilename, UErrorCode& status);
|
||||
|
||||
public:
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual ~DictionaryBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Default constructor. Creates an "empty" break iterator.
|
||||
* Such an iterator can subsequently be assigned to.
|
||||
* @return the newly created DictionaryBaseBreakIterator.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
DictionaryBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
* @param other The DictionaryBasedBreakIterator to be copied.
|
||||
* @return the newly created DictionaryBasedBreakIterator.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other);
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
* @param that The object to be copied.
|
||||
* @return the newly set DictionaryBasedBreakIterator.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
DictionaryBasedBreakIterator& operator=(const DictionaryBasedBreakIterator& that);
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
* @return Returns a newly-constructed RuleBasedBreakIterator.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual BreakIterator* clone(void) const;
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
/**
|
||||
* Advances the iterator backwards, to the last boundary preceding this one.
|
||||
* @return The position of the last boundary position preceding this one.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual int32_t previous(void);
|
||||
|
||||
/**
|
||||
* Sets the iterator to refer to the first boundary position following
|
||||
* the specified position.
|
||||
* @param offset The position from which to begin searching for a break position.
|
||||
* @return The position of the first break after the current position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual int32_t following(int32_t offset);
|
||||
|
||||
/**
|
||||
* Sets the iterator to refer to the last boundary position before the
|
||||
* specified position.
|
||||
* @param offset The position to begin searching for a break from.
|
||||
* @return The position of the last boundary before the starting position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual int32_t preceding(int32_t offset);
|
||||
|
||||
/**
|
||||
* Returns the class ID for this class. This is useful only for
|
||||
* comparing to a return value from getDynamicClassID(). For example:
|
||||
*
|
||||
* Base* polymorphic_pointer = createPolymorphicObject();
|
||||
* if (polymorphic_pointer->getDynamicClassID() ==
|
||||
* Derived::getStaticClassID()) ...
|
||||
*
|
||||
* @return The class ID for all objects of this class.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
static UClassID U_EXPORT2 getStaticClassID(void);
|
||||
|
||||
/**
|
||||
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
||||
* This method is to implement a simple version of RTTI, since not all
|
||||
* C++ compilers support genuine RTTI. Polymorphic operator==() and
|
||||
* clone() methods call this method.
|
||||
*
|
||||
* @return The class ID for this object. All objects of a
|
||||
* given class have the same class ID. Objects of
|
||||
* other classes have different class IDs.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
virtual UClassID getDynamicClassID(void) const;
|
||||
|
||||
protected:
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
* and advances through the text character by character until we reach the end
|
||||
* of the text or the state machine transitions to state 0. We update our return
|
||||
* value every time the state machine passes through a possible end state.
|
||||
* @internal
|
||||
*/
|
||||
virtual int32_t handleNext(void);
|
||||
|
||||
/**
|
||||
* removes the cache of break positions (usually in response to a change in
|
||||
* position of some sort)
|
||||
* @internal
|
||||
*/
|
||||
virtual void reset(void);
|
||||
|
||||
/**
|
||||
* init Initialize a dbbi. Common routine for use by constructors.
|
||||
* @internal
|
||||
*/
|
||||
void init();
|
||||
|
||||
/**
|
||||
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
|
||||
* If buffer is not large enough, new memory will be allocated.
|
||||
* @param BufferSize reference to size of allocated space.
|
||||
* If BufferSize == 0, a sufficient size for use in cloning will
|
||||
* be returned ('pre-flighting')
|
||||
* If BufferSize is not enough for a stack-based safe clone,
|
||||
* new memory will be allocated.
|
||||
* @param status to indicate whether the operation went on smoothly or there were errors
|
||||
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
|
||||
* necessary.
|
||||
* @return pointer to the new clone
|
||||
* @internal
|
||||
*/
|
||||
virtual BreakIterator * createBufferClone(void *stackBuffer,
|
||||
int32_t &BufferSize,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
private:
|
||||
/**
|
||||
* This is the function that actually implements the dictionary-based
|
||||
* algorithm. Given the endpoints of a range of text, it uses the
|
||||
* dictionary to determine the positions of any boundaries in this
|
||||
* range. It stores all the boundary positions it discovers in
|
||||
* cachedBreakPositions so that we only have to do this work once
|
||||
* for each time we enter the range.
|
||||
* @param startPos The start position of a range of text
|
||||
* @param endPos The end position of a range of text
|
||||
* @param status The error code status
|
||||
*/
|
||||
void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);
|
||||
|
||||
|
||||
/*
|
||||
* HSYS : Please revisit with Rich, the ctors of the DBBI class is currently
|
||||
* marked as private.
|
||||
*/
|
||||
friend class DictionaryBasedBreakIteratorTables;
|
||||
friend class BreakIterator;
|
||||
};
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -37,6 +37,9 @@ struct RBBIDataHeader;
|
|||
class RuleBasedBreakIteratorTables;
|
||||
class BreakIterator;
|
||||
class RBBIDataWrapper;
|
||||
class UStack;
|
||||
class LanguageBreakEngine;
|
||||
class UnhandledEngine;
|
||||
struct RBBIStateTable;
|
||||
|
||||
|
||||
|
@ -86,13 +89,58 @@ protected:
|
|||
|
||||
/**
|
||||
* Counter for the number of characters encountered with the "dictionary"
|
||||
* flag set. Normal RBBI iterators don't use it, although the code
|
||||
* for updating it is live. Dictionary Based break iterators (a subclass
|
||||
* of us) access this field directly.
|
||||
* flag set.
|
||||
* @internal
|
||||
*/
|
||||
uint32_t fDictionaryCharCount;
|
||||
uint32_t fDictionaryCharCount;
|
||||
|
||||
/**
|
||||
* When a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
* to use either the dictionary or the state table again until the iterator
|
||||
* leaves this range of text. Has the most impact for line breaking.
|
||||
* @internal
|
||||
*/
|
||||
int32_t* fCachedBreakPositions;
|
||||
|
||||
/**
|
||||
* The number of elements in fCachedBreakPositions
|
||||
* @internal
|
||||
*/
|
||||
int32_t fNumCachedBreakPositions;
|
||||
|
||||
/**
|
||||
* if fCachedBreakPositions is not null, this indicates which item in the
|
||||
* cache the current iteration position refers to
|
||||
* @internal
|
||||
*/
|
||||
int32_t fPositionInCache;
|
||||
|
||||
/**
|
||||
*
|
||||
* If present, UStack of LanguageBreakEngine objects that might handle
|
||||
* dictionary characters. Searched from top to bottom to find an object to
|
||||
* handle a given character.
|
||||
* @internal
|
||||
*/
|
||||
UStack *fLanguageBreakEngines;
|
||||
|
||||
/**
|
||||
*
|
||||
* If present, the special LanguageBreakEngine used for handling
|
||||
* characters that are in the dictionary set, but not handled by any
|
||||
* LangugageBreakEngine.
|
||||
* @internal
|
||||
*/
|
||||
UnhandledEngine *fUnhandledBreakEngine;
|
||||
|
||||
/**
|
||||
*
|
||||
* The type of the break iterator, or -1 if it has not been set.
|
||||
* @internal
|
||||
*/
|
||||
int32_t fBreakType;
|
||||
|
||||
/**
|
||||
* Debugging flag. Trace operation of state machine when true.
|
||||
* @internal
|
||||
|
@ -117,7 +165,7 @@ protected:
|
|||
*/
|
||||
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
|
||||
|
||||
/** @internal */
|
||||
|
||||
friend class RBBIRuleBuilder;
|
||||
/** @internal */
|
||||
friend class BreakIterator;
|
||||
|
@ -506,20 +554,9 @@ protected:
|
|||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* This method is the actual implementation of the next() method. All iteration
|
||||
* vectors through here. This method initializes the state machine to state 1
|
||||
* and advances through the text character by character until we reach the end
|
||||
* of the text or the state machine transitions to state 0. We update our return
|
||||
* value every time the state machine passes through a possible end state.
|
||||
* @internal
|
||||
*/
|
||||
virtual int32_t handleNext(void);
|
||||
|
||||
/**
|
||||
* Dumps caches and performs other actions associated with a complete change
|
||||
* in text or iteration position. This function is a no-op in RuleBasedBreakIterator,
|
||||
* but subclasses can and do override it.
|
||||
* in text or iteration position.
|
||||
* @internal
|
||||
*/
|
||||
virtual void reset(void);
|
||||
|
@ -534,6 +571,20 @@ protected:
|
|||
*/
|
||||
virtual UBool isDictionaryChar(UChar32);
|
||||
|
||||
/**
|
||||
* Get the type of the break iterator.
|
||||
* @internal
|
||||
*/
|
||||
virtual int32_t getBreakType() const;
|
||||
/** @internal */
|
||||
|
||||
/**
|
||||
* Set the type of the break iterator.
|
||||
* @internal
|
||||
*/
|
||||
virtual void setBreakType(int32_t type);
|
||||
/** @internal */
|
||||
|
||||
/**
|
||||
* Common initialization function, used by constructors and bufferClone.
|
||||
* (Also used by DictionaryBasedBreakIterator::createBufferClone().)
|
||||
|
@ -565,6 +616,30 @@ private:
|
|||
*/
|
||||
int32_t handleNext(const RBBIStateTable *statetable);
|
||||
|
||||
/**
|
||||
* This is the function that actually implements dictionary-based
|
||||
* breaking. Covering at least the range from startPos to endPos,
|
||||
* it checks for dictionary characters, and if it finds them determines
|
||||
* the appropriate object to deal with them. It may cache found breaks in
|
||||
* fCachedBreakPositions as it goes. It may well also look at text outside
|
||||
* the range startPos to endPos.
|
||||
* If going forward, endPos is the normal Unicode break result, and
|
||||
* if goind in reverse, startPos is the normal Unicode break result
|
||||
* @param startPos The start position of a range of text
|
||||
* @param endPos The end position of a range of text
|
||||
* @param reverse The call is for the reverse direction
|
||||
* @internal
|
||||
*/
|
||||
int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
|
||||
|
||||
/**
|
||||
* This function returns the appropriate LanguageBreakEngine for a
|
||||
* given character c.
|
||||
* @param c A character in the dictionary set
|
||||
* @internal
|
||||
*/
|
||||
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
|
||||
|
||||
/**
|
||||
* @internal
|
||||
*/
|
||||
|
|
3
icu4c/source/configure
vendored
3
icu4c/source/configure
vendored
|
@ -7685,7 +7685,7 @@ then
|
|||
CXXFLAGS="$CXXFLAGS \$(THREADSCXXFLAGS)"
|
||||
fi
|
||||
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/icupkg.inc config/Makefile.inc data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/pkgdata/Makefile tools/toolutil/Makefile tools/dumpce/Makefile test/Makefile test/testdata/Makefile test/testdata/pkgdata.inc test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/thaitest/Makefile test/testmap/Makefile test/letest/Makefile test/threadtest/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/csdet/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/icupkg.inc config/Makefile.inc data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/pkgdata/Makefile tools/toolutil/Makefile tools/dumpce/Makefile test/Makefile test/testdata/Makefile test/testdata/pkgdata.inc test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/thaitest/Makefile test/testmap/Makefile test/letest/Makefile test/threadtest/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/csdet/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
# tests run on this system so they can be shared between configure
|
||||
|
@ -8233,6 +8233,7 @@ do
|
|||
"tools/genccode/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/genccode/Makefile" ;;
|
||||
"tools/gencmn/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gencmn/Makefile" ;;
|
||||
"tools/gencnval/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gencnval/Makefile" ;;
|
||||
"tools/genctd/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/genctd/Makefile" ;;
|
||||
"tools/gennames/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gennames/Makefile" ;;
|
||||
"tools/gentest/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gentest/Makefile" ;;
|
||||
"tools/gennorm/Makefile" ) CONFIG_FILES="$CONFIG_FILES tools/gennorm/Makefile" ;;
|
||||
|
|
|
@ -1010,6 +1010,7 @@ AC_OUTPUT([icudefs.mk \
|
|||
tools/genccode/Makefile \
|
||||
tools/gencmn/Makefile \
|
||||
tools/gencnval/Makefile \
|
||||
tools/genctd/Makefile \
|
||||
tools/gennames/Makefile \
|
||||
tools/gentest/Makefile \
|
||||
tools/gennorm/Makefile \
|
||||
|
|
|
@ -73,11 +73,12 @@ TRANSLITSRCDIR=$(SRCDATADIR)/translit
|
|||
TRANSLITBLDDIR=$(BUILDDIR)/translit
|
||||
MISCSRCDIR=$(SRCDATADIR)/misc
|
||||
BRKSRCDIR=$(SRCDATADIR)/brkitr
|
||||
BRKBLDDIR=$(BUILDDIR)/brkitr
|
||||
MISCSRCDIR=$(SRCDATADIR)/misc
|
||||
UCMSRCDIR=$(SRCDATADIR)/mappings
|
||||
COMINCDIR=$(top_srcdir)/common/unicode
|
||||
SRCLISTDEPS=Makefile $(srcdir)/Makefile.in
|
||||
BUILD_DIRS=$(OUTDIR) $(BUILDDIR) $(COLBLDDIR) $(RBNFBLDDIR) $(TRANSLITBLDDIR) $(TESTOUTDIR) $(TESTBUILDDIR) $(OUTTMPDIR) $(OUTTMPDIR_390STUB) $(OUTTMPDIR)/$(COLLATION_TREE) $(OUTTMPDIR)/$(RBNF_TREE) $(OUTTMPDIR)/$(TRANSLIT_TREE)
|
||||
BUILD_DIRS=$(OUTDIR) $(BUILDDIR) $(BRKBLDDIR) $(COLBLDDIR) $(RBNFBLDDIR) $(TRANSLITBLDDIR) $(TESTOUTDIR) $(TESTBUILDDIR) $(OUTTMPDIR) $(OUTTMPDIR_390STUB) $(OUTTMPDIR)/$(COLLATION_TREE) $(OUTTMPDIR)/$(RBNF_TREE) $(OUTTMPDIR)/$(TRANSLIT_TREE) $(OUTTMPDIR)/$(BREAK_TREE)
|
||||
|
||||
# relative lib links from pkgdata are the same as for tmp
|
||||
TOOLDIR=$(top_builddir)/tools
|
||||
|
@ -209,11 +210,19 @@ DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu ucadata.icu invuca.icu uidna.
|
|||
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
|
||||
|
||||
## BRK files
|
||||
BREAK_TREE=brkitr
|
||||
-include $(BRKSRCDIR)/brkfiles.mk
|
||||
-include $(BRKSRCDIR)/brklocal.mk
|
||||
ALL_BRK_SOURCE=char.txt title.txt word.txt $(BRK_SOURCE) $(BRK_SOURCE_LOCAL)
|
||||
BRK_FILES_SHORT=$(ALL_BRK_SOURCE:%.txt=%.brk)
|
||||
BRK_FILES=$(BRK_FILES_SHORT:%=$(BUILDDIR)/%)
|
||||
BRK_FILES_SHORT=$(ALL_BRK_SOURCE:%.txt=$(BREAK_TREE)/%.brk)
|
||||
BRK_FILES=$(ALL_BRK_SOURCE:%.txt=$(BRKBLDDIR)/%.brk)
|
||||
|
||||
## CTD files
|
||||
-include $(BRKSRCDIR)/ctdfiles.mk
|
||||
-include $(BRKSRCDIR)/ctdlocal.mk
|
||||
ALL_CTD_SOURCE=$(CTD_SOURCE) $(CTD_SOURCE_LOCAL)
|
||||
CTD_FILES_SHORT=$(ALL_CTD_SOURCE:%.txt=$(BREAK_TREE)/%.ctd)
|
||||
CTD_FILES=$(ALL_CTD_SOURCE:%.txt=$(BRKBLDDIR)/%.ctd)
|
||||
|
||||
## UCM files
|
||||
-include $(UCMSRCDIR)/ucmcore.mk
|
||||
|
@ -228,10 +237,12 @@ CNV_FILES_SHORT = $(ALL_UCM_SOURCE:%.ucm=%.cnv)
|
|||
## RES files
|
||||
-include $(LOCSRCDIR)/resfiles.mk
|
||||
-include $(COLSRCDIR)/colfiles.mk
|
||||
-include $(BRKSRCDIR)/brsfiles.mk
|
||||
-include $(RBNFSRCDIR)/rbnffiles.mk
|
||||
-include $(TRANSLITSRCDIR)/trnsfiles.mk
|
||||
-include $(LOCSRCDIR)/reslocal.mk
|
||||
-include $(COLSRCDIR)/collocal.mk
|
||||
-include $(BRKSRCDIR)/brslocal.mk
|
||||
-include $(RBNFSRCDIR)/rbnflocal.mk
|
||||
-include $(TRANSLITSRCDIR)/trnslocal.mk
|
||||
ifdef GENRB_SOURCE
|
||||
|
@ -244,6 +255,11 @@ COL_SRC= root.txt $(COLLATION_SOURCE) $(COLLATION_ALIAS_SOURCE) $(COLLATION_SOUR
|
|||
COL_SRC_FILES = $(COL_SRC:%=$(COLSRCDIR)/%)
|
||||
INSTALLED_COL_FILES = $(COLLATION_SOURCE:%.txt=%) $(COLLATION_SOURCE_LOCAL:%.txt=%)
|
||||
endif
|
||||
ifdef BREAKRES_SOURCE
|
||||
BRS_SRC= root.txt $(BREAKRES_SOURCE) $(BREAKRES_SOURCE_LOCAL)
|
||||
BRS_SRC_FILES = $(BRS_SRC:%=$(BRKSRCDIR)/%)
|
||||
INSTALLED_BRS_FILES = $(BREAKRES_SOURCE:%.txt=%) $(BREAKRES_SOURCE_LOCAL:%.txt=%)
|
||||
endif
|
||||
ifdef RBNF_SOURCE
|
||||
RBNF_SRC= root.txt $(RBNF_SOURCE) $(RBNF_ALIAS_SOURCE) $(RBNF_SOURCE_LOCAL)
|
||||
RBNF_SRC_FILES = $(RBNF_SRC:%=$(RBNFSRCDIR)/%)
|
||||
|
@ -286,6 +302,12 @@ COLLATION_INDEX_RES_SHORT=$(COLLATION_TREE)/$(INDEX_NAME).res
|
|||
COLLATION_FILES = $(COL_SRC:%.txt=$(COLBLDDIR)/%.res) $(COLLATION_INDEX_RES)
|
||||
COLLATION_FILES_SHORT = $(COL_SRC:%.txt=$(COLLATION_TREE)/%.res)
|
||||
|
||||
BREAKRES_INDEX_FILE=$(OUTTMPDIR)/$(BREAK_TREE)/$(INDEX_NAME).txt
|
||||
BREAKRES_INDEX_RES=$(BRKBLDDIR)/$(INDEX_NAME).res
|
||||
BREAKRES_INDEX_RES_SHORT=$(BREAK_TREE)/$(INDEX_NAME).res
|
||||
BREAKRES_FILES = $(BRS_SRC:%.txt=$(BRKBLDDIR)/%.res) $(BREAKRES_INDEX_RES)
|
||||
BREAKRES_FILES_SHORT = $(BRS_SRC:%.txt=$(BREAK_TREE)/%.res)
|
||||
|
||||
RBNF_TREE=rbnf
|
||||
RBNF_INDEX_FILE=$(OUTTMPDIR)/$(RBNF_TREE)/$(INDEX_NAME).txt
|
||||
RBNF_INDEX_RES=$(RBNFBLDDIR)/$(INDEX_NAME).res
|
||||
|
@ -301,9 +323,9 @@ TRANSLIT_FILES = $(TRANSLIT_SRC:%.txt=$(TRANSLITBLDDIR)/%.res)
|
|||
TRANSLIT_FILES_SHORT = $(TRANSLIT_SRC:%.txt=$(TRANSLIT_TREE)/%.res)
|
||||
|
||||
## All generated files
|
||||
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(RBNF_FILES) $(TRANSLIT_FILES)
|
||||
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BREAKRES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES)
|
||||
# a list to use in the .lst files (package-relative)
|
||||
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT)
|
||||
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BREAKRES_FILES_SHORT) $(BREAKRES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT)
|
||||
|
||||
#####################################################
|
||||
# General data build rules
|
||||
|
@ -387,12 +409,18 @@ $(BUILDDIR)/uidna.spp: $(MISCSRCDIR)/NamePrepProfile.txt $(BINDIR)/gensprep$(EXE
|
|||
#################################################### BRK
|
||||
# BRK FILES
|
||||
|
||||
thaidict.brk: $(SRCDATADIR)/thaidict.brk
|
||||
$(RMV) $@ && ln -s $(BUILDDIR) $@
|
||||
#thaidict.brk: $(SRCDATADIR)/thaidict.brk
|
||||
# $(RMV) $@ && ln -s $(BUILDDIR) $@
|
||||
|
||||
$(BUILDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(BINDIR)/genbrk$(EXEEXT) $(DAT_FILES)
|
||||
$(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(BINDIR)/genbrk$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(BINDIR)/genbrk -c -i $(BUILDDIR) -r $< -o $@
|
||||
|
||||
#################################################### CTD
|
||||
# CTD FILES
|
||||
|
||||
$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(BINDIR)/genctd$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(BINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
|
||||
|
||||
#################################################### CNV
|
||||
# CNV FILES
|
||||
$(BUILDDIR)/%.cnv: $(UCMSRCDIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT)
|
||||
|
@ -420,6 +448,25 @@ $(OUTTMPDIR)/$(COLLATION_TREE)/$(INDEX_NAME).txt: $(SRCLISTDEPS)
|
|||
echo " }" >> $@; \
|
||||
echo "}" >> $@;
|
||||
|
||||
### brk res
|
||||
$(BRKBLDDIR)/%.res: $(BRKSRCDIR)/%.txt $(BINDIR)/genrb$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -s $(BRKSRCDIR) -d $(BRKBLDDIR) $(<F)
|
||||
|
||||
$(BRKBLDDIR)/$(INDEX_NAME).res: $(OUTTMPDIR)/$(BREAK_TREE)/$(INDEX_NAME).txt $(BINDIR)/genrb$(EXEEXT)
|
||||
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -s $(OUTTMPDIR)/$(BREAK_TREE) -d $(BRKBLDDIR) $(INDEX_NAME).txt
|
||||
|
||||
$(OUTTMPDIR)/$(BREAK_TREE)/$(INDEX_NAME).txt: $(SRCLISTDEPS)
|
||||
@echo "generating $@ (list of installed break locales)"; \
|
||||
$(RMV) $@; \
|
||||
echo "// Warning this file is automatically generated" > $@; \
|
||||
echo "$(INDEX_NAME):table(nofallback) {" >> $@; \
|
||||
echo " InstalledLocales {" >> $@; \
|
||||
for file in $(INSTALLED_BRS_FILES); do \
|
||||
echo " $$file {\"\"}" >> $@; \
|
||||
done; \
|
||||
echo " }" >> $@; \
|
||||
echo "}" >> $@;
|
||||
|
||||
### RBNF res
|
||||
$(RBNFBLDDIR)/%.res: $(RBNFSRCDIR)/%.txt $(BINDIR)/genrb$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -s $(RBNFSRCDIR) -d $(RBNFBLDDIR) $(<F)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# * Copyright (C) 1997-2004, International Business Machines
|
||||
# * Copyright (C) 1997-2006, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# A list of txt's to build
|
||||
# Note:
|
||||
|
@ -28,4 +28,4 @@
|
|||
# char.txt, title.txt and word.txt are not included so that more tests pass by default,
|
||||
# and so that the makefile rules are simplier.
|
||||
BRK_SOURCE = \
|
||||
line.txt sent.txt line_th.txt word_th.txt word_ja.txt word_POSIX.txt
|
||||
line.txt sent.txt word_ja.txt word_POSIX.txt
|
||||
|
|
27
icu4c/source/data/brkitr/brsfiles.mk
Normal file
27
icu4c/source/data/brkitr/brsfiles.mk
Normal file
|
@ -0,0 +1,27 @@
|
|||
# * Copyright (C) 2006, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# A list of txt's to build
|
||||
# Note:
|
||||
#
|
||||
# If you are thinking of modifying this file, READ THIS.
|
||||
#
|
||||
# Instead of changing this file [unless you want to check it back in],
|
||||
# you should consider creating a 'brslocal.mk' file in this same directory.
|
||||
# Then, you can have your local changes remain even if you upgrade or
|
||||
# reconfigure ICU.
|
||||
#
|
||||
# Example 'brslocal.mk' files:
|
||||
#
|
||||
# * To add an additional locale to the list:
|
||||
# _____________________________________________________
|
||||
# | BREAKRES_SOURCE_LOCAL = myLocale.txt ...
|
||||
#
|
||||
# * To REPLACE the default list and only build with a few
|
||||
# locale:
|
||||
# _____________________________________________________
|
||||
# | BREAKRES_SOURCE = ar.txt ar_AE.txt en.txt de.txt zh.txt
|
||||
#
|
||||
#
|
||||
|
||||
# Ordinary resources
|
||||
BREAKRES_SOURCE = ja.txt en.txt en_US.txt en_US_POSIX.txt
|
27
icu4c/source/data/brkitr/ctdfiles.mk
Normal file
27
icu4c/source/data/brkitr/ctdfiles.mk
Normal file
|
@ -0,0 +1,27 @@
|
|||
# * Copyright (C) 2006, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# A list of txt's to build
|
||||
# Note:
|
||||
#
|
||||
# If you are thinking of modifying this file, READ THIS.
|
||||
#
|
||||
# Instead of changing this file [unless you want to check it back in],
|
||||
# you should consider creating a 'ctdlocal.mk' file in this same directory.
|
||||
# Then, you can have your local changes remain even if you upgrade or
|
||||
# reconfigure ICU.
|
||||
#
|
||||
# Example 'ctdlocal.mk' files:
|
||||
#
|
||||
# * To add an additional dictionary to the list:
|
||||
# _____________________________________________________
|
||||
# | CTD_SOURCE_LOCAL = myDict.txt ...
|
||||
#
|
||||
# * To REPLACE the default list and only build with a different
|
||||
# dictionary:
|
||||
# _____________________________________________________
|
||||
# | CTD_SOURCE = myDict.txt
|
||||
#
|
||||
#
|
||||
|
||||
CTD_SOURCE = \
|
||||
thaidict.txt
|
14
icu4c/source/data/brkitr/en.txt
Normal file
14
icu4c/source/data/brkitr/en.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/en_US_POSIX.xml
|
||||
// *
|
||||
// ***************************************************************************
|
||||
/**
|
||||
* ICU <specials> source: <path>/xml/main/en.xml
|
||||
*/
|
||||
en{
|
||||
Version{"1.36"}
|
||||
}
|
14
icu4c/source/data/brkitr/en_US.txt
Normal file
14
icu4c/source/data/brkitr/en_US.txt
Normal file
|
@ -0,0 +1,14 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/en_US_POSIX.xml
|
||||
// *
|
||||
// ***************************************************************************
|
||||
/**
|
||||
* ICU <specials> source: <path>/xml/main/en_US.xml
|
||||
*/
|
||||
en_US{
|
||||
Version{"1.36"}
|
||||
}
|
17
icu4c/source/data/brkitr/en_US_POSIX.txt
Normal file
17
icu4c/source/data/brkitr/en_US_POSIX.txt
Normal file
|
@ -0,0 +1,17 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/en_US_POSIX.xml
|
||||
// *
|
||||
// ***************************************************************************
|
||||
/**
|
||||
* ICU <specials> source: <path>/xml/main/en_US_POSIX.xml
|
||||
*/
|
||||
en_US_POSIX{
|
||||
Version{"1.36"}
|
||||
boundaries{
|
||||
word{"word_POSIX"}
|
||||
}
|
||||
}
|
17
icu4c/source/data/brkitr/ja.txt
Normal file
17
icu4c/source/data/brkitr/ja.txt
Normal file
|
@ -0,0 +1,17 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/ja.xml
|
||||
// *
|
||||
// ***************************************************************************
|
||||
/**
|
||||
* ICU <specials> source: <path>/xml/main/ja.xml
|
||||
*/
|
||||
ja{
|
||||
Version{"1.68"}
|
||||
boundaries{
|
||||
word{"word_ja"}
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2002-2005 International Business Machines Corporation and
|
||||
# Copyright (c) 2002-2006 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line.txt
|
||||
|
@ -89,6 +89,12 @@ $WJ = [:LineBreak = Word_Joiner:];
|
|||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
|
@ -551,6 +557,9 @@ $SP+ $CM* $B2;
|
|||
($CM* ($IS | $SY))+ $CM* $NU;
|
||||
$CL $CM* ($NU | $IS | $SY);
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
@ -564,5 +573,6 @@ $CL $CM* ($NU | $IS | $SY);
|
|||
# turn off rule chaining. We don't want to move more
|
||||
# than necessary.
|
||||
#
|
||||
[$CM $OP $QU $CL $B2 $PR $HY $SP]+ [^$CM $OP $QU $CL $B2 $PR $HY];
|
||||
[$CM $OP $QU $CL $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $dictionary];
|
||||
$dictionary $dictionary;
|
||||
|
||||
|
|
|
@ -1,173 +0,0 @@
|
|||
# Copyright (c) 2002-2006, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line.txt
|
||||
#
|
||||
# Line Breaking Rules for ICU rules based break iteration.
|
||||
# Implement default line breaking as defined by Unicode TR 14.
|
||||
#
|
||||
# TODO: Rework the rules not pertaining to Thai to be based on the
|
||||
# default line break rules. Not done yet because of interactions
|
||||
# between exact reverse rules and the Dictionary code.
|
||||
#
|
||||
# These rules, in their current form, do not conform to TR-14 for
|
||||
# non-Thai breaks.
|
||||
#
|
||||
|
||||
$LF = [\p{LineBreak = LF}];
|
||||
$IN = [\p{LineBreak = IN}];
|
||||
$SY = [\p{LineBreak = SY}];
|
||||
$EX = [\p{LineBreak = EX}];
|
||||
$BA = [\p{LineBreak = BA}];
|
||||
$IS = [\p{LineBreak = IS}];
|
||||
$BB = [\p{LineBreak = BB}];
|
||||
$SA = [\p{LineBreak = SA}];
|
||||
$CB = [\p{LineBreak = CB}];
|
||||
$XX = [\p{LineBreak = XX}];
|
||||
$HY = [\p{LineBreak = HY}];
|
||||
$AI = [\p{LineBreak = AI}];
|
||||
$ZW = [\p{LineBreak = ZW}];
|
||||
$SG = [\p{LineBreak = SG}];
|
||||
$AL = [\p{LineBreak = AL}];
|
||||
$OP = [\p{LineBreak = OP}];
|
||||
$BK = [\p{LineBreak = BK}];
|
||||
$PO = [\p{LineBreak = PO}];
|
||||
$NS = [\p{LineBreak = NS}];
|
||||
$CL = [\p{LineBreak = CL}];
|
||||
$NU = [\p{LineBreak = NU}];
|
||||
$CM = [\p{LineBreak = CM}];
|
||||
$PR = [\p{LineBreak = PR}];
|
||||
$B2 = [\p{LineBreak = B2}];
|
||||
$ID = [\p{LineBreak = ID}];
|
||||
$SP = [\p{LineBreak = SP}];
|
||||
$QU = [\p{LineBreak = QU}];
|
||||
$CR = [\p{LineBreak = CR}];
|
||||
$GL = [\p{LineBreak = GL}];
|
||||
|
||||
$JL = [\p{LineBreak = JL}];
|
||||
$JV = [\p{LineBreak = JV}];
|
||||
$JT = [\p{LineBreak = JT}];
|
||||
$H2 = [\p{LineBreak = H2}];
|
||||
$H3 = [\p{LineBreak = H3}];
|
||||
|
||||
|
||||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
|
||||
|
||||
#
|
||||
# Thai Dictionary related definitions and rules
|
||||
#
|
||||
|
||||
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
||||
$paiyannoi = [\u0e2f];
|
||||
$maiyamok = [\u0e46];
|
||||
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width) and
|
||||
# SA (South East Asian: Thai, Lao, Khmer) as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = $AL | $AI | [$SA - $dictionary];
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
# TODO: This is going to produce some odd results, because of the non-combining
|
||||
# chars that are included in $CM. Use $Extend instead, where possible.
|
||||
#
|
||||
$ALcm = $ALPlus $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$NUcm = $NU $Extend*;
|
||||
$HYcm = $HY $Extend*;
|
||||
$SPcm = $SP $Extend*;
|
||||
$QUcm = $QU $Extend*;
|
||||
$POcm = $PO $Extend*;
|
||||
$OPcm = $OP $Extend*;
|
||||
$BAcm = $BA $Extend*;
|
||||
$BBcm = $BB $Extend*;
|
||||
$NScm = $NS $Extend*;
|
||||
$GLcm = $GL $Extend*;
|
||||
$B2cm = $B2 $Extend*;
|
||||
$INcm = $IN $Extend*;
|
||||
|
||||
|
||||
# New Lines. Always break after, never break before.
|
||||
# Rule LB 3
|
||||
#
|
||||
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
||||
# Because we never break before these things, $Endings
|
||||
# appears at the end of line break rule.
|
||||
#
|
||||
$NLF = $BK | $CR | $LF | $CR $LF;
|
||||
$Endings = $SPcm* $ZW* $NLF?;
|
||||
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;
|
||||
|
||||
|
||||
#
|
||||
# Openings Sequences that can precede Words, and that should not be separated from them.
|
||||
# Rules LB 9, 10
|
||||
#
|
||||
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
|
||||
|
||||
#
|
||||
# Closings Seqences that follow words, and that should not be separated from them,
|
||||
# Rule LB 8, 11, 15
|
||||
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*;
|
||||
|
||||
#
|
||||
# Words. Includes mixed Alpha-numerics.
|
||||
# Rules 11a, 16, 17, 19, more or less.
|
||||
#
|
||||
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
|
||||
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
|
||||
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17
|
||||
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
|
||||
$ThaiRange = $dictionary+ | $thai_etc;
|
||||
$WordLikeThing = $Number | $Word | $Dashes | $ThaiRange;
|
||||
|
||||
|
||||
|
||||
|
||||
$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
|
||||
[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
|
||||
[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD to be glued.
|
||||
|
||||
|
||||
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
||||
# Rules 13, 14
|
||||
|
||||
#
|
||||
# The actual rules, a combination of everything defined above.
|
||||
#
|
||||
$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
|
||||
$Openings $GluedWord $Closings $Endings;
|
||||
|
||||
$Openings $GluedWord $Closings $paiyannoi /
|
||||
([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
|
||||
|
||||
|
||||
#"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
|
||||
# + "\u0e25[^$paiyannoi$_ignore_]);"
|
||||
|
||||
#
|
||||
# LB 18b. Do not break a Korean syllable
|
||||
#
|
||||
$JL+ $JV* $JT* $Extend*;
|
||||
$JV+ $JT* $Extend*;
|
||||
$JT+ $Extend*;
|
||||
$H2 $JV* $JT* $Extend*;
|
||||
$H3 $JT* $Extend*;
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
# Back up to a hard break or a space that will cause a boundary.
|
||||
# Not all spaces cause line breaks. $SpaceGlue represents a sequence
|
||||
# containing a space that may inhibit a break from occuring.
|
||||
#
|
||||
$SpaceGlue = ([$ZW $CL $IS $NS $OP] ($Extend* $SP)) | (($Extend* $SP)+ $OP);
|
||||
$ClumpingChars = [^$SP $BK $CR $LF];
|
||||
|
||||
!. . $ClumpingChars* ($SpaceGlue $ClumpingChars*)* (. | $LF $CR)?;
|
||||
|
24
icu4c/source/data/brkitr/root.txt
Normal file
24
icu4c/source/data/brkitr/root.txt
Normal file
|
@ -0,0 +1,24 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/root.xml
|
||||
// *
|
||||
// ***************************************************************************
|
||||
/**
|
||||
* ICU <specials> source: <path>/xml/main/root.xml
|
||||
*/
|
||||
root{
|
||||
Version{"1.00"}
|
||||
boundaries{
|
||||
grapheme{"char"}
|
||||
line{"line"}
|
||||
sentence{"sent"}
|
||||
title{"title"}
|
||||
word{"word"}
|
||||
}
|
||||
dictionaries{
|
||||
Thai{"thaidict"}
|
||||
}
|
||||
}
|
Binary file not shown.
26359
icu4c/source/data/brkitr/thaidict.txt
Normal file
26359
icu4c/source/data/brkitr/thaidict.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2002-2005, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2006, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
|
@ -38,12 +38,21 @@ $LF = \u000a;
|
|||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 3 Grapheme Clusters behave like their first char.
|
||||
# Rule 4 Ignore trailing Format characters (Also see note in TR 29)
|
||||
#
|
||||
$KatakanaEx = $Katakana $Extend* $Format*;
|
||||
$ALetterEx = $ALetter $Extend* $Format*;
|
||||
$ALetterEx = $ALetterPlus $Extend* $Format*;
|
||||
$MidLetterEx = $MidLetter $Extend* $Format*;
|
||||
$MidNumEx = $MidNum $Extend* $Format*;
|
||||
$NumericEx = $Numeric $Extend* $Format*;
|
||||
|
@ -125,7 +134,7 @@ $ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
|||
|
||||
!!reverse;
|
||||
|
||||
$BackALetterEx = $Format* $Extend* $ALetter;
|
||||
$BackALetterEx = $Format* $Extend* $ALetterPlus;
|
||||
$BackNumericEx = $Format* $Extend* $Numeric;
|
||||
$BackMidNumEx = $Format* $Extend* $MidNum;
|
||||
$BackMidLetterEx = $Format* $Extend* $MidLetter;
|
||||
|
@ -190,6 +199,9 @@ $MidLetter $BackALetterEx;
|
|||
# rule 11
|
||||
$MidNum $BackNumericEx;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
@ -218,3 +230,5 @@ $MidLetterEx $ALetterEx;
|
|||
# rule 11
|
||||
$MidNumEx $NumericEx;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2002-2005, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2006, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
|
@ -39,12 +39,21 @@ $LF = \u000a;
|
|||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 3 Grapheme Clusters behave like their first char.
|
||||
# Rule 4 Ignore trailing Format characters (Also see note in TR 29)
|
||||
#
|
||||
$KatakanaEx = $Katakana $Extend* $Format*;
|
||||
$ALetterEx = $ALetter $Extend* $Format*;
|
||||
$ALetterEx = $ALetterPlus $Extend* $Format*;
|
||||
$MidLetterEx = $MidLetter $Extend* $Format*;
|
||||
$MidNumEx = $MidNum $Extend* $Format*;
|
||||
$NumericEx = $Numeric $Extend* $Format*;
|
||||
|
@ -126,7 +135,7 @@ $ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
|||
|
||||
!!reverse;
|
||||
|
||||
$BackALetterEx = $Format* $Extend* $ALetter;
|
||||
$BackALetterEx = $Format* $Extend* $ALetterPlus;
|
||||
$BackNumericEx = $Format* $Extend* $Numeric;
|
||||
$BackMidNumEx = $Format* $Extend* $MidNum;
|
||||
$BackMidLetterEx = $Format* $Extend* $MidLetter;
|
||||
|
@ -191,6 +200,9 @@ $MidLetter $BackALetterEx;
|
|||
# rule 11
|
||||
$MidNum $BackNumericEx;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
@ -219,3 +231,5 @@ $MidLetterEx $ALetterEx;
|
|||
# rule 11
|
||||
$MidNumEx $NumericEx;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (C) 2002-2005, International Business Machines Corporation
|
||||
# Copyright (C) 2002-2006, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word_ja.txt
|
||||
|
@ -38,12 +38,21 @@ $LF = \u000a;
|
|||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 3 Grapheme Clusters behave like their first char.
|
||||
# Rule 4 Ignore trailing Format characters (Also see note in TR 29)
|
||||
#
|
||||
$KatakanaEx = $Katakana $Extend* $Format*;
|
||||
$ALetterEx = $ALetter $Extend* $Format*;
|
||||
$ALetterEx = $ALetterPlus $Extend* $Format*;
|
||||
$MidLetterEx = $MidLetter $Extend* $Format*;
|
||||
$MidNumEx = $MidNum $Extend* $Format*;
|
||||
$NumericEx = $Numeric $Extend* $Format*;
|
||||
|
@ -127,7 +136,7 @@ $ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
|||
|
||||
!!reverse;
|
||||
|
||||
$BackALetterEx = $Format* $Extend* $ALetter;
|
||||
$BackALetterEx = $Format* $Extend* $ALetterPlus;
|
||||
$BackNumericEx = $Format* $Extend* $Numeric;
|
||||
$BackMidNumEx = $Format* $Extend* $MidNum;
|
||||
$BackMidLetterEx = $Format* $Extend* $MidLetter;
|
||||
|
@ -196,6 +205,9 @@ $MidLetter $BackALetterEx;
|
|||
# rule 11
|
||||
$MidNum $BackNumericEx;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
|
@ -228,3 +240,5 @@ $MidLetterEx $ALetterEx;
|
|||
# rule 11
|
||||
$MidNumEx $NumericEx;
|
||||
|
||||
# For dictionary-based break
|
||||
$dictionary $dictionary;
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
# Copyright (c) 2002-2005, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
|
||||
#
|
||||
# TODO: Shift this over to being based on the current default (non-Thai)
|
||||
# word rules, including exact reverse rules. Postponed
|
||||
# because of interactions with dictionary implementation.
|
||||
|
||||
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$Numeric = [\p{Line_Break = Numeric}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
|
||||
$Hiragana = [\p{Hiragana}];
|
||||
|
||||
$Control = [^\p{Grapheme_Cluster_Break = Control}];
|
||||
$Extend = [\p{Grapheme_Cluster_Break = Extend}];
|
||||
$ALetterEx = $ALetter $Extend*;
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$MidLetterEx = $MidLetter $Extend*;
|
||||
$MidNumEx = $MidNum $Extend*;
|
||||
$ExtendNumLetEx = $ExtendNumLet $Extend*;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Thai Dictionary Related Rules. Identify runs that will be subdivided into words
|
||||
# using the dictionary.
|
||||
#
|
||||
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
||||
$paiyannoi = [\u0e2f];
|
||||
$maiyamok = [\u0e46];
|
||||
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
||||
|
||||
|
||||
$dictionary+ ($paiyannoi? $maiyamok)?;
|
||||
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
|
||||
$thai_etc;
|
||||
|
||||
|
||||
#
|
||||
# The Big Rule. Gloms Non-Thai words together.
|
||||
#
|
||||
$NumericClump = $NumericEx ($MidNumEx? $NumericEx)*;
|
||||
$AlphaClump = $ALetterEx ($MidLetterEx? $ALetterEx)*;
|
||||
($AlphaClump | $NumericClump | $ExtendNumLetEx)+;
|
||||
|
||||
#
|
||||
# Lesser rules
|
||||
#
|
||||
($Hiragana $Extend*)*;
|
||||
($Katakana $Extend*)*;
|
||||
[^$Control] $Extend*;
|
||||
\r\n;
|
||||
.;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up a bit too far,
|
||||
# but must back up at least enough.)
|
||||
#
|
||||
! ( $ALetter | $MidLetter | $Numeric | $ExtendNumLet | $MidNum | $Extend )*;
|
||||
! ($Hiragana | $Extend)*;
|
||||
! ($Katakana | $Extend)*;
|
||||
! $Extend* .;
|
||||
! \n\r;
|
||||
|
||||
! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;
|
|
@ -1,6 +1,6 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2005 International Business Machines
|
||||
// * Copyright (C) 2005-2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/en_US_POSIX.xml
|
||||
|
@ -31,7 +31,4 @@ en_US_POSIX{
|
|||
"0.000000E+000",
|
||||
}
|
||||
Version{"1.36"}
|
||||
boundaries{
|
||||
word{"word_POSIX"}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2005 International Business Machines
|
||||
// * Copyright (C) 2005-2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/ja.xml
|
||||
|
@ -1678,9 +1678,6 @@ ja{
|
|||
REVISED{"改訂版"}
|
||||
}
|
||||
Version{"1.68"}
|
||||
boundaries{
|
||||
word{"word_ja"}
|
||||
}
|
||||
calendar{
|
||||
gregorian{
|
||||
AmPmMarkers{
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2005 International Business Machines
|
||||
// * Copyright (C) 2005-2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/root.xml
|
||||
|
@ -66,13 +66,6 @@ root{
|
|||
210,
|
||||
}
|
||||
Version{"1.63"}
|
||||
boundaries{
|
||||
grapheme{"char"}
|
||||
line{"line"}
|
||||
sentence{"sent"}
|
||||
title{"title"}
|
||||
word{"word"}
|
||||
}
|
||||
calendar{
|
||||
buddhist{
|
||||
DateTimePatterns{
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// ***************************************************************************
|
||||
// *
|
||||
// * Copyright (C) 2005 International Business Machines
|
||||
// * Copyright (C) 2005-2006 International Business Machines
|
||||
// * Corporation and others. All Rights Reserved.
|
||||
// * Tool: com.ibm.icu.dev.tool.cldr.LDML2ICUConverter.java
|
||||
// * Source File:<path>/common/main/th.xml
|
||||
|
@ -10,7 +10,6 @@
|
|||
* ICU <specials> source: <path>/xml/main/th.xml
|
||||
*/
|
||||
th{
|
||||
BreakDictionaryData:import{"../brkitr/thaidict.brk"}
|
||||
Countries{
|
||||
001{"โลก"}
|
||||
002{"แอฟริกา"}
|
||||
|
@ -558,10 +557,6 @@ th{
|
|||
Thai{"ไทย"}
|
||||
}
|
||||
Version{"1.56"}
|
||||
boundaries{
|
||||
line{"line_th"}
|
||||
word{"word_th"}
|
||||
}
|
||||
calendar{
|
||||
buddhist{
|
||||
DateTimePatterns{
|
||||
|
|
|
@ -2155,6 +2155,8 @@ static void TestResourceLevelAliasing(void) {
|
|||
log_err("Referencing alias didn't get the right string\n");
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* TODO: Needs to be replaced as this data is no longer present! */
|
||||
/* check whether the binary collation data is properly referenced by an alias */
|
||||
uk = ures_findResource("th/BreakDictionaryData", uk, &status);
|
||||
binSequence = ures_getBinary(uk, &binSeqLen, &status);
|
||||
|
@ -2167,6 +2169,7 @@ static void TestResourceLevelAliasing(void) {
|
|||
} else if(binSeqLen != binLen || memcmp(binSequence, binary, binSeqLen) != 0) {
|
||||
log_err("Referencing alias didn't get the right data\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
/* simple alias */
|
||||
testtypes = ures_open(testdatapath, "testtypes", &status);
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
#include "filestrm.h"
|
||||
#include "udatamem.h"
|
||||
#include "cintltst.h"
|
||||
#include "ubrkimpl.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
|
@ -1100,6 +1101,12 @@ static void TestICUDataName()
|
|||
|
||||
/* test data swapping ------------------------------------------------------- */
|
||||
|
||||
/* Unfortunately, trie dictionaries are in a C++ header */
|
||||
int32_t
|
||||
triedict_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/* test cases for maximum data swapping code coverage */
|
||||
static const struct {
|
||||
const char *name, *type;
|
||||
|
@ -1156,6 +1163,7 @@ static const struct {
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
{"char", "brk", ubrk_swap},
|
||||
{"thaidict", "ctd", triedict_swap},
|
||||
#endif
|
||||
|
||||
/* the last item should not be #if'ed so that it can reliably omit the last comma */
|
||||
|
@ -1459,6 +1467,11 @@ TestSwapData() {
|
|||
pkg=loadTestData(&errorCode);
|
||||
nm=swapCases[i].name+1;
|
||||
uprv_strcpy(name, "testdata");
|
||||
} else if (uprv_strcmp(swapCases[i].type, "brk")==0
|
||||
|| uprv_strcmp(swapCases[i].type, "ctd")==0) {
|
||||
pkg=U_ICUDATA_BRKITR;
|
||||
nm=swapCases[i].name;
|
||||
uprv_strcpy(name, U_ICUDATA_BRKITR);
|
||||
} else {
|
||||
pkg=NULL;
|
||||
nm=swapCases[i].name;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2005, International Business Machines Corporation and
|
||||
* Copyright (c) 1999-2006, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
|
@ -20,6 +20,7 @@
|
|||
#include "rbbiapts.h"
|
||||
#include "rbbidata.h"
|
||||
#include "cstring.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utext.h"
|
||||
|
||||
|
@ -147,8 +148,8 @@ void RBBIAPITest::TestCloneEquals()
|
|||
void RBBIAPITest::TestBoilerPlate()
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator* a = BreakIterator::createLineInstance(Locale("hi"), status);
|
||||
BreakIterator* b = BreakIterator::createLineInstance(Locale("hi_IN"),status);
|
||||
BreakIterator* a = BreakIterator::createWordInstance(Locale("hi"), status);
|
||||
BreakIterator* b = BreakIterator::createWordInstance(Locale("hi_IN"),status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("Creation of break iterator failed %s", u_errorName(status));
|
||||
return;
|
||||
|
@ -156,7 +157,7 @@ void RBBIAPITest::TestBoilerPlate()
|
|||
if(*a!=*b){
|
||||
errln("Failed: boilerplate method operator!= does not return correct results");
|
||||
}
|
||||
BreakIterator* c = BreakIterator::createLineInstance(Locale("th"),status);
|
||||
BreakIterator* c = BreakIterator::createWordInstance(Locale("ja"),status);
|
||||
if(a && c){
|
||||
if(*c==*a){
|
||||
errln("Failed: boilerplate method opertator== does not return correct results");
|
||||
|
@ -864,17 +865,17 @@ void RBBIAPITest::TestBug2190() {
|
|||
void RBBIAPITest::TestRegistration() {
|
||||
#if !UCONFIG_NO_SERVICE
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
BreakIterator* thai_word = BreakIterator::createWordInstance("th_TH", status);
|
||||
BreakIterator* ja_word = BreakIterator::createWordInstance("ja_JP", status);
|
||||
|
||||
// ok to not delete these if we exit because of error?
|
||||
BreakIterator* thai_char = BreakIterator::createCharacterInstance("th_TH", status);
|
||||
BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
|
||||
BreakIterator* root_word = BreakIterator::createWordInstance("", status);
|
||||
BreakIterator* root_char = BreakIterator::createCharacterInstance("", status);
|
||||
|
||||
URegistryKey key = BreakIterator::registerInstance(thai_word, "xx", UBRK_WORD, status);
|
||||
URegistryKey key = BreakIterator::registerInstance(ja_word, "xx", UBRK_WORD, status);
|
||||
{
|
||||
if (thai_word && *thai_word == *root_word) {
|
||||
errln("thai not different from root");
|
||||
if (ja_word && *ja_word == *root_word) {
|
||||
errln("japan not different from root");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -882,7 +883,7 @@ void RBBIAPITest::TestRegistration() {
|
|||
BreakIterator* result = BreakIterator::createWordInstance("xx_XX", status);
|
||||
UBool fail = TRUE;
|
||||
if(result){
|
||||
fail = *result != *thai_word;
|
||||
fail = *result != *ja_word;
|
||||
}
|
||||
delete result;
|
||||
if (fail) {
|
||||
|
@ -891,14 +892,14 @@ void RBBIAPITest::TestRegistration() {
|
|||
}
|
||||
|
||||
{
|
||||
BreakIterator* result = BreakIterator::createCharacterInstance("th_TH", status);
|
||||
BreakIterator* result = BreakIterator::createCharacterInstance("ja_JP", status);
|
||||
UBool fail = TRUE;
|
||||
if(result){
|
||||
fail = *result != *thai_char;
|
||||
fail = *result != *ja_char;
|
||||
}
|
||||
delete result;
|
||||
if (fail) {
|
||||
errln("bad result for th_TH/char");
|
||||
errln("bad result for ja_JP/char");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -983,8 +984,8 @@ void RBBIAPITest::TestRegistration() {
|
|||
}
|
||||
|
||||
|
||||
// that_word was adopted by factory
|
||||
delete thai_char;
|
||||
// ja_word was adopted by factory
|
||||
delete ja_char;
|
||||
delete root_word;
|
||||
delete root_char;
|
||||
#endif
|
||||
|
@ -995,7 +996,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
|||
UParseError parseError;
|
||||
parseError.line = 0;
|
||||
parseError.offset = 0;
|
||||
UDataMemory *data = udata_open(NULL, "brk", dataFile, &status);
|
||||
UDataMemory *data = udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status);
|
||||
uint32_t length;
|
||||
const UChar *builtSource;
|
||||
const uint8_t *rbbiRules;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2005, International Business Machines Corporation and
|
||||
* Copyright (c) 1999-2006, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
|
@ -493,9 +493,11 @@ void RBBITest::TestMaiyamok()
|
|||
ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
|
||||
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
|
||||
|
||||
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
|
||||
|
@ -2055,7 +2057,10 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0)
|
|||
|
||||
fSets = new UVector(status);
|
||||
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]", status);
|
||||
fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
|
||||
"[\\p{Line_Break = Complex_Context}"
|
||||
"-\\p{Grapheme_Cluster_Break = Extend}"
|
||||
"-\\p{Grapheme_Cluster_Break = Control}]]", status);
|
||||
fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]", status);
|
||||
fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]", status);
|
||||
fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]", status);
|
||||
|
@ -2063,6 +2068,7 @@ RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0)
|
|||
fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]", status);
|
||||
fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]", status);
|
||||
fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]", status);
|
||||
|
||||
fOtherSet = new UnicodeSet();
|
||||
if(U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
|
|
|
@ -261,7 +261,7 @@ void UObjectTest::testIDs()
|
|||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
/* TESTCLASSID_ABSTRACT(BreakIterator); No staticID! */
|
||||
TESTCLASSID_FACTORY(RuleBasedBreakIterator, BreakIterator::createLineInstance("mt",status));
|
||||
TESTCLASSID_FACTORY(DictionaryBasedBreakIterator, BreakIterator::createLineInstance("th",status));
|
||||
//TESTCLASSID_FACTORY(DictionaryBasedBreakIterator, BreakIterator::createLineInstance("th",status));
|
||||
#endif
|
||||
|
||||
//TESTCLASSID_DEFAULT(EscapeTransliterator);
|
||||
|
|
12
icu4c/source/test/testdata/rbbitst.txt
vendored
12
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2001-2005 International Business Machines
|
||||
# Copyright (c) 2001-2006 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# RBBI Test Data
|
||||
|
@ -512,14 +512,14 @@ What is the proper use of the abbreviation pp.•? •Yes, I am definatelly 12"
|
|||
# Test data originally from the test code source file
|
||||
# // @suwit -- Thai sample data from GVT Guideline
|
||||
#
|
||||
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07•\u0E04\u0E33•\u0E44\u0E17\u0E22•\
|
||||
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16•\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A•\
|
||||
\u0E14\u0E49\u0E27\u0E22•\u0e2b\u0e25\u0e32\u0e22•\
|
||||
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c•</data>
|
||||
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
|
||||
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
|
||||
\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
|
||||
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
|
||||
|
||||
#
|
||||
# Jitterbug 3671 Test Case
|
||||
#
|
||||
<data>•สวัสดี•ครับ•สบาย•ดี•ไหม• •ครับ•</data>
|
||||
<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
บท ที่ ๑ พายุ ไซโคลน
|
||||
โดโรธี อาศัย อยู่ ท่าม กลาง ทุ่ง ใหญ่ ใน แคนซัส กับ ลุง เฮนรี ชาว ไร่ และ ป้า เอ็ม ภรรยา ชาวไร่
|
||||
บท ที่๑พายุ ไซโคลน
|
||||
โด โรธี อาศัย อยู่ ท่ามกลาง ทุ่งใหญ่ ใน แคนซัส กับ ลุง เฮ นรี ชาวไร่ และ ป้า เอ็ม ภรรยา ชาวไร่
|
||||
|
||||
บ้าน ของ พวก เขา หลัง เล็ก เพราะ ไม้ สร้าง บ้าน ต้อง ขน มา ด้วย เกวียน เป็น
|
||||
|
||||
|
@ -19,52 +19,52 @@
|
|||
|
||||
แล้ว ก็ มี เตียง นอน
|
||||
|
||||
ลุง เฮนรี กับ ป้า เอ็ม มี เตียง นอน ใหญ่ อยู่ ที่ มุม หนึ่ง
|
||||
ลุง เฮ นรี กับ ป้า เอ็ม มี เตียง นอน ใหญ่ อยู่ ที่ มุม หนึ่ง
|
||||
|
||||
ส่วน โดโรธี มี เตียง เล็ก อีก ที่ มุม หนึ่ง
|
||||
ส่วน โด โร ธีมี เตียง เล็ก อีก ที่ มุม หนึ่ง
|
||||
|
||||
ไม่ มี ห้อง ใต้ เพดาน เลย ห้อง ใต้ถุน ก็ ไม่ มี
|
||||
ไม่มี ห้อง ใต้ เพดาน เลย ห้อง ใต้ถุน ก็ ไม่มี
|
||||
|
||||
เว้น แต่ มี โพรง เล็กๆ
|
||||
เว้น แต่ มี โพ รง เล็กๆ
|
||||
|
||||
ที่ ขุด ไป ใต้
|
||||
|
||||
พื้น เรียก ว่า
|
||||
|
||||
" โพรง ไซโคลน "
|
||||
"โพรง ไซโคลน"
|
||||
|
||||
เป็น ที่ ครอบครัว นี้ จะ มุด เข้า ไป เมื่อ เกิด ลม มหาภัย
|
||||
เป็น ที่ ครอบครัว นี้ จะ มุด เข้าไป เมื่อ เกิด ลม มหา ภัย
|
||||
|
||||
ซึ่ง กระโชก แรง จน บด ขยี้ สิ่ง ก่อ สร้าง ใด ๆ
|
||||
ซึ่ง กระโชก แรง จน บดขยี้ สิ่ง ก่อ สร้าง ใดๆ
|
||||
|
||||
ที่ ขวาง ทาง มัน ได้ ตรง กลาง พื้น มี ฝา เปิด เข้า ไป
|
||||
ที่ ขวาง ทาง มัน ได้ ตรง กลาง พื้น มี ฝา เปิด เข้าไป
|
||||
|
||||
จาก นั้น มี บันได ลง ไป ถึง โพรง มืด เล็ก ๆ
|
||||
จาก นั้น มี บันได ลง ไป ถึง โพรง มืด เล็กๆ
|
||||
|
||||
|
||||
เมื่อ โดโรธี ยืน ที่ ปาก ประตู และ มอง ไป รอบ ๆ
|
||||
เมื่อ โด โรธี ยืน ที่ ปาก ประตู และ มอง ไป รอบๆ
|
||||
|
||||
เธอ ไม่ เห็น อะไร นอก จาก ท้อง ทุ่ง กว้าง สี เทา หม่น ทั่ว ทุก ด้าน
|
||||
เธอ ไม่ เห็น อะไร นอกจาก ท้อง ทุ่ง กว้าง สี เทา หม่น ทั่ว ทุก ด้าน
|
||||
|
||||
ไม่ มี แม้ ต้นไม้ สัก ต้น หรือ บ้าน สัก หลัง ที่ โผล่ พ้น ภูมิ ประเทศ อัน ราบ เรียบ
|
||||
ไม่มี แม้ ต้นไม้ สัก ต้น หรือ บ้าน สัก หลัง ที่ โผล่ พ้น ภูมิประเทศ อัน ราบ เรียบ
|
||||
|
||||
แผ่ ไป ไกล จน จด ขอบ ฟ้า ทั่ว ทุก ทิศ
|
||||
|
||||
ดวง ตะวัน เผา ผืน ดิน ที่ ไถ แล้ว จน กลาย เป็น แผ่น มหึมา สี ดำ
|
||||
ดวงตะวัน เผา ผืน ดิน ที่ ไถ แล้ว จน กลาย เป็น แผ่น มหึมา สี ดำ
|
||||
|
||||
มี รอย แตกระแหง อยู่ ตลอด
|
||||
มี รอย แตก ระแหง อยู่ ตลอด
|
||||
|
||||
แม้แต่ หญ้า ก็ ไม่ เขียว
|
||||
|
||||
เพราะ ดวง ตะวัน เผา ยอด ใบ ยาว เสีย จน เป็น สี เทา หม่น มอง เห็น อยู่ ทั่ว ไป
|
||||
เพราะ ดวงตะวัน เผา ยอด ใบ ยาว เสีย จน เป็น สี เทา หม่น มอง เห็น อยู่ ทั่วไป
|
||||
|
||||
ครั้ง หนึ่ง เคย ทา สี บ้านเอาไว้
|
||||
ครั้ง หนึ่ง เคย ทาสี บ้าน เอา ไว้
|
||||
|
||||
แต่ ก็ ถูก ดวง ตะวัน เผา เสีย จน สี พอง
|
||||
แต่ ก็ ถูก ดวงตะวัน เผา เสีย จน สี พอง
|
||||
|
||||
แล้ว ฝน ก็ ชะมัน หลุด ไป จน หมด
|
||||
แล้ว ฝน ก็ ชะ มัน หลุด ไป จน หมด
|
||||
|
||||
และ ตอน นี้ บ้าน จึง ดู หม่นหมอง เป็น สี เทา เหมือน สิ่ง อื่น ๆ
|
||||
และ ตอน นี้ บ้าน จึง ดู หม่นหมอง เป็น สี เทา เหมือน สิ่ง อื่นๆ
|
||||
|
||||
ด้วย
|
||||
|
||||
|
@ -75,13 +75,13 @@
|
|||
|
||||
เป็น ภรรยา ที่ งดงาม
|
||||
|
||||
แล้ว แดด และ ลม ก็ ได้ เปลี่ยน เธอ ไป
|
||||
แล้ว แดด และ ลม ก็ได้ เปลี่ยน เธอ ไป
|
||||
|
||||
เอา ประกาย ไป จาก ดวงตา เธอ ปล่อย ไว้ แต่ ความ สุขุม อย่าง หม่นหมอง
|
||||
|
||||
เอา สี แดง จาก แก้ม และ ริมฝีปาก เธอ ไป
|
||||
เอา สี แดง จาก แก้ม และ ริม ฝีปาก เธอ ไป
|
||||
|
||||
กลาย เป็น สี หม่น ๆ
|
||||
กลาย เป็น สี หม่นๆ
|
||||
|
||||
เหมือน กัน
|
||||
|
||||
|
@ -89,62 +89,62 @@
|
|||
|
||||
และ เดี๋ยว นี้ ไม่ เคย ยิ้ม เลย
|
||||
|
||||
เมื่อ โดโรธี ซึ่ง เป็น เด็ก กำพร้า มา อยู่ กับ เธอ ตอน แรก
|
||||
เมื่อ โด โรธี ซึ่ง เป็น เด็ก กำพร้า มา อยู่ กับ เธอ ตอน แรก
|
||||
|
||||
ป้า เอ็ม ตื่น เต้น กับ เสียง หัวเราะ ของ เด็ก น้อย มาก
|
||||
|
||||
เธอ จะส่ง เสียง ร้อง แล้ว เอา มือ ทาบ อก ทุก ครั้ง ที่ เสียง อัน ร่าเริง ของ โดโรธี เข้า หู เธอ
|
||||
เธอ จะ ส่ง เสียง ร้อง แล้ว เอา มือ ทาบ อก ทุก ครั้ง ที่ เสียง อัน ร่าเริง ของ โด โรธี เข้าหู เธอ
|
||||
|
||||
และ เธอ เฝ้า มอง เด็ก หญิง น้อย ๆ
|
||||
และ เธอ เฝ้า มอง เด็ก หญิง น้อยๆ
|
||||
|
||||
ด้วย ความ ประหลาด ใจ
|
||||
|
||||
ด้วย ยัง หา อะไร มา เป็น เรื่อง หัวเราะ ได้
|
||||
|
||||
|
||||
ลุง เฮนรี ไม่ เคย หัวเราะ
|
||||
ลุง เฮ นรี ไม่ เคย หัวเราะ
|
||||
|
||||
ลุง ทำงาน หนัก จาก เช้า ยัน ค่ำ
|
||||
|
||||
และ ไม่ เคย รู้จัก ว่า ความ ร่าเริง คือ อะไร
|
||||
|
||||
ลุง ดู หม่นหมอง ไป หมด ตั้ง แต่ เครา ยาว จน จด รองเท้า บูต อัน หยาบ
|
||||
ลุง ดู หม่นหมอง ไป หมด ตั้งแต่ เครา ยาว จน จด รองเท้า บูต อัน หยาบ
|
||||
|
||||
แล้ว ลุง ก็ ดู เคร่งขรึม น่า เกรงขาม ไม่ ค่อย จะ พูด
|
||||
แล้ว ลุง ก็ ดู เคร่งขรึม น่า เกรง ขาม ไม่ ค่อย จะ พูด
|
||||
|
||||
|
||||
มี โตโต้ ที่ ทำ ให้ โดโรธี หัวเราะ ได้
|
||||
มี โต โต้ ที่ ทำให้ โด โรธี หัวเราะ ได้
|
||||
|
||||
และ ช่วย เธอ ให้ พ้น จาก การ กลาย เป็น สี เทา หม่นเหมือน กับ สิ่ง รอบ ตัว อื่น ๆ
|
||||
และ ช่วย เธอ ให้ พ้น จาก การก ลาย เป็น สี เทา หม่น เหมือน กับ สิ่ง รอบ ตัว อื่นๆ
|
||||
|
||||
โตโต้ สี ไม่ เทา หม่น
|
||||
โต โต้ สี ไม่ เทา หม่น
|
||||
|
||||
แต่ มัน เป็น หมา สี ดำ ตัว น้อย ๆ
|
||||
แต่ มัน เป็น หมา สี ดำ ตัว น้อยๆ
|
||||
|
||||
ขน ยาว ปุย ราว กับ ไหม
|
||||
ขน ยาว ปุย ราวกับ ไหม
|
||||
|
||||
มี ตา ดำ เล็ก เป็น ประกาย รื่นเริง อยู่ สอง ข้าง จมูก เล็ก อัน น่า ขัน ของ มัน
|
||||
|
||||
โตโต้ เล่น ทั้ง วัน
|
||||
โต โต้ เล่น ทั้ง วัน
|
||||
|
||||
และ โดโรธี ก็ เล่น กับ มัน
|
||||
และ โด โรธี ก็ เล่น กับ มัน
|
||||
|
||||
และ รัก มัน เหลือ เกิน
|
||||
|
||||
|
||||
อย่างไร ก็ ตาม
|
||||
อย่างไร ก็ตาม
|
||||
|
||||
วัน นี้ ทั้ง คู่ ไม่ ได้ เล่น
|
||||
|
||||
ลุง เฮนรี นั่ง อยู่ ที่ บันได ประตู และ เฝ้า กังวล จ้อง ดู ท้อง ฟ้า สี เทา หม่น ผิด ปกติ
|
||||
ลุง เฮ นรี นั่ง อยู่ ที่ บันได ประตู และ เฝ้า กังวล จ้อง ดู ท้องฟ้า สี เทา หม่น ผิด ปกติ
|
||||
|
||||
โดโรธี ยืน ที่ ประตู
|
||||
โด โรธี ยืน ที่ ประตู
|
||||
|
||||
กอด โตโต้ ไว้ ใน อ้อม แขน
|
||||
กอด โต โต้ ไว้ ใน อ้อม แขน
|
||||
|
||||
และ ก็ มอง ดู ท้อง ฟ้า อยู่ เหมือน กัน
|
||||
และ ก็ มอง ดู ท้องฟ้า อยู่ เหมือน กัน
|
||||
|
||||
ป้า เอ็ม กำลัง ล้าง ชาม อยู่
|
||||
ป้า เอ็ มกำ ลัง ล้าง ชาม อยู่
|
||||
|
||||
|
||||
|
||||
|
@ -152,25 +152,25 @@
|
|||
|
||||
มี เสียง ลม คราง แผ่ว เบา ได้ยิน มา
|
||||
|
||||
ลุง เฮนรี และ โดโรธี เห็น ต้น หญ้า สูง เอน เป็น คลื่น ก่อน ที่ พายุ จะ มา ถึง
|
||||
ลุง เฮ นรี และ โด โรธี เห็น ต้น หญ้า สูง เอน เป็น คลื่น ก่อน ที่ พายุ จะ มา ถึง
|
||||
|
||||
แล้ว ก็ มี เสียง หวีดหวิว ชัดเจน มา จาก บรรยากาศ ทาง ใต้
|
||||
แล้ว ก็ มี เสียง หวีด หวิว ชัดเจน มา จาก บรรยากาศ ทาง ใต้
|
||||
|
||||
และ เมื่อ เหลือบ ตา ไป ทาง ด้าน นั้น ก็ เห็น คลื่น หญ้า มา ทาง ด้าน นั้น ด้วย
|
||||
|
||||
|
||||
|
||||
ลุง เฮนรี ผุด ลุก ขึ้น ทัน ใด
|
||||
ลุง เฮ นรี ผุด ลุก ขึ้น ทันใด
|
||||
|
||||
|
||||
|
||||
" ลม ไซโคลน มา
|
||||
"ลม ไซโคลน มา
|
||||
|
||||
เอ็ม "
|
||||
เอ็ม"
|
||||
|
||||
ลุง ร้อง บอก ภรรยา
|
||||
|
||||
" ข้า จะ ไป ดู สัตว์ เลี้ยง หน่อย "
|
||||
"ข้า จะ ไป ดู สัตว์ เลี้ยง หน่อย"
|
||||
|
||||
แล้ว ลุง ก็ วิ่ง ไป ยัง เพิง ที่ วัว และ ม้า อาศัย อยู่
|
||||
|
||||
|
@ -184,31 +184,31 @@
|
|||
|
||||
|
||||
|
||||
" เร็ว โดโรธี ! "
|
||||
"เร็ว โด โรธี!"
|
||||
|
||||
ป้า ตะโกน
|
||||
|
||||
|
||||
|
||||
" วิ่ง ไป ห้อง ใต้ถุน "
|
||||
"วิ่ง ไป ห้อง ใต้ถุน"
|
||||
|
||||
|
||||
|
||||
โตโต้ ผลุน กระโดด ลง จาก อ้อมแขน โดโรธี
|
||||
โต โต้ ผลุน กระโดด ลง จาก อ้อม แขน โด โรธี
|
||||
|
||||
แล้ว เข้า ไป ซ่อน อยู่ ใต้ เตียง
|
||||
แล้ว เข้าไป ซ่อน อยู่ ใต้ เตียง
|
||||
|
||||
เด็ก หญิง น้อย เข้า ไป ดึง มัน ออก มา
|
||||
เด็ก หญิง น้อย เข้าไป ดึง มัน ออก มา
|
||||
|
||||
ป้า เอ็ม กระชาก ฝา ที่ พื้น ออก อย่าง อก สั่น ขวัญ หาย
|
||||
ป้า เอ็ มก ระ ชาก ฝา ที่ พื้น ออก อย่าง อก สั่น ขวัญ หาย
|
||||
|
||||
ปีน บันได ไม้ ลง ไป ใน โพรง เล็ก อัน มืด ทึบ
|
||||
|
||||
โดโรธี จับ โตโต้ ได้ ใน ที่ สุด
|
||||
โด โรธี จับ โต โต้ ได้ ใน ที่สุด
|
||||
|
||||
และ วิ่ง ตาม ป้า เธอ ไป
|
||||
|
||||
เมื่อ เธอ มา ได้ ครึ่ง ห้อง ก็ มี เสียง หวีดหวือ
|
||||
เมื่อ เธอ มา ได้ ครึ่ง ห้อง ก็ มี เสียง หวีด หวือ
|
||||
|
||||
ส่วน บ้าน ก็ สั่น อย่าง แรง จน เธอ หก คะมำ นั่ง จ้ำเบ้า อยู่ กับ พื้น
|
||||
|
||||
|
@ -218,39 +218,39 @@
|
|||
|
||||
|
||||
|
||||
บ้าน หมุน ไป หมุน มา สอง สาม รอบ
|
||||
บ้าน หมุน ไป หมุน มาส อง สาม รอบ
|
||||
|
||||
แล้ว ก็ ลอย ขึ้น สู่ อากาศ อย่าง ช้า ๆ
|
||||
แล้ว ก็ ลอย ขึ้น สู่ อากาศ อย่าง ช้าๆ
|
||||
|
||||
โดโรธี รู้สึก ราว กับ ว่า เธอ ได้ ขึ้น ไป กับ ลูก บอลลูน
|
||||
โด โร ธีรู้ สึก ราวกับ ว่า เธอ ได้ ขึ้น ไป กับ ลูก บอลลูน
|
||||
|
||||
|
||||
|
||||
พายุ เหนือ กับ พายุ ใต้ มา พบ กัน ตรง ที่ บ้าน พอ ดี
|
||||
พายุ เหนือ กับ พายุ ใต้ มา พบ กัน ตรง ที่ บ้าน พอดี
|
||||
|
||||
และ ทำ ให้ ตรง นั้น เป็น จุด ศูนย์ กลาง ของ พายุ ไซโคลน
|
||||
และ ทำให้ ตรง นั้น เป็น จุดศูนย์กลาง ของ พายุ ไซโคลน
|
||||
|
||||
ตาม ปกติ ตรง กลาง พายุ ไซโคลน อากาศ จะ นิ่ง
|
||||
|
||||
แต่ ความ กดดัน อย่าง หนัก ของ ลม ทุก ด้าน รอบ บ้าน
|
||||
|
||||
ทำ ให้ บ้าน ลอย สูง ขึ้น ๆ
|
||||
ทำให้ บ้าน ลอย สูง ขึ้นๆ
|
||||
|
||||
จน กระทั่ง ขึ้น ไป อยู่ สุด ยอด ของ พายุ ไซโคลน
|
||||
|
||||
และ จาก ตรง นั้น ก็ ถูก หอบ ไป หลาย ไมล์
|
||||
|
||||
ง่าย ดาย ราว กับ หอบ ขน นก
|
||||
ง่ายดาย ราวกับ หอบ ขน นก
|
||||
|
||||
|
||||
|
||||
มืด มาก แล้ว
|
||||
|
||||
ลม ยัง ส่ง เสียง หวีดหวือ น่า กลัว อยู่ รอบ ตัว เธอ
|
||||
ลม ยัง ส่ง เสียง หวีด หวือ น่า กลัว อยู่ รอบ ตัว เธอ
|
||||
|
||||
แต่ โดโรธี เห็น ว่า เธอ สามารถ นั่ง ไป ได้ อย่า งง่าย ดาย นัก
|
||||
แต่ โด โรธี เห็น ว่า เธอ สามารถ นั่ง ไป ได้ อย่าง ง่ายดาย นัก
|
||||
|
||||
ครั้ง หนึ่ง หลัง จาก ที่ บ้าน สะดุด อย่าง แรง และ หมุน ไป รอบ ๆ
|
||||
ครั้ง หนึ่ง หลัง จาก ที่ บ้าน สะดุด อย่าง แรง และ หมุน ไป รอบๆ
|
||||
|
||||
สอง สาม ครั้ง ใน ตอน แรก
|
||||
|
||||
|
@ -258,17 +258,17 @@
|
|||
|
||||
|
||||
|
||||
โตโต้ ไม่ ชอบ ใจ เลย
|
||||
โต โต้ ไม่ ชอบใจ เลย
|
||||
|
||||
มัน วิ่ง ไป วิ่ง มา รอบ ห้อง
|
||||
มัน วิ่ง ไป วิ่ง มาร อบ ห้อง
|
||||
|
||||
ทาง โน้น ที ทาง นี้ ที ส่ง เสียง เห่า ดัง ก้อง
|
||||
|
||||
แต่ โดโรธี นั่ง นิ่ง อยู่ บน พื้น เฝ้า คอย ดู ว่า จะ เกิด อะไร ขึ้น
|
||||
แต่ โด โรธี นั่ง นิ่ง อยู่ บน พื้น เฝ้า คอย ดู ว่า จะ เกิด อะไร ขึ้น
|
||||
|
||||
|
||||
|
||||
ครั้ง หนึ่ง โตโต้ เข้า ไป ใกล้ ฝา ที่ พื้น มาก ไป
|
||||
ครั้ง หนึ่ง โต โต้ เข้าไป ใกล้ ฝา ที่ พื้น มาก ไป
|
||||
|
||||
เลย พลัด ตกลง ไป
|
||||
|
||||
|
@ -276,9 +276,9 @@
|
|||
|
||||
แต่ ชั่ว ครู่ เธอ ก็ เห็น หู ของ มัน โผล่ ขึ้น มา จาก ช่อง นั้น
|
||||
|
||||
ทั้ง นี้ เพราะ แรง กด อย่าง หนัก ของ อากาศ ทำ ให้ โตโต้ ไม่ ตกลง ไป ข้าง ล่าง
|
||||
ทั้งนี้ เพราะ แรง กด อย่าง หนัก ของ อากาศ ทำให้ โต โต้ ไม่ ตกลง ไป ข้าง ล่าง
|
||||
|
||||
โดโรธี คลาน ไป ที่ ช่อง นั้น จับ หู โตโต้ ไว้ ได้
|
||||
โด โรธี คลาน ไป ที่ ช่อง นั้น จับ หู โต โต้ ไว้ ได้
|
||||
|
||||
และ ลาก มัน มา ที่ ห้อง อีก
|
||||
|
||||
|
@ -288,33 +288,33 @@
|
|||
|
||||
ชั่วโมง แล้ว ชั่วโมง เล่า ผ่าน ไป
|
||||
|
||||
โดโรธี ค่อย ๆ
|
||||
โด โรธี ค่อยๆ
|
||||
|
||||
หาย กลัว
|
||||
|
||||
แต่ เธอ รู้สึก เหงา เหลือ เกิน
|
||||
|
||||
และ ลม ก็ ส่ง เสียง หวีดหวือ ดัง เสีย จน เธอ แทบ จะ หู หนวก
|
||||
และ ลม ก็ ส่ง เสียง หวีด หวือ ดัง เสีย จน เธอ แทบ จะ หู หนวก
|
||||
|
||||
ที แรก เธอ สงสัย ว่า คง จะ ถูก ฉีก กระชาก ออก เป็น ชิ้น เล็ก ชิ้น น้อย เมื่อ บ้าน เอน ล้ม ลง อีก ครั้ง
|
||||
ที แรก เธอ สงสัย ว่า คงจะ ถูก ฉีก กระชาก ออก เป็น ชิ้น เล็ก ชิ้น น้อย เมื่อ บ้าน เอน ล้ม ลง อีก ครั้ง
|
||||
|
||||
แต่ หลาย ชั่วโมง ผ่าน ไป ก็ ไม่ มี อะไร เกิด ขึ้น เธอ เลย เลิก วิตก และ ตัดสิน ใจ คอย ดู อย่าง สงบ
|
||||
แต่ หลาย ชั่วโมง ผ่าน ไป ก็ ไม่มี อะไร เกิด ขึ้น เธอ เลย เลิก วิตก และ ตัดสิน ใจ คอย ดู อย่าง สงบ
|
||||
|
||||
และ รอ ว่า อนาคต จะ เป็น อย่างไร
|
||||
|
||||
ใน ที่ สุด เธอ คลาน จาก พื้น ห้อง ที่ โยก ไป มา ขึ้น ไป บน เตียง
|
||||
ใน ที่สุด เธอ คลาน จาก พื้น ห้อง ที่ โยก ไป มา ขึ้น ไป บน เตียง
|
||||
|
||||
แล้ว ก็ นอน ลง
|
||||
|
||||
โตโต้ ตาม ติด มา นอน ลง ใกล้ ๆ
|
||||
โต โต้ ตาม ติด มา นอน ลง ใกล้ๆ
|
||||
|
||||
เธอ
|
||||
|
||||
|
||||
|
||||
ไม่ ช้า โดโรธี ก็ ปิด ตา ลง หลับ ผล็อย ไป อย่าง สนิท ทั้ง ๆ
|
||||
ไม่ ช้า โด โรธี ก็ ปิด ตา ลง หลับ ผล็อย ไป อย่าง สนิท ทั้งๆ
|
||||
|
||||
ที่ บ้าน โยก ไป มา และ ลม ก็ คราง หวีดหวือ
|
||||
ที่ บ้าน โยก ไป มา และ ลม ก็ คราง หวีด หวือ
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1998-2003, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
* Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
|
||||
* and others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
|
@ -15,6 +15,8 @@
|
|||
#include "unicode/brkiter.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
||||
/*
|
||||
* This program takes a Unicode text file containing Thai text with
|
||||
|
@ -68,6 +70,9 @@ private:
|
|||
|
||||
// current space count
|
||||
int32_t fSpaceCount;
|
||||
|
||||
// UnicodeSet of SA characters
|
||||
UnicodeSet fComplexContext;
|
||||
|
||||
// true when fBreakIter has returned DONE
|
||||
UBool fDone;
|
||||
|
@ -386,6 +391,47 @@ const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count,
|
|||
return noSpaces;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a text file with spaces in it from a file without.
|
||||
*/
|
||||
int generateFile(const UChar *chars, int32_t length) {
|
||||
Locale root("");
|
||||
UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
UnicodeString saSet("[:LineBreak=SA:]", -1, US_INV);
|
||||
UnicodeSet complexContext(saSet, status);
|
||||
BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
|
||||
breakIter->adoptText(noSpaceIter);
|
||||
char outbuf[1024];
|
||||
int32_t strlength;
|
||||
UChar bom = 0xFEFF;
|
||||
|
||||
printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
|
||||
int32_t prevbreak = 0;
|
||||
while (U_SUCCESS(status)) {
|
||||
int32_t nextbreak = breakIter->next();
|
||||
if (nextbreak == BreakIterator::DONE) {
|
||||
break;
|
||||
}
|
||||
printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
|
||||
nextbreak-prevbreak, &status));
|
||||
if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
|
||||
&& complexContext.contains(chars[nextbreak])) {
|
||||
printf(" ");
|
||||
}
|
||||
prevbreak = nextbreak;
|
||||
}
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "generate failed: %s\n", u_errorName(status));
|
||||
return status;
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The main routine. Read the command line arguments, read the text file,
|
||||
* remove the spaces, do the comparison and report the final results
|
||||
|
@ -395,6 +441,12 @@ int main(int argc, char **argv)
|
|||
char *fileName = "space.txt";
|
||||
int arg = 1;
|
||||
UBool verbose = FALSE;
|
||||
UBool generate = FALSE;
|
||||
|
||||
if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
|
||||
generate = TRUE;
|
||||
arg += 1;
|
||||
}
|
||||
|
||||
if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
|
||||
verbose = TRUE;
|
||||
|
@ -418,6 +470,10 @@ int main(int argc, char **argv)
|
|||
if (spaces == 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (generate) {
|
||||
return generateFile(spaces, spaceCount);
|
||||
}
|
||||
|
||||
noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
|
||||
|
||||
|
@ -441,11 +497,13 @@ int main(int argc, char **argv)
|
|||
SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
|
||||
: fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
|
||||
{
|
||||
UnicodeString saSet("[:LineBreak=SA:]", -1, US_INV);
|
||||
UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Locale us("us");
|
||||
fComplexContext.applyPattern(saSet, status);
|
||||
Locale root("");
|
||||
|
||||
fBreakIter = BreakIterator::createWordInstance(us, status);
|
||||
fBreakIter = BreakIterator::createWordInstance(root, status);
|
||||
fBreakIter->adoptText(iter);
|
||||
}
|
||||
|
||||
|
@ -471,12 +529,17 @@ int32_t SpaceBreakIterator::next()
|
|||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
int32_t nextBreak = fBreakIter->next();
|
||||
|
||||
if (nextBreak == BreakIterator::DONE) {
|
||||
fDone = TRUE;
|
||||
return BreakIterator::DONE;
|
||||
int32_t nextBreak;
|
||||
do {
|
||||
nextBreak = fBreakIter->next();
|
||||
|
||||
if (nextBreak == BreakIterator::DONE) {
|
||||
fDone = TRUE;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
}
|
||||
while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
|
||||
&& fComplexContext.contains(fText[nextBreak]));
|
||||
|
||||
int32_t result = nextBreak - fSpaceCount;
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ subdir = tools
|
|||
|
||||
SUBDIRS = toolutil ctestfw makeconv genrb genuca genbrk \
|
||||
gennames genpname gencnval gensprep genccode gencmn icupkg pkgdata \
|
||||
gentest genprops gencase genbidi gennorm
|
||||
gentest genprops gencase genbidi gennorm genctd
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local all-recursive install install-local \
|
||||
|
|
95
icu4c/source/tools/genctd/Makefile.in
Normal file
95
icu4c/source/tools/genctd/Makefile.in
Normal file
|
@ -0,0 +1,95 @@
|
|||
## Makefile.in for ICU - tools/genctd
|
||||
## Copyright (c) 2002-2006 International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/genctd
|
||||
|
||||
TARGET_STUB_NAME = genctd
|
||||
|
||||
SECTION = 1
|
||||
|
||||
MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
|
||||
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS) $(MAN_FILES)
|
||||
|
||||
## Target information
|
||||
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
|
||||
|
||||
ifneq ($(top_builddir),$(top_srcdir))
|
||||
CPPFLAGS += -I$(top_builddir)/common
|
||||
endif
|
||||
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = genctd.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check check-local install-man
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET) $(MAN_FILES)
|
||||
|
||||
install-local: all-local install-man
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(bindir)
|
||||
$(INSTALL) $(TARGET) $(DESTDIR)$(bindir)
|
||||
|
||||
install-man: $(MAN_FILES)
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
$(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(OBJECTS)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
|
||||
|
||||
|
||||
%.$(SECTION): $(srcdir)/%.$(SECTION).in
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
|
111
icu4c/source/tools/genctd/genctd.1.in
Normal file
111
icu4c/source/tools/genctd/genctd.1.in
Normal file
|
@ -0,0 +1,111 @@
|
|||
.\" Hey, Emacs! This is -*-nroff-*- you know...
|
||||
.\"
|
||||
.\" genctd.1: manual page for the genctd utility
|
||||
.\"
|
||||
.\" Copyright (C) 2006 IBM, Inc. and others.
|
||||
.\"
|
||||
.TH GENCTD 1 "8 March 2006" "ICU MANPAGE" "ICU @VERSION@ Manual"
|
||||
.SH NAME
|
||||
.B genctd
|
||||
\- Compiles word list into ICU compact trie dictionary
|
||||
.SH SYNOPSIS
|
||||
.B genctd
|
||||
[
|
||||
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
|
||||
]
|
||||
[
|
||||
.BR "\-V\fP, \fB\-\-version"
|
||||
]
|
||||
[
|
||||
.BR "\-c\fP, \fB\-\-copyright"
|
||||
]
|
||||
[
|
||||
.BR "\-v\fP, \fB\-\-verbose"
|
||||
]
|
||||
[
|
||||
.BI "\-d\fP, \fB\-\-destdir" " destination"
|
||||
]
|
||||
[
|
||||
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
|
||||
]
|
||||
.BI "\-o\fP, \fB\-\-out" " output\-file"
|
||||
.IR " dictionary\-file"
|
||||
.SH DESCRIPTION
|
||||
.B genctd
|
||||
reads the word list from
|
||||
.I dictionary-file
|
||||
and creates a compact trie dictionary file. Normally this data file has the
|
||||
.B .ctd
|
||||
extension.
|
||||
.PP
|
||||
Words begin at the beginning of a line and are terminated by the first whitespace.
|
||||
Lines that begin with whitespace are ignored.
|
||||
.SH OPTIONS
|
||||
.TP
|
||||
.BR "\-h\fP, \fB\-?\fP, \fB\-\-help"
|
||||
Print help about usage and exit.
|
||||
.TP
|
||||
.BR "\-V\fP, \fB\-\-version"
|
||||
Print the version of
|
||||
.B genctd
|
||||
and exit.
|
||||
.TP
|
||||
.BR "\-c\fP, \fB\-\-copyright"
|
||||
Embeds the standard ICU copyright into the
|
||||
.IR output-file .
|
||||
.TP
|
||||
.BR "\-v\fP, \fB\-\-verbose"
|
||||
Display extra informative messages during execution.
|
||||
.TP
|
||||
.BI "\-d\fP, \fB\-\-destdir" " destination"
|
||||
Set the destination directory of the
|
||||
.IR output-file
|
||||
to
|
||||
.IR destination .
|
||||
.TP
|
||||
.BI "\-i\fP, \fB\-\-icudatadir" " directory"
|
||||
Look for any necessary ICU data files in
|
||||
.IR directory .
|
||||
For example, the file
|
||||
.B pnames.icu
|
||||
must be located when ICU's data is not built as a shared library.
|
||||
The default ICU data directory is specified by the environment variable
|
||||
.BR ICU_DATA .
|
||||
Most configurations of ICU do not require this argument.
|
||||
.TP
|
||||
.BI " dictionary\-file"
|
||||
The source file to read.
|
||||
.TP
|
||||
.BI "\-o\fP, \fB\-\-out" " output\-file"
|
||||
The output data file to write.
|
||||
.SH CAVEATS
|
||||
When the
|
||||
.IR dictionary-file
|
||||
contains a byte order mark (BOM) at the beginning of the file, which is the Unicode character
|
||||
.B U+FEFF,
|
||||
then the
|
||||
.IR dictionary-file
|
||||
is interpreted as Unicode. Without the BOM,
|
||||
the file is interpreted in the current operating system default codepage.
|
||||
In order to eliminate any ambiguity of the encoding for how the
|
||||
.IR rule-file
|
||||
was written, it is recommended that you write this file in UTF-8
|
||||
with the BOM.
|
||||
.SH ENVIRONMENT
|
||||
.TP 10
|
||||
.B ICU_DATA
|
||||
Specifies the directory containing ICU data. Defaults to
|
||||
.BR @thepkgicudatadir@/@PACKAGE@/@VERSION@/ .
|
||||
Some tools in ICU depend on the presence of the trailing slash. It is thus
|
||||
important to make sure that it is present if
|
||||
.B ICU_DATA
|
||||
is set.
|
||||
.SH AUTHORS
|
||||
Deborah Goldsmith
|
||||
.SH VERSION
|
||||
1.0
|
||||
.SH COPYRIGHT
|
||||
Copyright (C) 2006 IBM, Inc. and others.
|
||||
.SH SEE ALSO
|
||||
.BR http://icu.sourceforge.net/userguide/boundaryAnalysis.html
|
||||
|
393
icu4c/source/tools/genctd/genctd.cpp
Normal file
393
icu4c/source/tools/genctd/genctd.cpp
Normal file
|
@ -0,0 +1,393 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2006, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* File genctd.c
|
||||
*/
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//
|
||||
// Tool for generating CompactTrieDictionary data files (.ctd files).
|
||||
//
|
||||
// Usage: genctd [options] -o output-file.ctd input-file
|
||||
//
|
||||
// options: -v verbose
|
||||
// -? or -h help
|
||||
//
|
||||
// The input file is a plain text file containing words, one per line.
|
||||
// Words end at the first whitespace; lines beginning with whitespace
|
||||
// are ignored.
|
||||
// The file can be encoded as utf-8, or utf-16 (either endian), or
|
||||
// in the default code page (platform dependent.). utf encoded
|
||||
// files must include a BOM.
|
||||
//
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/putil.h"
|
||||
|
||||
#include "uoptions.h"
|
||||
#include "unewdata.h"
|
||||
#include "ucmndata.h"
|
||||
#include "rbbidata.h"
|
||||
#include "triedict.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static char *progName;
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H, /* 0 */
|
||||
UOPTION_HELP_QUESTION_MARK, /* 1 */
|
||||
UOPTION_VERBOSE, /* 2 */
|
||||
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 3 */
|
||||
UOPTION_ICUDATADIR, /* 4 */
|
||||
UOPTION_DESTDIR, /* 5 */
|
||||
UOPTION_COPYRIGHT, /* 6 */
|
||||
};
|
||||
|
||||
void usageAndDie(int retCode) {
|
||||
printf("Usage: %s [-v] [-options] -o output-file dictionary-file\n", progName);
|
||||
printf("\tRead in word list and write out compact trie dictionary\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose turn on verbose output\n"
|
||||
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
|
||||
"\t followed by path, defaults to %s\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n",
|
||||
u_getDataDirectory());
|
||||
exit (retCode);
|
||||
}
|
||||
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/* dummy UDataInfo cf. udata.h */
|
||||
static UDataInfo dummyDataInfo = {
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ 0, 0, 0, 0 }, /* dummy dataFormat */
|
||||
{ 0, 0, 0, 0 }, /* dummy formatVersion */
|
||||
{ 0, 0, 0, 0 } /* dummy dataVersion */
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
//
|
||||
// Set up the ICU data header, defined in ucmndata.h
|
||||
//
|
||||
DataHeader dh ={
|
||||
{sizeof(DataHeader), // Struct MappedData
|
||||
0xda,
|
||||
0x27},
|
||||
|
||||
{ // struct UDataInfo
|
||||
sizeof(UDataInfo), // size
|
||||
0, // reserved
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0, // reserved
|
||||
|
||||
{ 0x54, 0x72, 0x44, 0x63 }, // "TrDc" Trie Dictionary
|
||||
{ 1, 0, 0, 0 }, // 1.0.0.0
|
||||
{ 0, 0, 0, 0 }, // Irrelevant for this data type
|
||||
}};
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// main for genctd
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
int main(int argc, char **argv) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *wordFileName;
|
||||
const char *outFileName;
|
||||
const char *outDir = NULL;
|
||||
const char *copyright = NULL;
|
||||
|
||||
//
|
||||
// Pick up and check the command line arguments,
|
||||
// using the standard ICU tool utils option handling.
|
||||
//
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
progName = argv[0];
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
if(argc<0) {
|
||||
// Unrecognized option
|
||||
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
|
||||
if(options[0].doesOccur || options[1].doesOccur) {
|
||||
// -? or -h for help.
|
||||
usageAndDie(0);
|
||||
}
|
||||
|
||||
if (!options[3].doesOccur || argc < 2) {
|
||||
fprintf(stderr, "input and output file must both be specified.\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
outFileName = options[3].value;
|
||||
wordFileName = argv[1];
|
||||
|
||||
if (options[4].doesOccur) {
|
||||
u_setDataDirectory(options[4].value);
|
||||
}
|
||||
|
||||
/* Initialize ICU */
|
||||
u_init(&status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
|
||||
argv[0], u_errorName(status));
|
||||
exit(1);
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
/* Combine the directory with the file name */
|
||||
if(options[5].doesOccur) {
|
||||
outDir = options[5].value;
|
||||
}
|
||||
if (options[6].doesOccur) {
|
||||
copyright = U_COPYRIGHT_STRING;
|
||||
}
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
UNewDataMemory *pData;
|
||||
char msg[1024];
|
||||
|
||||
/* write message with just the name */
|
||||
sprintf(msg, "genctd writes dummy %s because of UCONFIG_NO_BREAK_ITERATION, see uconfig.h", outFileName);
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
|
||||
/* write the dummy data file */
|
||||
pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
|
||||
udata_writeBlock(pData, msg, strlen(msg));
|
||||
udata_finish(pData, &status);
|
||||
return (int)status;
|
||||
|
||||
#else
|
||||
|
||||
//
|
||||
// Read in the dictionary source file
|
||||
//
|
||||
long result;
|
||||
long wordFileSize;
|
||||
FILE *file;
|
||||
char *wordBufferC;
|
||||
|
||||
file = fopen(wordFileName, "rb");
|
||||
if( file == 0 ) {
|
||||
fprintf(stderr, "Could not open file \"%s\"\n", wordFileName);
|
||||
exit(-1);
|
||||
}
|
||||
fseek(file, 0, SEEK_END);
|
||||
wordFileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
wordBufferC = new char[wordFileSize+10];
|
||||
|
||||
result = (long)fread(wordBufferC, 1, wordFileSize, file);
|
||||
if (result != wordFileSize) {
|
||||
fprintf(stderr, "Error reading file \"%s\"\n", wordFileName);
|
||||
exit (-1);
|
||||
}
|
||||
wordBufferC[wordFileSize]=0;
|
||||
fclose(file);
|
||||
|
||||
//
|
||||
// Look for a Unicode Signature (BOM) on the word file
|
||||
//
|
||||
int32_t signatureLength;
|
||||
const char * wordSourceC = wordBufferC;
|
||||
const char* encoding = ucnv_detectUnicodeSignature(
|
||||
wordSourceC, wordFileSize, &signatureLength, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
exit(status);
|
||||
}
|
||||
if(encoding!=NULL ){
|
||||
wordSourceC += signatureLength;
|
||||
wordFileSize -= signatureLength;
|
||||
}
|
||||
|
||||
//
|
||||
// Open a converter to take the rule file to UTF-16
|
||||
//
|
||||
UConverter* conv;
|
||||
conv = ucnv_open(encoding, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
//
|
||||
// Convert the words to UChar.
|
||||
// Preflight first to determine required buffer size.
|
||||
//
|
||||
uint32_t destCap = ucnv_toUChars(conv,
|
||||
NULL, // dest,
|
||||
0, // destCapacity,
|
||||
wordSourceC,
|
||||
wordFileSize,
|
||||
&status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
};
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UChar *wordSourceU = new UChar[destCap+1];
|
||||
ucnv_toUChars(conv,
|
||||
wordSourceU, // dest,
|
||||
destCap+1,
|
||||
wordSourceC,
|
||||
wordFileSize,
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
};
|
||||
ucnv_close(conv);
|
||||
|
||||
// Get rid of the original file buffer
|
||||
delete[] wordBufferC;
|
||||
|
||||
// Create a MutableTrieDictionary, and loop through all the lines, inserting
|
||||
// words.
|
||||
|
||||
// First, pick a median character.
|
||||
UChar *current = wordSourceU + (destCap/2);
|
||||
UChar uc = *current++;
|
||||
UnicodeSet breaks;
|
||||
breaks.add(0x000A); // Line Feed
|
||||
breaks.add(0x000D); // Carriage Return
|
||||
breaks.add(0x2028); // Line Separator
|
||||
breaks.add(0x2029); // Paragraph Separator
|
||||
|
||||
do {
|
||||
// Look for line break
|
||||
while (uc && !breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
// Now skip to first non-line-break
|
||||
while (uc && breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
}
|
||||
while (uc && (breaks.contains(uc) || u_isspace(uc)));
|
||||
|
||||
MutableTrieDictionary *mtd = new MutableTrieDictionary(uc, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "new MutableTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
// Now add the words. Words are non-space characters at the beginning of
|
||||
// lines, and must be at least one UChar.
|
||||
current = wordSourceU;
|
||||
UChar *candidate = current;
|
||||
uc = *current++;
|
||||
int32_t length = 0;
|
||||
|
||||
while (uc) {
|
||||
while (uc && !u_isspace(uc)) {
|
||||
++length;
|
||||
uc = *current++;
|
||||
}
|
||||
if (length > 0) {
|
||||
mtd->addWord(candidate, length, status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "MutableTrieDictionary::addWord: ICU Error \"%s\"\n",
|
||||
u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
}
|
||||
// Find beginning of next line
|
||||
while (uc && !breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
while (uc && breaks.contains(uc)) {
|
||||
uc = *current++;
|
||||
}
|
||||
candidate = current-1;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
// Get rid of the Unicode text buffer
|
||||
delete[] wordSourceU;
|
||||
|
||||
// Now, create a CompactTrieDictionary from the mutable dictionary
|
||||
CompactTrieDictionary *ctd = new CompactTrieDictionary(*mtd, status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "new CompactTrieDictionary: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
// Get rid of the MutableTrieDictionary
|
||||
delete mtd;
|
||||
|
||||
//
|
||||
// Get the binary data from the dictionary.
|
||||
//
|
||||
uint32_t outDataSize = ctd->dataSize();
|
||||
const uint8_t *outData = (const uint8_t *)ctd->data();
|
||||
|
||||
//
|
||||
// Create the output file
|
||||
//
|
||||
size_t bytesWritten;
|
||||
UNewDataMemory *pData;
|
||||
pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
fprintf(stderr, "genctd: Could not open output file \"%s\", \"%s\"\n",
|
||||
outFileName, u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
|
||||
// Write the data itself.
|
||||
udata_writeBlock(pData, outData, outDataSize);
|
||||
// finish up
|
||||
bytesWritten = udata_finish(pData, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
fprintf(stderr, "genctd: error \"%s\" writing the output file\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
if (bytesWritten != outDataSize) {
|
||||
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Get rid of the CompactTrieDictionary
|
||||
delete ctd;
|
||||
|
||||
u_cleanup();
|
||||
|
||||
printf("genctd: tool completed successfully.\n");
|
||||
return 0;
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
}
|
||||
|
Loading…
Add table
Reference in a new issue