mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-45 new builder for RBBI rules, initial checkin
X-SVN-Rev: 8939
This commit is contained in:
parent
f6d8f01f27
commit
32c09250b7
57 changed files with 8436 additions and 989 deletions
|
@ -189,6 +189,24 @@ Package=<4>
|
|||
|
||||
###############################################################################
|
||||
|
||||
Project: "genbrk"=..\tools\genbrk\genbrk.dsp - Package Owner=<4>
|
||||
|
||||
Package=<5>
|
||||
{{{
|
||||
}}}
|
||||
|
||||
Package=<4>
|
||||
{{{
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name common
|
||||
End Project Dependency
|
||||
Begin Project Dependency
|
||||
Project_Dep_Name toolutil
|
||||
End Project Dependency
|
||||
}}}
|
||||
|
||||
###############################################################################
|
||||
|
||||
Project: "derb"=..\TOOLS\GENRB\derb.dsp - Package Owner=<4>
|
||||
|
||||
Package=<5>
|
||||
|
|
|
@ -62,7 +62,8 @@ unistr.o utf_impl.o ustring.o ustrcase.o cstring.o ustrfmt.o ustrtrns.o \
|
|||
normlzr.o unorm.o chariter.o schriter.o uchriter.o uiter.o \
|
||||
uchar.o uprops.o bidi.o ubidi.o ubidiwrt.o ubidiln.o ushape.o unames.o \
|
||||
ucln_cmn.o uscript.o umemstrm.o ucmp8.o uvector.o digitlst.o \
|
||||
brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o \
|
||||
brkiter.o brkdict.o ubrk.o dbbi.o dbbi_tbl.o \
|
||||
rbbi.o rbbidata.o rbbinode.o rbbirb.o rbbiscan.o rbbisetb.o rbbistbl.o rbbitblb.o \
|
||||
unicode.o scsu.o convert.o utrie.o uset.o \
|
||||
unifilt.o unifunct.o uniset.o upropset.o usetiter.o util.o
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
|
|||
result = new DictionaryBasedBreakIterator(file, filename, status);
|
||||
}
|
||||
else {
|
||||
result = new RuleBasedBreakIterator(file);
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,7 +97,7 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
|
|||
result = new DictionaryBasedBreakIterator(file, filename, status);
|
||||
}
|
||||
else {
|
||||
result = new RuleBasedBreakIterator(file);
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -121,7 +121,7 @@ BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
|
|||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
|
||||
if (!U_FAILURE(status)) {
|
||||
result = new RuleBasedBreakIterator(file);
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -144,7 +144,7 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
|
|||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
|
||||
if (!U_FAILURE(status)) {
|
||||
result = new RuleBasedBreakIterator(file);
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -167,7 +167,7 @@ BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
|
|||
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
|
||||
|
||||
if (!U_FAILURE(status)) {
|
||||
result = new RuleBasedBreakIterator(file);
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
@ -220,7 +220,31 @@ SOURCE=.\rbbi.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi_tbl.cpp
|
||||
SOURCE=.\rbbidata.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbinode.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbirb.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbiscan.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbisetb.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbistbl.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbitblb.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
|
@ -817,24 +841,39 @@ InputPath=.\unicode\normlzr.h
|
|||
|
||||
!ELSEIF "$(CFG)" == "common - Win64 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\normlzr.h
|
||||
|
||||
"..\..\include\unicode\normlzr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\include\unicode
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win64 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\normlzr.h
|
||||
!ENDIF
|
||||
|
||||
"..\..\include\unicode\normlzr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\parseerr.h
|
||||
|
||||
!IF "$(CFG)" == "common - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\parseerr.h
|
||||
|
||||
"..\..\include\unicode\parseerr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\include\unicode
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win32 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\parseerr.h
|
||||
|
||||
"..\..\include\unicode\parseerr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\include\unicode
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win64 Release"
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win64 Debug"
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
|
@ -894,6 +933,37 @@ SOURCE=.\unicode\putil.h
|
|||
# Begin Custom Build
|
||||
InputPath=.\unicode\putil.h
|
||||
|
||||
"..\..\include\unicode\normlzr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\include\unicode
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win32 Debug"
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win64 Release"
|
||||
|
||||
!ELSEIF "$(CFG)" == "common - Win64 Debug"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\putil.h
|
||||
|
||||
"..\..\include\unicode\normlzr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\include\unicode
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\putil.h
|
||||
|
||||
!IF "$(CFG)" == "common - Win32 Release"
|
||||
|
||||
# Begin Custom Build
|
||||
InputPath=.\unicode\putil.h
|
||||
|
||||
"..\..\include\unicode\putil.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(InputPath) ..\..\include\unicode
|
||||
|
||||
|
@ -1028,7 +1098,31 @@ InputPath=.\unicode\rbbi.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi_tbl.h
|
||||
SOURCE=.\rbbidata.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbinode.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbirb.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbirpt.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbiscan.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbisetb.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbitblb.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
|
|
|
@ -19,54 +19,86 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
const char DictionaryBasedBreakIterator::fgClassID = 0;
|
||||
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* tablesImage,
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode& status)
|
||||
: RuleBasedBreakIterator((UDataMemory*)NULL),
|
||||
dictionaryCharCount(0),
|
||||
cachedBreakPositions(NULL),
|
||||
numCachedBreakPositions(0),
|
||||
positionInCache(0)
|
||||
{
|
||||
tables = new DictionaryBasedBreakIteratorTables(tablesImage, dictionaryFilename, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete tables;
|
||||
return;
|
||||
}
|
||||
tables->addReference();
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
|
||||
RuleBasedBreakIterator() {
|
||||
init();
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode& status)
|
||||
: RuleBasedBreakIterator(rbbiData, status)
|
||||
{
|
||||
init();
|
||||
fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
|
||||
if (U_FAILURE(status)) {
|
||||
fTables->removeReference();
|
||||
fTables = NULL;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
|
||||
RuleBasedBreakIterator(other)
|
||||
{
|
||||
init();
|
||||
if (other.fTables != NULL) {
|
||||
fTables = other.fTables;
|
||||
fTables->addReference();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
|
||||
{
|
||||
uprv_free(cachedBreakPositions);
|
||||
cachedBreakPositions = NULL;
|
||||
if (fTables != NULL) {fTables->removeReference();};
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
*/
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// Assignment operator. Sets this iterator to have the same behavior,
|
||||
// and iterate over the same text, as the one passed in.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
DictionaryBasedBreakIterator&
|
||||
DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
|
||||
reset();
|
||||
if (this == &that) {
|
||||
return *this;
|
||||
}
|
||||
reset(); // clears out cached break positions.
|
||||
RuleBasedBreakIterator::operator=(that);
|
||||
if (this->fTables != that.fTables) {
|
||||
if (this->fTables != NULL) {this->fTables->removeReference();};
|
||||
this->fTables = that.fTables;
|
||||
if (this->fTables != NULL) {this->fTables->addReference();};
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
*/
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
// behavior, and iterating over the same text, as this one.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
BreakIterator*
|
||||
DictionaryBasedBreakIterator::clone() const {
|
||||
return new DictionaryBasedBreakIterator(*this);
|
||||
|
@ -88,7 +120,7 @@ DictionaryBasedBreakIterator::previous()
|
|||
// covered by them, just move one step backward in the cache
|
||||
if (cachedBreakPositions != NULL && positionInCache > 0) {
|
||||
--positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
|
||||
|
@ -117,11 +149,11 @@ DictionaryBasedBreakIterator::preceding(int32_t offset)
|
|||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (text == NULL || offset > text->endIndex()) {
|
||||
if (fText == NULL || offset > fText->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < text->startIndex()) {
|
||||
return text->startIndex();
|
||||
else if (offset < fText->startIndex()) {
|
||||
return fText->startIndex();
|
||||
}
|
||||
|
||||
// if we have no cached break positions, or "offset" is outside the
|
||||
|
@ -143,8 +175,8 @@ DictionaryBasedBreakIterator::preceding(int32_t offset)
|
|||
&& offset > cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
--positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return text->getIndex();
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return fText->getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,11 +192,11 @@ DictionaryBasedBreakIterator::following(int32_t offset)
|
|||
// if the offset passed in is already past the end of the text,
|
||||
// just return DONE; if it's before the beginning, return the
|
||||
// text's starting offset
|
||||
if (text == NULL || offset > text->endIndex()) {
|
||||
if (fText == NULL || offset > fText->endIndex()) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
else if (offset < text->startIndex()) {
|
||||
return text->startIndex();
|
||||
else if (offset < fText->startIndex()) {
|
||||
return fText->startIndex();
|
||||
}
|
||||
|
||||
// if we have no cached break positions, or if "offset" is outside the
|
||||
|
@ -185,8 +217,8 @@ DictionaryBasedBreakIterator::following(int32_t offset)
|
|||
while (positionInCache < numCachedBreakPositions
|
||||
&& offset >= cachedBreakPositions[positionInCache])
|
||||
++positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return text->getIndex();
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return fText->getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -205,14 +237,14 @@ DictionaryBasedBreakIterator::handleNext()
|
|||
// start by using the inherited handleNext() to find a tentative return
|
||||
// value. dictionaryCharCount tells us how many dictionary characters
|
||||
// we passed over on our way to the tentative return value
|
||||
int32_t startPos = text->getIndex();
|
||||
dictionaryCharCount = 0;
|
||||
int32_t startPos = fText->getIndex();
|
||||
fDictionaryCharCount = 0;
|
||||
int32_t result = RuleBasedBreakIterator::handleNext();
|
||||
|
||||
// if we passed over more than one dictionary character, then we use
|
||||
// divideUpDictionaryRange() to regenerate the cached break positions
|
||||
// for the new range
|
||||
if (dictionaryCharCount > 1 && result - startPos > 1) {
|
||||
if (fDictionaryCharCount > 1 && result - startPos > 1) {
|
||||
divideUpDictionaryRange(startPos, result, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return -9999; // SHOULD NEVER GET HERE!
|
||||
|
@ -232,7 +264,7 @@ DictionaryBasedBreakIterator::handleNext()
|
|||
// and return it
|
||||
if (cachedBreakPositions != NULL) {
|
||||
++positionInCache;
|
||||
text->setIndex(cachedBreakPositions[positionInCache]);
|
||||
fText->setIndex(cachedBreakPositions[positionInCache]);
|
||||
return cachedBreakPositions[positionInCache];
|
||||
}
|
||||
return -9999; // SHOULD NEVER GET HERE!
|
||||
|
@ -244,108 +276,95 @@ DictionaryBasedBreakIterator::reset()
|
|||
uprv_free(cachedBreakPositions);
|
||||
cachedBreakPositions = NULL;
|
||||
numCachedBreakPositions = 0;
|
||||
dictionaryCharCount = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
|
||||
// internal type for BufferClone
|
||||
struct bufferCloneStructUChar
|
||||
{
|
||||
uint8_t bi [sizeof(DictionaryBasedBreakIterator)] ;
|
||||
uint8_t text [sizeof(UCharCharacterIterator)] ;
|
||||
};
|
||||
|
||||
struct bufferCloneStructString
|
||||
{
|
||||
uint8_t bi [sizeof(DictionaryBasedBreakIterator)] ;
|
||||
uint8_t text [sizeof(StringCharacterIterator)] ;
|
||||
};
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// init() Common initialization routine, for use by constructors, etc.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
void DictionaryBasedBreakIterator::init() {
|
||||
cachedBreakPositions = NULL;
|
||||
fTables = NULL;
|
||||
numCachedBreakPositions = 0;
|
||||
fDictionaryCharCount = 0;
|
||||
positionInCache = 0;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// BufferClone
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
|
||||
int32_t &BufferSize,
|
||||
int32_t &bufferSize,
|
||||
UErrorCode &status)
|
||||
{
|
||||
DictionaryBasedBreakIterator * localIterator;
|
||||
int32_t bufferSizeNeeded = 0;
|
||||
UBool IterIsUChar = FALSE;
|
||||
UBool IterIsString = FALSE;
|
||||
char *stackBufferChars = (char *)stackBuffer;
|
||||
|
||||
if (U_FAILURE(status)){
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Pointers on 64-bit platforms need to be aligned
|
||||
* on a 64-bit boundry in memory.
|
||||
*/
|
||||
//
|
||||
// If user buffer size is zero this is a preflight operation to
|
||||
// obtain the needed buffer size, allowing for worst case misalignment.
|
||||
//
|
||||
if (bufferSize == 0) {
|
||||
bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// Check the alignment and size of the user supplied buffer.
|
||||
// Allocate heap memory if the user supplied memory is insufficient.
|
||||
//
|
||||
char *buf = (char *)stackBuffer;
|
||||
int32_t s = bufferSize;
|
||||
|
||||
if (stackBuffer == NULL) {
|
||||
s = 0; // Ignore size, force allocation if user didn't give us a buffer.
|
||||
}
|
||||
if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
|
||||
int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
|
||||
BufferSize -= offsetUp;
|
||||
stackBufferChars += offsetUp;
|
||||
int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
|
||||
s -= offsetUp;
|
||||
buf += offsetUp;
|
||||
}
|
||||
if (s < sizeof(DictionaryBasedBreakIterator)) {
|
||||
buf = (char *) new DictionaryBasedBreakIterator();
|
||||
if (buf == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
}
|
||||
stackBuffer = (void *)stackBufferChars;
|
||||
|
||||
if (text == NULL)
|
||||
{
|
||||
bufferSizeNeeded = (int32_t) sizeof(DictionaryBasedBreakIterator);
|
||||
//
|
||||
// Initialize the clone object.
|
||||
// TODO: using an overloaded C++ "operator new" to directly initialize the
|
||||
// copy in the user's buffer would be better, but it doesn't seem
|
||||
// to get along with namespaces. Investigate why.
|
||||
//
|
||||
// The memcpy is only safe with an empty (default constructed)
|
||||
// break iterator. Use on others can screw up reference counts
|
||||
// to data. memcpy-ing objects is not really a good idea...
|
||||
//
|
||||
DictionaryBasedBreakIterator localIter; // Empty break iterator, source for memcpy
|
||||
DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
|
||||
uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
|
||||
*clone = *this; // clone = the real one we want.
|
||||
if (status != U_SAFECLONE_ALLOCATED_WARNING) {
|
||||
clone->fBufferClone = TRUE;
|
||||
}
|
||||
else if (text->getDynamicClassID() == StringCharacterIterator::getStaticClassID())
|
||||
{
|
||||
bufferSizeNeeded = (int32_t) sizeof(struct bufferCloneStructString);
|
||||
IterIsString = TRUE;
|
||||
}
|
||||
else if (text->getDynamicClassID() == UCharCharacterIterator::getStaticClassID())
|
||||
{
|
||||
bufferSizeNeeded = (int32_t) sizeof(struct bufferCloneStructUChar);
|
||||
IterIsUChar = TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
// code has changed - time to make a real CharacterIterator::CreateBufferClone()
|
||||
}
|
||||
if (BufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
|
||||
BufferSize = bufferSizeNeeded;
|
||||
return 0;
|
||||
}
|
||||
if (BufferSize < bufferSizeNeeded || !stackBuffer)
|
||||
{
|
||||
/* allocate one here...*/
|
||||
localIterator = new DictionaryBasedBreakIterator(*this);
|
||||
status = U_SAFECLONE_ALLOCATED_ERROR;
|
||||
return localIterator;
|
||||
}
|
||||
if (IterIsUChar) {
|
||||
struct bufferCloneStructUChar * localClone
|
||||
= (struct bufferCloneStructUChar *)stackBuffer;
|
||||
localIterator = (DictionaryBasedBreakIterator *)&localClone->bi;
|
||||
uprv_memcpy(localIterator, this, sizeof(DictionaryBasedBreakIterator));
|
||||
uprv_memcpy(&localClone->text, text, sizeof(UCharCharacterIterator));
|
||||
localIterator->text = (CharacterIterator *) &localClone->text;
|
||||
} else if (IterIsString) {
|
||||
struct bufferCloneStructString * localClone
|
||||
= (struct bufferCloneStructString *)stackBuffer;
|
||||
localIterator = (DictionaryBasedBreakIterator *)&localClone->bi;
|
||||
uprv_memcpy(localIterator, this, sizeof(DictionaryBasedBreakIterator));
|
||||
uprv_memcpy(&localClone->text, text, sizeof(StringCharacterIterator));
|
||||
localIterator->text = (CharacterIterator *)&localClone->text;
|
||||
} else {
|
||||
DictionaryBasedBreakIterator * localClone
|
||||
= (DictionaryBasedBreakIterator *)stackBuffer;
|
||||
localIterator = localClone;
|
||||
uprv_memcpy(localIterator, this, sizeof(DictionaryBasedBreakIterator));
|
||||
}
|
||||
// must not use (or delete) the copy of the old cache if it exists - not threadsafe
|
||||
localIterator->fBufferClone = TRUE;
|
||||
localIterator->cachedBreakPositions = NULL;
|
||||
localIterator->numCachedBreakPositions = 0;
|
||||
localIterator->positionInCache = 0;
|
||||
|
||||
return localIterator;
|
||||
return clone;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This is the function that actually implements the dictionary-based
|
||||
* algorithm. Given the endpoints of a range of text, it uses the
|
||||
|
@ -357,23 +376,17 @@ BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
|
|||
void
|
||||
DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
|
||||
{
|
||||
// to avoid casts throughout the rest of this function
|
||||
DictionaryBasedBreakIteratorTables* dictionaryTables
|
||||
= (DictionaryBasedBreakIteratorTables*)(this->tables);
|
||||
|
||||
// the range we're dividing may begin or end with non-dictionary characters
|
||||
// (i.e., for line breaking, we may have leading or trailing punctuation
|
||||
// that needs to be kept with the word). Seek from the beginning of the
|
||||
// range to the first dictionary character
|
||||
text->setIndex(startPos);
|
||||
UChar c = text->current();
|
||||
int category = dictionaryTables->lookupCategory(c, this);
|
||||
while (category == UBRK_IGNORE || !dictionaryTables->categoryFlags[category]) {
|
||||
c = text->next();
|
||||
category = dictionaryTables->lookupCategory(c, this);
|
||||
fText->setIndex(startPos);
|
||||
UChar c = fText->current();
|
||||
while (isDictionaryChar(c) == FALSE) {
|
||||
c = fText->next();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// initialize. We maintain two stacks: currentBreakPositions contains
|
||||
// the list of break positions that will be returned if we successfully
|
||||
// finish traversing the whole range now. possibleBreakPositions lists
|
||||
|
@ -406,7 +419,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
// dictionary. In this case, we "bless" the break positions that got us the
|
||||
// farthest as real break positions, and then start over from scratch with
|
||||
// the character where the error occurred.
|
||||
int32_t farthestEndPoint = text->getIndex();
|
||||
int32_t farthestEndPoint = fText->getIndex();
|
||||
UStack bestBreakPositions(status);
|
||||
UBool bestBreakPositionsInitialized = FALSE;
|
||||
|
||||
|
@ -414,25 +427,25 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
return;
|
||||
}
|
||||
// initialize (we always exit the loop with a break statement)
|
||||
c = text->current();
|
||||
c = fText->current();
|
||||
for (;;) {
|
||||
|
||||
// if we can transition to state "-1" from our current state, we're
|
||||
// on the last character of a legal word. Push that position onto
|
||||
// the possible-break-positions stack
|
||||
if (dictionaryTables->dictionary.at(state, (int32_t)0) == -1) {
|
||||
possibleBreakPositions.push(text->getIndex(), status);
|
||||
if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
|
||||
possibleBreakPositions.push(fText->getIndex(), status);
|
||||
}
|
||||
|
||||
// look up the new state to transition to in the dictionary
|
||||
state = dictionaryTables->dictionary.at(state, c);
|
||||
state = fTables->fDictionary->at(state, c);
|
||||
|
||||
// if the character we're sitting on causes us to transition to
|
||||
// the "end of word" state, then it was a non-dictionary character
|
||||
// and we've successfully traversed the whole range. Drop out
|
||||
// of the loop.
|
||||
if (state == -1) {
|
||||
currentBreakPositions.push(text->getIndex(), status);
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -440,12 +453,12 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
// the error state, or if we've gone off the end of the range
|
||||
// without transitioning to the "end of word" state, we've hit
|
||||
// an error...
|
||||
else if (state == 0 || text->getIndex() >= endPos) {
|
||||
else if (state == 0 || fText->getIndex() >= endPos) {
|
||||
|
||||
// if this is the farthest we've gotten, take note of it in
|
||||
// case there's an error in the text
|
||||
if (text->getIndex() > farthestEndPoint) {
|
||||
farthestEndPoint = text->getIndex();
|
||||
if (fText->getIndex() > farthestEndPoint) {
|
||||
farthestEndPoint = fText->getIndex();
|
||||
bestBreakPositions.removeAllElements();
|
||||
bestBreakPositionsInitialized = TRUE;
|
||||
for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
|
||||
|
@ -481,7 +494,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
}
|
||||
bestBreakPositions.removeAllElements();
|
||||
if (farthestEndPoint < endPos) {
|
||||
text->setIndex(farthestEndPoint + 1);
|
||||
fText->setIndex(farthestEndPoint + 1);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
|
@ -489,12 +502,12 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
}
|
||||
else {
|
||||
if ((currentBreakPositions.isEmpty()
|
||||
|| currentBreakPositions.peeki() != text->getIndex())
|
||||
&& text->getIndex() != startPos) {
|
||||
currentBreakPositions.push(text->getIndex(), status);
|
||||
|| currentBreakPositions.peeki() != fText->getIndex())
|
||||
&& fText->getIndex() != startPos) {
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
}
|
||||
text->next();
|
||||
currentBreakPositions.push(text->getIndex(), status);
|
||||
fText->next();
|
||||
currentBreakPositions.push(fText->getIndex(), status);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -512,13 +525,13 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
wrongBreakPositions.addElement(temp2, status);
|
||||
}
|
||||
currentBreakPositions.push(temp, status);
|
||||
text->setIndex(currentBreakPositions.peeki());
|
||||
fText->setIndex(currentBreakPositions.peeki());
|
||||
}
|
||||
|
||||
// re-sync "c" for the next go-round, and drop out of the loop if
|
||||
// we've made it off the end of the range
|
||||
c = text->current();
|
||||
if (text->getIndex() >= endPos) {
|
||||
c = fText->current();
|
||||
if (fText->getIndex() >= endPos) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -526,7 +539,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
|
|||
// if we didn't hit any exceptional conditions on this last iteration,
|
||||
// just advance to the next character and loop
|
||||
else {
|
||||
c = text->next();
|
||||
c = fText->next();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,73 +1,53 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000 IBM Corp. All rights reserved.
|
||||
* Copyright (C) 1999-2002 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/1/99 rgillam Complete port from Java.
|
||||
* 01/13/2000 helena Added UErrorCode to ctors.
|
||||
* 06/14/2002 andy Gutted for new RBBI impl.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "ucmp8.h"
|
||||
#include "dbbi_tbl.h"
|
||||
#include "unicode/dbbi.h"
|
||||
#include "umutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
DictionaryBasedBreakIteratorTables::DictionaryBasedBreakIteratorTables(
|
||||
UDataMemory* tablesMemory,
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode &status)
|
||||
: RuleBasedBreakIteratorTables(tablesMemory),
|
||||
dictionary(dictionaryFilename, status)
|
||||
{
|
||||
if(tablesMemory != 0) {
|
||||
const void* tablesImage = udata_getMemory(tablesMemory);
|
||||
if(tablesImage != 0) {
|
||||
if (U_FAILURE(status)) return;
|
||||
const int32_t* tablesIdx = (int32_t*) tablesImage;
|
||||
const int8_t* dbbiImage = ((const int8_t*)tablesImage + tablesIdx[8]);
|
||||
// we know the offset into the memory image where the DBBI stuff
|
||||
// starts is stored in element 8 of the array. There should be
|
||||
// a way for the RBBI constructor to give us this, but there's
|
||||
// isn't a good one.
|
||||
const int32_t* dbbiIdx = (const int32_t*)dbbiImage;
|
||||
|
||||
categoryFlags = (int8_t*)((const int8_t*)dbbiImage + (int32_t)dbbiIdx[0]);
|
||||
}
|
||||
UErrorCode &status) {
|
||||
fDictionary = new BreakDictionary(dictionaryFilename, status);
|
||||
fRefCount = 1;
|
||||
}
|
||||
|
||||
|
||||
void DictionaryBasedBreakIteratorTables::addReference() {
|
||||
umtx_atomic_inc(&fRefCount);
|
||||
}
|
||||
|
||||
|
||||
void DictionaryBasedBreakIteratorTables::removeReference() {
|
||||
if (umtx_atomic_dec(&fRefCount) == 0) {
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables::~DictionaryBasedBreakIteratorTables() {
|
||||
if (ownTables)
|
||||
delete [] categoryFlags;
|
||||
delete fDictionary;
|
||||
fDictionary = NULL;
|
||||
}
|
||||
|
||||
int32_t
|
||||
DictionaryBasedBreakIteratorTables::lookupCategory(UChar c,
|
||||
BreakIterator* bi) const {
|
||||
// this override of lookupCategory() exists only to keep track of whether we've
|
||||
// passed over any dictionary characters. It calls the inherited lookupCategory()
|
||||
// to do the real work, and then checks whether its return value is one of the
|
||||
// categories represented in the dictionary. If it is, bump the dictionary-
|
||||
// character count.
|
||||
int32_t result = RuleBasedBreakIteratorTables::lookupCategory(c, bi);
|
||||
if (result != RuleBasedBreakIterator::UBRK_IGNORE && categoryFlags[result]) {
|
||||
((DictionaryBasedBreakIterator*)bi)->bumpDictionaryCharCount();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
#ifndef DBBI_TBL_H
|
||||
#define DBBI_TBL_H
|
||||
|
||||
#include "rbbi_tbl.h"
|
||||
#include "brkdict.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
|
@ -20,38 +19,42 @@ U_NAMESPACE_BEGIN
|
|||
/* forward declaration */
|
||||
class DictionaryBasedBreakIterator;
|
||||
|
||||
/**
|
||||
* This subclass of RuleBasedBreakIteratorTables contains the additional
|
||||
* static data that is used by DictionaryBasedBreakIterator. This comprises
|
||||
* the dictionary itself and an array of flags that indicate which characters
|
||||
* are in the dictionary.
|
||||
*
|
||||
* @author Richard Gillam
|
||||
*/
|
||||
class DictionaryBasedBreakIteratorTables : public RuleBasedBreakIteratorTables {
|
||||
//
|
||||
// DictionaryBasedBreakIteratorTables
|
||||
//
|
||||
// This class sits between instances of DictionaryBasedBreakIterator
|
||||
// and the dictionary data itself, which is of type BreakDictionary.
|
||||
// It provides reference counting, allowing multiple copies of a
|
||||
// DictionaryBasedBreakIterator to share a single instance of
|
||||
// BreakDictionary.
|
||||
//
|
||||
// TODO: it'd probably be cleaner to add the reference counting to
|
||||
// BreakDictionary and get rid of this class, but doing it this way
|
||||
// was a convenient transition from earlier code, and time is short...
|
||||
//
|
||||
class DictionaryBasedBreakIteratorTables {
|
||||
|
||||
private:
|
||||
/**
|
||||
* a list of known words that is used to divide up contiguous ranges of letters,
|
||||
* stored in a compressed, indexed, format that offers fast access
|
||||
*/
|
||||
BreakDictionary dictionary;
|
||||
int32_t fRefCount;
|
||||
|
||||
/**
|
||||
* a list of flags indicating which character categories are contained in
|
||||
* the dictionary file (this is used to determine which ranges of characters
|
||||
* to apply the dictionary to)
|
||||
*/
|
||||
int8_t* categoryFlags;
|
||||
|
||||
public:
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
DictionaryBasedBreakIteratorTables(const char* dictionaryFilename,
|
||||
UErrorCode& status);
|
||||
|
||||
DictionaryBasedBreakIteratorTables(UDataMemory* tablesMemory,
|
||||
const char* dictionaryFilename,
|
||||
UErrorCode& status);
|
||||
|
||||
BreakDictionary *fDictionary;
|
||||
void addReference();
|
||||
void removeReference();
|
||||
/**
|
||||
* Destructor. Should not be used directly. Use removeReference() istead.
|
||||
* (Not private to avoid compiler warnings.)
|
||||
*/
|
||||
virtual ~DictionaryBasedBreakIteratorTables();
|
||||
|
||||
private:
|
||||
/**
|
||||
* The copy constructor is declared private and not implemented.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
|
@ -62,26 +65,15 @@ private:
|
|||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~DictionaryBasedBreakIteratorTables();
|
||||
|
||||
/**
|
||||
* The assignment operator is declared private and not implemented.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
* Call addReference() and share an existing copy instead.
|
||||
*/
|
||||
DictionaryBasedBreakIteratorTables& operator=(
|
||||
const DictionaryBasedBreakIteratorTables& that);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
*/
|
||||
virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
|
||||
|
||||
friend class DictionaryBasedBreakIterator;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -31,7 +31,7 @@
|
|||
* 06/28/99 stephen Removed mutex locking in u_isBigEndian().
|
||||
* 08/04/99 jeffrey R. Added OS/2 changes
|
||||
* 11/15/99 helena Integrated S/390 IEEE support.
|
||||
* 04/26/01 Barry N. OS/400 support for uprv_getDefaultLocaleIDM
|
||||
* 04/26/01 Barry N. OS/400 support for uprv_getDefaultLocaleID
|
||||
* 08/15/01 Steven H. OS/400 support for uprv_getDefaultCodepage
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -1811,6 +1811,22 @@ _uFmtErrorName[U_FMT_PARSE_ERROR_LIMIT - U_FMT_PARSE_ERROR_START] = {
|
|||
"U_UNSUPPORTED_ATTRIBUTE"
|
||||
};
|
||||
|
||||
static const char * const
|
||||
_uBrkErrorName[U_BRK_ERROR_LIMIT - U_BRK_ERROR_START] = {
|
||||
"U_BRK_ERROR_START",
|
||||
"U_BRK_INTERNAL_ERROR",
|
||||
"U_BRK_HEX_DIGITS_EXPECTED",
|
||||
"U_BRK_SEMICOLON_EXPECTED",
|
||||
"U_BRK_RULE_SYNTAX",
|
||||
"U_BRK_UNCLOSED_SET",
|
||||
"U_BRK_ASSIGN_ERROR",
|
||||
"U_BRK_VARIABLE_REDFINITION",
|
||||
"U_BRK_MISMATCHED_PAREN",
|
||||
"U_BRK_NEW_LINE_IN_QUOTED_STRING",
|
||||
"U_BRK_UNDEFINED_VARIABLE",
|
||||
};
|
||||
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
u_errorName(UErrorCode code) {
|
||||
if(U_ZERO_ERROR <= code && code < U_STANDARD_ERROR_LIMIT) {
|
||||
|
@ -1821,6 +1837,8 @@ u_errorName(UErrorCode code) {
|
|||
return _uTransErrorName[code - U_PARSE_ERROR_START];
|
||||
} else if(U_FMT_PARSE_ERROR_START <= code && code < U_FMT_PARSE_ERROR_LIMIT){
|
||||
return _uFmtErrorName[code - U_FMT_PARSE_ERROR_START];
|
||||
} else if (U_BRK_ERROR_START <= code && code < U_BRK_ERROR_LIMIT){
|
||||
return _uBrkErrorName[code - U_BRK_ERROR_START];
|
||||
} else {
|
||||
return "[BOGUS UErrorCode]";
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
305
icu4c/source/common/rbbicst.pl
Executable file
305
icu4c/source/common/rbbicst.pl
Executable file
|
@ -0,0 +1,305 @@
|
|||
#
|
||||
# rbbicst Compile the RBBI rule paser state table data into initialized C data.
|
||||
#
|
||||
|
||||
$num_states = 1; # Always the state number for the line being compiled.
|
||||
$line_num = 0; # The line number in the input file.
|
||||
|
||||
$states{"pop"} = 255; # Add the "pop" to the list of defined state names.
|
||||
# This prevents any state from being labelled with "pop",
|
||||
# and resolves references to "pop" in the next state field.
|
||||
|
||||
line_loop: while (<>) {
|
||||
chomp();
|
||||
$line = $_;
|
||||
@fields = split();
|
||||
$line_num++;
|
||||
|
||||
# Remove # comments, which are any fields beginning with a #, plus all
|
||||
# that follow on the line.
|
||||
for ($i=0; $i<@fields; $i++) {
|
||||
if ($fields[$i] =~ /^#/) {
|
||||
@fields = @fields[0 .. $i-1];
|
||||
last;
|
||||
}
|
||||
}
|
||||
# ignore blank lines, and those with no fields left after stripping comments..
|
||||
if (@fields == 0) {
|
||||
next;
|
||||
}
|
||||
|
||||
#
|
||||
# State Label: handling.
|
||||
# Does the first token end with a ":"? If so, it's the name of a state.
|
||||
# Put in a hash, together with the current state number,
|
||||
# so that we can later look up the number from the name.
|
||||
#
|
||||
if (@fields[0] =~ /.*:$/) {
|
||||
$state_name = @fields[0];
|
||||
$state_name =~ s/://; # strip off the colon from the state name.
|
||||
|
||||
if ($states{$state_name} != 0) {
|
||||
print " rbbicst: at line $line-num duplicate definition of state $state_name\n";
|
||||
}
|
||||
$states{$state_name} = $num_states;
|
||||
$stateNames[$num_states] = $state_name;
|
||||
|
||||
# if the label was the only thing on this line, go on to the next line,
|
||||
# otherwise assume that a state definition is on the same line and fall through.
|
||||
if (@fields == 1) {
|
||||
next line_loop;
|
||||
}
|
||||
shift @fields; # shift off label field in preparation
|
||||
# for handling the rest of the line.
|
||||
}
|
||||
|
||||
#
|
||||
# State Transition line.
|
||||
# syntax is this,
|
||||
# character [n] target-state [^push-state] [function-name]
|
||||
# where
|
||||
# [something] is an optional something
|
||||
# character is either a single quoted character e.g. '['
|
||||
# or a name of a character class, e.g. white_space
|
||||
#
|
||||
|
||||
$state_line_num[$num_states] = $line_num; # remember line number with each state
|
||||
# so we can make better error messages later.
|
||||
#
|
||||
# First field, character class or literal character for this transition.
|
||||
#
|
||||
if ($fields[0] =~ /^'.'$/) {
|
||||
# We've got a quoted literal character.
|
||||
$state_literal_chars[$num_states] = $fields[0];
|
||||
$state_literal_chars[$num_states] =~ s/'//g;
|
||||
} else {
|
||||
# We've got the name of a character class.
|
||||
$state_char_class[$num_states] = $fields[0];
|
||||
if ($fields[0] =~ /[\W]/) {
|
||||
print " rbbicsts: at line $line_num, bad character literal or character class name.\n";
|
||||
print " scanning $fields[0]\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
shift @fields;
|
||||
|
||||
#
|
||||
# do the 'n' flag
|
||||
#
|
||||
$state_flag[$num_states] = "FALSE";
|
||||
if ($fields[0] eq "n") {
|
||||
$state_flag[$num_states] = "TRUE";
|
||||
shift @fields;
|
||||
}
|
||||
|
||||
#
|
||||
# do the destination state.
|
||||
#
|
||||
$state_dest_state[$num_states] = $fields[0];
|
||||
if ($fields[0] eq "") {
|
||||
print " rbbicsts: at line $line_num, destination state missing.\n";
|
||||
exit(-1);
|
||||
}
|
||||
shift @fields;
|
||||
|
||||
#
|
||||
# do the push state, if present.
|
||||
#
|
||||
if ($fields[0] =~ /^\^/) {
|
||||
$fields[0] =~ s/^\^//;
|
||||
$state_push_state[$num_states] = $fields[0];
|
||||
if ($fields[0] eq "" ) {
|
||||
print " rbbicsts: at line $line_num, expected state after ^ (no spaces).\n";
|
||||
exit(-1);
|
||||
}
|
||||
shift @fields;
|
||||
}
|
||||
|
||||
#
|
||||
# Lastly, do the optional action name.
|
||||
#
|
||||
if ($fields[0] ne "") {
|
||||
$state_func_name[$num_states] = $fields[0];
|
||||
shift @fields;
|
||||
}
|
||||
|
||||
#
|
||||
# There should be no fields left on the line at this point.
|
||||
#
|
||||
if (@fields > 0) {
|
||||
print " rbbicsts: at line $line_num, unexpected extra stuff on input line.\n";
|
||||
print " scanning $fields[0]\n";
|
||||
}
|
||||
$num_states++;
|
||||
}
|
||||
|
||||
#
|
||||
# We've read in the whole file, now go back and output the
|
||||
# C source code for the state transition table.
|
||||
#
|
||||
# We read all states first, before writing anything, so that the state numbers
|
||||
# for the destination states are all available to be written.
|
||||
#
|
||||
|
||||
#
|
||||
# Make hashes for the names of the character classes and
|
||||
# for the names of the actions that appeared.
|
||||
#
|
||||
for ($state=1; $state < $num_states; $state++) {
|
||||
if ($state_char_class[$state] ne "") {
|
||||
if ($charClasses{$state_char_class[$state]} == 0) {
|
||||
$charClasses{$state_char_class[$state]} = 1;
|
||||
}
|
||||
}
|
||||
if ($state_func_name[$state] eq "") {
|
||||
$state_func_name[$state] = "doNOP";
|
||||
}
|
||||
if ($actions{$state_action_name[$state]} == 0) {
|
||||
$actions{$state_func_name[$state]} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# Check that all of the destination states have been defined
|
||||
#
|
||||
#
|
||||
$states{"exit"} = 0; # Predefined state name, terminates state machine.
|
||||
for ($state=1; $state<$num_states; $state++) {
|
||||
if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
|
||||
print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
|
||||
$errors++;
|
||||
}
|
||||
if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
|
||||
print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
|
||||
$errors++;
|
||||
}
|
||||
}
|
||||
|
||||
die if ($errors>0);
|
||||
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
print "//\n";
|
||||
print "// Generated Header File. Do not edit by hand.\n";
|
||||
print "// This file contains the state table for RBBI rule parser.\n";
|
||||
print "// It is generated by the Perl script \"rbbicst.pl\" from\n";
|
||||
print "// the rule parser state definitions file \"rbbirpt.txt\".\n";
|
||||
print "//\n";
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
print "#ifndef RBBIRPT_H\n";
|
||||
print "#define RBBIRPT_H\n";
|
||||
print "\n";
|
||||
print "U_NAMESPACE_BEGIN\n";
|
||||
|
||||
#
|
||||
# Emit the constants for indicies of Unicode Sets
|
||||
# Define one constant for each of the character classes encountered.
|
||||
# At the same time, store the index corresponding to the set name back into hash.
|
||||
#
|
||||
print "//\n";
|
||||
print "// Character classes for RBBI rule scanning.\n";
|
||||
print "//\n";
|
||||
$i = 128; # State Table values for Unicode char sets range from 128-250.
|
||||
# Sets "default", "escaped", etc. get special handling.
|
||||
# They have no corresponding UnicodeSet object in the state machine,
|
||||
# but are handled by special case code. So we emit no reference
|
||||
# to a UnicodeSet object to them here.
|
||||
foreach $setName (keys %charClasses) {
|
||||
if ($setName eq "default") {
|
||||
$charClasses{$setName} = 255;}
|
||||
elsif ($setName eq "escaped") {
|
||||
$charClasses{$setName} = 254;}
|
||||
elsif ($setName eq "escapedP") {
|
||||
$charClasses{$setName} = 253;}
|
||||
elsif ($setName eq "eof") {
|
||||
$charClasses{$setName} = 252;}
|
||||
else {
|
||||
# Normal character class. Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
|
||||
print " const uint8_t kRuleSet_$setName = $i;\n";
|
||||
$charClasses{$setName} = $i;
|
||||
$i++;
|
||||
}
|
||||
}
|
||||
print "\n\n";
|
||||
|
||||
#
|
||||
# Emit the enum for the actions to be performed.
|
||||
#
|
||||
print "enum RBBI_RuleParseAction {\n";
|
||||
foreach $act (keys %actions) {
|
||||
print " $act,\n";
|
||||
}
|
||||
print " rbbiLastAction};\n\n";
|
||||
|
||||
#
|
||||
# Emit the struct definition for transtion table elements.
|
||||
#
|
||||
print "//-------------------------------------------------------------------------------\n";
|
||||
print "//\n";
|
||||
print "// RBBIRuleTableEl represents the structure of a row in the transition table\n";
|
||||
print "// for the rule parser state machine.\n";
|
||||
print "//-------------------------------------------------------------------------------\n";
|
||||
print "struct RBBIRuleTableEl {\n";
|
||||
print " RBBI_RuleParseAction fAction;\n";
|
||||
print " uint8_t fCharClass; // 0-127: an individual ASCII character\n";
|
||||
print " // 128-255: character class index\n";
|
||||
print " uint8_t fNextState; // 0-250: normal next-stat numbers\n";
|
||||
print " // 255: pop next-state from stack.\n";
|
||||
print " uint8_t fPushState;\n";
|
||||
print " UBool fNextChar;\n";
|
||||
print "};\n\n";
|
||||
|
||||
#
|
||||
# emit the state transition table
|
||||
#
|
||||
print "struct RBBIRuleTableEl gRuleParseStateTable[] = {\n";
|
||||
print " {doNOP, 0, 0, 0, TRUE}\n"; # State 0 is a dummy. Real states start with index = 1.
|
||||
for ($state=1; $state < $num_states; $state++) {
|
||||
print " , {$state_func_name[$state],";
|
||||
if ($state_literal_chars[$state] ne "") {
|
||||
$c = $state_literal_chars[$state];
|
||||
printf(" %d /*$c*/,", ord($c)); #TODO: use numeric value, so EBCDIC machines are ok.
|
||||
}else {
|
||||
print " $charClasses{$state_char_class[$state]},";
|
||||
}
|
||||
print " $states{$state_dest_state[$state]},";
|
||||
|
||||
# The push-state field is optional. If omitted, fill field with a zero, which flags
|
||||
# the state machine that there is no push state.
|
||||
if ($state_push_state[$state] eq "") {
|
||||
print "0, ";
|
||||
} else {
|
||||
print " $states{$state_push_state[$state]},";
|
||||
}
|
||||
print " $state_flag[$state]} ";
|
||||
|
||||
# Put out a C++ comment showing the number (index) of this state row,
|
||||
# and, if this is the first row of the table for this state, the state name.
|
||||
print " // $state ";
|
||||
if ($stateNames[$state] ne "") {
|
||||
print " $stateNames[$state]";
|
||||
}
|
||||
print "\n";
|
||||
};
|
||||
print " };\n";
|
||||
|
||||
|
||||
#
|
||||
# emit a mapping array from state numbers to state names.
|
||||
#
|
||||
# This array is used for producing debugging output from the rule parser.
|
||||
#
|
||||
print "const char *RBBIRuleStateNames[] = {";
|
||||
for ($state=0; $state<$num_states; $state++) {
|
||||
if ($stateNames[$state] ne "") {
|
||||
print " \"$stateNames[$state]\",\n";
|
||||
} else {
|
||||
print " 0,\n";
|
||||
}
|
||||
}
|
||||
print " 0};\n\n";
|
||||
|
||||
print "U_NAMESPACE_END\n";
|
||||
print "#endif\n";
|
||||
|
||||
|
||||
|
226
icu4c/source/common/rbbidata.cpp
Normal file
226
icu4c/source/common/rbbidata.cpp
Normal file
|
@ -0,0 +1,226 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2002 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "cmemory.h"
|
||||
#include "rbbidata.h"
|
||||
#include "utrie.h"
|
||||
#include "udatamem.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Constructors.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
|
||||
init(data, status);
|
||||
}
|
||||
|
||||
RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
|
||||
const RBBIDataHeader *d = (const RBBIDataHeader *)
|
||||
((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
|
||||
init(d, status);
|
||||
fUDataMem = udm;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// Trie access folding function. Copied as-is from properties code in uchar.c
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingOffset(uint32_t data) {
|
||||
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// init(). Does most of the work of construction, shared between the
|
||||
// constructors.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fHeader = data;
|
||||
if (fHeader->fMagic != 0xb1a0) {
|
||||
status = U_BRK_INTERNAL_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
fUDataMem = NULL;
|
||||
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
|
||||
fReverseTable = NULL;
|
||||
if (data->fRTableLen != 0) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
|
||||
|
||||
utrie_unserialize(&fTrie,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fTrie.getFoldingOffset=getFoldingOffset;
|
||||
|
||||
|
||||
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
|
||||
fRuleString.setTo(TRUE, fRuleSource, -1);
|
||||
|
||||
fRefCount = 1;
|
||||
|
||||
char *debugEnv = getenv("U_RBBIDEBUG"); // TODO: make conditional on some compile time setting
|
||||
if (debugEnv && strstr(debugEnv, "data")) {this->printData();}
|
||||
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor. Don't call this - use removeReferenc() instead.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
RBBIDataWrapper::~RBBIDataWrapper() {
|
||||
assert(fRefCount == 0);
|
||||
if (fUDataMem) {
|
||||
udata_close(fUDataMem);
|
||||
} else {
|
||||
uprv_free((void *)fHeader);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Operator == Consider two RBBIDataWrappers to be equal if they
|
||||
// refer to the same underlying data. Although
|
||||
// the data wrappers are normally shared between
|
||||
// iterator instances, it's possible to independently
|
||||
// open the same data twice, and get two instances, which
|
||||
// should still be ==.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
|
||||
if (fHeader == other.fHeader) {
|
||||
return TRUE;
|
||||
}
|
||||
if (fHeader->fLength != other.fHeader->fLength) {
|
||||
return FALSE;
|
||||
}
|
||||
if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int32_t RBBIDataWrapper::hashCode() {
|
||||
return fHeader->fFTableLen;
|
||||
;
|
||||
};
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// Reference Counting. A single RBBIDataWrapper object is shared among
|
||||
// however many RulesBasedBreakIterator instances are
|
||||
// referencing the same data.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBIDataWrapper::removeReference() {
|
||||
if (--fRefCount <= 0) { // TODO needs synchronization
|
||||
delete this;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
RBBIDataWrapper *RBBIDataWrapper::addReference() {
|
||||
++fRefCount; // TODO: needs synchronization
|
||||
return this;
|
||||
};
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// getRuleSourceString
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
const UnicodeString &RBBIDataWrapper::getRuleSourceString() {
|
||||
return fRuleString;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// print - debugging function to dump the runtime data tables.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBIDataWrapper::printData() {
|
||||
uint32_t c, s;
|
||||
|
||||
printf("RBBI Data at %x\n", fHeader);
|
||||
printf(" Version = %d\n", fHeader->fVersion);
|
||||
printf(" total length of data = %d\n", fHeader->fLength);
|
||||
printf(" number of character categories = %d\n\n", fHeader->fCatCount);
|
||||
|
||||
printf(" Forward State Transition Table\n");
|
||||
printf("State | Acc LA Tag");
|
||||
for (c=0; c<fHeader->fCatCount; c++) {printf("%3d ", c);};
|
||||
printf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {printf("----");}
|
||||
printf("\n");
|
||||
|
||||
for (s=0; s<fForwardTable->fNumStates; s++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)
|
||||
(fForwardTable->fTableData + (fForwardTable->fRowLen * s));
|
||||
printf("%4d | %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag);
|
||||
for (c=0; c<fHeader->fCatCount; c++) {
|
||||
printf("%3d ", row->fNextState[c]);
|
||||
};
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("\nOrignal Rules source:\n");
|
||||
c = 0;
|
||||
for (;;) {
|
||||
if (fRuleSource[c] == 0)
|
||||
break;
|
||||
putchar(fRuleSource[c]);
|
||||
c++;
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
134
icu4c/source/common/rbbidata.h
Normal file
134
icu4c/source/common/rbbidata.h
Normal file
|
@ -0,0 +1,134 @@
|
|||
// file: rbbidata.h
|
||||
//
|
||||
//**********************************************************************
|
||||
// Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
//**********************************************************************
|
||||
//
|
||||
// RBBI data formats Includes
|
||||
//
|
||||
// Structs that describes the format of the Binary RBBI data,
|
||||
// as it is stored in ICU's data file.
|
||||
//
|
||||
// RBBIDataWrapper - Instances of this class sit between the
|
||||
// raw data structs and the RulesBasedBreakIterator objects
|
||||
// that are created by applications. The wrapper class
|
||||
// provides reference counting for the underlying data,
|
||||
// and direct pointers to data that would not otherwise
|
||||
// be accessible without ugly pointer arithmetic. The
|
||||
// wrapper does not attempt to provide any higher level
|
||||
// abstractions for the data itself.
|
||||
//
|
||||
// There will be only one instance of RBBIDataWrapper for any
|
||||
// set of RBBI run time data being shared by instances
|
||||
// (clones) of RulesBasedBreakIterator.
|
||||
//
|
||||
|
||||
#ifndef __RBBIDATA_H__
|
||||
#define __RBBIDATA_H__
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "utrie.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
// The following structs map exactly onto the raw data from ICU common data file.
|
||||
//
|
||||
struct RBBIDataHeader {
|
||||
uint32_t fMagic; // == 0xbla0
|
||||
uint32_t fVersion; // == 1
|
||||
uint32_t fLength; // Total length in bytes of this RBBI Data,
|
||||
// including all sections, not just the header.
|
||||
uint32_t fCatCount; // Number of character categories.
|
||||
|
||||
//
|
||||
// Offsets and sizes of each of the subsections within the RBBI data.
|
||||
// All offsets are bytes from the start of the RBBIDataHeader.
|
||||
// All sizes are in bytes.
|
||||
//
|
||||
uint32_t fFTable; // forward state transition table.
|
||||
uint32_t fFTableLen;
|
||||
uint32_t fRTable; // Offset to the reverse state transition table.
|
||||
uint32_t fRTableLen;
|
||||
uint32_t fTrie; // Offset to Trie data for character categories
|
||||
uint32_t fTrieLen;
|
||||
uint32_t fRuleSource; // Offset to the source for for the break
|
||||
uint32_t fRuleSourceLen; // rules. Stored UChar *.
|
||||
|
||||
uint32_t fReserved[8]; // Reserved for expansion
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct RBBIStateTableRow {
|
||||
int16_t fAccepting; // Non-zero if this row is for an accepting state.
|
||||
// Value is the {nnn} value to return to calling
|
||||
// application.
|
||||
int16_t fLookAhead; // Non-zero if this row is for a state that
|
||||
// corresponds to a '/' in the rule source.
|
||||
// Value is the same as the fAccepting
|
||||
// value for the rule (which will appear
|
||||
// in a different state.
|
||||
int16_t fTag; // Non-zero if this row covers a {tagged} position
|
||||
// from a rule. value is the tag number.
|
||||
int16_t fReserved;
|
||||
uint16_t fNextState[2]; // Next State, indexed by char category.
|
||||
// Array Size is fNumCols from the
|
||||
// state table header.
|
||||
// CAUTION: see RBBITableBuilder::getTableSize()
|
||||
// before changing anything here.
|
||||
};
|
||||
|
||||
|
||||
struct RBBIStateTable {
|
||||
uint32_t fNumStates; // Number of states.
|
||||
uint32_t fRowLen; // Length of a state table row, in bytes.
|
||||
char fTableData[4]; // First RBBIStateTableRow begins here.
|
||||
// (making it char[] simplifies ugly address
|
||||
// arithmetic for indexing variable length rows.)
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// The reference counting wrapper class
|
||||
//
|
||||
class RBBIDataWrapper {
|
||||
public:
|
||||
RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
|
||||
RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
|
||||
RBBIDataWrapper(const RBBIDataWrapper &other);
|
||||
~RBBIDataWrapper();
|
||||
|
||||
void init(const RBBIDataHeader *data, UErrorCode &status);
|
||||
RBBIDataWrapper *addReference();
|
||||
void removeReference();
|
||||
UBool operator ==(const RBBIDataWrapper &other) const;
|
||||
int32_t hashCode();
|
||||
const UnicodeString &getRuleSourceString();
|
||||
void printData();
|
||||
|
||||
//
|
||||
// Pointers to items within the data
|
||||
//
|
||||
const RBBIDataHeader *fHeader;
|
||||
const RBBIStateTable *fForwardTable;
|
||||
const RBBIStateTable *fReverseTable;
|
||||
const UChar *fRuleSource;
|
||||
|
||||
UTrie fTrie;
|
||||
|
||||
|
||||
private:
|
||||
int32_t fRefCount;
|
||||
UDataMemory *fUDataMem;
|
||||
UnicodeString fRuleString;
|
||||
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
340
icu4c/source/common/rbbinode.cpp
Normal file
340
icu4c/source/common/rbbinode.cpp
Normal file
|
@ -0,0 +1,340 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
//
|
||||
// File: rbbinode.cpp
|
||||
//
|
||||
// Implementation of class RBBINode, which represents a node in the
|
||||
// tree generated when parsing the Rules Based Break Iterator rules.
|
||||
//
|
||||
// This "Class" is actually closer to a struct.
|
||||
// Code using it is expected to directly access fields much of the time.
|
||||
//
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "uvector.h"
|
||||
|
||||
#include "rbbirb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
#include "assert.h"
|
||||
|
||||
#include <stdio.h> // TODO - getrid of this.
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
int RBBINode::gLastSerial = 0;
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor. Just set the fields to reasonable default values.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
RBBINode::RBBINode(NodeType t) {
|
||||
fSerialNum = ++gLastSerial;
|
||||
fType = t;
|
||||
fParent = NULL;
|
||||
fLeftChild = NULL;
|
||||
fRightChild = NULL;
|
||||
fInputSet = NULL;
|
||||
fFirstPos = 0;
|
||||
fLastPos = 0;
|
||||
fNullable = FALSE;
|
||||
fLookAheadEnd = FALSE;
|
||||
fVal = 0;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
|
||||
fLastPosSet = new UVector(status);
|
||||
fFollowPos = new UVector(status);
|
||||
if (t==opCat) {fPrecedence = precOpCat;}
|
||||
else if (t==opOr) {fPrecedence = precOpOr;}
|
||||
else if (t==opStart) {fPrecedence = precStart;}
|
||||
else if (t= opLParen) {fPrecedence = precLParen;}
|
||||
|
||||
};
|
||||
|
||||
|
||||
RBBINode::RBBINode(const RBBINode &other) {
|
||||
fSerialNum = ++gLastSerial;
|
||||
fType = other.fType;
|
||||
fParent = NULL;
|
||||
fLeftChild = NULL;
|
||||
fRightChild = NULL;
|
||||
fInputSet = other.fInputSet;
|
||||
fPrecedence = other.fPrecedence;
|
||||
fText = other.fText;
|
||||
fFirstPos = other.fFirstPos;
|
||||
fLastPos = other.fLastPos;
|
||||
fNullable = other.fNullable;
|
||||
fVal = other.fVal;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fFirstPosSet = new UVector(status); // TODO - get a real status from somewhere
|
||||
fLastPosSet = new UVector(status);
|
||||
fFollowPos = new UVector(status);
|
||||
};
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor. Deletes both this node AND any child nodes,
|
||||
// except in the case of variable reference nodes. For
|
||||
// these, the l. child points back to the definition, which
|
||||
// is common for all references to the variable, meaning
|
||||
// it can't be deleted here.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
RBBINode::~RBBINode() {
|
||||
// printf("deleting node %8x serial %4d\n", this, this->fSerialNum);
|
||||
delete fInputSet;
|
||||
fInputSet = NULL;
|
||||
|
||||
switch (this->fType) {
|
||||
case varRef:
|
||||
case setRef:
|
||||
// for these node types, multiple instances point to the same "children"
|
||||
// Storage ownership of children handled elsewhere. Don't delete here.
|
||||
break;
|
||||
|
||||
case uset:
|
||||
delete fLeftChild;
|
||||
// For usets, don't delete the right child; it's used to form a linked list of usets.
|
||||
break;
|
||||
|
||||
default:
|
||||
delete fLeftChild;
|
||||
fLeftChild = NULL;
|
||||
delete fRightChild;
|
||||
fRightChild = NULL;
|
||||
}
|
||||
|
||||
|
||||
delete fFirstPosSet;
|
||||
delete fLastPosSet;
|
||||
delete fFollowPos;
|
||||
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// cloneTree Make a copy of the subtree rooted at this node.
|
||||
// Discard any variable references encountered along the way,
|
||||
// and replace with copies of the variable's definitions.
|
||||
// Used to replicate the expression underneath variable
|
||||
// references in preparation for generating the DFA tables.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
RBBINode *RBBINode::cloneTree() {
|
||||
RBBINode *n;
|
||||
|
||||
if (fType == RBBINode::varRef) {
|
||||
// If the current node is a variable reference, skip over it
|
||||
// and clone the definition of the variable instead.
|
||||
n = fLeftChild->cloneTree();
|
||||
} else if (fType == RBBINode::uset) {
|
||||
n = this;
|
||||
} else {
|
||||
n = new RBBINode(*this);
|
||||
if (fLeftChild != NULL) {
|
||||
n->fLeftChild = fLeftChild->cloneTree();
|
||||
n->fLeftChild->fParent = n;
|
||||
}
|
||||
if (fRightChild != NULL) {
|
||||
n->fRightChild = fRightChild->cloneTree();
|
||||
n->fRightChild->fParent = n;
|
||||
}
|
||||
}
|
||||
return n;
|
||||
};
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// flattenVariables Walk a parse tree, replacing any variable
|
||||
// references with a copy of the variable's definition.
|
||||
// Aside from variables, the tree is not changed.
|
||||
//
|
||||
// This function works by recursively walking the tree
|
||||
// without doing anything until a variable reference is
|
||||
// found, then calling cloneTree() at that point. Any
|
||||
// nested references are handled by cloneTree(), not here.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
void RBBINode::flattenVariables() {
|
||||
assert(fType != varRef);
|
||||
|
||||
if (fLeftChild != NULL) {
|
||||
if (fLeftChild->fType==varRef) {
|
||||
RBBINode *oldChild = fLeftChild;
|
||||
fLeftChild = oldChild->cloneTree();
|
||||
fLeftChild->fParent = this;
|
||||
delete oldChild;
|
||||
} else {
|
||||
fLeftChild->flattenVariables();
|
||||
}
|
||||
}
|
||||
|
||||
if (fRightChild != NULL) {
|
||||
if (fRightChild->fType==varRef) {
|
||||
RBBINode *oldChild = fRightChild;
|
||||
fRightChild = oldChild->cloneTree();
|
||||
fRightChild->fParent = this;
|
||||
delete oldChild;
|
||||
} else {
|
||||
fRightChild->flattenVariables();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// flattenSets Walk the parse tree, replacing any nodes of type setRef
|
||||
// with a copy of the expression tree for the set. A set's
|
||||
// equivalent expression tree is precomputed and saved as
|
||||
// the left child of the uset node.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
void RBBINode::flattenSets() {
|
||||
assert(fType != setRef);
|
||||
|
||||
if (fLeftChild != NULL) {
|
||||
if (fLeftChild->fType==setRef) {
|
||||
RBBINode *setRefNode = fLeftChild;
|
||||
RBBINode *usetNode = setRefNode->fLeftChild;
|
||||
RBBINode *replTree = usetNode->fLeftChild;
|
||||
fLeftChild = replTree->cloneTree();
|
||||
fLeftChild->fParent = this;
|
||||
delete setRefNode;
|
||||
} else {
|
||||
fLeftChild->flattenSets();
|
||||
}
|
||||
}
|
||||
|
||||
if (fRightChild != NULL) {
|
||||
if (fRightChild->fType==setRef) {
|
||||
RBBINode *setRefNode = fRightChild;
|
||||
RBBINode *usetNode = setRefNode->fLeftChild;
|
||||
RBBINode *replTree = usetNode->fLeftChild;
|
||||
fRightChild = replTree->cloneTree();
|
||||
fRightChild->fParent = this;
|
||||
delete setRefNode;
|
||||
} else {
|
||||
fRightChild->flattenSets();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// findNodes() Locate all the nodes of the specified type, starting
|
||||
// at the specified root.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
void RBBINode::findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status) {
|
||||
if (fType == kind) {
|
||||
dest->addElement(this, status);
|
||||
}
|
||||
if (fLeftChild != NULL) {
|
||||
fLeftChild->findNodes(dest, kind, status);
|
||||
}
|
||||
if (fRightChild !=NULL && fType != RBBINode::uset) {
|
||||
fRightChild->findNodes(dest, kind, status);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// print. Print out a single node, for debugging.
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
static const char *nodeTypeNames[] = {
|
||||
"setRef",
|
||||
"uset",
|
||||
"varRef",
|
||||
"leafChar",
|
||||
"lookAhead",
|
||||
"tag",
|
||||
"endMark",
|
||||
"opStart",
|
||||
"opCat",
|
||||
"opOr",
|
||||
"opStar",
|
||||
"opPlus",
|
||||
"opQuestion",
|
||||
"opBreak",
|
||||
"opReverse",
|
||||
"opLParen"
|
||||
};
|
||||
|
||||
void RBBINode::print() {
|
||||
printf("%10x %12s %10x %10x %10x %4d %6d %d ",
|
||||
this, nodeTypeNames[fType], fParent, fLeftChild, fRightChild,
|
||||
fSerialNum, fFirstPos, fVal);
|
||||
if (fType == varRef) {
|
||||
printUnicodeString(fText);
|
||||
}
|
||||
putc('\n', stdout);
|
||||
}
|
||||
|
||||
|
||||
void RBBINode::printUnicodeString(const UnicodeString &s, int minWidth)
|
||||
{
|
||||
int i;
|
||||
for (i=0; i<s.length(); i++) {
|
||||
putc(s.charAt(i), stdout);
|
||||
}
|
||||
for (i=s.length(); i<minWidth; i++) {
|
||||
putc(' ', stdout);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
//
|
||||
// print. Print out the tree of nodes rooted at "this"
|
||||
//
|
||||
//-------------------------------------------------------------------------
|
||||
void RBBINode::printTree(UBool printHeading, UBool doVars) {
|
||||
if (printHeading) {
|
||||
printf( "-------------------------------------------------------------------\n"
|
||||
" Address type Parent LeftChild RightChild serial position value\n"
|
||||
);
|
||||
}
|
||||
this->print();
|
||||
// Only dump the definition under a variable reference if asked to.
|
||||
// Unconditinally dump children of all other node types.
|
||||
if (fType != varRef || doVars) {
|
||||
if (fLeftChild != NULL) {
|
||||
fLeftChild->printTree(FALSE);
|
||||
}
|
||||
|
||||
// Note: The right child field of uset nodes is borrowed to link them into a list
|
||||
// They are actually a leaf node as far as the tree is concerned.
|
||||
if (fRightChild != NULL && this->fType != RBBINode::uset) {
|
||||
fRightChild->printTree(FALSE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
103
icu4c/source/common/rbbinode.h
Normal file
103
icu4c/source/common/rbbinode.h
Normal file
|
@ -0,0 +1,103 @@
|
|||
#ifndef RBBINODE_H
|
||||
#define RBBINODE_H
|
||||
|
||||
|
||||
//
|
||||
// class RBBINode
|
||||
//
|
||||
// Represents a node in the parse tree generated when reading
|
||||
// a rule file.
|
||||
//
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UnicodeSet;
|
||||
class UVector;
|
||||
|
||||
class RBBINode {
|
||||
public:
|
||||
enum NodeType {
|
||||
setRef,
|
||||
uset,
|
||||
varRef,
|
||||
leafChar,
|
||||
lookAhead,
|
||||
tag,
|
||||
endMark,
|
||||
opStart,
|
||||
opCat,
|
||||
opOr,
|
||||
opStar,
|
||||
opPlus,
|
||||
opQuestion,
|
||||
opBreak,
|
||||
opReverse,
|
||||
opLParen
|
||||
};
|
||||
|
||||
enum OpPrecedence {
|
||||
precZero,
|
||||
precStart,
|
||||
precLParen,
|
||||
precOpOr,
|
||||
precOpCat
|
||||
};
|
||||
|
||||
NodeType fType;
|
||||
RBBINode *fParent;
|
||||
RBBINode *fLeftChild;
|
||||
RBBINode *fRightChild;
|
||||
UnicodeSet *fInputSet; // For uset nodes only.
|
||||
OpPrecedence fPrecedence; // For binary ops only.
|
||||
|
||||
UnicodeString fText; // Text corresponding to this node.
|
||||
// May be lazily evaluated when (if) needed
|
||||
// for some node types.
|
||||
int fFirstPos; // Position in the rule source string of the
|
||||
// first text associated with the node.
|
||||
// If there's a left child, this will be the same
|
||||
// as that child's left pos.
|
||||
int fLastPos; // Last position in the rule source string
|
||||
// of any text associated with this node.
|
||||
// If there's a right child, this will be the same
|
||||
// as that child's last postion.
|
||||
|
||||
UBool fNullable; // See Aho.
|
||||
int32_t fVal; // For leafChar nodes, the value.
|
||||
// Values are the character category,
|
||||
// corresponds to columns in the final
|
||||
// state transition table.
|
||||
|
||||
UBool fLookAheadEnd; // For endMark nodes, set TRUE if
|
||||
// marking the end of a look-ahead rule.
|
||||
|
||||
UVector *fFirstPosSet;
|
||||
UVector *fLastPosSet; // TODO: rename fFirstPos & fLastPos to avoid confusion.
|
||||
UVector *fFollowPos;
|
||||
|
||||
|
||||
RBBINode(NodeType t);
|
||||
RBBINode(const RBBINode &other);
|
||||
~RBBINode();
|
||||
|
||||
RBBINode *cloneTree();
|
||||
void flattenVariables();
|
||||
void flattenSets();
|
||||
void findNodes(UVector *dest, RBBINode::NodeType kind, UErrorCode &status);
|
||||
|
||||
void print();
|
||||
void printTree(UBool withHeading=TRUE, UBool doVars=FALSE);
|
||||
static void printUnicodeString(const UnicodeString &s, int minWidth=0);
|
||||
|
||||
private:
|
||||
void operator = (const RBBINode &other); // No defs.
|
||||
UBool operator == (const RBBINode &other); // Private, so these functions won't accidently be used.
|
||||
|
||||
int fSerialNum; // Debugging aids.
|
||||
static int gLastSerial;
|
||||
|
||||
};
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
238
icu4c/source/common/rbbirb.cpp
Normal file
238
icu4c/source/common/rbbirb.cpp
Normal file
|
@ -0,0 +1,238 @@
|
|||
//
|
||||
// file: rbbirb.cpp
|
||||
//
|
||||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains the RBBIRuleBuilder class implementation. This is the main class for
|
||||
// building (compiling) break rules into the tables required by the runtime
|
||||
// RBBI engine.
|
||||
//
|
||||
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uchriter.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#include "rbbirb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
#include "rbbiscan.h"
|
||||
#include "rbbisetb.h"
|
||||
#include "rbbitblb.h"
|
||||
|
||||
#include <stdio.h> // TODO - getrid of this.
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// Forward Declarations.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
static void U_EXPORT2 U_CALLCONV RBBISetTable_deleter(void *p);
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
||||
UParseError &parseErr,
|
||||
UErrorCode &status)
|
||||
: fRules(rules)
|
||||
{
|
||||
fStatus = &status;
|
||||
fParseError = &parseErr;
|
||||
fDebugEnv = getenv("U_RBBIDEBUG"); // TODO: make conditional on some compile time setting
|
||||
|
||||
fScanner = new RBBIRuleScanner(this);
|
||||
fSetBuilder = new RBBISetBuilder(this);
|
||||
fSetsListHead = NULL;
|
||||
fForwardTree = NULL;
|
||||
fReverseTree = NULL;
|
||||
fForwardTables = NULL;
|
||||
fReverseTables = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
RBBIRuleBuilder::~RBBIRuleBuilder() {
|
||||
|
||||
// Delete the linked lest of USet nodes and the corresponding UnicodeSets.
|
||||
// (Deleting a node deletes its children, so deleting the head node of
|
||||
// this list will take out the whole list.)
|
||||
RBBINode *n, *nextN;
|
||||
for (n=fSetsListHead; n!=NULL; n=nextN) {
|
||||
nextN = n->fRightChild;
|
||||
delete n;
|
||||
}
|
||||
fSetsListHead = NULL;
|
||||
|
||||
|
||||
delete fSetBuilder;
|
||||
delete fForwardTables;
|
||||
delete fReverseTables;
|
||||
delete fForwardTree;
|
||||
delete fReverseTree;
|
||||
delete fScanner;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// flattenData() - Collect up the compiled RBBI rule data and put it into
|
||||
// the format for saving in ICU data files,
|
||||
// which is also the format needed by the RBBI runtime engine.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;};
|
||||
RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Calculate the size of each section in the data.
|
||||
// Sizes here are padded up to a multiple of 8 for better memory alignment.
|
||||
// Sections sizes actually stored in the header are for the actual data
|
||||
// without the padding.
|
||||
//
|
||||
int32_t headerSize = align8(sizeof(RBBIDataHeader));
|
||||
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
||||
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t rulesSize = align8((fRules.length()+1) * sizeof(UChar));
|
||||
|
||||
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
|
||||
+ trieSize + rulesSize;
|
||||
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
|
||||
if (data == NULL) {
|
||||
*fStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
uprv_memset(data, 0, totalSize);
|
||||
|
||||
|
||||
data->fMagic = 0xb1a0;
|
||||
data->fVersion = 1;
|
||||
data->fLength = totalSize;
|
||||
data->fCatCount = fSetBuilder->getNumCharCategories();
|
||||
|
||||
data->fFTable = headerSize;
|
||||
data->fFTableLen = forwardTableSize;
|
||||
data->fRTable = data->fFTable + forwardTableSize;
|
||||
data->fRTableLen = reverseTableSize;
|
||||
data->fTrie = data->fRTable + reverseTableSize;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fRuleSource = data->fTrie + trieSize;
|
||||
data->fRuleSourceLen = fRules.length() * sizeof(UChar);
|
||||
|
||||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
fRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// RulesBasedBreakIterator, construct from source rules that are passed in
|
||||
// in a UnicodeString
|
||||
//
|
||||
BreakIterator *
|
||||
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
||||
UParseError &parseError,
|
||||
UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// Read the input rules, generate a parse tree, symbol table,
|
||||
// and list of all Unicode Sets referenced by the rules.
|
||||
//
|
||||
RBBIRuleBuilder builder(rules, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
builder.fScanner->parse();
|
||||
|
||||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input 32-bit characters to
|
||||
// the character categories.
|
||||
//
|
||||
builder.fSetBuilder->build();
|
||||
|
||||
|
||||
//
|
||||
// Generate the DFA state transition table.
|
||||
//
|
||||
builder.fForwardTables = new RBBITableBuilder(&builder, builder.fForwardTree);
|
||||
builder.fReverseTables = new RBBITableBuilder(&builder, builder.fReverseTree);
|
||||
builder.fForwardTables->build();
|
||||
builder.fReverseTables->build();
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Package up the compiled data into a memory image
|
||||
// in the run-time format.
|
||||
//
|
||||
RBBIDataHeader *data;
|
||||
data = builder.flattenData();
|
||||
|
||||
|
||||
//
|
||||
// Clean up the compiler related stuff
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// Create a break iterator from the compiled rules.
|
||||
// (Identical to creation from stored pre-compiled rules)
|
||||
//
|
||||
RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
This = NULL;
|
||||
}
|
||||
return This;
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
160
icu4c/source/common/rbbirb.h
Normal file
160
icu4c/source/common/rbbirb.h
Normal file
|
@ -0,0 +1,160 @@
|
|||
//
|
||||
// rbbirb.h
|
||||
//
|
||||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for several from the Rule Based Break Iterator rule builder.
|
||||
//
|
||||
|
||||
|
||||
#ifndef RBBIRB_H
|
||||
#define RBBIRB_H
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "symtable.h" // For UnicodeSet parsing, is the interface that
|
||||
// looks up references to $variables within a set.
|
||||
// #include "rbbinode.h"
|
||||
// #include "rbbitblb.h"
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class RBBIRuleScanner;
|
||||
struct RBBIRuleTableEl;
|
||||
class RBBISetBuilder;
|
||||
class RBBINode;
|
||||
class RBBITableBuilder;
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// RBBISymbolTable. Implements SymbolTable interface that is used by the
|
||||
// UnicodeSet parser to resolve references to $variables.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
class RBBISymbolTableEntry { // The symbol table hash table contains one
|
||||
public: // of these structs for each entry.
|
||||
UnicodeString key;
|
||||
RBBINode *val;
|
||||
~RBBISymbolTableEntry();
|
||||
};
|
||||
|
||||
|
||||
class RBBISymbolTable : public SymbolTable {
|
||||
private:
|
||||
const UnicodeString &fRules;
|
||||
UHashtable *fHashTable;
|
||||
RBBIRuleScanner *fRuleScanner;
|
||||
|
||||
// These next two fields are part of the mechanism for passing references to
|
||||
// already-constructed UnicodeSets back to the UnicodeSet constructor
|
||||
// when the pattern includes $variable references.
|
||||
const UnicodeString ffffString; // = "/uffff"
|
||||
UnicodeSet *fCachedSetLookup;
|
||||
|
||||
public:
|
||||
// API inherited from class SymbolTable
|
||||
virtual const UnicodeString* lookup(const UnicodeString& s) const;
|
||||
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
|
||||
virtual UnicodeString parseReference(const UnicodeString& text,
|
||||
ParsePosition& pos, int32_t limit) const;
|
||||
|
||||
// Additional Functions
|
||||
RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
|
||||
virtual ~RBBISymbolTable();
|
||||
|
||||
virtual RBBINode *lookupNode(const UnicodeString &key) const;
|
||||
virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
|
||||
|
||||
virtual void print() const;
|
||||
};
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
class RBBIRuleBuilder {
|
||||
public:
|
||||
|
||||
// Create a rule based break iterator from a set of rules.
|
||||
// This function is the main entry point into the rule builder. The
|
||||
// public ICU API for creating RBBIs uses this function to do the actual work.
|
||||
//
|
||||
static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
|
||||
UParseError &parseError,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
public:
|
||||
// The "public" functions and data members that appear below are accessed
|
||||
// (and shared) by the various parts that make up the rule builder. They
|
||||
// are NOT intended to be accessed by anything outside of the
|
||||
// rule builder implementation.
|
||||
RBBIRuleBuilder(const UnicodeString &rules,
|
||||
UParseError &parseErr,
|
||||
UErrorCode &status
|
||||
);
|
||||
|
||||
virtual ~RBBIRuleBuilder();
|
||||
char *fDebugEnv; // controls debug trace output
|
||||
UErrorCode *fStatus; // Error reporting. Keeping status
|
||||
UParseError *fParseError; // here avoids passing it everywhere.
|
||||
const UnicodeString &fRules; // The rule string that we are compiling
|
||||
|
||||
RBBIRuleScanner *fScanner; // The scanner.
|
||||
RBBINode *fForwardTree; // The parse trees, generated by the scanner,
|
||||
RBBINode *fReverseTree; // then manipulated by subsequent steps.
|
||||
|
||||
RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
|
||||
RBBINode *fSetsListHead; // Head of the linked list of UnicodeSets
|
||||
// (uset nodes.)
|
||||
|
||||
RBBITableBuilder *fForwardTables; // State transition tables
|
||||
RBBITableBuilder *fReverseTables;
|
||||
|
||||
RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
|
||||
// data tables..
|
||||
|
||||
private:
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// RBBISetTableEl is an entry in the hash table of UnicodeSets that have
|
||||
// been encountered. The val Node will be of nodetype uset
|
||||
// and contain pointers to the actual UnicodeSets.
|
||||
// The Key is the source string for initializing the set.
|
||||
//
|
||||
// The hash table is used to avoid creating duplicate
|
||||
// unnamed (not $var references) UnicodeSets.
|
||||
//
|
||||
// Memory Management:
|
||||
// The Hash Table owns these RBBISetTableEl structs and
|
||||
// the key strings. It does NOT own the val nodes.
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
struct RBBISetTableEl {
|
||||
UnicodeString *key;
|
||||
RBBINode *val;
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
|
||||
|
247
icu4c/source/common/rbbirpt.h
Normal file
247
icu4c/source/common/rbbirpt.h
Normal file
|
@ -0,0 +1,247 @@
|
|||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// Generated Header File. Do not edit by hand.
|
||||
// This file contains the state table for RBBI rule parser.
|
||||
// It is generated by the Perl script "rbbicst.pl" from
|
||||
// the rule parser state definitions file "rbbirpt.txt".
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
#ifndef RBBIRPT_H
|
||||
#define RBBIRPT_H
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
//
|
||||
// Character classes for RBBI rule scanning.
|
||||
//
|
||||
const uint8_t kRuleSet_digit_char = 128;
|
||||
const uint8_t kRuleSet_rule_char = 129;
|
||||
const uint8_t kRuleSet_white_space = 130;
|
||||
const uint8_t kRuleSet_name_char = 131;
|
||||
const uint8_t kRuleSet_name_start_char = 132;
|
||||
|
||||
|
||||
enum RBBI_RuleParseAction {
|
||||
doExprOrOperator,
|
||||
doRuleErrorAssignExpr,
|
||||
doTagValue,
|
||||
doEndAssign,
|
||||
doRuleError,
|
||||
doVariableNameExpectedErr,
|
||||
doRuleChar,
|
||||
doLParen,
|
||||
doSlash,
|
||||
doStartTagValue,
|
||||
doDotAny,
|
||||
doExprFinished,
|
||||
doScanUnicodeSet,
|
||||
doExprRParen,
|
||||
doStartVariableName,
|
||||
doTagExpectedError,
|
||||
doTagDigit,
|
||||
doUnaryOpStar,
|
||||
doEndVariableName,
|
||||
doNOP,
|
||||
doUnaryOpQuestion,
|
||||
doExit,
|
||||
doStartAssign,
|
||||
doEndOfRule,
|
||||
doUnaryOpPlus,
|
||||
doExprStart,
|
||||
doExprCatOperator,
|
||||
doReverseDir,
|
||||
doCheckVarDef,
|
||||
rbbiLastAction};
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// RBBIRuleTableEl represents the structure of a row in the transition table
|
||||
// for the rule parser state machine.
|
||||
//-------------------------------------------------------------------------------
|
||||
struct RBBIRuleTableEl {
|
||||
RBBI_RuleParseAction fAction;
|
||||
uint8_t fCharClass; // 0-127: an individual ASCII character
|
||||
// 128-255: character class index
|
||||
uint8_t fNextState; // 0-250: normal next-stat numbers
|
||||
// 255: pop next-state from stack.
|
||||
uint8_t fPushState;
|
||||
UBool fNextChar;
|
||||
};
|
||||
|
||||
struct RBBIRuleTableEl gRuleParseStateTable[] = {
|
||||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doExprStart, 254, 12, 8, FALSE} // 1 start
|
||||
, {doNOP, 130, 1,0, TRUE} // 2
|
||||
, {doExprStart, 36 /*$*/, 70, 80, FALSE} // 3
|
||||
, {doReverseDir, 33 /*!*/, 11,0, TRUE} // 4
|
||||
, {doNOP, 59 /*;*/, 1,0, TRUE} // 5
|
||||
, {doNOP, 252, 0,0, FALSE} // 6
|
||||
, {doExprStart, 255, 12, 8, FALSE} // 7
|
||||
, {doEndOfRule, 59 /*;*/, 1,0, TRUE} // 8 break-rule-end
|
||||
, {doNOP, 130, 8,0, TRUE} // 9
|
||||
, {doRuleError, 255, 85,0, FALSE} // 10
|
||||
, {doExprStart, 255, 12, 8, FALSE} // 11 reverse-rule
|
||||
, {doRuleChar, 254, 21,0, TRUE} // 12 term
|
||||
, {doNOP, 130, 12,0, TRUE} // 13
|
||||
, {doRuleChar, 129, 21,0, TRUE} // 14
|
||||
, {doNOP, 91 /*[*/, 76, 21, FALSE} // 15
|
||||
, {doLParen, 40 /*(*/, 12, 21, TRUE} // 16
|
||||
, {doNOP, 36 /*$*/, 70, 20, FALSE} // 17
|
||||
, {doDotAny, 46 /*.*/, 21,0, TRUE} // 18
|
||||
, {doRuleError, 255, 85,0, FALSE} // 19
|
||||
, {doCheckVarDef, 255, 21,0, FALSE} // 20 term-var-ref
|
||||
, {doUnaryOpStar, 42 /***/, 25,0, TRUE} // 21 expr-mod
|
||||
, {doUnaryOpPlus, 43 /*+*/, 25,0, TRUE} // 22
|
||||
, {doUnaryOpQuestion, 63 /*?*/, 25,0, TRUE} // 23
|
||||
, {doNOP, 255, 25,0, FALSE} // 24
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 25 expr-cont
|
||||
, {doNOP, 130, 25,0, TRUE} // 26
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 27
|
||||
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 28
|
||||
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 29
|
||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 30
|
||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 31
|
||||
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 32
|
||||
, {doExprCatOperator, 123 /*{*/, 49,0, FALSE} // 33
|
||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 34
|
||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 35
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 36
|
||||
, {doSlash, 47 /*/*/, 39,0, TRUE} // 37 look-ahead
|
||||
, {doNOP, 255, 85,0, FALSE} // 38
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 39 expr-cont-no-slash
|
||||
, {doNOP, 130, 25,0, TRUE} // 40
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 41
|
||||
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 42
|
||||
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 43
|
||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 44
|
||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 45
|
||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 46
|
||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 47
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 48
|
||||
, {doNOP, 130, 49,0, TRUE} // 49 tag-open
|
||||
, {doStartTagValue, 128, 52,0, FALSE} // 50
|
||||
, {doTagExpectedError, 255, 85,0, FALSE} // 51
|
||||
, {doNOP, 130, 56,0, TRUE} // 52 tag-value
|
||||
, {doNOP, 125 /*}*/, 56,0, FALSE} // 53
|
||||
, {doTagDigit, 128, 52,0, TRUE} // 54
|
||||
, {doTagExpectedError, 255, 85,0, FALSE} // 55
|
||||
, {doNOP, 130, 56,0, TRUE} // 56 tag-close
|
||||
, {doTagValue, 125 /*}*/, 59,0, TRUE} // 57
|
||||
, {doTagExpectedError, 255, 85,0, FALSE} // 58
|
||||
, {doExprCatOperator, 254, 12,0, FALSE} // 59 expr-cont-no-tag
|
||||
, {doNOP, 130, 59,0, TRUE} // 60
|
||||
, {doExprCatOperator, 129, 12,0, FALSE} // 61
|
||||
, {doExprCatOperator, 91 /*[*/, 12,0, FALSE} // 62
|
||||
, {doExprCatOperator, 40 /*(*/, 12,0, FALSE} // 63
|
||||
, {doExprCatOperator, 36 /*$*/, 12,0, FALSE} // 64
|
||||
, {doExprCatOperator, 46 /*.*/, 12,0, FALSE} // 65
|
||||
, {doExprCatOperator, 47 /*/*/, 37,0, FALSE} // 66
|
||||
, {doExprOrOperator, 124 /*|*/, 12,0, TRUE} // 67
|
||||
, {doExprRParen, 41 /*)*/, 255,0, TRUE} // 68
|
||||
, {doExprFinished, 255, 255,0, FALSE} // 69
|
||||
, {doStartVariableName, 36 /*$*/, 72,0, TRUE} // 70 scan-var-name
|
||||
, {doNOP, 255, 85,0, FALSE} // 71
|
||||
, {doNOP, 132, 74,0, TRUE} // 72 scan-var-start
|
||||
, {doVariableNameExpectedErr, 255, 85,0, FALSE} // 73
|
||||
, {doNOP, 131, 74,0, TRUE} // 74 scan-var-body
|
||||
, {doEndVariableName, 255, 255,0, FALSE} // 75
|
||||
, {doScanUnicodeSet, 91 /*[*/, 255,0, TRUE} // 76 scan-unicode-set
|
||||
, {doScanUnicodeSet, 112 /*p*/, 255,0, TRUE} // 77
|
||||
, {doScanUnicodeSet, 80 /*P*/, 255,0, TRUE} // 78
|
||||
, {doNOP, 255, 85,0, FALSE} // 79
|
||||
, {doNOP, 130, 80,0, TRUE} // 80 assign-or-rule
|
||||
, {doStartAssign, 61 /*=*/, 12, 83, TRUE} // 81
|
||||
, {doNOP, 255, 20, 8, FALSE} // 82
|
||||
, {doEndAssign, 59 /*;*/, 1,0, TRUE} // 83 assign-end
|
||||
, {doRuleErrorAssignExpr, 255, 85,0, FALSE} // 84
|
||||
, {doExit, 255, 85,0, TRUE} // 85 errorDeath
|
||||
};
|
||||
const char *RBBIRuleStateNames[] = { 0,
|
||||
"start",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"break-rule-end",
|
||||
0,
|
||||
0,
|
||||
"reverse-rule",
|
||||
"term",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"term-var-ref",
|
||||
"expr-mod",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"expr-cont",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"look-ahead",
|
||||
0,
|
||||
"expr-cont-no-slash",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"tag-open",
|
||||
0,
|
||||
0,
|
||||
"tag-value",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"tag-close",
|
||||
0,
|
||||
0,
|
||||
"expr-cont-no-tag",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"scan-var-name",
|
||||
0,
|
||||
"scan-var-start",
|
||||
0,
|
||||
"scan-var-body",
|
||||
0,
|
||||
"scan-unicode-set",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"assign-or-rule",
|
||||
0,
|
||||
0,
|
||||
"assign-end",
|
||||
0,
|
||||
"errorDeath",
|
||||
0};
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
296
icu4c/source/common/rbbirpt.txt
Normal file
296
icu4c/source/common/rbbirpt.txt
Normal file
|
@ -0,0 +1,296 @@
|
|||
|
||||
#*****************************************************************************
|
||||
#
|
||||
# Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
#*****************************************************************************
|
||||
#
|
||||
# file: rbbirpt.txt
|
||||
# ICU Break Iterator Rule Parser State Table
|
||||
#
|
||||
# This state table is used when reading and parsing a set of RBBI rules
|
||||
# The rule parser uses a state machine; the data in this file define the
|
||||
# state transitions that occur for each input character.
|
||||
#
|
||||
# *** This file defines the RBBI rule grammar. This is it.
|
||||
# *** The determination of what is accepted is here.
|
||||
#
|
||||
# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
|
||||
# that are then built with the rule parser.
|
||||
#
|
||||
|
||||
#
|
||||
# Here is the syntax of the state definitions in this file:
|
||||
#
|
||||
#
|
||||
#StateName:
|
||||
# input-char n next-state ^push-state action
|
||||
# input-char n next-state ^push-state action
|
||||
# | | | | |
|
||||
# | | | | |--- action to be performed by state machine
|
||||
# | | | | See function RBBIRuleScanner::doParseActions()
|
||||
# | | | |
|
||||
# | | | |--- Push this named state onto the state stack.
|
||||
# | | | Later, when next state is specified as "pop",
|
||||
# | | | the pushed state will become the current state.
|
||||
# | | |
|
||||
# | | |--- Transition to this state if the current input character matches the input
|
||||
# | | character or char class in the left hand column. "pop" causes the next
|
||||
# | | state to be popped from the state stack.
|
||||
# | |
|
||||
# | |--- When making the state transition specified on this line, advance to the next
|
||||
# | character from the input only if 'n' appears here.
|
||||
# |
|
||||
# |--- Character or named character classes to test for. If the current character being scanned
|
||||
# matches, peform the actions and go to the state specified on this line.
|
||||
# The input character is tested sequentally, in the order written. The characters and
|
||||
# character classes tested for do not need to be mutually exclusive. The first match wins.
|
||||
#
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# start state, scan position is at the beginning of the rules file, or in between two rules.
|
||||
#
|
||||
start:
|
||||
escaped term ^break-rule-end doExprStart
|
||||
white_space n start
|
||||
'$' scan-var-name ^assign-or-rule doExprStart
|
||||
'!' n reverse-rule doReverseDir
|
||||
';' n start # ignore empty rules.
|
||||
eof exit
|
||||
default term ^break-rule-end doExprStart
|
||||
|
||||
#
|
||||
# break-rule-end: Returned from doing a break-rule expression.
|
||||
#
|
||||
break-rule-end:
|
||||
';' n start doEndOfRule
|
||||
white_space n break-rule-end
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rule We've just scanned a '!', indicating a reverse direction rule.
|
||||
# A rule expression must follow.
|
||||
#
|
||||
reverse-rule:
|
||||
default term ^break-rule-end doExprStart
|
||||
|
||||
|
||||
#
|
||||
# term. Eat through a single rule character, or a composite thing, which
|
||||
# could be a parenthesized expression, a variable name, or a Unicode Set.
|
||||
#
|
||||
term:
|
||||
escaped n expr-mod doRuleChar
|
||||
white_space n term
|
||||
rule_char n expr-mod doRuleChar
|
||||
'[' scan-unicode-set ^expr-mod
|
||||
'(' n term ^expr-mod doLParen
|
||||
'$' scan-var-name ^term-var-ref
|
||||
'.' n expr-mod doDotAny
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
|
||||
#
|
||||
# term-var-ref We've just finished scanning a reference to a $variable.
|
||||
# Check that the variable was defined.
|
||||
# The variable name scanning is in common with assignment statements,
|
||||
# so the check can't be done there.
|
||||
term-var-ref:
|
||||
default expr-mod doCheckVarDef
|
||||
|
||||
|
||||
#
|
||||
# expr-mod We've just finished scanning a term, now look for the optional
|
||||
# trailing '*', '?', '+'
|
||||
#
|
||||
expr-mod:
|
||||
'*' n expr-cont doUnaryOpStar
|
||||
'+' n expr-cont doUnaryOpPlus
|
||||
'?' n expr-cont doUnaryOpQuestion
|
||||
default expr-cont
|
||||
|
||||
|
||||
#
|
||||
# expr-cont Expression, continuation. At a point where additional terms are
|
||||
# allowed, but not required.
|
||||
#
|
||||
expr-cont:
|
||||
escaped term doExprCatOperator
|
||||
white_space n expr-cont
|
||||
rule_char term doExprCatOperator
|
||||
'[' term doExprCatOperator
|
||||
'(' term doExprCatOperator
|
||||
'$' term doExprCatOperator
|
||||
'.' term doExprCatOperator
|
||||
'/' look-ahead doExprCatOperator
|
||||
'{' tag-open doExprCatOperator
|
||||
'|' n term doExprOrOperator
|
||||
')' n pop doExprRParen
|
||||
default pop doExprFinished
|
||||
|
||||
|
||||
#
|
||||
# look-ahead Scanning a '/', which identifies a break point, assuming that the
|
||||
# remainder of the expression matches.
|
||||
#
|
||||
# Generate a parse tree as if this was a special kind of input symbol
|
||||
# appearing in an otherwise normal concatenation expression.
|
||||
#
|
||||
look-ahead:
|
||||
'/' n expr-cont-no-slash doSlash
|
||||
default errorDeath
|
||||
|
||||
|
||||
#
|
||||
# expr-cont-no-slash Expression, continuation. At a point where additional terms are
|
||||
# allowed, but not required. Just like
|
||||
# expr-cont, above, except that no '/'
|
||||
# look-ahead symbol is permitted.
|
||||
#
|
||||
expr-cont-no-slash:
|
||||
escaped term doExprCatOperator
|
||||
white_space n expr-cont
|
||||
rule_char term doExprCatOperator
|
||||
'[' term doExprCatOperator
|
||||
'(' term doExprCatOperator
|
||||
'$' term doExprCatOperator
|
||||
'.' term doExprCatOperator
|
||||
'|' n term doExprOrOperator
|
||||
')' n pop doExprRParen
|
||||
default pop doExprFinished
|
||||
|
||||
|
||||
#
|
||||
# tags scanning a '{', the opening delimiter for a tag that identifies
|
||||
# the kind of match. Scan the whole {dddd} tag, where d=digit
|
||||
#
|
||||
tag-open:
|
||||
white_space n tag-open
|
||||
digit_char tag-value doStartTagValue
|
||||
default errorDeath doTagExpectedError
|
||||
|
||||
tag-value:
|
||||
white_space n tag-close
|
||||
'}' tag-close
|
||||
digit_char n tag-value doTagDigit
|
||||
default errorDeath doTagExpectedError
|
||||
|
||||
tag-close:
|
||||
white_space n tag-close
|
||||
'}' n expr-cont-no-tag doTagValue
|
||||
default errorDeath doTagExpectedError
|
||||
|
||||
|
||||
|
||||
#
|
||||
# expr-cont-no-tag Expression, continuation. At a point where additional terms are
|
||||
# allowed, but not required. Just like
|
||||
# expr-cont, above, except that no "{ddd}"
|
||||
# tagging is permitted.
|
||||
#
|
||||
expr-cont-no-tag:
|
||||
escaped term doExprCatOperator
|
||||
white_space n expr-cont-no-tag
|
||||
rule_char term doExprCatOperator
|
||||
'[' term doExprCatOperator
|
||||
'(' term doExprCatOperator
|
||||
'$' term doExprCatOperator
|
||||
'.' term doExprCatOperator
|
||||
'/' look-ahead doExprCatOperator
|
||||
'|' n term doExprOrOperator
|
||||
')' n pop doExprRParen
|
||||
default pop doExprFinished
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Variable Name Scanning.
|
||||
#
|
||||
# The state that branched to here must have pushed a return state
|
||||
# to go to after completion of the variable name scanning.
|
||||
#
|
||||
# The current input character must be the $ that introduces the name.
|
||||
# The $ is consummed here rather than in the state that first detected it
|
||||
# so that the doStartVariableName action only needs to happen in one
|
||||
# place (here), and the other states don't need to worry about it.
|
||||
#
|
||||
scan-var-name:
|
||||
'$' n scan-var-start doStartVariableName
|
||||
default errorDeath
|
||||
|
||||
|
||||
scan-var-start:
|
||||
name_start_char n scan-var-body
|
||||
default errorDeath doVariableNameExpectedErr
|
||||
|
||||
scan-var-body:
|
||||
name_char n scan-var-body
|
||||
default pop doEndVariableName
|
||||
|
||||
|
||||
|
||||
#
|
||||
# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
|
||||
# Within the RBBI parser, after finding the first character
|
||||
# of a Unicode Set, we just hand the rule input at that
|
||||
# point of to the Unicode Set constructor, then pick
|
||||
# up parsing after the close of the set.
|
||||
#
|
||||
# The action for this state invokes the UnicodeSet parser.
|
||||
#
|
||||
scan-unicode-set:
|
||||
'[' n pop doScanUnicodeSet
|
||||
'p' n pop doScanUnicodeSet
|
||||
'P' n pop doScanUnicodeSet
|
||||
default errorDeath
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# assign-or-rule. A $variable was encountered at the start of something, could be
|
||||
# either an assignment statement or a rule, depending on whether an '='
|
||||
# follows the variable name. We get to this state when the variable name
|
||||
# scanning does a return.
|
||||
#
|
||||
assign-or-rule:
|
||||
white_space n assign-or-rule
|
||||
'=' n term ^assign-end doStartAssign # variable was target of assignment
|
||||
default term-var-ref ^break-rule-end # variable was a term in a rule
|
||||
|
||||
|
||||
|
||||
#
|
||||
# assign-end This state is entered when the end of the expression on the
|
||||
# right hand side of an assignment is found. We get here via
|
||||
# a pop; this state is pushed when the '=' in an assignment is found.
|
||||
#
|
||||
# The only thing allowed at this point is a ';'. The RHS of an
|
||||
# assignment must look like a rule expression, and we come here
|
||||
# when what is being scanned no longer looks like an expression.
|
||||
#
|
||||
assign-end:
|
||||
';' n start doEndAssign
|
||||
default errorDeath doRuleErrorAssignExpr
|
||||
|
||||
|
||||
|
||||
#
|
||||
# errorDeath. This state is specified as the next state whenever a syntax error
|
||||
# in the source rules is detected. Barring bugs, the state machine will never
|
||||
# actually get here, but will stop because of the action associated with the error.
|
||||
# But, just in case, this state asks the state machine to exit.
|
||||
errorDeath:
|
||||
default n errorDeath doExit
|
||||
|
||||
|
1079
icu4c/source/common/rbbiscan.cpp
Normal file
1079
icu4c/source/common/rbbiscan.cpp
Normal file
File diff suppressed because it is too large
Load diff
153
icu4c/source/common/rbbiscan.h
Normal file
153
icu4c/source/common/rbbiscan.h
Normal file
|
@ -0,0 +1,153 @@
|
|||
//
|
||||
// rbbiscan.h
|
||||
//
|
||||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for class RBBIRuleScanner
|
||||
//
|
||||
|
||||
|
||||
#ifndef RBBISCAN_H
|
||||
#define RBBISCAN_H
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "symtable.h" // For UnicodeSet parsing, is the interface that
|
||||
// looks up references to $variables within a set.
|
||||
#include "rbbinode.h"
|
||||
//#include "rbbitblb.h"
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class RBBIRuleBuilder;
|
||||
class RBBISymbolTable;
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// class RBBIRuleScanner does the lowest level, character-at-a-time
|
||||
// scanning of break iterator rules.
|
||||
//
|
||||
// The output of the scanner is parse trees for
|
||||
// the rule expressions and a list of all Unicode Sets
|
||||
// encountered.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
static const int kStackSize = 100; // The size of the state stack for
|
||||
// rules parsing. Corresponds roughly
|
||||
// to the depth of parentheses nesting
|
||||
// that is allowed in the rules.
|
||||
|
||||
enum EParseAction {dummy01, dummy02}; // Placeholder enum for the specifier for
|
||||
// actions that are specified in the
|
||||
// rule parsing state table.
|
||||
|
||||
class RBBIRuleScanner {
|
||||
public:
|
||||
|
||||
struct RBBIRuleChar {
|
||||
UChar32 fChar;
|
||||
UBool fEscaped;
|
||||
};
|
||||
|
||||
RBBIRuleScanner(RBBIRuleBuilder *rb);
|
||||
|
||||
|
||||
virtual ~RBBIRuleScanner();
|
||||
|
||||
void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
|
||||
// Return false if at end.
|
||||
|
||||
UBool push(const RBBIRuleChar &c); // Push (unget) one character.
|
||||
// Only a single character may be pushed.
|
||||
|
||||
void parse(); // Parse the rules, generating two parse
|
||||
// trees, one each for the forward and
|
||||
// reverse rules,
|
||||
// and a list of UnicodeSets encountered.
|
||||
|
||||
|
||||
|
||||
|
||||
private:
|
||||
|
||||
UBool doParseActions(EParseAction a, RBBIRuleChar &c);
|
||||
void error(UErrorCode e); // error reporting convenience function.
|
||||
void fixOpStack(RBBINode::OpPrecedence p);
|
||||
// a character.
|
||||
void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
|
||||
|
||||
UChar32 nextCharLL();
|
||||
void printNodeStack(const char *title);
|
||||
RBBINode *pushNewNode(RBBINode::NodeType t);
|
||||
void scanSet();
|
||||
|
||||
|
||||
RBBIRuleBuilder *fRB; // The rule builder that we are part of.
|
||||
|
||||
int32_t fScanIndex; // Index of current character being processed
|
||||
// in the rule input string.
|
||||
int32_t fNextIndex; // Index of the next character, which
|
||||
// is the first character not yet scanned.
|
||||
UBool fQuoteMode; // Scan is in a 'quoted region'
|
||||
int fLineNum; // Line number in input file.
|
||||
int fCharNum; // Char position within the line.
|
||||
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
||||
// as a single line, not two.
|
||||
|
||||
RBBIRuleChar fC; // Current char for parse state machine
|
||||
// processing.
|
||||
UnicodeString fVarName; // $variableName, valid when we've just
|
||||
// scanned one.
|
||||
|
||||
RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
|
||||
// parsing. index by p[state][char-class]
|
||||
|
||||
uint16_t fStack[kStackSize]; // State stack, holds state pushes
|
||||
int fStackPtr; // and pops as specified in the state
|
||||
// transition rules.
|
||||
|
||||
RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
|
||||
// during the parse of a rule
|
||||
int fNodeStackPtr;
|
||||
|
||||
|
||||
UBool fReverseRule; // True if the rule currently being scanned
|
||||
// is a reverse direction rule (if it
|
||||
// starts with a '!')
|
||||
|
||||
UBool fLookAheadRule; // True if the rule includes a '/'
|
||||
// somewhere within it.
|
||||
|
||||
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
|
||||
// $variable symbols.
|
||||
|
||||
UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
|
||||
// the sets created while parsing rules.
|
||||
// The key is the string used for creating
|
||||
// the set.
|
||||
|
||||
UnicodeSet *fRuleSets[10]; // Unicode Sets that are needed during
|
||||
// the scanning of RBBI rules. The
|
||||
// indicies for these are assigned by the
|
||||
// perl script that builds the state tables.
|
||||
// See rbbirpt.h.
|
||||
|
||||
int32_t fRuleNum; // Counts each rule as it is scanned.
|
||||
|
||||
UnicodeSet *gRuleSet_rule_char;
|
||||
UnicodeSet *gRuleSet_white_space;
|
||||
UnicodeSet *gRuleSet_name_char;
|
||||
UnicodeSet *gRuleSet_name_start_char;
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
557
icu4c/source/common/rbbisetb.cpp
Normal file
557
icu4c/source/common/rbbisetb.cpp
Normal file
|
@ -0,0 +1,557 @@
|
|||
//
|
||||
// rbbisetb.cpp
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
//
|
||||
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
|
||||
//
|
||||
// Starting with the rules parse tree from the scanner,
|
||||
//
|
||||
// - Enumerate the set of UnicodeSets that are referenced
|
||||
// by the RBBI rules.
|
||||
// - compute a set of non-overlapping character ranges
|
||||
// with all characters within a range belonging to the same
|
||||
// set of input uniocde sets.
|
||||
// - Derive a set of non-overlapping UnicodeSet (like things)
|
||||
// that will correspond to columns in the state table for
|
||||
// the RBBI execution engine. All characters within one
|
||||
// of these sets belong to the same set of the original
|
||||
// UnicodeSets from the user's rules.
|
||||
// - construct the trie table that maps input characters
|
||||
// to the index of the matching non-overlapping set of set from
|
||||
// the previous step.
|
||||
//
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "utrie.h"
|
||||
#include "cmemory.h"
|
||||
#include "uvector.h"
|
||||
#include "assert.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#include "rbbisetb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
|
||||
{
|
||||
fRB = rb;
|
||||
fStatus = rb->fStatus;
|
||||
fRangeList = 0;
|
||||
fTrie = 0;
|
||||
fTrieSize = 0;
|
||||
fGroupCount = 0;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// Destructor
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
RBBISetBuilder::~RBBISetBuilder()
|
||||
{
|
||||
RangeDescriptor *nextRangeDesc;
|
||||
|
||||
// Walk through & delete the linked list of RangeDescriptors
|
||||
for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
|
||||
RangeDescriptor *r = nextRangeDesc;
|
||||
nextRangeDesc = r->fNext;
|
||||
delete r;
|
||||
}
|
||||
|
||||
utrie_close(fTrie);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getFoldedRBBIValue Call-back function used during building of Trie table.
|
||||
// Folding value: just store the offset (16 bits)
|
||||
// if there is any non-0 entry.
|
||||
// (It'd really be nice if the Trie builder would provide a
|
||||
// simple default, so this function could go away from here.)
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else if(value!=0) {
|
||||
return (uint32_t)(offset|0x8000);
|
||||
} else {
|
||||
++start;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingRBBIOffset(uint32_t data) {
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// build Build the list of non-overlapping character ranges
|
||||
// from the Unicode Sets.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::build() {
|
||||
RBBINode *usetNode;
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "usets")) {printSets();}
|
||||
|
||||
//
|
||||
// Initialize the process by creating a single range encompassing all characters
|
||||
// that is in no sets.
|
||||
//
|
||||
fRangeList = new RangeDescriptor(*fStatus);
|
||||
fRangeList->fStartChar = 0;
|
||||
fRangeList->fEndChar = 0x10ffff;
|
||||
|
||||
|
||||
//
|
||||
// Find the set of non-overlapping ranges of characters
|
||||
//
|
||||
for (usetNode=fRB->fSetsListHead; usetNode!=NULL; usetNode=usetNode->fRightChild) {
|
||||
UnicodeSet *inputSet = usetNode->fInputSet;
|
||||
int32_t inputSetRangeCount = inputSet->getRangeCount();
|
||||
int inputSetRangeIndex = 0;
|
||||
rlRange = fRangeList;
|
||||
|
||||
for (;;) {
|
||||
if (inputSetRangeIndex >= inputSetRangeCount) {
|
||||
break;
|
||||
}
|
||||
UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
|
||||
UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
|
||||
|
||||
// skip over ranges from the range list that are completely
|
||||
// below the current range from the input unicode set.
|
||||
while (rlRange->fEndChar < inputSetRangeBegin) {
|
||||
rlRange = rlRange->fNext;
|
||||
}
|
||||
|
||||
// If the start of the range from the range list is before with
|
||||
// the start of the range from the unicode set, split the range list range
|
||||
// in two, with one part being before (wholly outside of) the unicode set
|
||||
// and the other containing the rest.
|
||||
// Then continue the loop; the post-split current range will then be skipped
|
||||
// over
|
||||
if (rlRange->fStartChar < inputSetRangeBegin) {
|
||||
rlRange->split(inputSetRangeBegin, *fStatus);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Same thing at the end of the ranges...
|
||||
// If the end of the range from the range list doesn't coincide with
|
||||
// the end of the range from the unicode set, split the range list
|
||||
// range in two. The first part of the split range will be
|
||||
// wholly inside the Unicode set.
|
||||
if (rlRange->fEndChar > inputSetRangeEnd) {
|
||||
rlRange->split(inputSetRangeEnd+1, *fStatus);
|
||||
}
|
||||
|
||||
// The current rlRange is now entirely within the UnicodeSet range.
|
||||
// Add this unicode set to the list of sets for this rlRange
|
||||
if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
|
||||
rlRange->fIncludesSets->addElement(usetNode, *fStatus);
|
||||
}
|
||||
|
||||
// Advance over ranges that we are finished with.
|
||||
if (inputSetRangeEnd == rlRange->fEndChar) {
|
||||
inputSetRangeIndex++;
|
||||
}
|
||||
rlRange = rlRange->fNext;
|
||||
}
|
||||
}
|
||||
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "range")) { printRanges();}
|
||||
|
||||
//
|
||||
// Group the above ranges, with each group consisting of one or more
|
||||
// ranges that are in exactly the same set of original UnicodeSets.
|
||||
// The groups are numbered, and these group numbers are the set of
|
||||
// input symbols recognized by the run-time state machine.
|
||||
//
|
||||
RangeDescriptor *rlSearchRange;
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
|
||||
if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
|
||||
rlRange->fNum = rlSearchRange->fNum;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rlRange->fNum == 0) {
|
||||
fGroupCount ++;
|
||||
rlRange->fNum = fGroupCount;
|
||||
rlRange->setDictionaryFlag();
|
||||
addValToSets(rlRange->fIncludesSets, fGroupCount);
|
||||
}
|
||||
}
|
||||
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "esets")) {printSets();}
|
||||
|
||||
//
|
||||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number
|
||||
//
|
||||
fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
|
||||
NULL, // Data array (utrie will allocate one)
|
||||
100000, // Max Data Length
|
||||
0, // Initial value for all code points
|
||||
TRUE); // Keep Latin 1 in separately
|
||||
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getTrieSize() {
|
||||
fTrieSize = utrie_serialize(fTrie,
|
||||
NULL, // Buffer
|
||||
0, // Capacity
|
||||
getFoldedRBBIValue,
|
||||
TRUE, // Reduce to 16 bits
|
||||
fStatus);
|
||||
// printf("Trie table size is %d\n", trieSize);
|
||||
return fTrieSize;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// serializeTrie() Put the serialized trie at the specified address.
|
||||
// Trust the caller to have given us enough memory.
|
||||
// getTrieSize() MUST be called first.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void RBBISetBuilder::serializeTrie(uint8_t *where) {
|
||||
utrie_serialize(fTrie,
|
||||
where, // Buffer
|
||||
fTrieSize, // Capacity
|
||||
getFoldedRBBIValue,
|
||||
TRUE, // Reduce to 16 bits
|
||||
fStatus);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// addValToSets Add a runtime-mapped input value to each uset from a
|
||||
// list of uset nodes.
|
||||
// For each of the original Unicode sets - which correspond
|
||||
// directly to uset nodes - a logically equivalent expression
|
||||
// is constructed in terms of the remapped runtime input
|
||||
// symbol set. This function adds one runtime input symbol to
|
||||
// a list of sets.
|
||||
//
|
||||
// The "logically equivalent expression" is the tree for an
|
||||
// or-ing together of all of the symbols that go into the set.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
|
||||
int32_t ix;
|
||||
|
||||
for (ix=0; ix<sets->size(); ix++) {
|
||||
RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
|
||||
RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
|
||||
leafNode->fVal = (unsigned short)val;
|
||||
if (usetNode->fLeftChild == NULL) {
|
||||
usetNode->fLeftChild = leafNode;
|
||||
leafNode->fParent = usetNode;
|
||||
} else {
|
||||
// There are already input symbols present for this set.
|
||||
// Set up an OR node, with the previous stuff as the left child
|
||||
// and the new value as the right child.
|
||||
RBBINode *orNode = new RBBINode(RBBINode::opOr);
|
||||
orNode->fLeftChild = usetNode->fLeftChild;
|
||||
orNode->fRightChild = leafNode;
|
||||
orNode->fLeftChild->fParent = orNode;
|
||||
orNode->fRightChild->fParent = orNode;
|
||||
usetNode->fLeftChild = orNode;
|
||||
orNode->fParent = usetNode;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getNumOutputSets
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getNumCharCategories() {
|
||||
return fGroupCount + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// printRanges A debugging function.
|
||||
// dump out all of the range definitions.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::printRanges() {
|
||||
RangeDescriptor *rlRange;
|
||||
int i;
|
||||
|
||||
printf("\n\n Nonoverlapping Ranges ...\n");
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
printf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
UnicodeString setName = "anon"; // TODO: no string literals.
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != NULL) {
|
||||
RBBINode *varRef = setRef->fParent;
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
RBBINode::printUnicodeString(setName); printf(" ");
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// printRangeGroups A debugging function.
|
||||
// dump out all of the range groups.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::printRangeGroups() {
|
||||
RangeDescriptor *rlRange;
|
||||
RangeDescriptor *tRange;
|
||||
int i;
|
||||
int lastPrintedGroupNum = 0;
|
||||
|
||||
printf("\nRanges grouped by Unicode Set Membership...\n");
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
int groupNum = rlRange->fNum & 0xbfff;
|
||||
if (groupNum > lastPrintedGroupNum) {
|
||||
lastPrintedGroupNum = groupNum;
|
||||
printf("%2i ", groupNum);
|
||||
|
||||
if (rlRange->fNum & 0x4000) { printf(" <DICT> ");};
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
UnicodeString setName = "anon";
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != NULL) {
|
||||
RBBINode *varRef = setRef->fParent;
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
RBBINode::printUnicodeString(setName); printf(" ");
|
||||
}
|
||||
|
||||
i = 0;
|
||||
for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
|
||||
if (tRange->fNum == rlRange->fNum) {
|
||||
if (i++ % 5 == 0) {
|
||||
printf("\n ");
|
||||
}
|
||||
printf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// printSets A debugging function.
|
||||
// dump out all of the set definitions.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::printSets() {
|
||||
RBBINode *usetNode;
|
||||
int i;
|
||||
UnicodeSet inputSet;
|
||||
|
||||
printf("\n\nUnicode Sets List\n------------------\n");
|
||||
i = 0;
|
||||
for (usetNode=fRB->fSetsListHead; usetNode!=NULL; usetNode=usetNode->fRightChild) {
|
||||
RBBINode *setRef;
|
||||
RBBINode *varRef;
|
||||
UnicodeString setName;
|
||||
|
||||
i++;
|
||||
printf("%3d ", i);
|
||||
setName = "anonymous";
|
||||
setRef = usetNode->fParent;
|
||||
if (setRef != NULL) {
|
||||
varRef = setRef->fParent;
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
RBBINode::printUnicodeString(setName);
|
||||
printf(" ");
|
||||
RBBINode::printUnicodeString(usetNode->fText);
|
||||
printf("\n");
|
||||
if (usetNode->fLeftChild != NULL) {
|
||||
usetNode->fLeftChild->printTree();
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDesriptor copy constructor
|
||||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
|
||||
int i;
|
||||
|
||||
this->fStartChar = other.fStartChar;
|
||||
this->fEndChar = other.fEndChar;
|
||||
this->fNum = other.fNum;
|
||||
this->fNext = NULL;
|
||||
this->fIncludesSets = new UVector(status);
|
||||
for (i=0; i<other.fIncludesSets->size(); i++) {
|
||||
this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDesriptor default constructor
|
||||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
RangeDescriptor::RangeDescriptor(UErrorCode &status) {
|
||||
this->fStartChar = 0;
|
||||
this->fEndChar = 0;
|
||||
this->fNum = 0;
|
||||
this->fNext = NULL;
|
||||
this->fIncludesSets = new UVector(status);
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDesriptor Destructor
|
||||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
RangeDescriptor::~RangeDescriptor() {
|
||||
delete fIncludesSets;
|
||||
fIncludesSets = NULL;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDesriptor::split()
|
||||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
|
||||
assert(where>fStartChar && where<=fEndChar);
|
||||
RangeDescriptor *nr = new RangeDescriptor(*this, status);
|
||||
// RangeDescriptor copy constructor copies all fields.
|
||||
// Only need to update those that are different after the split.
|
||||
nr->fStartChar = where;
|
||||
this->fEndChar = where-1;
|
||||
nr->fNext = this->fNext;
|
||||
this->fNext = nr;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDescriptor::setDictionaryFlag
|
||||
//
|
||||
// Character Category Numbers that include characters from
|
||||
// the original Unicode Set named "dictionary" have bit 14
|
||||
// set to 1. The RBBI runtime engine uses this to trigger
|
||||
// use of the word dictionary.
|
||||
//
|
||||
// This function looks through the Unicode Sets that it
|
||||
// (the range) includes, and sets the bit in fNum when
|
||||
// "dictionary" is among them.
|
||||
//
|
||||
// TODO: a faster way would be to find the set node for
|
||||
// "dictionary" just once, rather than looking it
|
||||
// up by name every time.
|
||||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
void RangeDescriptor::setDictionaryFlag() {
|
||||
int i;
|
||||
|
||||
for (i=0; i<this->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
|
||||
UnicodeString setName;
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != NULL) {
|
||||
RBBINode *varRef = setRef->fParent;
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
if (setName.compare("dictionary") == 0) { // TODO: no string literals.
|
||||
this->fNum |= 0x4000;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
110
icu4c/source/common/rbbisetb.h
Normal file
110
icu4c/source/common/rbbisetb.h
Normal file
|
@ -0,0 +1,110 @@
|
|||
//
|
||||
// rbbisetb.h
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef RBBISETB_H
|
||||
#define RBBISETB_H
|
||||
|
||||
#include "rbbirb.h"
|
||||
#include "uvector.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
// RBBISetBuilder Derives the character categories used by the runtime RBBI engine
|
||||
// from the Unicode Sets appearing in the source RBBI rules, and
|
||||
// creates the TRIE table used to map from Unicode to the
|
||||
// character categories.
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// RangeDescriptor
|
||||
//
|
||||
// Each of the non-overlapping character ranges gets one of these descriptors.
|
||||
// All of them are strung together in a linked list, which is kept in order
|
||||
// (by character)
|
||||
//
|
||||
struct RangeDescriptor {
|
||||
UChar32 fStartChar; // Start of range, unicode 32 bit value.
|
||||
UChar32 fEndChar; // End of range, unicode 32 bit value.
|
||||
int32_t fNum; // runtime-mapped input value for this range.
|
||||
UVector *fIncludesSets; // vector of the the original
|
||||
// Unicode sets that include this range.
|
||||
// (Contains ptrs to uset nodes)
|
||||
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
|
||||
|
||||
RangeDescriptor(UErrorCode &status);
|
||||
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
|
||||
~RangeDescriptor();
|
||||
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
|
||||
// where appearing in the second (higher) part.
|
||||
void setDictionaryFlag(); // Check whether this range appears as part of
|
||||
// the Unicode set named "dictionary"
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
|
||||
//
|
||||
// Starting with the rules parse tree from the scanner,
|
||||
//
|
||||
// - Enumerate the set of UnicodeSets that are referenced
|
||||
// by the RBBI rules.
|
||||
// - compute a derived set of non-overlapping UnicodeSets
|
||||
// that will correspond to columns in the state table for
|
||||
// the RBBI execution engine.
|
||||
// - construct the trie table that maps input characters
|
||||
// to set numbers in the non-overlapping set of sets.
|
||||
//
|
||||
|
||||
|
||||
class RBBISetBuilder {
|
||||
public:
|
||||
RBBISetBuilder(RBBIRuleBuilder *rb);
|
||||
~RBBISetBuilder();
|
||||
|
||||
void build(); // TODO: needs an out parameter for the TRIE.
|
||||
void addValToSets(UVector *sets, uint32_t val);
|
||||
int32_t getNumCharCategories(); // CharCategories are the same as input symbol set to the
|
||||
// runtime state machine, which are the same as
|
||||
// columns in the DFA state table
|
||||
int32_t getTrieSize(); // Size in bytes of the serialized Trie.
|
||||
void serializeTrie(uint8_t *where); // write out the serialized Trie.
|
||||
void printSets();
|
||||
void printRanges();
|
||||
void printRangeGroups();
|
||||
|
||||
|
||||
private:
|
||||
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
|
||||
UErrorCode *fStatus;
|
||||
|
||||
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
UNewTrie *fTrie; // The mapping TRIE that is the end result of processin
|
||||
uint32_t fTrieSize; // the Unicode Sets.
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
// fGroupCount is the index of the last used group.
|
||||
// The value is also the number of columns in the RBBI state table being compiled.
|
||||
// Index 0 is not used. Funny counting.
|
||||
int32_t fGroupCount;
|
||||
|
||||
|
||||
|
||||
private:
|
||||
void numberSets();
|
||||
};
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
263
icu4c/source/common/rbbistbl.cpp
Normal file
263
icu4c/source/common/rbbistbl.cpp
Normal file
|
@ -0,0 +1,263 @@
|
|||
//
|
||||
// file: rbbistbl.cpp Implementation of the ICU RBBISymbolTable class
|
||||
//
|
||||
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2001, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/parsepos.h"
|
||||
|
||||
#include "umutex.h"
|
||||
|
||||
#include "rbbirb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
#include <stdio.h> // TODO - getrid of this.
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
//
|
||||
// Forward Declarations
|
||||
//
|
||||
static void U_EXPORT2 U_CALLCONV RBBISymbolTableEntry_deleter(void *p);
|
||||
|
||||
|
||||
|
||||
|
||||
RBBISymbolTable::RBBISymbolTable(RBBIRuleScanner *rs, const UnicodeString &rules, UErrorCode &status)
|
||||
:fRuleScanner(rs), fRules(rules), ffffString(UChar(0xffff))
|
||||
{
|
||||
fHashTable = NULL;
|
||||
fCachedSetLookup = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
||||
fHashTable = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, &status);
|
||||
uhash_setValueDeleter(fHashTable, RBBISymbolTableEntry_deleter);
|
||||
};
|
||||
|
||||
|
||||
|
||||
RBBISymbolTable::~RBBISymbolTable()
|
||||
{
|
||||
uhash_close(fHashTable);
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// RBBISymbolTable::lookup This function from the abstract symbol table inteface
|
||||
// looks up a variable name and returns a UnicodeString
|
||||
// containing the substitution text.
|
||||
//
|
||||
// The variable name does NOT include the leading $.
|
||||
//
|
||||
const UnicodeString *RBBISymbolTable::lookup(const UnicodeString& s) const
|
||||
{
|
||||
RBBISymbolTableEntry *el;
|
||||
RBBINode *varRefNode;
|
||||
RBBINode *exprNode;
|
||||
RBBINode *usetNode;
|
||||
const UnicodeString *retString;
|
||||
RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const
|
||||
|
||||
el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &s);
|
||||
if (el == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
varRefNode = el->val;
|
||||
exprNode = varRefNode->fLeftChild; // Root node of expression for variable
|
||||
if (exprNode->fType == RBBINode::setRef) {
|
||||
// The $variable refers to a single UnicodeSet
|
||||
// return the ffffString, which will subsequently be interpreted as a
|
||||
// stand-in character for the set by RBBISymbolTable::lookupMatcher()
|
||||
usetNode = exprNode->fLeftChild;
|
||||
This->fCachedSetLookup = usetNode->fInputSet;
|
||||
retString = &ffffString;
|
||||
}
|
||||
else
|
||||
{
|
||||
// The variable refers to something other than just a set.
|
||||
// return the original source string for the expression
|
||||
retString = &exprNode->fText;
|
||||
This->fCachedSetLookup = NULL;
|
||||
}
|
||||
return retString;
|
||||
};
|
||||
|
||||
|
||||
|
||||
//
|
||||
// RBBISymbolTable::lookupMatcher This function from the abstract symbol table
|
||||
// interface maps a single stand-in character to a
|
||||
// pointer to a Unicode Set. The Unicode Set code uses this
|
||||
// mechanism to get all references to the same $variable
|
||||
// name to refer to a single common Unicode Set instance.
|
||||
//
|
||||
// This implementation cheats a little, and does not maintain a map of stand-in chars
|
||||
// to sets. Instead, it takes advantage of the fact that the UnicodeSet
|
||||
// constructor will always call this function right after calling lookup(),
|
||||
// and we just need to remember what set to return between these two calls.
|
||||
const UnicodeFunctor *RBBISymbolTable::lookupMatcher(UChar32 ch) const
|
||||
{
|
||||
UnicodeSet *retVal = NULL;
|
||||
RBBISymbolTable *This = (RBBISymbolTable *)this; // cast off const
|
||||
if (ch == 0xffff) {
|
||||
retVal = fCachedSetLookup;
|
||||
This->fCachedSetLookup = 0;
|
||||
}
|
||||
return retVal;
|
||||
};
|
||||
|
||||
//
|
||||
// RBBISymbolTable::parseReference This function from the abstract symbol table interface
|
||||
// looks for a $variable name in the source text.
|
||||
// It does not look it up, only scans for it.
|
||||
// It is used by the UnicodeSet parser.
|
||||
//
|
||||
// This implementation is lifted pretty much verbatim
|
||||
// from the rules based transliterator implementation.
|
||||
// I didn't see an obvious way of sharing it.
|
||||
//
|
||||
UnicodeString RBBISymbolTable::parseReference(const UnicodeString& text,
|
||||
ParsePosition& pos, int32_t limit) const
|
||||
{
|
||||
int32_t start = pos.getIndex();
|
||||
int32_t i = start;
|
||||
UnicodeString result;
|
||||
while (i < limit) {
|
||||
UChar c = text.charAt(i);
|
||||
if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
|
||||
break;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (i == start) { // No valid name chars
|
||||
return result; // Indicate failure with empty string
|
||||
}
|
||||
pos.setIndex(i);
|
||||
text.extractBetween(start, i, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// RBBISymbolTable::lookupNode Given a key (a variable name), return the
|
||||
// corresponding RBBI Node. If there is no entry
|
||||
// in the table for this name, return NULL.
|
||||
//
|
||||
RBBINode *RBBISymbolTable::lookupNode(const UnicodeString &key) const{
|
||||
|
||||
RBBINode *retNode = NULL;
|
||||
RBBISymbolTableEntry *el;
|
||||
|
||||
el = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key);
|
||||
if (el != NULL) {
|
||||
retNode = el->val;
|
||||
}
|
||||
return retNode;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// RBBISymbolTable::addEntry Add a new entry to the symbol table.
|
||||
// Indicate an error if the name already exists -
|
||||
// this will only occur in the case of duplicate
|
||||
// variable assignments.
|
||||
//
|
||||
void RBBISymbolTable::addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err) {
|
||||
RBBISymbolTableEntry *e;
|
||||
|
||||
e = (RBBISymbolTableEntry *)uhash_get(fHashTable, &key);
|
||||
if (e != NULL) {
|
||||
err = U_BRK_VARIABLE_REDFINITION;
|
||||
return;
|
||||
}
|
||||
|
||||
e = new RBBISymbolTableEntry;
|
||||
if (e == NULL) {
|
||||
err = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
};
|
||||
e->key = key;
|
||||
e->val = val;
|
||||
uhash_put( fHashTable, &e->key, e, &err);
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// RBBISymbolTableEntry_deleter Used by the UHashTable to delete the contents
|
||||
// when the hash table is deleted.
|
||||
//
|
||||
static void U_EXPORT2 U_CALLCONV RBBISymbolTableEntry_deleter(void *p) {
|
||||
RBBISymbolTableEntry *px = (RBBISymbolTableEntry *)p;
|
||||
delete px;
|
||||
};
|
||||
|
||||
RBBISymbolTableEntry::~RBBISymbolTableEntry() {
|
||||
// The "val" of a symbol table entry is a variable reference node.
|
||||
// The l. child of the val is the rhs expression from the assignment.
|
||||
// Unlike other node types, children of variable reference nodes are not
|
||||
// automatically recursively deleted. We do it manually here.
|
||||
delete val->fLeftChild;
|
||||
val->fLeftChild = NULL;
|
||||
|
||||
delete val;
|
||||
|
||||
// Note: the key UnicodeString is destructed by virtue of being in the object by value.
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// RBBISymbolTable::print Debugging function, dump out the symbol table contents.
|
||||
//
|
||||
void RBBISymbolTable::print() const {
|
||||
printf("Variable Definitions\n"
|
||||
"Name Node Val String Val\n"
|
||||
"----------------------------------------------------------------------\n");
|
||||
|
||||
int32_t pos = -1;
|
||||
const UHashElement *e = NULL;
|
||||
for (;;) {
|
||||
e = uhash_nextElement(fHashTable, &pos);
|
||||
if (e == NULL ) {
|
||||
break;
|
||||
}
|
||||
RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
|
||||
|
||||
RBBINode::printUnicodeString(s->key, 15);
|
||||
printf(" %8x ", s->val);
|
||||
RBBINode::printUnicodeString(s->val->fLeftChild->fText);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("\nParsed Variable Definitions\n");
|
||||
pos = -1;
|
||||
for (;;) {
|
||||
e = uhash_nextElement(fHashTable, &pos);
|
||||
if (e == NULL ) {
|
||||
break;
|
||||
}
|
||||
RBBISymbolTableEntry *s = (RBBISymbolTableEntry *)e->value.pointer;
|
||||
RBBINode::printUnicodeString(s->key);
|
||||
s->val->fLeftChild->printTree();
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
730
icu4c/source/common/rbbitblb.cpp
Normal file
730
icu4c/source/common/rbbitblb.cpp
Normal file
|
@ -0,0 +1,730 @@
|
|||
//
|
||||
// rbbitblb.cpp
|
||||
//
|
||||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "rbbitblb.h"
|
||||
#include "rbbirb.h"
|
||||
#include "rbbisetb.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode) :
|
||||
fTree(rootNode) {
|
||||
fRB = rb;
|
||||
fStatus = fRB->fStatus;
|
||||
fDStates = new UVector(*fStatus);
|
||||
}
|
||||
|
||||
|
||||
|
||||
RBBITableBuilder::~RBBITableBuilder() {
|
||||
int i;
|
||||
for (i=0; i<fDStates->size(); i++) {
|
||||
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
|
||||
}
|
||||
delete fDStates;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
|
||||
// table from the RBBI rules parse tree.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::build() {
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If there were no rules, just return. This situation can easily arise
|
||||
// for the reverse rules.
|
||||
if (fTree==NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
//
|
||||
// Walk through the tree, replacing any references to $variables with a copy of the
|
||||
// parse tree for the substition expression.
|
||||
//
|
||||
fTree->flattenVariables();
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "ftree")) {
|
||||
printf("Parse tree after flattening variable references.\n");
|
||||
fTree->printTree(TRUE);
|
||||
}
|
||||
|
||||
//
|
||||
// Add a unique right-end marker to the expression.
|
||||
// Appears as a cat-node, left child being the original tree,
|
||||
// right child being the end marker.
|
||||
//
|
||||
RBBINode *cn = new RBBINode(RBBINode::opCat);
|
||||
cn->fLeftChild = fTree;
|
||||
fTree->fParent = cn;
|
||||
cn->fRightChild = new RBBINode(RBBINode::endMark);
|
||||
cn->fRightChild->fParent = cn;
|
||||
fTree = cn;
|
||||
|
||||
//
|
||||
// Replace all references to UnicodeSets with the tree for the equivalent
|
||||
// expression.
|
||||
//
|
||||
fTree->flattenSets();
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "stree")) {
|
||||
printf("Parse tree after flattening Unicode Set references.\n");
|
||||
fTree->printTree(TRUE);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// calculate the functions nullable, firstpos, lastpos and followpos on
|
||||
// nodes in the parse tree.
|
||||
// See the alogrithm description in Aho.
|
||||
// Understanding how this works by looking at the code alone will be
|
||||
// nearly impossible.
|
||||
//
|
||||
calcNullable(fTree);
|
||||
calcFirstPos(fTree);
|
||||
calcLastPos(fTree);
|
||||
calcFollowPos(fTree);
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "pos")) {
|
||||
printf("\n\n");
|
||||
printPosSets(fTree);
|
||||
}
|
||||
|
||||
//
|
||||
// Build the DFA state transition tables.
|
||||
//
|
||||
buildStateTable();
|
||||
flagAcceptingStates();
|
||||
flagLookAheadStates();
|
||||
if (fRB->fDebugEnv && strstr(fRB->fDebugEnv, "states")) {printStates();};
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::calcNullable(RBBINode *n) {
|
||||
if (n == NULL) {
|
||||
return;
|
||||
}
|
||||
if (n->fType == RBBINode::setRef ||
|
||||
n->fType == RBBINode::endMark ) {
|
||||
// These are non-empty leaf node types.
|
||||
n->fNullable = FALSE;
|
||||
return;
|
||||
}
|
||||
|
||||
if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
|
||||
// Lookahead marker node. It's a leaf, so no recursion on children.
|
||||
// It's nullable because it does not match any literal text from the input stream.
|
||||
n->fNullable = TRUE;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// The node is not a leaf.
|
||||
// Calculate nullable on its children.
|
||||
calcNullable(n->fLeftChild);
|
||||
calcNullable(n->fRightChild);
|
||||
|
||||
// Apply functions from table 3.40 in Aho
|
||||
if (n->fType == RBBINode::opOr) {
|
||||
n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
|
||||
}
|
||||
else if (n->fType == RBBINode::opCat) {
|
||||
n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
|
||||
}
|
||||
else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
|
||||
n->fNullable = TRUE;
|
||||
}
|
||||
else {
|
||||
n->fNullable = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::calcFirstPos(RBBINode *n) {
|
||||
if (n == NULL) {
|
||||
return;
|
||||
}
|
||||
if (n->fType == RBBINode::leafChar ||
|
||||
n->fType == RBBINode::endMark ||
|
||||
n->fType == RBBINode::lookAhead ||
|
||||
n->fType == RBBINode::tag) {
|
||||
// These are non-empty leaf node types.
|
||||
n->fFirstPosSet->addElement(n, *fStatus);
|
||||
return;
|
||||
}
|
||||
|
||||
// The node is not a leaf.
|
||||
// Calculate firstPos on its children.
|
||||
calcFirstPos(n->fLeftChild);
|
||||
calcFirstPos(n->fRightChild);
|
||||
|
||||
// Apply functions from table 3.40 in Aho
|
||||
if (n->fType == RBBINode::opOr) {
|
||||
setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
|
||||
setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
|
||||
}
|
||||
else if (n->fType == RBBINode::opCat) {
|
||||
setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
|
||||
if (n->fLeftChild->fNullable) {
|
||||
setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
|
||||
}
|
||||
}
|
||||
else if (n->fType == RBBINode::opStar ||
|
||||
n->fType == RBBINode::opQuestion ||
|
||||
n->fType == RBBINode::opPlus) {
|
||||
setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::calcLastPos(RBBINode *n) {
|
||||
if (n == NULL) {
|
||||
return;
|
||||
}
|
||||
if (n->fType == RBBINode::leafChar ||
|
||||
n->fType == RBBINode::endMark ||
|
||||
n->fType == RBBINode::lookAhead ||
|
||||
n->fType == RBBINode::tag) {
|
||||
// These are non-empty leaf node types.
|
||||
n->fLastPosSet->addElement(n, *fStatus);
|
||||
return;
|
||||
}
|
||||
|
||||
// The node is not a leaf.
|
||||
// Calculate lastPos on its children.
|
||||
calcLastPos(n->fLeftChild);
|
||||
calcLastPos(n->fRightChild);
|
||||
|
||||
// Apply functions from table 3.40 in Aho
|
||||
if (n->fType == RBBINode::opOr) {
|
||||
setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
|
||||
setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
|
||||
}
|
||||
else if (n->fType == RBBINode::opCat) {
|
||||
setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
|
||||
if (n->fRightChild->fNullable) {
|
||||
setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
|
||||
}
|
||||
}
|
||||
else if (n->fType == RBBINode::opStar ||
|
||||
n->fType == RBBINode::opQuestion ||
|
||||
n->fType == RBBINode::opPlus) {
|
||||
setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::calcFollowPos(RBBINode *n) {
|
||||
if (n == NULL ||
|
||||
n->fType == RBBINode::leafChar ||
|
||||
n->fType == RBBINode::endMark) {
|
||||
return;
|
||||
}
|
||||
|
||||
calcFollowPos(n->fLeftChild);
|
||||
calcFollowPos(n->fRightChild);
|
||||
|
||||
// Aho rule #1
|
||||
if (n->fType == RBBINode::opCat) {
|
||||
RBBINode *i; // is 'i' in Aho's description
|
||||
uint32_t ix;
|
||||
|
||||
UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
|
||||
UVector *FirstPosOfRightChild = n->fRightChild->fFirstPosSet;
|
||||
|
||||
for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
|
||||
i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
|
||||
setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
|
||||
}
|
||||
}
|
||||
|
||||
// Aho rule #2
|
||||
if (n->fType == RBBINode::opStar ||
|
||||
n->fType == RBBINode::opPlus) {
|
||||
RBBINode *i; // again, n and i are the names from Aho's description.
|
||||
uint32_t ix;
|
||||
|
||||
for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
|
||||
i = (RBBINode *)n->fLastPosSet->elementAt(ix);
|
||||
setAdd(i->fFollowPos, n->fFirstPosSet);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// buildStateTable() Determine the set of runtime DFA states and the
|
||||
// transition tables for these states, by the algorithm
|
||||
// of fig. 3.44 in Aho.
|
||||
//
|
||||
// Most of the comments are quotes of Aho's psuedo-code.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::buildStateTable() {
|
||||
//
|
||||
// Add a dummy state 0 - the stop state. Not from Aho.
|
||||
int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
|
||||
RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
|
||||
failState->fPositions = new UVector(*fStatus);
|
||||
fDStates->addElement(failState, *fStatus);
|
||||
|
||||
// initially, the only unmarked state in Dstates is firstpos(root),
|
||||
// where toot is the root of the syntax tree for (r)#;
|
||||
RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
|
||||
initialState->fPositions = new UVector(*fStatus);
|
||||
setAdd(initialState->fPositions, fTree->fFirstPosSet);
|
||||
fDStates->addElement(initialState, *fStatus);
|
||||
|
||||
// while there is an unmarked state T in Dstates do begin
|
||||
for (;;) {
|
||||
RBBIStateDescriptor *T = NULL;
|
||||
int32_t tx;
|
||||
for (tx=1; tx<fDStates->size(); tx++) {
|
||||
RBBIStateDescriptor *temp;
|
||||
temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
|
||||
if (temp->fMarked == FALSE) {
|
||||
T = temp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (T == NULL) {
|
||||
break;
|
||||
}
|
||||
|
||||
// mark T;
|
||||
T->fMarked = TRUE;
|
||||
|
||||
// for each input symbol a do begin
|
||||
int32_t a;
|
||||
for (a = 1; a<=lastInputSymbol; a++) {
|
||||
// let U be the set of positions that are in followpos(p)
|
||||
// for some position p in T
|
||||
// such that the symbol at position p is a;
|
||||
UVector *U = NULL;
|
||||
RBBINode *p;
|
||||
int32_t px;
|
||||
for (px=0; px<T->fPositions->size(); px++) {
|
||||
p = (RBBINode *)T->fPositions->elementAt(px);
|
||||
if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
|
||||
if (U == NULL) {
|
||||
U = new UVector(*fStatus);
|
||||
}
|
||||
setAdd(U, p->fFollowPos);
|
||||
}
|
||||
}
|
||||
|
||||
// if U is not empty and not in DStates then
|
||||
int32_t ux;
|
||||
UBool UinDstates = FALSE;
|
||||
if (U != NULL) {
|
||||
assert(U->size() > 0);
|
||||
int ix;
|
||||
for (ix=0; ix<fDStates->size(); ix++) {
|
||||
RBBIStateDescriptor *temp2;
|
||||
temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
|
||||
if (setEquals(U, temp2->fPositions)) {
|
||||
delete U;
|
||||
U = temp2->fPositions;
|
||||
ux = ix;
|
||||
UinDstates = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Add U as an unmarked state to Dstates
|
||||
if (!UinDstates)
|
||||
{
|
||||
RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
|
||||
newState->fPositions = U;
|
||||
fDStates->addElement(newState, *fStatus);
|
||||
ux = fDStates->size()-1;
|
||||
}
|
||||
|
||||
// Dtran[T, a] := U;
|
||||
T->fDtran->setElementAt(ux, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// flagAcceptingStates Identify accepting states.
|
||||
// TODO: implementation for tagging of rule match values
|
||||
// will probably end up here.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::flagAcceptingStates() {
|
||||
UVector endMarkerNodes(*fStatus);
|
||||
RBBINode *endMarker;
|
||||
int32_t i;
|
||||
int32_t n;
|
||||
|
||||
fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
|
||||
|
||||
for (i=0; i<endMarkerNodes.size(); i++) {
|
||||
endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
if (sd->fPositions->indexOf(endMarker) >= 0) {
|
||||
// Any non-zero value for fAccepting means this is an accepting node.
|
||||
// The value is what will be returned to the user as the break status.
|
||||
// If no other value was specified, force it to -1.
|
||||
sd->fAccepting = endMarker->fVal;
|
||||
if (sd->fAccepting == 0) {
|
||||
sd->fAccepting = -1;
|
||||
}
|
||||
|
||||
// If the end marker node is from a look-ahead rule, set
|
||||
// the fLookAhead field or this state also.
|
||||
if (endMarker->fLookAheadEnd) {
|
||||
sd->fLookAhead = sd->fAccepting;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// flagLookAheadStates
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::flagLookAheadStates() {
|
||||
UVector lookAheadNodes(*fStatus);
|
||||
RBBINode *lookAheadNode;
|
||||
int32_t i;
|
||||
int32_t n;
|
||||
|
||||
fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
|
||||
for (i=0; i<lookAheadNodes.size(); i++) {
|
||||
lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
|
||||
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
|
||||
sd->fLookAhead = lookAheadNode->fVal;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// flagTaggedStates
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::flagTaggedStates() {
|
||||
UVector tagNodes(*fStatus);
|
||||
RBBINode *tagNode;
|
||||
int32_t i;
|
||||
int32_t n;
|
||||
|
||||
fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
|
||||
for (i=0; i<tagNodes.size(); i++) {
|
||||
tagNode = (RBBINode *)tagNodes.elementAt(i);
|
||||
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
if (sd->fPositions->indexOf(tagNode) >= 0) {
|
||||
sd->fTagVal = tagNode->fVal;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// setAdd Set operation on UVector
|
||||
// dest = dest union source
|
||||
// Elements may only appear once. Order is unimportant.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
|
||||
int destOriginalSize = dest->size();
|
||||
int sourceSize = source->size();
|
||||
int32_t si, di;
|
||||
|
||||
for (si=0; si<sourceSize; si++) {
|
||||
void *elToAdd = source->elementAt(si);
|
||||
for (di=0; di<destOriginalSize; di++) {
|
||||
if (dest->elementAt(di) == elToAdd) {
|
||||
goto elementAlreadyInDest;
|
||||
}
|
||||
}
|
||||
dest->addElement(elToAdd, *fStatus);
|
||||
elementAlreadyInDest: ;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// setEqual Set operation on UVector.
|
||||
// Compare for equality.
|
||||
// Elements may appear only once.
|
||||
// Elements may appear in any order.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
|
||||
int32_t aSize = a->size();
|
||||
int32_t bSize = b->size();
|
||||
|
||||
if (aSize != bSize) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
int32_t ax;
|
||||
int32_t bx;
|
||||
int32_t firstBx = 0;
|
||||
void *aVal;
|
||||
void *bVal;
|
||||
|
||||
for (ax=0; ax<aSize; ax++) {
|
||||
aVal = a->elementAt(ax);
|
||||
for (bx=firstBx; bx<bSize; bx++) {
|
||||
bVal = b->elementAt(bx);
|
||||
if (aVal == bVal) {
|
||||
if (bx==firstBx) {
|
||||
firstBx++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (aVal != bVal) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
|
||||
// for each node in the tree.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::printPosSets(RBBINode *n) {
|
||||
if (n==NULL) {
|
||||
return;
|
||||
}
|
||||
n->print();
|
||||
printf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
|
||||
|
||||
printf(" firstpos: ");
|
||||
printSet(n->fFirstPosSet);
|
||||
|
||||
printf(" lastpos: ");
|
||||
printSet(n->fLastPosSet);
|
||||
|
||||
printf(" followpos: ");
|
||||
printSet(n->fFollowPos);
|
||||
|
||||
printPosSets(n->fLeftChild);
|
||||
printPosSets(n->fRightChild);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// getTableSize() Calculate the size of the runtime form of this
|
||||
// state transition table.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
int32_t RBBITableBuilder::getTableSize() {
|
||||
int32_t size = 0;
|
||||
int32_t numRows;
|
||||
int32_t numCols;
|
||||
int32_t rowSize;
|
||||
|
||||
if (fTree == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
|
||||
|
||||
numRows = fDStates->size();
|
||||
numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
// Note The declaration of RBBIStateTableRow is for a table of two columns.
|
||||
// Therefore we subtract two from numCols when determining
|
||||
// how much storage to add to a row for the total columns.
|
||||
rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
|
||||
size += numRows * rowSize;
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// exportTable() export the state transition table in the format required
|
||||
// by the runtime engine. getTableSize() bytes of memory
|
||||
// must be available at the output address "where".
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::exportTable(void *where) {
|
||||
RBBIStateTable *table = (RBBIStateTable *)where;
|
||||
uint32_t state;
|
||||
int col;
|
||||
|
||||
if (U_FAILURE(*fStatus) || fTree == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
|
||||
fDStates->size() > 0x7fff) {
|
||||
*fStatus = U_BRK_INTERNAL_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
table->fRowLen = sizeof(RBBIStateTableRow) +
|
||||
sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
|
||||
table->fNumStates = fDStates->size();
|
||||
|
||||
for (state=0; state<table->fNumStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
|
||||
assert (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
|
||||
assert (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
|
||||
row->fAccepting = (int16_t)sd->fAccepting;
|
||||
row->fLookAhead = (int16_t)sd->fLookAhead;
|
||||
row->fTag = (int16_t)sd->fTagVal;
|
||||
for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
|
||||
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// printSet Debug function. Print the contents of a UVector
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::printSet(UVector *s) {
|
||||
int32_t i;
|
||||
for (i=0; i<s->size(); i++) {
|
||||
void *v = s->elementAt(i);
|
||||
printf("%10x", v);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// printStates Debug Function. Dump the fully constructed state transition table.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::printStates() {
|
||||
|
||||
int c; // input "character"
|
||||
int n; // state number
|
||||
|
||||
printf("state | i n p u t s y m b o l s \n");
|
||||
printf(" | Acc LA Tag");
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {printf(" %2d", c);};
|
||||
printf("\n");
|
||||
printf(" |---------------");
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {printf("---");};
|
||||
printf("\n");
|
||||
|
||||
for (n=0; n<fDStates->size(); n++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
|
||||
printf(" %3d | " , n);
|
||||
printf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagVal);
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
printf(" %2d", sd->fDtran->elementAti(c));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// RBBIStateDescriptor Methods. This is a very struct-like class
|
||||
// Most access is directly to the fields.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
|
||||
fMarked = FALSE;
|
||||
fAccepting = 0;
|
||||
fLookAhead = 0;
|
||||
fTagVal = 0;
|
||||
fPositions = NULL;
|
||||
fDtran = new UVector(lastInputSymbol+1, *fStatus);
|
||||
fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
|
||||
// It is indexed by input symbols, and will
|
||||
// hold the next state number for each
|
||||
// symbol.
|
||||
}
|
||||
|
||||
|
||||
RBBIStateDescriptor::~RBBIStateDescriptor() {
|
||||
delete fPositions;
|
||||
delete fDtran;
|
||||
fPositions = NULL;
|
||||
fDtran = NULL;
|
||||
}
|
107
icu4c/source/common/rbbitblb.h
Normal file
107
icu4c/source/common/rbbitblb.h
Normal file
|
@ -0,0 +1,107 @@
|
|||
//
|
||||
// rbbitblb.h
|
||||
//
|
||||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef RBBITBLB_H
|
||||
#define RBBITBLB_H
|
||||
|
||||
|
||||
#include "unicode/rbbi.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class RBBIRuleScanner;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
// It builds the state transition table used by the RBBI runtime
|
||||
// from the expression syntax tree generated by the rule scanner.
|
||||
//
|
||||
// This class is part of the RBBI implementation only.
|
||||
// There is no user-visible public API here.
|
||||
//
|
||||
|
||||
class RBBITableBuilder {
|
||||
public:
|
||||
// TODO: add a root node param to the constructor. We're going to have two
|
||||
// builders, one for the forward table, and one for the reverse table.
|
||||
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode *&rootNode);
|
||||
~RBBITableBuilder();
|
||||
|
||||
void build();
|
||||
int32_t getTableSize(); // Return the runtime size in bytes of
|
||||
// the built state table
|
||||
void exportTable(void *where); // fill in the runtime state table.
|
||||
// Sufficient memory must exist at
|
||||
// the specified location.
|
||||
|
||||
// TODO: add getter function(s) for the built table.
|
||||
|
||||
private:
|
||||
void calcNullable(RBBINode *n);
|
||||
void calcFirstPos(RBBINode *n);
|
||||
void calcLastPos(RBBINode *n);
|
||||
void calcFollowPos(RBBINode *n);
|
||||
void buildStateTable();
|
||||
void flagAcceptingStates();
|
||||
void flagLookAheadStates();
|
||||
void flagTaggedStates();
|
||||
|
||||
// Set functions for UVector.
|
||||
// TODO: make a USet subclass of UVector
|
||||
|
||||
void setAdd(UVector *dest, UVector *source);
|
||||
UBool setEquals(UVector *a, UVector *b);
|
||||
|
||||
void printSet(UVector *s);
|
||||
void printPosSets(RBBINode *n = NULL);
|
||||
void printStates();
|
||||
|
||||
|
||||
private:
|
||||
RBBIRuleBuilder *fRB;
|
||||
RBBINode *&fTree; // The root node of the parse tree to build a
|
||||
// table for.
|
||||
UErrorCode *fStatus;
|
||||
|
||||
UVector *fDStates; // D states (Aho's terminology)
|
||||
// Index is state number
|
||||
// Contents are RBBIStateDescriptor pointers.
|
||||
|
||||
};
|
||||
|
||||
//
|
||||
// RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
|
||||
// one for each state.
|
||||
class RBBIStateDescriptor {
|
||||
public:
|
||||
UBool fMarked;
|
||||
int32_t fAccepting;
|
||||
int32_t fLookAhead;
|
||||
int32_t fTagVal;
|
||||
UVector *fPositions; // Set of parse tree positions associated
|
||||
// with this state. Unordered (it's a set).
|
||||
// UVector contents are RBBINode *
|
||||
|
||||
UVector *fDtran; // Transitions out of this state.
|
||||
// indexed by input character
|
||||
// contents is int index of dest state
|
||||
// in RBBITableBuilder.fDStates
|
||||
|
||||
RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
|
||||
~RBBIStateDescriptor();
|
||||
};
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif
|
|
@ -11,9 +11,17 @@
|
|||
#include "unicode/uloc.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/uchriter.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "rbbirb.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// ubrk_open Create a canned type of break iterator based on type (word, line, etc.)
|
||||
// and locale.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
U_CAPI UBreakIterator* U_EXPORT2
|
||||
ubrk_open(UBreakIteratorType type,
|
||||
const char *locale,
|
||||
|
@ -58,9 +66,8 @@ ubrk_open(UBreakIteratorType type,
|
|||
return 0;
|
||||
}
|
||||
|
||||
int32_t textLen = (textLength == -1 ? u_strlen(text) : textLength);
|
||||
UCharCharacterIterator *iter = 0;
|
||||
iter = new UCharCharacterIterator(text, textLen);
|
||||
iter = new UCharCharacterIterator(text, textLength);
|
||||
if(iter == 0) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete result;
|
||||
|
@ -71,18 +78,45 @@ ubrk_open(UBreakIteratorType type,
|
|||
return (UBreakIterator*)result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// ubrk_openRules open a break iterator from a set of break rules.
|
||||
// Invokes the rule builder.
|
||||
//
|
||||
//----------------------------------------------------------------------------------------
|
||||
U_CAPI UBreakIterator* U_EXPORT2
|
||||
ubrk_openRules(const UChar *rules,
|
||||
int32_t rulesLength,
|
||||
const UChar *text,
|
||||
int32_t textLength,
|
||||
UErrorCode *status)
|
||||
{
|
||||
if(U_FAILURE(*status)) return 0;
|
||||
*status = U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
ubrk_openRules( const UChar *rules,
|
||||
int32_t rulesLength,
|
||||
const UChar *text,
|
||||
int32_t textLength,
|
||||
UParseError *parseErr,
|
||||
UErrorCode *status) {
|
||||
|
||||
BreakIterator *result = 0;
|
||||
|
||||
UnicodeString ruleString(rules, rulesLength);
|
||||
result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status);
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UCharCharacterIterator *iter = 0;
|
||||
iter = new UCharCharacterIterator(text, textLength);
|
||||
if(iter == 0) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete result;
|
||||
return 0;
|
||||
}
|
||||
result->adoptText(iter);
|
||||
return (UBreakIterator *)result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
U_CAPI UBreakIterator * U_EXPORT2
|
||||
ubrk_safeClone(
|
||||
const UBreakIterator *bi,
|
||||
|
@ -101,13 +135,19 @@ ubrk_safeClone(
|
|||
createBufferClone(stackBuffer, *pBufferSize, *status));
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ubrk_close(UBreakIterator *bi)
|
||||
{
|
||||
|
||||
if (bi && !((BreakIterator*) bi)->isBufferClone())
|
||||
{
|
||||
delete (BreakIterator*) bi;
|
||||
BreakIterator *ubi = (BreakIterator*) bi;
|
||||
if (ubi) {
|
||||
if (ubi->isBufferClone()) {
|
||||
ubi->~BreakIterator();
|
||||
*(uint32_t *)ubi = 0xdeadbeef;
|
||||
} else {
|
||||
delete ubi;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -465,7 +465,7 @@ public:
|
|||
virtual UChar32 next32(void) = 0;
|
||||
|
||||
/**
|
||||
* Advances to the previous code unit in the iteration rance
|
||||
* Advances to the previous code unit in the iteration range
|
||||
* (toward startIndex()), and returns that code unit. If there are
|
||||
* no more code units to return, returns DONE.
|
||||
* @stable
|
||||
|
@ -473,7 +473,7 @@ public:
|
|||
virtual UChar previous(void) = 0;
|
||||
|
||||
/**
|
||||
* Advances to the previous code point in the iteration rance
|
||||
* Advances to the previous code point in the iteration range
|
||||
* (toward startIndex()), and returns that code point. If there are
|
||||
* no more code points to return, returns DONE.
|
||||
* @stable
|
||||
|
|
|
@ -49,11 +49,6 @@ class DictionaryBasedBreakIteratorTables;
|
|||
class U_COMMON_API DictionaryBasedBreakIterator : public RuleBasedBreakIterator {
|
||||
|
||||
private:
|
||||
/**
|
||||
* a temporary hiding place for the number of dictionary characters in the
|
||||
* last range passed over by next()
|
||||
*/
|
||||
int32_t dictionaryCharCount;
|
||||
|
||||
/**
|
||||
* when a range of characters is divided up using the dictionary, the break
|
||||
|
@ -74,6 +69,8 @@ private:
|
|||
*/
|
||||
int32_t positionInCache;
|
||||
|
||||
DictionaryBasedBreakIteratorTables *fTables;
|
||||
|
||||
/**
|
||||
* Class ID
|
||||
*/
|
||||
|
@ -104,6 +101,17 @@ public:
|
|||
*/
|
||||
virtual ~DictionaryBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Default constructor. Creates an "empty" break iterator.
|
||||
* Such an iterator can subsequently be assigned to.
|
||||
*/
|
||||
DictionaryBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other);
|
||||
|
||||
/**
|
||||
* Assignment operator. Sets this iterator to have the same behavior,
|
||||
* and iterate over the same text, as the one passed in.
|
||||
|
@ -179,11 +187,16 @@ protected:
|
|||
virtual int32_t handleNext(void);
|
||||
|
||||
/**
|
||||
* dumps the cache of break positions (usually in response to a change in
|
||||
* removes the cache of break positions (usually in response to a change in
|
||||
* position of some sort)
|
||||
*/
|
||||
virtual void reset(void);
|
||||
|
||||
//
|
||||
// init Initialize a dbbi. Common routine for use by constructors.
|
||||
//
|
||||
void init();
|
||||
|
||||
virtual BreakIterator * createBufferClone(void *stackBuffer,
|
||||
int32_t &BufferSize,
|
||||
UErrorCode &status);
|
||||
|
@ -200,11 +213,6 @@ private:
|
|||
*/
|
||||
void divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Used by the tables object to increment the count of dictionary characters
|
||||
* during iteration
|
||||
*/
|
||||
void bumpDictionaryCharCount(void);
|
||||
|
||||
/*
|
||||
* HSYS : Please revisit with Rich, the ctors of the DBBI class is currently
|
||||
|
@ -222,9 +230,6 @@ inline UClassID DictionaryBasedBreakIterator::getStaticClassID(void) {
|
|||
return (UClassID)(&fgClassID);
|
||||
}
|
||||
|
||||
inline void DictionaryBasedBreakIterator::bumpDictionaryCharCount(void) {
|
||||
++dictionaryCharCount;
|
||||
}
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -13,12 +13,18 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "utrie.h"
|
||||
|
||||
#include "rbbidata.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class RuleBasedBreakIteratorTables;
|
||||
class BreakIterator;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
|
||||
*
|
||||
|
@ -177,72 +183,91 @@ class BreakIterator;
|
|||
* </table>
|
||||
* </blockquote>
|
||||
*
|
||||
* <p>For a more complete explanation, see <a
|
||||
* href="http://www.ibm.com/developerworks/unicode/library/boundaries/boundaries.html">http://www.ibm.com/developerworks/unicode/library/boundaries/boundaries.html</a>.
|
||||
* For examples, see the resource data (which is annotated).</p>
|
||||
*
|
||||
* @author Richard Gillam
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
|
||||
|
||||
protected:
|
||||
/**
|
||||
* A token used as a character-category value to identify ignore characters
|
||||
*/
|
||||
static const int8_t UBRK_IGNORE;
|
||||
friend class DictionaryBasedBreakIteratorTables;
|
||||
|
||||
private:
|
||||
/**
|
||||
* The state number of the starting state
|
||||
*/
|
||||
static const int16_t START_STATE;
|
||||
|
||||
/**
|
||||
* The state-transition value indicating "stop"
|
||||
*/
|
||||
static const int16_t STOP_STATE;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* The character iterator through which this BreakIterator accesses the text
|
||||
*/
|
||||
CharacterIterator* text;
|
||||
CharacterIterator* fText;
|
||||
|
||||
//
|
||||
// The rule data for this BreakIterator instance
|
||||
//
|
||||
RBBIDataWrapper *fData;
|
||||
UTrie *fCharMappings;
|
||||
int16_t fLastBreakStatus;
|
||||
|
||||
//
|
||||
// Counter for the number of characters encountered with the "dictionary"
|
||||
// flag set. Normal RBBI iterators don't use it, although the code
|
||||
// for updating it is live. Dictionary Based break iterators (a subclass
|
||||
// of us) access this field directly.
|
||||
//
|
||||
uint32_t fDictionaryCharCount;
|
||||
|
||||
//
|
||||
// Debugging flag.
|
||||
//
|
||||
static UBool fTrace;
|
||||
|
||||
|
||||
/**
|
||||
* The data tables this iterator uses to determine the break positions
|
||||
*/
|
||||
RuleBasedBreakIteratorTables* tables;
|
||||
|
||||
private:
|
||||
/**
|
||||
* Class ID
|
||||
*/
|
||||
static const char fgClassID;
|
||||
/*
|
||||
* HSYS: To be revisited, once the ctor are made public.
|
||||
*/
|
||||
protected:
|
||||
|
||||
protected:
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
||||
// This constructor uses the udata interface to create a BreakIterator whose
|
||||
// internal tables live in a memory-mapped file. "image" is a pointer to the
|
||||
// beginning of that file.
|
||||
RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
|
||||
|
||||
// This constructor uses the udata interface to create a BreakIterator whose
|
||||
// internal tables live in a memory-mapped file. "image" is a pointer to the
|
||||
// beginning of that file.
|
||||
RuleBasedBreakIterator(UDataMemory* image);
|
||||
//
|
||||
// Constructor from a flattened set of RBBI data in malloced memory.
|
||||
// RulesBasedBreakIterators built from a custom set of rules
|
||||
// are created via this constructor; the rules are compiled
|
||||
// into memory, then the break iterator is constructed here.
|
||||
//
|
||||
// The break iterator adopts the memory, and will
|
||||
// uprv_free() it when done.
|
||||
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
|
||||
|
||||
friend class RBBIRuleBuilder;
|
||||
friend class BreakIterator;
|
||||
|
||||
|
||||
|
||||
public:
|
||||
|
||||
/** Default constructor. Creates an empty shell of an iterator, with no
|
||||
* rules or text to iterate over. Object can subsequently be assigned.
|
||||
*/
|
||||
RuleBasedBreakIterator();
|
||||
|
||||
/**
|
||||
* Copy constructor. Will produce a collator with the same behavior,
|
||||
* Copy constructor. Will produce a break iterator with the same behavior,
|
||||
* and which iterates over the same text, as the one passed in.
|
||||
*/
|
||||
RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
|
||||
*/
|
||||
RuleBasedBreakIterator( const UnicodeString &rules,
|
||||
UParseError &parseError,
|
||||
UErrorCode &status);
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
|
@ -269,8 +294,10 @@ RuleBasedBreakIterator(UDataMemory* image);
|
|||
/**
|
||||
* Returns a newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior, and iterating over the same text, as this one.
|
||||
* Differs from the copy constructor in that it is polymorphic, and
|
||||
* will correctly clone (copy) a derived class.
|
||||
*/
|
||||
virtual BreakIterator* clone(void) const;
|
||||
virtual BreakIterator* clone() const;
|
||||
|
||||
/**
|
||||
* Compute a hash code for this BreakIterator
|
||||
|
@ -296,28 +323,6 @@ RuleBasedBreakIterator(UDataMemory* image);
|
|||
*/
|
||||
virtual const CharacterIterator& getText(void) const;
|
||||
|
||||
#ifdef ICU_ENABLE_DEPRECATED_BREAKITERATOR
|
||||
/**
|
||||
* Returns a newly-created CharacterIterator that the caller is to take
|
||||
* ownership of.
|
||||
* @deprecated This will be removed after 2000-Dec-31.
|
||||
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
|
||||
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
|
||||
* FROM *BOTH* CLASSES. Use getText() instead.
|
||||
*/
|
||||
virtual CharacterIterator* createText(void) const;
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText The text to analyze.
|
||||
* @deprecated
|
||||
* THIS FUNCTION SHOULD NOT BE HERE. IT'S HERE BECAUSE BreakIterator DEFINES
|
||||
* IT AS PURE VIRTUAL, FORCING RBBI TO IMPLEMENT IT. IT SHOULD BE REMOVED
|
||||
* FROM *BOTH* CLASSES. Use the other setText() instead.
|
||||
*/
|
||||
virtual void setText(const UnicodeString* newText);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
|
@ -402,6 +407,15 @@ RuleBasedBreakIterator(UDataMemory* image);
|
|||
*/
|
||||
virtual int32_t current(void) const;
|
||||
|
||||
|
||||
/**
|
||||
* Return the status from the break rule that determined the most recently
|
||||
* returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. For rules that do not specify a
|
||||
* status, a default value of 0 is returned.
|
||||
*/
|
||||
virtual int16_t getRuleStatus() const;
|
||||
|
||||
/**
|
||||
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
|
||||
* This method is to implement a simple version of RTTI, since not all
|
||||
|
@ -429,6 +443,22 @@ RuleBasedBreakIterator(UDataMemory* image);
|
|||
virtual BreakIterator * createBufferClone(void *stackBuffer,
|
||||
int32_t &BufferSize,
|
||||
UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Return the flattened form of compiled break rules,
|
||||
* which can then be used to create a new break iterator at some
|
||||
* time in the future. Creating a break iterator in this way
|
||||
* is much faster than building one from the source form of the
|
||||
* break rules.
|
||||
*
|
||||
* @return A pointer to the flattened rule data. The storage
|
||||
* belongs to the RulesBasedBreakIterator object, no the
|
||||
* caller, and must not be modified or deleted.
|
||||
*/
|
||||
virtual const uint8_t *getFlattenedData(uint32_t *length);
|
||||
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void debugDumpTables() const;
|
||||
#endif
|
||||
|
@ -463,18 +493,30 @@ protected:
|
|||
*/
|
||||
virtual void reset(void);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Return true if the category lookup for this char
|
||||
* indicates that it is in the set of dictionary lookup chars.
|
||||
* This function is intended for use by dictionary based break iterators.
|
||||
*/
|
||||
virtual UBool isDictionaryChar(UChar32);
|
||||
|
||||
/**
|
||||
* Constructs a RuleBasedBreakIterator that uses the already-created
|
||||
* tables object that is passed in as a parameter.
|
||||
*/
|
||||
RuleBasedBreakIterator(RuleBasedBreakIteratorTables* adoptTables);
|
||||
|
||||
friend class BreakIterator;
|
||||
* Common initialization function, used by constructors and bufferClone.
|
||||
* (Also used by DictionaryBasedBreakIterator::createBufferClone().)
|
||||
*/
|
||||
void init();
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------
|
||||
//
|
||||
// Inline Functions Definitions ...
|
||||
//
|
||||
//----------------------------------------------------------------------------------
|
||||
|
||||
inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
|
||||
return !operator==(that);
|
||||
}
|
||||
|
@ -487,6 +529,8 @@ inline UClassID RuleBasedBreakIterator::getStaticClassID(void) {
|
|||
return (UClassID)(&fgClassID);
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -7,6 +7,8 @@
|
|||
#define UBRK_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: BreakIterator
|
||||
|
@ -219,19 +221,23 @@ ubrk_open(UBreakIteratorType type,
|
|||
* The rule syntax is ... (TBD)
|
||||
* @param rules A set of rules specifying the text breaking conventions.
|
||||
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
|
||||
* @param text The text to be iterated over.
|
||||
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
|
||||
* used to specify the text to be iterated.
|
||||
* @param textLength The number of characters in text, or -1 if null-terminated.
|
||||
* @param parseErr Receives position and context information for any syntax errors
|
||||
* detected while parsing the rules.
|
||||
* @param status A UErrorCode to receive any errors.
|
||||
* @return A UBreakIterator for the specified rules.
|
||||
* @see ubrk_open
|
||||
* @stable
|
||||
* @draft
|
||||
*/
|
||||
U_CAPI UBreakIterator* U_EXPORT2
|
||||
ubrk_openRules(const UChar *rules,
|
||||
int32_t rulesLength,
|
||||
const UChar *text,
|
||||
int32_t textLength,
|
||||
UErrorCode *status);
|
||||
ubrk_openRules(const UChar *rules,
|
||||
int32_t rulesLength,
|
||||
const UChar *text,
|
||||
int32_t textLength,
|
||||
UParseError *parseErr,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Thread safe cloning operation
|
||||
|
@ -397,4 +403,14 @@ ubrk_countAvailable(void);
|
|||
U_CAPI UBool U_EXPORT2
|
||||
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
|
||||
|
||||
/**
|
||||
* Return the status from the break rule that determined the most recently
|
||||
* returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. For rules that do not specify a
|
||||
* status, a default value of 0 is returned.
|
||||
*/
|
||||
U_CAPI int16_t U_EXPORT2
|
||||
ubrk_getRuleStatus();
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -921,6 +921,8 @@ private:
|
|||
friend class TransliteratorIDParser;
|
||||
friend class TransliterationRule;
|
||||
|
||||
friend class RBBIRuleScanner;
|
||||
|
||||
/**
|
||||
* Constructs a set from the given pattern. See the class description
|
||||
* for the syntax of the pattern language.
|
||||
|
|
|
@ -473,7 +473,23 @@ enum UErrorCode {
|
|||
U_UNSUPPORTED_ATTRIBUTE,
|
||||
U_FMT_PARSE_ERROR_LIMIT,
|
||||
|
||||
U_ERROR_LIMIT=U_FMT_PARSE_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
/*
|
||||
* the error code range 0x10200 0x10300 are reserved for Break Iterator related error
|
||||
*/
|
||||
U_BRK_ERROR_START=0x10200,
|
||||
U_BRK_INTERNAL_ERROR,
|
||||
U_BRK_HEX_DIGITS_EXPECTED,
|
||||
U_BRK_SEMICOLON_EXPECTED,
|
||||
U_BRK_RULE_SYNTAX,
|
||||
U_BRK_UNCLOSED_SET,
|
||||
U_BRK_ASSIGN_ERROR,
|
||||
U_BRK_VARIABLE_REDFINITION,
|
||||
U_BRK_MISMATCHED_PAREN,
|
||||
U_BRK_NEW_LINE_IN_QUOTED_STRING,
|
||||
U_BRK_UNDEFINED_VARIABLE,
|
||||
U_BRK_ERROR_LIMIT,
|
||||
|
||||
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
};
|
||||
|
||||
#ifndef XP_CPLUSPLUS
|
||||
|
|
|
@ -113,7 +113,9 @@ void UVector::addElement(void* obj, UErrorCode &status) {
|
|||
|
||||
void UVector::addElement(int32_t elem, UErrorCode &status) {
|
||||
if (ensureCapacity(count + 1, status)) {
|
||||
elements[count++].integer = elem;
|
||||
elements[count].pointer = NULL; // Pointers may be bigger than ints.
|
||||
elements[count].integer = elem;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -130,8 +132,10 @@ void UVector::setElementAt(void* obj, int32_t index) {
|
|||
void UVector::setElementAt(int32_t elem, int32_t index) {
|
||||
if (0 <= index && index < count) {
|
||||
if (elements[index].pointer != 0 && deleter != 0) {
|
||||
// TODO: this should be an error. mixing up ints and pointers.
|
||||
(*deleter)(elements[index].pointer);
|
||||
}
|
||||
elements[index].pointer = NULL;
|
||||
elements[index].integer = elem;
|
||||
}
|
||||
/* else index out of range */
|
||||
|
@ -226,6 +230,32 @@ void UVector::removeAllElements(void) {
|
|||
count = 0;
|
||||
}
|
||||
|
||||
UBool UVector::equals(const UVector &other) const {
|
||||
int i;
|
||||
|
||||
if (this->count != other.count) {
|
||||
return FALSE;
|
||||
}
|
||||
if (comparer == 0) {
|
||||
for (i=0; i<count; i++) {
|
||||
if (elements[i].pointer != other.elements[i].pointer) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
UHashTok key;
|
||||
for (i=0; i<count; i++) {
|
||||
key.pointer = &other.elements[i];
|
||||
if (!(*comparer)(key, elements[i])) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t UVector::indexOf(void* obj, int32_t startIndex) const {
|
||||
UHashTok key;
|
||||
key.pointer = obj;
|
||||
|
@ -247,6 +277,12 @@ int32_t UVector::indexOf(UHashTok key, int32_t startIndex) const {
|
|||
return i;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i=startIndex; i<count; ++i) {
|
||||
if (key.pointer == elements[i].pointer) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
|
|
@ -152,6 +152,8 @@ public:
|
|||
|
||||
int32_t elementAti(int32_t index) const;
|
||||
|
||||
UBool equals(const UVector &other) const;
|
||||
|
||||
void* firstElement(void) const;
|
||||
|
||||
void* lastElement(void) const;
|
||||
|
|
359
icu4c/source/configure
vendored
359
icu4c/source/configure
vendored
File diff suppressed because it is too large
Load diff
|
@ -4,7 +4,7 @@ dnl Copyright (c) 1999-2000, International Business Machines Corporation and
|
|||
dnl others. All Rights Reserved.
|
||||
dnl Stephen F. Booth, heavily modified by Yves and others
|
||||
|
||||
dnl $Id: configure.in,v 1.170 2002/05/31 23:16:07 grhoten-oss Exp $
|
||||
dnl $Id: configure.in,v 1.171 2002/06/25 17:23:02 aheninger-oss Exp $
|
||||
|
||||
dnl Process this file with autoconf to produce a configure script
|
||||
AC_INIT(common/unicode/utypes.h)
|
||||
|
@ -891,6 +891,7 @@ AC_OUTPUT([README icudefs.mk \
|
|||
tools/gentest/Makefile \
|
||||
tools/gennorm/Makefile \
|
||||
tools/genprops/Makefile \
|
||||
tools/genbrk/Makefile \
|
||||
tools/dumpce/Makefile \
|
||||
test/Makefile test/testdata/Makefile test/intltest/Makefile \
|
||||
test/cintltst/Makefile test/iotest/Makefile \
|
||||
|
|
|
@ -248,15 +248,8 @@ $(TESTBUILDDIR)/test.dat: $(TOOLDIR)/gentest/gentest$(EXEEXT)
|
|||
thaidict.brk: $(SRCDATADIR)/thaidict.brk
|
||||
$(RMV) $@ && ln -s $(BUILDDIR) $@
|
||||
|
||||
# copy the right endianness
|
||||
|
||||
ifeq (@U_IS_BIG_ENDIAN@,1)
|
||||
$(BUILDDIR)/%.brk: $(BRKSRCDIR)/%BE.brk
|
||||
cp $< $@
|
||||
else
|
||||
$(BUILDDIR)/%.brk: $(BRKSRCDIR)/%LE.brk
|
||||
cp $< $@
|
||||
endif
|
||||
$(BUILDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(TOOLDIR)/genbrk/genbrk$(EXEEXT)
|
||||
ICU_DATA=$(BUILDDIR) $(INVOKE) $(TOOLDIR)/genbrk/genbrk -r $< -o $@
|
||||
|
||||
#################################################### CNV
|
||||
# CNV FILES
|
||||
|
|
130
icu4c/source/data/brkitr/char.txt
Normal file
130
icu4c/source/data/brkitr/char.txt
Normal file
|
@ -0,0 +1,130 @@
|
|||
#
|
||||
# Character Break Rules, also known as Grapheme Cluster Boundaries
|
||||
# See Unicode Technical Report #29.
|
||||
# These rules are based on the proposed draft dated 2001-03-11
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
# The names are those from TR29.
|
||||
#
|
||||
$CR = \r;
|
||||
$LF = \n;
|
||||
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
|
||||
#Paragraph Separtor,
|
||||
# General Category == Control
|
||||
|
||||
$CGJ = [\u034f]; #Combining Grapheme Joiner
|
||||
$Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner
|
||||
|
||||
#
|
||||
# Grapheme_Link, Grapheme_Extend, Grapheme_Base as determined by the UCD.
|
||||
# See http://www.unicode.org/Public/UNIDATA/PropList.txt
|
||||
#
|
||||
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
|
||||
|
||||
|
||||
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
|
||||
|
||||
$LetterBase = [:L:];
|
||||
|
||||
#
|
||||
# Korean Syllable Sequences
|
||||
#
|
||||
$L = [\u1100-\u115f];
|
||||
$V = [\u1160-\u11a2];
|
||||
$T = [\u11a8-\u11f9];
|
||||
|
||||
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
|
||||
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
|
||||
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
|
||||
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
|
||||
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
|
||||
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
|
||||
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
|
||||
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
|
||||
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
|
||||
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
|
||||
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
|
||||
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
|
||||
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
|
||||
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
|
||||
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
|
||||
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
|
||||
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
|
||||
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
|
||||
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
|
||||
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
|
||||
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
|
||||
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
|
||||
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
|
||||
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
|
||||
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
|
||||
$LVT = [[\uac00-\ud7a3] - $LV];
|
||||
|
||||
$Hangul_Sequence = ($L* $LV? $V* $T* ) | ($L* $LVT $T*);
|
||||
|
||||
#
|
||||
# Do not break between linking characters and letters, or before linking characters.
|
||||
# THis provides for Indic graphemes, where virama (halant) will link character
|
||||
# clusters together.
|
||||
#
|
||||
$LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase;
|
||||
|
||||
#
|
||||
# Do not break around a Combining Grapheme Joiner
|
||||
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
|
||||
|
||||
# Do not break between a CR and LF.
|
||||
$CR $LF;
|
||||
|
||||
#
|
||||
# Here are the main rules. $NotControl is what matches most ordinary characters.
|
||||
#
|
||||
($NotControl | $Hangul_Sequence) $Extend* (($LinkSequence | $CGJSequence) $Extend*)*;
|
||||
(($LinkSequence | $CGJSequence) $Extend*)*;
|
||||
|
||||
|
||||
# Otherwise break after every character.
|
||||
# This matches control chars, which do not match the main rules.
|
||||
#
|
||||
.;
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules, find a safe point to back up to.
|
||||
#
|
||||
! [^$LetterBase]* $LetterBase ([^$LetterBase]* $Link+ [^$LetterBase]* $LetterBase)*;
|
||||
! $Extend* ($LVT | ($T* $V* $LV?) $L*);
|
||||
! $Extend* .;
|
||||
|
363
icu4c/source/data/brkitr/line.txt
Normal file
363
icu4c/source/data/brkitr/line.txt
Normal file
|
@ -0,0 +1,363 @@
|
|||
#
|
||||
# file: line.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by Unicode TR 14.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
# These are generated by a script from the Unicode LineBreak derived
|
||||
# properties file.
|
||||
#
|
||||
|
||||
############ Start of Script-Generated Definitions #######################
|
||||
|
||||
$LF = [ \u000A];
|
||||
|
||||
$IN = [ \u2024-\u2026];
|
||||
|
||||
$SY = [ \u002F];
|
||||
|
||||
$EX = [ \u0021 \u003F \u2762-\u2763 \uFE56-\uFE57 \uFF01 \uFF1F];
|
||||
|
||||
$BA = [ \u0009 \u007C \u00AD \u058A \u0F0B \u1361 \u1680 \u17D5 \u2000-\u2006
|
||||
\u2008-\u200A \u2010 \u2012-\u2013 \u2027 \u205F];
|
||||
|
||||
$IS = [ \u002C \u002E \u003A-\u003B \u0589];
|
||||
|
||||
$BB = [ \u00B4 \u02C8 \u02CC \u1806];
|
||||
|
||||
$SA = [ \u0E01-\u0E30 \u0E32-\u0E33 \u0E40-\u0E46 \u0E81-\u0E82 \u0E84 \u0E87-\u0E88
|
||||
\u0E8A \u0E8D \u0E94-\u0E97 \u0E99-\u0E9F \u0EA1-\u0EA3 \u0EA5
|
||||
\u0EA7 \u0EAA-\u0EAB \u0EAD-\u0EB0 \u0EB2-\u0EB3 \u0EBD \u0EC0-\u0EC4
|
||||
\u0EC6 \u0EDC-\u0EDD \u1000-\u1021 \u1023-\u1027 \u1029-\u102A
|
||||
\u1050-\u1055 \u1780-\u17B3];
|
||||
|
||||
$CB = [ \uFFFC];
|
||||
|
||||
$XX = [ \uE000-\uF8FF \U000F0000-\U000FFFFD \U00100000-\U0010FFFD];
|
||||
|
||||
$HY = [ \u002D];
|
||||
|
||||
$AI = [ \u00A1 \u00A7-\u00A8 \u00AA \u00B2-\u00B3 \u00B6-\u00BA \u00BC-\u00BF
|
||||
\u00C6 \u00D0 \u00D7-\u00D8 \u00DE-\u00E1 \u00E6 \u00E8-\u00EA
|
||||
\u00EC-\u00ED \u00F0 \u00F2-\u00F3 \u00F7-\u00FA \u00FC \u00FE
|
||||
\u0101 \u0111 \u0113 \u011B \u0126-\u0127 \u012B \u0131-\u0133
|
||||
\u0138 \u013F-\u0142 \u0144 \u0148-\u014A \u014D \u0152-\u0153
|
||||
\u0166-\u0167 \u016B \u01CE \u01D0 \u01D2 \u01D4 \u01D6 \u01D8
|
||||
\u01DA \u01DC \u0251 \u0261 \u02C7 \u02C9-\u02CB \u02CD \u02D0
|
||||
\u02D8-\u02DB \u02DD \u0391-\u03A1 \u03A3-\u03A9 \u03B1-\u03C1
|
||||
\u03C3-\u03C9 \u0401 \u0410-\u044F \u0451 \u2015-\u2016 \u2020-\u2021
|
||||
\u203B \u2074 \u207F \u2081-\u2084 \u2105 \u2113 \u2121-\u2122
|
||||
\u212B \u2140 \u2154-\u2155 \u215B \u215E \u2160-\u216B \u2170-\u2179
|
||||
\u2190-\u2199 \u21D2 \u21D4 \u2200 \u2202-\u2203 \u2207-\u2208
|
||||
\u220B \u220F \u2211 \u2215 \u221A \u221D-\u2220 \u2223 \u2225
|
||||
\u2227-\u222C \u222E \u2234-\u2237 \u223C-\u223D \u2248 \u224C
|
||||
\u2252 \u2260-\u2261 \u2264-\u2267 \u226A-\u226B \u226E-\u226F
|
||||
\u2282-\u2283 \u2286-\u2287 \u2295 \u2299 \u22A5 \u22BF \u2312
|
||||
\u2460-\u24BF \u24D0-\u24E9 \u24EB-\u24FE \u2500-\u254B \u2550-\u2574
|
||||
\u2580-\u258F \u2592-\u2595 \u25A0-\u25A1 \u25A3-\u25A9 \u25B2-\u25B3
|
||||
\u25B6-\u25B7 \u25BC-\u25BD \u25C0-\u25C1 \u25C6-\u25C8 \u25CB
|
||||
\u25CE-\u25D1 \u25E2-\u25E5 \u25EF \u2605-\u2606 \u2609 \u260E-\u260F
|
||||
\u2616-\u2617 \u261C \u261E \u2640 \u2642 \u2660-\u2661 \u2663-\u2665
|
||||
\u2667-\u266A \u266C-\u266D \u266F \uFFFD];
|
||||
|
||||
$ZW = [ \u200B];
|
||||
|
||||
$SG = [ \uD800-\uDFFF];
|
||||
|
||||
$AL = [ \u0023 \u0026 \u002A \u003C-\u003E \u0040-\u005A \u005E-\u007A \u007E
|
||||
\u00A6 \u00A9 \u00AC \u00AE-\u00AF \u00B5 \u00C0-\u00C5 \u00C7-\u00CF
|
||||
\u00D1-\u00D6 \u00D9-\u00DD \u00E2-\u00E5 \u00E7 \u00EB \u00EE-\u00EF
|
||||
\u00F1 \u00F4-\u00F6 \u00FB \u00FD \u00FF-\u0100 \u0102-\u0110
|
||||
\u0112 \u0114-\u011A \u011C-\u0125 \u0128-\u012A \u012C-\u0130
|
||||
\u0134-\u0137 \u0139-\u013E \u0143 \u0145-\u0147 \u014B-\u014C
|
||||
\u014E-\u0151 \u0154-\u0165 \u0168-\u016A \u016C-\u01CD \u01CF
|
||||
\u01D1 \u01D3 \u01D5 \u01D7 \u01D9 \u01DB \u01DD-\u0220 \u0222-\u0233
|
||||
\u0250 \u0252-\u0260 \u0262-\u02AD \u02B0-\u02C6 \u02CE-\u02CF
|
||||
\u02D1-\u02D7 \u02DC \u02DE-\u02EE \u0374-\u0375 \u037A \u037E
|
||||
\u0384-\u038A \u038C \u038E-\u0390 \u03AA-\u03B0 \u03C2 \u03CA-\u03CE
|
||||
\u03D0-\u03F6 \u0400 \u0402-\u040F \u0450 \u0452-\u0482 \u048A-\u04CE
|
||||
\u04D0-\u04F5 \u04F8-\u04F9 \u0500-\u050F \u0531-\u0556 \u0559-\u055F
|
||||
\u0561-\u0587 \u05BE \u05C0 \u05C3 \u05D0-\u05EA \u05F0-\u05F4
|
||||
\u060C \u061B \u061F \u0621-\u063A \u0640-\u064A \u066A-\u066F
|
||||
\u0671-\u06D5 \u06E5-\u06E6 \u06E9 \u06FA-\u06FE \u0700-\u070D
|
||||
\u0710 \u0712-\u072C \u0780-\u07A5 \u07B1 \u0905-\u0939 \u093D
|
||||
\u0950 \u0958-\u0961 \u0964-\u0965 \u0970 \u0985-\u098C \u098F-\u0990
|
||||
\u0993-\u09A8 \u09AA-\u09B0 \u09B2 \u09B6-\u09B9 \u09DC-\u09DD
|
||||
\u09DF-\u09E1 \u09F0-\u09F1 \u09F4-\u09FA \u0A05-\u0A0A \u0A0F-\u0A10
|
||||
\u0A13-\u0A28 \u0A2A-\u0A30 \u0A32-\u0A33 \u0A35-\u0A36 \u0A38-\u0A39
|
||||
\u0A59-\u0A5C \u0A5E \u0A72-\u0A74 \u0A85-\u0A8B \u0A8D \u0A8F-\u0A91
|
||||
\u0A93-\u0AA8 \u0AAA-\u0AB0 \u0AB2-\u0AB3 \u0AB5-\u0AB9 \u0ABD
|
||||
\u0AD0 \u0AE0 \u0B05-\u0B0C \u0B0F-\u0B10 \u0B13-\u0B28 \u0B2A-\u0B30
|
||||
\u0B32-\u0B33 \u0B36-\u0B39 \u0B3D \u0B5C-\u0B5D \u0B5F-\u0B61
|
||||
\u0B70 \u0B83 \u0B85-\u0B8A \u0B8E-\u0B90 \u0B92-\u0B95 \u0B99-\u0B9A
|
||||
\u0B9C \u0B9E-\u0B9F \u0BA3-\u0BA4 \u0BA8-\u0BAA \u0BAE-\u0BB5
|
||||
\u0BB7-\u0BB9 \u0BF0-\u0BF2 \u0C05-\u0C0C \u0C0E-\u0C10 \u0C12-\u0C28
|
||||
\u0C2A-\u0C33 \u0C35-\u0C39 \u0C60-\u0C61 \u0C85-\u0C8C \u0C8E-\u0C90
|
||||
\u0C92-\u0CA8 \u0CAA-\u0CB3 \u0CB5-\u0CB9 \u0CDE \u0CE0-\u0CE1
|
||||
\u0D05-\u0D0C \u0D0E-\u0D10 \u0D12-\u0D28 \u0D2A-\u0D39 \u0D60-\u0D61
|
||||
\u0D85-\u0D96 \u0D9A-\u0DB1 \u0DB3-\u0DBB \u0DBD \u0DC0-\u0DC6
|
||||
\u0DF4 \u0E4F \u0F00-\u0F0A \u0F0D-\u0F17 \u0F1A-\u0F1F \u0F2A-\u0F34
|
||||
\u0F36 \u0F38 \u0F40-\u0F47 \u0F49-\u0F6A \u0F85 \u0F88-\u0F8B
|
||||
\u0FBE-\u0FC5 \u0FC7-\u0FCC \u0FCF \u104A-\u104F \u10A0-\u10C5
|
||||
\u10D0-\u10F8 \u10FB \u1200-\u1206 \u1208-\u1246 \u1248 \u124A-\u124D
|
||||
\u1250-\u1256 \u1258 \u125A-\u125D \u1260-\u1286 \u1288 \u128A-\u128D
|
||||
\u1290-\u12AE \u12B0 \u12B2-\u12B5 \u12B8-\u12BE \u12C0 \u12C2-\u12C5
|
||||
\u12C8-\u12CE \u12D0-\u12D6 \u12D8-\u12EE \u12F0-\u130E \u1310
|
||||
\u1312-\u1315 \u1318-\u131E \u1320-\u1346 \u1348-\u135A \u1362-\u1368
|
||||
\u1372-\u137C \u13A0-\u13F4 \u1401-\u1676 \u1681-\u169A \u16A0-\u16F0
|
||||
\u1700-\u170C \u170E-\u1711 \u1720-\u1731 \u1735-\u1736 \u1740-\u1751
|
||||
\u1760-\u176C \u176E-\u1770 \u17DC \u1800-\u1805 \u1807-\u180A
|
||||
\u1820-\u1877 \u1880-\u18A8 \u1E00-\u1E9B \u1EA0-\u1EF9 \u1F00-\u1F15
|
||||
\u1F18-\u1F1D \u1F20-\u1F45 \u1F48-\u1F4D \u1F50-\u1F57 \u1F59
|
||||
\u1F5B \u1F5D \u1F5F-\u1F7D \u1F80-\u1FB4 \u1FB6-\u1FC4 \u1FC6-\u1FD3
|
||||
\u1FD6-\u1FDB \u1FDD-\u1FEF \u1FF2-\u1FF4 \u1FF6-\u1FFE \u2017
|
||||
\u2022-\u2023 \u2038 \u203D-\u2043 \u2047-\u2052 \u2057 \u2061-\u2063
|
||||
\u2070-\u2071 \u2075-\u207C \u2080 \u2085-\u208C \u2100-\u2102
|
||||
\u2104 \u2106-\u2108 \u210A-\u2112 \u2114-\u2115 \u2117-\u2120
|
||||
\u2123-\u2125 \u2127-\u212A \u212C-\u213A \u213D-\u213F \u2141-\u214B
|
||||
\u2153 \u2156-\u215A \u215C-\u215D \u215F \u216C-\u216F \u217A-\u2183
|
||||
\u219A-\u21D1 \u21D3 \u21D5-\u21FF \u2201 \u2204-\u2206 \u2209-\u220A
|
||||
\u220C-\u220E \u2210 \u2214 \u2216-\u2219 \u221B-\u221C \u2221-\u2222
|
||||
\u2224 \u2226 \u222D \u222F-\u2233 \u2238-\u223B \u223E-\u2247
|
||||
\u2249-\u224B \u224D-\u2251 \u2253-\u225F \u2262-\u2263 \u2268-\u2269
|
||||
\u226C-\u226D \u2270-\u2281 \u2284-\u2285 \u2288-\u2294 \u2296-\u2298
|
||||
\u229A-\u22A4 \u22A6-\u22BE \u22C0-\u2311 \u2313-\u2328 \u232B-\u23B3
|
||||
\u23B7-\u23CE \u2400-\u2426 \u2440-\u244A \u24C0-\u24CF \u24EA
|
||||
\u254C-\u254F \u2575-\u257F \u2590-\u2591 \u2596-\u259F \u25A2
|
||||
\u25AA-\u25B1 \u25B4-\u25B5 \u25B8-\u25BB \u25BE-\u25BF \u25C2-\u25C5
|
||||
\u25C9-\u25CA \u25CC-\u25CD \u25D2-\u25E1 \u25E6-\u25EE \u25F0-\u2604
|
||||
\u2607-\u2608 \u260A-\u260D \u2610-\u2613 \u2619-\u261B \u261D
|
||||
\u261F-\u263F \u2641 \u2643-\u265F \u2662 \u2666 \u266B \u266E
|
||||
\u2670-\u267D \u2680-\u2689 \u2701-\u2704 \u2706-\u2709 \u270C-\u2727
|
||||
\u2729-\u274B \u274D \u274F-\u2752 \u2756 \u2758-\u275A \u2761
|
||||
\u2764-\u2767 \u2776-\u2794 \u2798-\u27AF \u27B1-\u27BE \u27D0-\u27E5
|
||||
\u27F0-\u2982 \u2999-\u29D7 \u29DC-\u29FB \u29FE-\u2AFF \uFB00-\uFB06
|
||||
\uFB13-\uFB17 \uFB1D \uFB1F-\uFB36 \uFB38-\uFB3C \uFB3E \uFB40-\uFB41
|
||||
\uFB43-\uFB44 \uFB46-\uFBB1 \uFBD3-\uFD3D \uFD50-\uFD8F \uFD92-\uFDC7
|
||||
\uFDF0-\uFDFB \uFE70-\uFE74 \uFE76-\uFEFC \uFF66 \uFF71-\uFF9D
|
||||
\uFFA0-\uFFBE \uFFC2-\uFFC7 \uFFCA-\uFFCF \uFFD2-\uFFD7 \uFFDA-\uFFDC
|
||||
\uFFE8-\uFFEE \U00010300-\U0001031E \U00010320-\U00010323 \U00010330-\U0001034A
|
||||
\U00010400-\U00010425 \U00010428-\U0001044D \U0001D000-\U0001D0F5
|
||||
\U0001D100-\U0001D126 \U0001D12A-\U0001D164 \U0001D16A-\U0001D16C
|
||||
\U0001D183-\U0001D184 \U0001D18C-\U0001D1A9 \U0001D1AE-\U0001D1DD
|
||||
\U0001D400-\U0001D454 \U0001D456-\U0001D49C \U0001D49E-\U0001D49F
|
||||
\U0001D4A2 \U0001D4A5-\U0001D4A6 \U0001D4A9-\U0001D4AC \U0001D4AE-\U0001D4B9
|
||||
\U0001D4BB \U0001D4BD-\U0001D4C0 \U0001D4C2-\U0001D4C3 \U0001D4C5-\U0001D505
|
||||
\U0001D507-\U0001D50A \U0001D50D-\U0001D514 \U0001D516-\U0001D51C
|
||||
\U0001D51E-\U0001D539 \U0001D53B-\U0001D53E \U0001D540-\U0001D544
|
||||
\U0001D546 \U0001D54A-\U0001D550 \U0001D552-\U0001D6A3 \U0001D6A8-\U0001D7C9];
|
||||
|
||||
$OP = [ \u0028 \u005B \u007B \u0F3A \u0F3C \u169B \u201A \u201E \u2045 \u207D
|
||||
\u208D \u2329 \u23B4 \u2768 \u276A \u276C \u276E \u2770 \u2772
|
||||
\u2774 \u27E6 \u27E8 \u27EA \u2983 \u2985 \u2987 \u2989 \u298B
|
||||
\u298D \u298F \u2991 \u2993 \u2995 \u2997 \u29D8 \u29DA \u29FC
|
||||
\u3008 \u300A \u300C \u300E \u3010 \u3014 \u3016 \u3018 \u301A
|
||||
\u301D \uFD3E \uFE35 \uFE37 \uFE39 \uFE3B \uFE3D \uFE3F \uFE41
|
||||
\uFE43 \uFE59 \uFE5B \uFE5D \uFF08 \uFF3B \uFF5B \uFF5F \uFF62];
|
||||
|
||||
$BK = [ \u000C \u2028-\u2029];
|
||||
|
||||
$PO = [ \u0025 \u00A2 \u00B0 \u2030-\u2037 \u20A7 \u2103 \u2109 \u2126 \uFDFC
|
||||
\uFE6A \uFF05 \uFFE0];
|
||||
|
||||
$NS = [ \u0E5A-\u0E5B \u17D4 \u17D6-\u17DA \u203C \u2044 \u3005 \u301C \u303B-\u303C
|
||||
\u3041 \u3043 \u3045 \u3047 \u3049 \u3063 \u3083 \u3085 \u3087
|
||||
\u308E \u3095-\u3096 \u309B-\u309E \u30A0-\u30A1 \u30A3 \u30A5
|
||||
\u30A7 \u30A9 \u30C3 \u30E3 \u30E5 \u30E7 \u30EE \u30F5-\u30F6
|
||||
\u30FB \u30FD \u31F0-\u31FF \uFE54-\uFE55 \uFF1A-\uFF1B \uFF65
|
||||
\uFF67-\uFF70 \uFF9E-\uFF9F];
|
||||
|
||||
$CL = [ \u0029 \u005D \u007D \u0F3B \u0F3D \u169C \u2046 \u207E \u208E \u232A
|
||||
\u23B5 \u2769 \u276B \u276D \u276F \u2771 \u2773 \u2775 \u27E7
|
||||
\u27E9 \u27EB \u2984 \u2986 \u2988 \u298A \u298C \u298E \u2990
|
||||
\u2992 \u2994 \u2996 \u2998 \u29D9 \u29DB \u29FD \u3001-\u3002
|
||||
\u3009 \u300B \u300D \u300F \u3011 \u3015 \u3017 \u3019 \u301B
|
||||
\u301E-\u301F \uFD3F \uFE36 \uFE38 \uFE3A \uFE3C \uFE3E \uFE40
|
||||
\uFE42 \uFE44 \uFE50 \uFE52 \uFE5A \uFE5C \uFE5E \uFF09 \uFF0C
|
||||
\uFF0E \uFF3D \uFF5D \uFF60-\uFF61 \uFF63-\uFF64];
|
||||
|
||||
$NU = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
|
||||
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
|
||||
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
|
||||
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
|
||||
|
||||
$CM = [ \u0000-\u0008 \u000B \u000E-\u001F \u007F-\u009F \u0300-\u034F \u0360-\u036F
|
||||
\u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9 \u05BB-\u05BD
|
||||
\u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06E4
|
||||
\u06E7-\u06E8 \u06EA-\u06ED \u070F \u0711 \u0730-\u074A \u07A6-\u07B0
|
||||
\u0901-\u0903 \u093C \u093E-\u094D \u0951-\u0954 \u0962-\u0963
|
||||
\u0981-\u0983 \u09BC \u09BE-\u09C4 \u09C7-\u09C8 \u09CB-\u09CD
|
||||
\u09D7 \u09E2-\u09E3 \u0A02 \u0A3C \u0A3E-\u0A42 \u0A47-\u0A48
|
||||
\u0A4B-\u0A4D \u0A70-\u0A71 \u0A81-\u0A83 \u0ABC \u0ABE-\u0AC5
|
||||
\u0AC7-\u0AC9 \u0ACB-\u0ACD \u0B01-\u0B03 \u0B3C \u0B3E-\u0B43
|
||||
\u0B47-\u0B48 \u0B4B-\u0B4D \u0B56-\u0B57 \u0B82 \u0BBE-\u0BC2
|
||||
\u0BC6-\u0BC8 \u0BCA-\u0BCD \u0BD7 \u0C01-\u0C03 \u0C3E-\u0C44
|
||||
\u0C46-\u0C48 \u0C4A-\u0C4D \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE-\u0CC4
|
||||
\u0CC6-\u0CC8 \u0CCA-\u0CCD \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D43
|
||||
\u0D46-\u0D48 \u0D4A-\u0D4D \u0D57 \u0D82-\u0D83 \u0DCA \u0DCF-\u0DD4
|
||||
\u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E3A \u0E47-\u0E4E
|
||||
\u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F84 \u0F86-\u0F87
|
||||
\u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C-\u1032 \u1036-\u1039
|
||||
\u1056-\u1059 \u1160-\u11A2 \u11A8-\u11F9 \u1712-\u1714 \u1732-\u1734
|
||||
\u1752-\u1753 \u1772-\u1773 \u17B4-\u17D3 \u180B-\u180E \u18A9
|
||||
\u200C-\u200F \u202A-\u202E \u206A-\u206F \u20D0-\u20EA \u302A-\u302F
|
||||
\u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFFF9-\uFFFB
|
||||
\U0001D165-\U0001D169 \U0001D16D-\U0001D182 \U0001D185-\U0001D18B
|
||||
\U0001D1AA-\U0001D1AD \U000E0001 \U000E0020-\U000E007F];
|
||||
|
||||
$PR = [ \u0024 \u002B \u005C \u00A3-\u00A5 \u00B1 \u09F2-\u09F3 \u0E3F \u17DB
|
||||
\u20A0-\u20A6 \u20A8-\u20B1 \u2116 \u2212-\u2213 \uFE69 \uFF04
|
||||
\uFFE1 \uFFE5-\uFFE6];
|
||||
|
||||
$B2 = [ \u2014];
|
||||
|
||||
$ID = [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
||||
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
||||
\u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
|
||||
\u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
|
||||
\u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
|
||||
\u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
|
||||
\u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
|
||||
\u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
|
||||
\u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
|
||||
\uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
|
||||
\uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
|
||||
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
||||
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
||||
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
||||
|
||||
$SP = [ \u0020];
|
||||
|
||||
$QU = [ \u0022 \u0027 \u00AB \u00BB \u2018-\u2019 \u201B-\u201D \u201F \u2039-\u203A
|
||||
\u23B6 \u275B-\u275E];
|
||||
|
||||
$CR = [ \u000D];
|
||||
|
||||
$GL = [ \u00A0 \u0F0C \u2007 \u2011 \u202F \u2060 \uFEFF];
|
||||
|
||||
############ End of Script-Generated Definitions #######################
|
||||
|
||||
#
|
||||
# Character classes from TR 29. Needed for finding characters.
|
||||
#
|
||||
# $Extend is all combining characters, and none of the other cruft that
|
||||
# TR14 puts into $CM, which is its concept of combining marks.
|
||||
#
|
||||
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
# TODO: This is going to produce some odd results, because of the non-combining
|
||||
# chars that are included in $CM. Use $Extend instead, where possible.
|
||||
#
|
||||
$ALcm = $AL $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$NUcm = $NU $Extend*;
|
||||
$HYcm = $HY $Extend*;
|
||||
$SPcm = $SP $Extend*;
|
||||
$QUcm = $QU $Extend*;
|
||||
$POcm = $PO $Extend*;
|
||||
$OPcm = $OP $Extend*;
|
||||
$BAcm = $BA $Extend*;
|
||||
$BBcm = $BB $Extend*;
|
||||
$NScm = $NS $Extend*;
|
||||
$GLcm = $GL $Extend*;
|
||||
$B2cm = $B2 $Extend*;
|
||||
$INcm = $IN $Extend*;
|
||||
|
||||
|
||||
# New Lines. Always break after, never break before.
|
||||
# Rule LB 3
|
||||
#
|
||||
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
||||
# Because we never break before these things, $Endings
|
||||
# appears at the end of line break rule.
|
||||
#
|
||||
$NLF = $BK | $CR | $LF | $CR $LF;
|
||||
$Endings = $SPcm* $ZW* $NLF?;
|
||||
|
||||
|
||||
#
|
||||
# Openings Sequences that can precede Words, and that should not be separated from them.
|
||||
# Rules LB 9, 10
|
||||
#
|
||||
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
|
||||
|
||||
#
|
||||
# Closings Seqences that follow words, and that should not be separated from them,
|
||||
# Rule LB 8, 11, 15
|
||||
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm)*;
|
||||
|
||||
#
|
||||
# Words. Includes mixed Alpha-numerics.
|
||||
# Rules 11a, 16, 17, 19, more or less.
|
||||
#
|
||||
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
|
||||
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
|
||||
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)) ; # Alpha-numeric. 16, 17
|
||||
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
$Word15 = ($BBcm* ($Word | $Number | $Dashes)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
|
||||
[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
|
||||
[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD
|
||||
# to be glued.
|
||||
|
||||
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
||||
# Rules 13, 14
|
||||
|
||||
#
|
||||
# The actual rule, a combination of everything defined above.
|
||||
#
|
||||
$Openings $GluedWord $Closings $Endings;
|
||||
# $GluedWord;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
# Back up to a hard break.
|
||||
# TODO: make smarter reverse rules for better efficiency
|
||||
#
|
||||
! . . [^$BK | $CR | $LF]* (. | $LF $CR);
|
||||
! .*;
|
381
icu4c/source/data/brkitr/line_th.txt
Normal file
381
icu4c/source/data/brkitr/line_th.txt
Normal file
|
@ -0,0 +1,381 @@
|
|||
#
|
||||
# file: line.txt
|
||||
#
|
||||
# Line Breaking Rules for ICU rules based break iteration.
|
||||
# Implement default line breaking as defined by Unicode TR 14.
|
||||
#
|
||||
|
||||
|
||||
#
|
||||
# Character Classes defined by Unicode TR 14.
|
||||
# These are generated by a script from the Unicode LineBreak derived
|
||||
# properties file.
|
||||
#
|
||||
|
||||
############ Start of Script-Generated Definitions #######################
|
||||
|
||||
$LF = [ \u000A];
|
||||
|
||||
$IN = [ \u2024-\u2026];
|
||||
|
||||
$SY = [ \u002F];
|
||||
|
||||
$EX = [ \u0021 \u003F \u2762-\u2763 \uFE56-\uFE57 \uFF01 \uFF1F];
|
||||
|
||||
$BA = [ \u0009 \u007C \u00AD \u058A \u0F0B \u1361 \u1680 \u17D5 \u2000-\u2006
|
||||
\u2008-\u200A \u2010 \u2012-\u2013 \u2027 \u205F];
|
||||
|
||||
$IS = [ \u002C \u002E \u003A-\u003B \u0589];
|
||||
|
||||
$BB = [ \u00B4 \u02C8 \u02CC \u1806];
|
||||
|
||||
$SA = [ \u0E01-\u0E30 \u0E32-\u0E33 \u0E40-\u0E46 \u0E81-\u0E82 \u0E84 \u0E87-\u0E88
|
||||
\u0E8A \u0E8D \u0E94-\u0E97 \u0E99-\u0E9F \u0EA1-\u0EA3 \u0EA5
|
||||
\u0EA7 \u0EAA-\u0EAB \u0EAD-\u0EB0 \u0EB2-\u0EB3 \u0EBD \u0EC0-\u0EC4
|
||||
\u0EC6 \u0EDC-\u0EDD \u1000-\u1021 \u1023-\u1027 \u1029-\u102A
|
||||
\u1050-\u1055 \u1780-\u17B3];
|
||||
|
||||
$CB = [ \uFFFC];
|
||||
|
||||
$XX = [ \uE000-\uF8FF \U000F0000-\U000FFFFD \U00100000-\U0010FFFD];
|
||||
|
||||
$HY = [ \u002D];
|
||||
|
||||
$AI = [ \u00A1 \u00A7-\u00A8 \u00AA \u00B2-\u00B3 \u00B6-\u00BA \u00BC-\u00BF
|
||||
\u00C6 \u00D0 \u00D7-\u00D8 \u00DE-\u00E1 \u00E6 \u00E8-\u00EA
|
||||
\u00EC-\u00ED \u00F0 \u00F2-\u00F3 \u00F7-\u00FA \u00FC \u00FE
|
||||
\u0101 \u0111 \u0113 \u011B \u0126-\u0127 \u012B \u0131-\u0133
|
||||
\u0138 \u013F-\u0142 \u0144 \u0148-\u014A \u014D \u0152-\u0153
|
||||
\u0166-\u0167 \u016B \u01CE \u01D0 \u01D2 \u01D4 \u01D6 \u01D8
|
||||
\u01DA \u01DC \u0251 \u0261 \u02C7 \u02C9-\u02CB \u02CD \u02D0
|
||||
\u02D8-\u02DB \u02DD \u0391-\u03A1 \u03A3-\u03A9 \u03B1-\u03C1
|
||||
\u03C3-\u03C9 \u0401 \u0410-\u044F \u0451 \u2015-\u2016 \u2020-\u2021
|
||||
\u203B \u2074 \u207F \u2081-\u2084 \u2105 \u2113 \u2121-\u2122
|
||||
\u212B \u2140 \u2154-\u2155 \u215B \u215E \u2160-\u216B \u2170-\u2179
|
||||
\u2190-\u2199 \u21D2 \u21D4 \u2200 \u2202-\u2203 \u2207-\u2208
|
||||
\u220B \u220F \u2211 \u2215 \u221A \u221D-\u2220 \u2223 \u2225
|
||||
\u2227-\u222C \u222E \u2234-\u2237 \u223C-\u223D \u2248 \u224C
|
||||
\u2252 \u2260-\u2261 \u2264-\u2267 \u226A-\u226B \u226E-\u226F
|
||||
\u2282-\u2283 \u2286-\u2287 \u2295 \u2299 \u22A5 \u22BF \u2312
|
||||
\u2460-\u24BF \u24D0-\u24E9 \u24EB-\u24FE \u2500-\u254B \u2550-\u2574
|
||||
\u2580-\u258F \u2592-\u2595 \u25A0-\u25A1 \u25A3-\u25A9 \u25B2-\u25B3
|
||||
\u25B6-\u25B7 \u25BC-\u25BD \u25C0-\u25C1 \u25C6-\u25C8 \u25CB
|
||||
\u25CE-\u25D1 \u25E2-\u25E5 \u25EF \u2605-\u2606 \u2609 \u260E-\u260F
|
||||
\u2616-\u2617 \u261C \u261E \u2640 \u2642 \u2660-\u2661 \u2663-\u2665
|
||||
\u2667-\u266A \u266C-\u266D \u266F \uFFFD];
|
||||
|
||||
$ZW = [ \u200B];
|
||||
|
||||
$SG = [ \uD800-\uDFFF];
|
||||
|
||||
$AL = [ \u0023 \u0026 \u002A \u003C-\u003E \u0040-\u005A \u005E-\u007A \u007E
|
||||
\u00A6 \u00A9 \u00AC \u00AE-\u00AF \u00B5 \u00C0-\u00C5 \u00C7-\u00CF
|
||||
\u00D1-\u00D6 \u00D9-\u00DD \u00E2-\u00E5 \u00E7 \u00EB \u00EE-\u00EF
|
||||
\u00F1 \u00F4-\u00F6 \u00FB \u00FD \u00FF-\u0100 \u0102-\u0110
|
||||
\u0112 \u0114-\u011A \u011C-\u0125 \u0128-\u012A \u012C-\u0130
|
||||
\u0134-\u0137 \u0139-\u013E \u0143 \u0145-\u0147 \u014B-\u014C
|
||||
\u014E-\u0151 \u0154-\u0165 \u0168-\u016A \u016C-\u01CD \u01CF
|
||||
\u01D1 \u01D3 \u01D5 \u01D7 \u01D9 \u01DB \u01DD-\u0220 \u0222-\u0233
|
||||
\u0250 \u0252-\u0260 \u0262-\u02AD \u02B0-\u02C6 \u02CE-\u02CF
|
||||
\u02D1-\u02D7 \u02DC \u02DE-\u02EE \u0374-\u0375 \u037A \u037E
|
||||
\u0384-\u038A \u038C \u038E-\u0390 \u03AA-\u03B0 \u03C2 \u03CA-\u03CE
|
||||
\u03D0-\u03F6 \u0400 \u0402-\u040F \u0450 \u0452-\u0482 \u048A-\u04CE
|
||||
\u04D0-\u04F5 \u04F8-\u04F9 \u0500-\u050F \u0531-\u0556 \u0559-\u055F
|
||||
\u0561-\u0587 \u05BE \u05C0 \u05C3 \u05D0-\u05EA \u05F0-\u05F4
|
||||
\u060C \u061B \u061F \u0621-\u063A \u0640-\u064A \u066A-\u066F
|
||||
\u0671-\u06D5 \u06E5-\u06E6 \u06E9 \u06FA-\u06FE \u0700-\u070D
|
||||
\u0710 \u0712-\u072C \u0780-\u07A5 \u07B1 \u0905-\u0939 \u093D
|
||||
\u0950 \u0958-\u0961 \u0964-\u0965 \u0970 \u0985-\u098C \u098F-\u0990
|
||||
\u0993-\u09A8 \u09AA-\u09B0 \u09B2 \u09B6-\u09B9 \u09DC-\u09DD
|
||||
\u09DF-\u09E1 \u09F0-\u09F1 \u09F4-\u09FA \u0A05-\u0A0A \u0A0F-\u0A10
|
||||
\u0A13-\u0A28 \u0A2A-\u0A30 \u0A32-\u0A33 \u0A35-\u0A36 \u0A38-\u0A39
|
||||
\u0A59-\u0A5C \u0A5E \u0A72-\u0A74 \u0A85-\u0A8B \u0A8D \u0A8F-\u0A91
|
||||
\u0A93-\u0AA8 \u0AAA-\u0AB0 \u0AB2-\u0AB3 \u0AB5-\u0AB9 \u0ABD
|
||||
\u0AD0 \u0AE0 \u0B05-\u0B0C \u0B0F-\u0B10 \u0B13-\u0B28 \u0B2A-\u0B30
|
||||
\u0B32-\u0B33 \u0B36-\u0B39 \u0B3D \u0B5C-\u0B5D \u0B5F-\u0B61
|
||||
\u0B70 \u0B83 \u0B85-\u0B8A \u0B8E-\u0B90 \u0B92-\u0B95 \u0B99-\u0B9A
|
||||
\u0B9C \u0B9E-\u0B9F \u0BA3-\u0BA4 \u0BA8-\u0BAA \u0BAE-\u0BB5
|
||||
\u0BB7-\u0BB9 \u0BF0-\u0BF2 \u0C05-\u0C0C \u0C0E-\u0C10 \u0C12-\u0C28
|
||||
\u0C2A-\u0C33 \u0C35-\u0C39 \u0C60-\u0C61 \u0C85-\u0C8C \u0C8E-\u0C90
|
||||
\u0C92-\u0CA8 \u0CAA-\u0CB3 \u0CB5-\u0CB9 \u0CDE \u0CE0-\u0CE1
|
||||
\u0D05-\u0D0C \u0D0E-\u0D10 \u0D12-\u0D28 \u0D2A-\u0D39 \u0D60-\u0D61
|
||||
\u0D85-\u0D96 \u0D9A-\u0DB1 \u0DB3-\u0DBB \u0DBD \u0DC0-\u0DC6
|
||||
\u0DF4 \u0E4F \u0F00-\u0F0A \u0F0D-\u0F17 \u0F1A-\u0F1F \u0F2A-\u0F34
|
||||
\u0F36 \u0F38 \u0F40-\u0F47 \u0F49-\u0F6A \u0F85 \u0F88-\u0F8B
|
||||
\u0FBE-\u0FC5 \u0FC7-\u0FCC \u0FCF \u104A-\u104F \u10A0-\u10C5
|
||||
\u10D0-\u10F8 \u10FB \u1200-\u1206 \u1208-\u1246 \u1248 \u124A-\u124D
|
||||
\u1250-\u1256 \u1258 \u125A-\u125D \u1260-\u1286 \u1288 \u128A-\u128D
|
||||
\u1290-\u12AE \u12B0 \u12B2-\u12B5 \u12B8-\u12BE \u12C0 \u12C2-\u12C5
|
||||
\u12C8-\u12CE \u12D0-\u12D6 \u12D8-\u12EE \u12F0-\u130E \u1310
|
||||
\u1312-\u1315 \u1318-\u131E \u1320-\u1346 \u1348-\u135A \u1362-\u1368
|
||||
\u1372-\u137C \u13A0-\u13F4 \u1401-\u1676 \u1681-\u169A \u16A0-\u16F0
|
||||
\u1700-\u170C \u170E-\u1711 \u1720-\u1731 \u1735-\u1736 \u1740-\u1751
|
||||
\u1760-\u176C \u176E-\u1770 \u17DC \u1800-\u1805 \u1807-\u180A
|
||||
\u1820-\u1877 \u1880-\u18A8 \u1E00-\u1E9B \u1EA0-\u1EF9 \u1F00-\u1F15
|
||||
\u1F18-\u1F1D \u1F20-\u1F45 \u1F48-\u1F4D \u1F50-\u1F57 \u1F59
|
||||
\u1F5B \u1F5D \u1F5F-\u1F7D \u1F80-\u1FB4 \u1FB6-\u1FC4 \u1FC6-\u1FD3
|
||||
\u1FD6-\u1FDB \u1FDD-\u1FEF \u1FF2-\u1FF4 \u1FF6-\u1FFE \u2017
|
||||
\u2022-\u2023 \u2038 \u203D-\u2043 \u2047-\u2052 \u2057 \u2061-\u2063
|
||||
\u2070-\u2071 \u2075-\u207C \u2080 \u2085-\u208C \u2100-\u2102
|
||||
\u2104 \u2106-\u2108 \u210A-\u2112 \u2114-\u2115 \u2117-\u2120
|
||||
\u2123-\u2125 \u2127-\u212A \u212C-\u213A \u213D-\u213F \u2141-\u214B
|
||||
\u2153 \u2156-\u215A \u215C-\u215D \u215F \u216C-\u216F \u217A-\u2183
|
||||
\u219A-\u21D1 \u21D3 \u21D5-\u21FF \u2201 \u2204-\u2206 \u2209-\u220A
|
||||
\u220C-\u220E \u2210 \u2214 \u2216-\u2219 \u221B-\u221C \u2221-\u2222
|
||||
\u2224 \u2226 \u222D \u222F-\u2233 \u2238-\u223B \u223E-\u2247
|
||||
\u2249-\u224B \u224D-\u2251 \u2253-\u225F \u2262-\u2263 \u2268-\u2269
|
||||
\u226C-\u226D \u2270-\u2281 \u2284-\u2285 \u2288-\u2294 \u2296-\u2298
|
||||
\u229A-\u22A4 \u22A6-\u22BE \u22C0-\u2311 \u2313-\u2328 \u232B-\u23B3
|
||||
\u23B7-\u23CE \u2400-\u2426 \u2440-\u244A \u24C0-\u24CF \u24EA
|
||||
\u254C-\u254F \u2575-\u257F \u2590-\u2591 \u2596-\u259F \u25A2
|
||||
\u25AA-\u25B1 \u25B4-\u25B5 \u25B8-\u25BB \u25BE-\u25BF \u25C2-\u25C5
|
||||
\u25C9-\u25CA \u25CC-\u25CD \u25D2-\u25E1 \u25E6-\u25EE \u25F0-\u2604
|
||||
\u2607-\u2608 \u260A-\u260D \u2610-\u2613 \u2619-\u261B \u261D
|
||||
\u261F-\u263F \u2641 \u2643-\u265F \u2662 \u2666 \u266B \u266E
|
||||
\u2670-\u267D \u2680-\u2689 \u2701-\u2704 \u2706-\u2709 \u270C-\u2727
|
||||
\u2729-\u274B \u274D \u274F-\u2752 \u2756 \u2758-\u275A \u2761
|
||||
\u2764-\u2767 \u2776-\u2794 \u2798-\u27AF \u27B1-\u27BE \u27D0-\u27E5
|
||||
\u27F0-\u2982 \u2999-\u29D7 \u29DC-\u29FB \u29FE-\u2AFF \uFB00-\uFB06
|
||||
\uFB13-\uFB17 \uFB1D \uFB1F-\uFB36 \uFB38-\uFB3C \uFB3E \uFB40-\uFB41
|
||||
\uFB43-\uFB44 \uFB46-\uFBB1 \uFBD3-\uFD3D \uFD50-\uFD8F \uFD92-\uFDC7
|
||||
\uFDF0-\uFDFB \uFE70-\uFE74 \uFE76-\uFEFC \uFF66 \uFF71-\uFF9D
|
||||
\uFFA0-\uFFBE \uFFC2-\uFFC7 \uFFCA-\uFFCF \uFFD2-\uFFD7 \uFFDA-\uFFDC
|
||||
\uFFE8-\uFFEE \U00010300-\U0001031E \U00010320-\U00010323 \U00010330-\U0001034A
|
||||
\U00010400-\U00010425 \U00010428-\U0001044D \U0001D000-\U0001D0F5
|
||||
\U0001D100-\U0001D126 \U0001D12A-\U0001D164 \U0001D16A-\U0001D16C
|
||||
\U0001D183-\U0001D184 \U0001D18C-\U0001D1A9 \U0001D1AE-\U0001D1DD
|
||||
\U0001D400-\U0001D454 \U0001D456-\U0001D49C \U0001D49E-\U0001D49F
|
||||
\U0001D4A2 \U0001D4A5-\U0001D4A6 \U0001D4A9-\U0001D4AC \U0001D4AE-\U0001D4B9
|
||||
\U0001D4BB \U0001D4BD-\U0001D4C0 \U0001D4C2-\U0001D4C3 \U0001D4C5-\U0001D505
|
||||
\U0001D507-\U0001D50A \U0001D50D-\U0001D514 \U0001D516-\U0001D51C
|
||||
\U0001D51E-\U0001D539 \U0001D53B-\U0001D53E \U0001D540-\U0001D544
|
||||
\U0001D546 \U0001D54A-\U0001D550 \U0001D552-\U0001D6A3 \U0001D6A8-\U0001D7C9];
|
||||
|
||||
$OP = [ \u0028 \u005B \u007B \u0F3A \u0F3C \u169B \u201A \u201E \u2045 \u207D
|
||||
\u208D \u2329 \u23B4 \u2768 \u276A \u276C \u276E \u2770 \u2772
|
||||
\u2774 \u27E6 \u27E8 \u27EA \u2983 \u2985 \u2987 \u2989 \u298B
|
||||
\u298D \u298F \u2991 \u2993 \u2995 \u2997 \u29D8 \u29DA \u29FC
|
||||
\u3008 \u300A \u300C \u300E \u3010 \u3014 \u3016 \u3018 \u301A
|
||||
\u301D \uFD3E \uFE35 \uFE37 \uFE39 \uFE3B \uFE3D \uFE3F \uFE41
|
||||
\uFE43 \uFE59 \uFE5B \uFE5D \uFF08 \uFF3B \uFF5B \uFF5F \uFF62];
|
||||
|
||||
$BK = [ \u000C \u2028-\u2029];
|
||||
|
||||
$PO = [ \u0025 \u00A2 \u00B0 \u2030-\u2037 \u20A7 \u2103 \u2109 \u2126 \uFDFC
|
||||
\uFE6A \uFF05 \uFFE0];
|
||||
|
||||
$NS = [ \u0E5A-\u0E5B \u17D4 \u17D6-\u17DA \u203C \u2044 \u3005 \u301C \u303B-\u303C
|
||||
\u3041 \u3043 \u3045 \u3047 \u3049 \u3063 \u3083 \u3085 \u3087
|
||||
\u308E \u3095-\u3096 \u309B-\u309E \u30A0-\u30A1 \u30A3 \u30A5
|
||||
\u30A7 \u30A9 \u30C3 \u30E3 \u30E5 \u30E7 \u30EE \u30F5-\u30F6
|
||||
\u30FB \u30FD \u31F0-\u31FF \uFE54-\uFE55 \uFF1A-\uFF1B \uFF65
|
||||
\uFF67-\uFF70 \uFF9E-\uFF9F];
|
||||
|
||||
$CL = [ \u0029 \u005D \u007D \u0F3B \u0F3D \u169C \u2046 \u207E \u208E \u232A
|
||||
\u23B5 \u2769 \u276B \u276D \u276F \u2771 \u2773 \u2775 \u27E7
|
||||
\u27E9 \u27EB \u2984 \u2986 \u2988 \u298A \u298C \u298E \u2990
|
||||
\u2992 \u2994 \u2996 \u2998 \u29D9 \u29DB \u29FD \u3001-\u3002
|
||||
\u3009 \u300B \u300D \u300F \u3011 \u3015 \u3017 \u3019 \u301B
|
||||
\u301E-\u301F \uFD3F \uFE36 \uFE38 \uFE3A \uFE3C \uFE3E \uFE40
|
||||
\uFE42 \uFE44 \uFE50 \uFE52 \uFE5A \uFE5C \uFE5E \uFF09 \uFF0C
|
||||
\uFF0E \uFF3D \uFF5D \uFF60-\uFF61 \uFF63-\uFF64];
|
||||
|
||||
$NU = [ \u0030-\u0039 \u0660-\u0669 \u06F0-\u06F9 \u0966-\u096F \u09E6-\u09EF
|
||||
\u0A66-\u0A6F \u0AE6-\u0AEF \u0B66-\u0B6F \u0BE7-\u0BEF \u0C66-\u0C6F
|
||||
\u0CE6-\u0CEF \u0D66-\u0D6F \u0E50-\u0E59 \u0ED0-\u0ED9 \u0F20-\u0F29
|
||||
\u1040-\u1049 \u1369-\u1371 \u17E0-\u17E9 \u1810-\u1819 \U0001D7CE-\U0001D7FF];
|
||||
|
||||
$CM = [ \u0000-\u0008 \u000B \u000E-\u001F \u007F-\u009F \u0300-\u034F \u0360-\u036F
|
||||
\u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9 \u05BB-\u05BD
|
||||
\u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06E4
|
||||
\u06E7-\u06E8 \u06EA-\u06ED \u070F \u0711 \u0730-\u074A \u07A6-\u07B0
|
||||
\u0901-\u0903 \u093C \u093E-\u094D \u0951-\u0954 \u0962-\u0963
|
||||
\u0981-\u0983 \u09BC \u09BE-\u09C4 \u09C7-\u09C8 \u09CB-\u09CD
|
||||
\u09D7 \u09E2-\u09E3 \u0A02 \u0A3C \u0A3E-\u0A42 \u0A47-\u0A48
|
||||
\u0A4B-\u0A4D \u0A70-\u0A71 \u0A81-\u0A83 \u0ABC \u0ABE-\u0AC5
|
||||
\u0AC7-\u0AC9 \u0ACB-\u0ACD \u0B01-\u0B03 \u0B3C \u0B3E-\u0B43
|
||||
\u0B47-\u0B48 \u0B4B-\u0B4D \u0B56-\u0B57 \u0B82 \u0BBE-\u0BC2
|
||||
\u0BC6-\u0BC8 \u0BCA-\u0BCD \u0BD7 \u0C01-\u0C03 \u0C3E-\u0C44
|
||||
\u0C46-\u0C48 \u0C4A-\u0C4D \u0C55-\u0C56 \u0C82-\u0C83 \u0CBE-\u0CC4
|
||||
\u0CC6-\u0CC8 \u0CCA-\u0CCD \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D43
|
||||
\u0D46-\u0D48 \u0D4A-\u0D4D \u0D57 \u0D82-\u0D83 \u0DCA \u0DCF-\u0DD4
|
||||
\u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E3A \u0E47-\u0E4E
|
||||
\u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F84 \u0F86-\u0F87
|
||||
\u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C-\u1032 \u1036-\u1039
|
||||
\u1056-\u1059 \u1160-\u11A2 \u11A8-\u11F9 \u1712-\u1714 \u1732-\u1734
|
||||
\u1752-\u1753 \u1772-\u1773 \u17B4-\u17D3 \u180B-\u180E \u18A9
|
||||
\u200C-\u200F \u202A-\u202E \u206A-\u206F \u20D0-\u20EA \u302A-\u302F
|
||||
\u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFFF9-\uFFFB
|
||||
\U0001D165-\U0001D169 \U0001D16D-\U0001D182 \U0001D185-\U0001D18B
|
||||
\U0001D1AA-\U0001D1AD \U000E0001 \U000E0020-\U000E007F];
|
||||
|
||||
$PR = [ \u0024 \u002B \u005C \u00A3-\u00A5 \u00B1 \u09F2-\u09F3 \u0E3F \u17DB
|
||||
\u20A0-\u20A6 \u20A8-\u20B1 \u2116 \u2212-\u2213 \uFE69 \uFF04
|
||||
\uFFE1 \uFFE5-\uFFE6];
|
||||
|
||||
$B2 = [ \u2014];
|
||||
|
||||
$ID = [ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
||||
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
||||
\u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
|
||||
\u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
|
||||
\u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
|
||||
\u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
|
||||
\u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
|
||||
\u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
|
||||
\u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
|
||||
\uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
|
||||
\uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
|
||||
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
||||
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
||||
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
||||
|
||||
$SP = [ \u0020];
|
||||
|
||||
$QU = [ \u0022 \u0027 \u00AB \u00BB \u2018-\u2019 \u201B-\u201D \u201F \u2039-\u203A
|
||||
\u23B6 \u275B-\u275E];
|
||||
|
||||
$CR = [ \u000D];
|
||||
|
||||
$GL = [ \u00A0 \u0F0C \u2007 \u2011 \u202F \u2060 \uFEFF];
|
||||
|
||||
############ End of Script-Generated Definitions #######################
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Thai Dictionary related definitions and rules
|
||||
#
|
||||
|
||||
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
||||
$paiyannoi = [\u0e2f];
|
||||
$maiyamok = [\u0e46];
|
||||
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Character classes from TR 29. Needed for finding characters.
|
||||
#
|
||||
# $Extend is all combining characters, and none of the other cruft that
|
||||
# TR14 puts into $CM, which is its concept of combining marks.
|
||||
#
|
||||
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
|
||||
#
|
||||
# Combining Marks. X $CM* behaves as if it were X. Rule LB6.
|
||||
# TODO: This is going to produce some odd results, because of the non-combining
|
||||
# chars that are included in $CM. Use $Extend instead, where possible.
|
||||
#
|
||||
$ALcm = $AL $CM*;
|
||||
$IDcm = $ID $CM*;
|
||||
$NUcm = $NU $Extend*;
|
||||
$HYcm = $HY $Extend*;
|
||||
$SPcm = $SP $Extend*;
|
||||
$QUcm = $QU $Extend*;
|
||||
$POcm = $PO $Extend*;
|
||||
$OPcm = $OP $Extend*;
|
||||
$BAcm = $BA $Extend*;
|
||||
$BBcm = $BB $Extend*;
|
||||
$NScm = $NS $Extend*;
|
||||
$GLcm = $GL $Extend*;
|
||||
$B2cm = $B2 $Extend*;
|
||||
$INcm = $IN $Extend*;
|
||||
|
||||
|
||||
# New Lines. Always break after, never break before.
|
||||
# Rule LB 3
|
||||
#
|
||||
# Endings. NewLine or Zero Width Space, or both. Rules 4, 5
|
||||
# Because we never break before these things, $Endings
|
||||
# appears at the end of line break rule.
|
||||
#
|
||||
$NLF = $BK | $CR | $LF | $CR $LF;
|
||||
$Endings = $SPcm* $ZW* $NLF?;
|
||||
$EndingsMandatory = $SPcm* $NLF | $SPcm* $ZW $NLF?;
|
||||
|
||||
|
||||
#
|
||||
# Openings Sequences that can precede Words, and that should not be separated from them.
|
||||
# Rules LB 9, 10
|
||||
#
|
||||
$Openings = (($QUcm $SPcm*)? $OPcm $SPcm*)*;
|
||||
|
||||
#
|
||||
# Closings Seqences that follow words, and that should not be separated from them,
|
||||
# Rule LB 8, 11, 15
|
||||
$Closings = ($SPcm*( ($CL ($SPcm* $NScm)? | $EX | $IS | $SY) $Extend*) | $BAcm | $HYcm | $NScm | $maiyamok)*;
|
||||
|
||||
#
|
||||
# Words. Includes mixed Alpha-numerics.
|
||||
# Rules 11a, 16, 17, 19, more or less.
|
||||
#
|
||||
$NumberInterior = $IDcm | ($NUcm | $ALcm | $IS $NUcm)+;
|
||||
$Number = $PR? ($OPcm | $HYcm)? $NumberInterior $CL? $POcm?; # Fancy Number 18
|
||||
$Word = (($IDcm | ($ALcm | $NUcm)+) ($POcm? | $INcm?)); # Alpha-numeric. 16, 17
|
||||
$Dashes = (($B2cm $SPcm*)*); # Dashes 11a
|
||||
$ThaiRange = $dictionary+ | $thai_etc;
|
||||
$WordLikeThing = $Number | $Word | $Dashes | $ThaiRange;
|
||||
|
||||
|
||||
|
||||
|
||||
$Word15 = ($BBcm* ($WordLikeThing)? ($BAcm | $HYcm | $NScm)*) | # Rule 15. Stuff sticks around words.
|
||||
[^[:Cc:] $BK $CR $LF $ZW $SP $GL] $Extend* | # Allow characters that don't meet the
|
||||
[^$BK $CR $LF $ZW $SP $GL ]; # more elaborate definitions for WORD
|
||||
# to be glued.
|
||||
|
||||
$GluedWord = ($GLcm | $QUcm)? $Word15 (($GLcm | $QUcm) $Word15)*; # "Glue" will stick anything below it together.
|
||||
# Rules 13, 14
|
||||
|
||||
#
|
||||
# The actual rules, a combination of everything defined above.
|
||||
#
|
||||
$Openings $GluedWord $Closings $paiyannoi? $EndingsMandatory;
|
||||
$Openings $GluedWord $Closings $Endings;
|
||||
|
||||
$Openings $GluedWord $Closings $paiyannoi /
|
||||
([^\u0e25 $Extend] | \u0e25[^$paiyannoi $Extend]);
|
||||
|
||||
|
||||
#"$word($nbsp+$word)*$paiyannoi/([^[\u0e25$_ignore_]]|"
|
||||
# + "\u0e25[^$paiyannoi$_ignore_]);"
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
# Back up to a hard break.
|
||||
# TODO: make smarter reverse rules for better efficiency
|
||||
#
|
||||
! . . [^$BK | $CR | $LF]* (. | $LF $CR);
|
||||
! .*;
|
80
icu4c/source/data/brkitr/sent.txt
Normal file
80
icu4c/source/data/brkitr/sent.txt
Normal file
|
@ -0,0 +1,80 @@
|
|||
# file: sent.txt Sentence Boundary Rules.
|
||||
#
|
||||
|
||||
|
||||
# Separators are line or paragraph ends that will attach to the end of sentences.
|
||||
$Sep =[\n \r \u0085 \u2028 \u2029];
|
||||
$SepSeq = $Sep | \u000d\u000a;
|
||||
$Sp = [[:Zs:] - $Sep];
|
||||
|
||||
# $ATerm contains ambiguous terminators, characters that may or may not terminate
|
||||
# sentence depending on the context.
|
||||
# $Term contains $ATerm + all characters that unambiguously end sentences.
|
||||
#
|
||||
$ATerm = [\u002e \u0589 \u3001]; # same as Terminal_Punctuation2 from TR29
|
||||
$Term = [$ATerm \u0021 \u003f \u037e \u061f \u06d4 \u203c \u203d
|
||||
\u3002 \u2048 \u2049
|
||||
\u0964]; # TODO: these (this line) not yet decided in TR29.
|
||||
|
||||
$Lower = [[:Ll:] [:Sk:]];
|
||||
$Upper = [[:Lu:] [:Lt:]];
|
||||
$NotLetter = [^[:L:] $Term];
|
||||
$Open = [:Ps:];
|
||||
$Close = [[:Pe:] \" \'];
|
||||
|
||||
#
|
||||
# Combining chars. Copied from UNIDATA/DerivedCoreProperties.txt
|
||||
#
|
||||
$Extend =
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
|
||||
$EndSequence = [^$Term]* $Term ($Close | $Term | $Extend)* $Sp* $SepSeq?;
|
||||
$LowerWordFollows = [^$Term]* $ATerm $Close* $Sp* $SepSeq? $NotLetter* $Lower;
|
||||
$UpperWordPrecedes = [^$Term]* $Upper ($Lower | $Extend)* $ATerm $Close* $Sp* $SepSeq?;
|
||||
|
||||
|
||||
($LowerWordFollows | $UpperWordPrecedes)* $EndSequence;
|
||||
|
||||
#
|
||||
# In cases where the input text ends without a normal end-of-sentence sequence,
|
||||
# this rule will match whatever text is there.
|
||||
#
|
||||
[^$Term]*;
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
$RevEndSequence = [^$Term]* ($Term | $Close | $Extend)* [^$Term]*;
|
||||
$ReverseLowerWordFollows = $Lower ($Close | $Sp | $Sep | $Extend | $NotLetter)* $ATerm [^$Term]*;
|
||||
$ReverseUpperWordPrecedes = $ATerm ($Lower | $Extend)* $Upper [^$Term]*;
|
||||
|
||||
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperWordPrecedes)* $Term?;
|
||||
!.;
|
||||
|
27
icu4c/source/data/brkitr/title.txt
Normal file
27
icu4c/source/data/brkitr/title.txt
Normal file
|
@ -0,0 +1,27 @@
|
|||
#
|
||||
# Title Casing Break Rules
|
||||
#
|
||||
|
||||
$CaseIgnorable = [[:Mn:][:Me:][:Cf:][:Lm:][:Sk:] \u0027 \u00AD \u2019];
|
||||
$OtherUpperCase = [\u2160-\u216f \u24b6-\u24cf];
|
||||
$OtherLowerCase = [\u02b0-\u02b8 \u02c0-\u02c1 \u02e0-\u02e4 \u0345\u037a \u2170-\u217f \u24d0-\u24e9];
|
||||
$Cased = [[:Lu:][:Lt:][:Ll:] $OtherUpperCase $OtherLowerCase - $CaseIgnorable];
|
||||
$NotCased = [^ $Cased $CaseIgnorable];
|
||||
|
||||
#
|
||||
# If the iterator was not stopped on a cased character, advance it to the first cased char
|
||||
#
|
||||
($NotCased | $CaseIgnorable)*;
|
||||
|
||||
#
|
||||
# If the iterator starts on a cased item, advance through all adjacent cased items plus
|
||||
# any non-cased stuff, to reach the start of the next word.
|
||||
#
|
||||
$Cased ($Cased | $CaseIgnorable)* $NotCased*;
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules
|
||||
#
|
||||
!$NotCased* ($Cased | $CaseIgnorable)* $NotCased*;
|
||||
|
160
icu4c/source/data/brkitr/word.txt
Normal file
160
icu4c/source/data/brkitr/word.txt
Normal file
|
@ -0,0 +1,160 @@
|
|||
#
|
||||
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
|
||||
#
|
||||
|
||||
|
||||
$Hiragana = [[:L:] & [:Hira:]];
|
||||
$Katakana = [[:L:] & [:Kana:]];
|
||||
|
||||
#
|
||||
# Definition of $Ideographic is from TR14, Line Breaking.
|
||||
#
|
||||
$Ideographic =
|
||||
[ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
||||
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
||||
\u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
|
||||
\u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
|
||||
\u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
|
||||
\u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
|
||||
\u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
|
||||
\u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
|
||||
\u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
|
||||
\uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
|
||||
\uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
|
||||
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
||||
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
||||
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
||||
|
||||
#
|
||||
# These definitions are from the character break rules.
|
||||
#
|
||||
$CGJ = [\u034f]; #Combining Grapheme Joiner
|
||||
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
|
||||
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
|
||||
#Paragraph Separtor,
|
||||
# General Category == Control
|
||||
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
#
|
||||
# Korean, also taken from character break rules.
|
||||
#
|
||||
#
|
||||
# Korean Syllable Sequences
|
||||
#
|
||||
$L = [\u1100-\u115f];
|
||||
$V = [\u1160-\u11a2];
|
||||
$T = [\u11a8-\u11f9];
|
||||
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
|
||||
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
|
||||
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
|
||||
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
|
||||
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
|
||||
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
|
||||
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
|
||||
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
|
||||
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
|
||||
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
|
||||
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
|
||||
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
|
||||
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
|
||||
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
|
||||
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
|
||||
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
|
||||
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
|
||||
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
|
||||
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
|
||||
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
|
||||
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
|
||||
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
|
||||
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
|
||||
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
|
||||
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
|
||||
$LVT = [[\uac00-\ud7a3] - $LV];
|
||||
$Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
|
||||
|
||||
|
||||
|
||||
$LineBreak = [$Ideographic $Hiragana $Katakana];
|
||||
$Letter = [[[:L:] [:Sk:]] & [^$LineBreak]];
|
||||
#$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
|
||||
$MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];
|
||||
|
||||
|
||||
|
||||
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
|
||||
$LetterBase = [:L:];
|
||||
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
|
||||
$Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner
|
||||
$LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase;
|
||||
$LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CGJSequence) $Extend*)*);
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Numeric Definitions
|
||||
# TODO: More complete handling of $Extend combining chars.
|
||||
#
|
||||
$Numeric = [:Nd:]; #TODO remove FULL WIDTH
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$InfixNumeric = [\u002c \u002e \u003a \u003b \u0589];
|
||||
$PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
|
||||
\u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
|
||||
$PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
|
||||
|
||||
$NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
|
||||
$NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;
|
||||
|
||||
|
||||
#
|
||||
# The Big Rule. Gloms everything together.
|
||||
#
|
||||
$NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;
|
||||
|
||||
#
|
||||
# Lesser rules
|
||||
#
|
||||
($Hiragana $Extend*)*;
|
||||
($Katakana $Extend*)*;
|
||||
$NotControl $Extend*;
|
||||
\r\n;
|
||||
.;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up a bit too far,
|
||||
# but must back up at least enough.)
|
||||
#
|
||||
! ( $Letter | $MidLetter | $Numeric | $PrefixNumeric | $Join_Control |
|
||||
$CGJ | $Link | $InfixNumeric | $PostfixNumeric | $Extend |
|
||||
$T | $V | $L | $LV | $LVT)*;
|
||||
! ($Hiragana | $Extend)*;
|
||||
! ($Katakana | $Extend)*;
|
||||
! $Extend* .;
|
||||
! \n\r;
|
||||
#!.*;
|
177
icu4c/source/data/brkitr/word_th.txt
Normal file
177
icu4c/source/data/brkitr/word_th.txt
Normal file
|
@ -0,0 +1,177 @@
|
|||
#
|
||||
# word.txt Word Breaking Rules for ICU Rules Based Break Iterator.
|
||||
#
|
||||
|
||||
|
||||
$Hiragana = [[:L:] & [:Hira:]];
|
||||
$Katakana = [[:L:] & [:Kana:]];
|
||||
|
||||
#
|
||||
# Definition of $Ideographic is from TR14, Line Breaking.
|
||||
#
|
||||
$Ideographic =
|
||||
[ \u1100-\u1159 \u115F \u2E80-\u2E99 \u2E9B-\u2EF3 \u2F00-\u2FD5 \u2FF0-\u2FFB
|
||||
\u3000 \u3003-\u3004 \u3006-\u3007 \u3012-\u3013 \u3020-\u3029
|
||||
\u3030-\u303A \u303D-\u303F \u3042 \u3044 \u3046 \u3048 \u304A-\u3062
|
||||
\u3064-\u3082 \u3084 \u3086 \u3088-\u308D \u308F-\u3094 \u309F
|
||||
\u30A2 \u30A4 \u30A6 \u30A8 \u30AA-\u30C2 \u30C4-\u30E2 \u30E4
|
||||
\u30E6 \u30E8-\u30ED \u30EF-\u30F4 \u30F7-\u30FA \u30FC \u30FE-\u30FF
|
||||
\u3105-\u312C \u3131-\u318E \u3190-\u31B7 \u3200-\u321C \u3220-\u3243
|
||||
\u3251-\u327B \u327F-\u32CB \u32D0-\u32FE \u3300-\u3376 \u337B-\u33DD
|
||||
\u33E0-\u33FE \u3400-\u4DB5 \u4E00-\u9FA5 \uA000-\uA48C \uA490-\uA4C6
|
||||
\uAC00-\uD7A3 \uF900-\uFA2D \uFA30-\uFA6A \uFE30-\uFE34 \uFE45-\uFE46
|
||||
\uFE49-\uFE4F \uFE51 \uFE58 \uFE5F-\uFE66 \uFE68 \uFE6B \uFF02-\uFF03
|
||||
\uFF06-\uFF07 \uFF0A-\uFF0B \uFF0D \uFF0F-\uFF19 \uFF1C-\uFF1E
|
||||
\uFF20-\uFF3A \uFF3C \uFF3E-\uFF5A \uFF5C \uFF5E \uFFE2-\uFFE4
|
||||
\U00020000-\U0002A6D6 \U0002F800-\U0002FA1D];
|
||||
|
||||
#
|
||||
# These definitions are from the character break rules.
|
||||
#
|
||||
$CGJ = [\u034f]; #Combining Grapheme Joiner
|
||||
$Link = [\u094D \u09CD \u0A4D \u0ACD \u0B4D \u0BCD \u0C4D \u0CCD \u0D4D \u0DCA \u0E3A \u1039 \u17D2];
|
||||
$NotControl = [^[:Zl:] [:Zp:] [:Cc:]]; #Line Separator,
|
||||
#Paragraph Separtor,
|
||||
# General Category == Control
|
||||
$Extend = # From UNIDATA/DerivedCoreProperties.txt
|
||||
[\u0300-\u034E \u0360-\u036F \u0483-\u0486 \u0488-\u0489 \u0591-\u05A1 \u05A3-\u05B9
|
||||
\u05BB-\u05BD \u05BF \u05C1-\u05C2 \u05C4 \u064B-\u0655 \u0670 \u06D6-\u06DC
|
||||
\u06DE \u06DF-\u06E4 \u06E7-\u06E8 \u06EA-\u06ED \u0711 \u0730-\u074A
|
||||
\u07A6-\u07B0 \u0901-\u0902 \u0903 \u093C \u093E-\u0940 \u0941-\u0948
|
||||
\u0949-\u094C \u0951-\u0954 \u0962-\u0963 \u0981 \u0982-\u0983 \u09BC
|
||||
\u09BE-\u09C0 \u09C1-\u09C4 \u09C7-\u09C8 \u09CB-\u09CC \u09D7 \u09E2-\u09E3
|
||||
\u0A02 \u0A3C \u0A3E-\u0A40 \u0A41-\u0A42 \u0A47-\u0A48 \u0A4B-\u0A4C
|
||||
\u0A70-\u0A71 \u0A81-\u0A82 \u0A83 \u0ABC \u0ABE-\u0AC0 \u0AC1-\u0AC5
|
||||
\u0AC7-\u0AC8 \u0AC9 \u0ACB-\u0ACC \u0B01 \u0B02-\u0B03 \u0B3C \u0B3E
|
||||
\u0B3F \u0B40 \u0B41-\u0B43 \u0B47-\u0B48 \u0B4B-\u0B4C \u0B56 \u0B57
|
||||
\u0B82 \u0BBE-\u0BBF \u0BC0 \u0BC1-\u0BC2 \u0BC6-\u0BC8 \u0BCA-\u0BCC \u0BD7
|
||||
\u0C01-\u0C03 \u0C3E-\u0C40 \u0C41-\u0C44 \u0C46-\u0C48 \u0C4A-\u0C4C
|
||||
\u0C55-\u0C56 \u0C82-\u0C83 \u0CBE \u0CBF \u0CC0-\u0CC4 \u0CC6
|
||||
\u0CC7-\u0CC8 \u0CCA-\u0CCB \u0CCC \u0CD5-\u0CD6 \u0D02-\u0D03 \u0D3E-\u0D40
|
||||
\u0D41-\u0D43 \u0D46-\u0D48 \u0D4A-\u0D4C \u0D57 \u0D82-\u0D83 \u0DCF-\u0DD1
|
||||
\u0DD2-\u0DD4 \u0DD6 \u0DD8-\u0DDF \u0DF2-\u0DF3 \u0E31 \u0E34-\u0E39
|
||||
\u0E47-\u0E4E \u0EB1 \u0EB4-\u0EB9 \u0EBB-\u0EBC \u0EC8-\u0ECD \u0F18-\u0F19
|
||||
\u0F35 \u0F37 \u0F39 \u0F3E-\u0F3F \u0F71-\u0F7E \u0F7F \u0F80-\u0F84
|
||||
\u0F86-\u0F87 \u0F90-\u0F97 \u0F99-\u0FBC \u0FC6 \u102C \u102D-\u1030 \u1031
|
||||
\u1032 \u1036-\u1037 \u1038 \u1056-\u1057 \u1058-\u1059 \u1712-\u1714
|
||||
\u1732-\u1734 \u1752-\u1753 \u1772-\u1773 \u17B4-\u17B6 \u17B7-\u17BD
|
||||
\u17BE-\u17C5 \u17C6 \u17C7-\u17C8 \u17C9-\u17D1 \u17D3 \u180B-\u180D
|
||||
\u18A9 \u20D0-\u20DC \u20DD-\u20E0 \u20E1 \u20E2-\u20E4 \u20E5-\u20EA
|
||||
\u302A-\u302F \u3099-\u309A \uFB1E \uFE00-\uFE0F \uFE20-\uFE23 \uFF9E-\uFF9F
|
||||
\U0001D165-\U0001D166 \U0001D167-\U0001D169 \U0001D16D-\U0001D172
|
||||
\U0001D17B-\U0001D182 \U0001D185-\U0001D18B \U0001D1AA-\U0001D1AD];
|
||||
|
||||
#
|
||||
# Korean, also taken from character break rules.
|
||||
#
|
||||
#
|
||||
# Korean Syllable Sequences
|
||||
#
|
||||
$L = [\u1100-\u115f];
|
||||
$V = [\u1160-\u11a2];
|
||||
$T = [\u11a8-\u11f9];
|
||||
$LV = [ \uac00 \uac1c \uac38 \uac54 \uac70 \uac8c \uaca8 \uacc4 \uace0 \uacfc \uad18 \uad34 \uad50 \uad6c \uad88 \uada4
|
||||
\uadc0 \uaddc \uadf8 \uae14 \uae30 \uae4c \uae68 \uae84 \uaea0 \uaebc \uaed8 \uaef4 \uaf10 \uaf2c \uaf48 \uaf64
|
||||
\uaf80 \uaf9c \uafb8 \uafd4 \uaff0 \ub00c \ub028 \ub044 \ub060 \ub07c \ub098 \ub0b4 \ub0d0 \ub0ec \ub108 \ub124
|
||||
\ub140 \ub15c \ub178 \ub194 \ub1b0 \ub1cc \ub1e8 \ub204 \ub220 \ub23c \ub258 \ub274 \ub290 \ub2ac \ub2c8 \ub2e4
|
||||
\ub300 \ub31c \ub338 \ub354 \ub370 \ub38c \ub3a8 \ub3c4 \ub3e0 \ub3fc \ub418 \ub434 \ub450 \ub46c \ub488 \ub4a4
|
||||
\ub4c0 \ub4dc \ub4f8 \ub514 \ub530 \ub54c \ub568 \ub584 \ub5a0 \ub5bc \ub5d8 \ub5f4 \ub610 \ub62c \ub648 \ub664
|
||||
\ub680 \ub69c \ub6b8 \ub6d4 \ub6f0 \ub70c \ub728 \ub744 \ub760 \ub77c \ub798 \ub7b4 \ub7d0 \ub7ec \ub808 \ub824
|
||||
\ub840 \ub85c \ub878 \ub894 \ub8b0 \ub8cc \ub8e8 \ub904 \ub920 \ub93c \ub958 \ub974 \ub990 \ub9ac \ub9c8 \ub9e4
|
||||
\uba00 \uba1c \uba38 \uba54 \uba70 \uba8c \ubaa8 \ubac4 \ubae0 \ubafc \ubb18 \ubb34 \ubb50 \ubb6c \ubb88 \ubba4
|
||||
\ubbc0 \ubbdc \ubbf8 \ubc14 \ubc30 \ubc4c \ubc68 \ubc84 \ubca0 \ubcbc \ubcd8 \ubcf4 \ubd10 \ubd2c \ubd48 \ubd64
|
||||
\ubd80 \ubd9c \ubdb8 \ubdd4 \ubdf0 \ube0c \ube28 \ube44 \ube60 \ube7c \ube98 \ubeb4 \ubed0 \ubeec \ubf08 \ubf24
|
||||
\ubf40 \ubf5c \ubf78 \ubf94 \ubfb0 \ubfcc \ubfe8 \uc004 \uc020 \uc03c \uc058 \uc074 \uc090 \uc0ac \uc0c8 \uc0e4
|
||||
\uc100 \uc11c \uc138 \uc154 \uc170 \uc18c \uc1a8 \uc1c4 \uc1e0 \uc1fc \uc218 \uc234 \uc250 \uc26c \uc288 \uc2a4
|
||||
\uc2c0 \uc2dc \uc2f8 \uc314 \uc330 \uc34c \uc368 \uc384 \uc3a0 \uc3bc \uc3d8 \uc3f4 \uc410 \uc42c \uc448 \uc464
|
||||
\uc480 \uc49c \uc4b8 \uc4d4 \uc4f0 \uc50c \uc528 \uc544 \uc560 \uc57c \uc598 \uc5b4 \uc5d0 \uc5ec \uc608 \uc624
|
||||
\uc640 \uc65c \uc678 \uc694 \uc6b0 \uc6cc \uc6e8 \uc704 \uc720 \uc73c \uc758 \uc774 \uc790 \uc7ac \uc7c8 \uc7e4
|
||||
\uc800 \uc81c \uc838 \uc854 \uc870 \uc88c \uc8a8 \uc8c4 \uc8e0 \uc8fc \uc918 \uc934 \uc950 \uc96c \uc988 \uc9a4
|
||||
\uc9c0 \uc9dc \uc9f8 \uca14 \uca30 \uca4c \uca68 \uca84 \ucaa0 \ucabc \ucad8 \ucaf4 \ucb10 \ucb2c \ucb48 \ucb64
|
||||
\ucb80 \ucb9c \ucbb8 \ucbd4 \ucbf0 \ucc0c \ucc28 \ucc44 \ucc60 \ucc7c \ucc98 \uccb4 \uccd0 \uccec \ucd08 \ucd24
|
||||
\ucd40 \ucd5c \ucd78 \ucd94 \ucdb0 \ucdcc \ucde8 \uce04 \uce20 \uce3c \uce58 \uce74 \uce90 \uceac \ucec8 \ucee4
|
||||
\ucf00 \ucf1c \ucf38 \ucf54 \ucf70 \ucf8c \ucfa8 \ucfc4 \ucfe0 \ucffc \ud018 \ud034 \ud050 \ud06c \ud088 \ud0a4
|
||||
\ud0c0 \ud0dc \ud0f8 \ud114 \ud130 \ud14c \ud168 \ud184 \ud1a0 \ud1bc \ud1d8 \ud1f4 \ud210 \ud22c \ud248 \ud264
|
||||
\ud280 \ud29c \ud2b8 \ud2d4 \ud2f0 \ud30c \ud328 \ud344 \ud360 \ud37c \ud398 \ud3b4 \ud3d0 \ud3ec \ud408 \ud424
|
||||
\ud440 \ud45c \ud478 \ud494 \ud4b0 \ud4cc \ud4e8 \ud504 \ud520 \ud53c \ud558 \ud574 \ud590 \ud5ac \ud5c8 \ud5e4
|
||||
\ud600 \ud61c \ud638 \ud654 \ud670 \ud68c \ud6a8 \ud6c4 \ud6e0 \ud6fc \ud718 \ud734 \ud750 \ud76c \ud788 ];
|
||||
$LVT = [[\uac00-\ud7a3] - $LV];
|
||||
$Hangul_Sequence = ((($L+ $LV?) | ($L* $LV)) $V* $T* ) | ($L* $LVT $T*);
|
||||
|
||||
|
||||
#
|
||||
# Thai Dictionary Related Rules
|
||||
#
|
||||
$dictionary = [\u0e01-\u0e2e \u0e30-\u0e3a \u0e40-\u0e44 \u0e47-\u0e4e]; # this rule breaks the iterator with mixed Thai and English
|
||||
$paiyannoi = [\u0e2f];
|
||||
$maiyamok = [\u0e46];
|
||||
$thai_etc = $paiyannoi \u0e25 $paiyannoi;
|
||||
|
||||
|
||||
$dictionary+ ($paiyannoi? $maiyamok)?;
|
||||
$dictionary+ $paiyannoi / ([^\u0e25 $maiyamok $Extend] | \u0e25[^$paiyannoi $Extend]);
|
||||
$thai_etc;
|
||||
|
||||
|
||||
#
|
||||
# Definitions for building up Letters, so that breaks will not occur
|
||||
# within a single letter (Grapheme Cluster). See the character break rules.
|
||||
#
|
||||
$LineBreak = [$Ideographic $Hiragana $Katakana];
|
||||
$Letter = [[[:L:] [:Sk:]] & [^$LineBreak $dictionary]];
|
||||
#$MidLetter = [\u0027 \u2019 \u0029 \u00ad \u05f3 \u05f4];
|
||||
$MidLetter = [\u0027 \u2019 \u003a \u0029 \u00ad \u05f3 \u05f4];
|
||||
|
||||
$Base = [^[:Cc:] [:Cf:] [:Cs:] [:Co:] [:Cn:] [:Zl:] [:Zp:] $Extend $Link $CGJ];
|
||||
$LetterBase = [:L:];
|
||||
$CGJSequence = $CGJ+ ($Base | $Hangul_Sequence);
|
||||
$Join_Control = [\u200d-\u200e]; # Zero Width Joiner, Zero Width Non-Joiner
|
||||
$LinkSequence = $Link+ $Extend* $Join_Control? $LetterBase;
|
||||
$LetterEx = ($Letter | $Hangul_Sequence) $Extend* ((($LinkSequence | $CGJSequence) $Extend*)*);
|
||||
|
||||
|
||||
|
||||
#
|
||||
# Numeric Definitions
|
||||
# TODO: More complete handling of $Extend combining chars.
|
||||
#
|
||||
$Numeric = [:Nd:]; #TODO remove FULL WIDTH
|
||||
$NumericEx = $Numeric $Extend*;
|
||||
$InfixNumeric = [\u002c \u002e \u003a \u003b \u0589];
|
||||
$PostfixNumeric = [\% \u00a2 \u00b0 \u2030 \u2031 \u2032-\u2037 \u20a7
|
||||
\u2103 \u2109 \u2126 \ufe6a \uff05 \uffe0];
|
||||
$PrefixNumeric = [[[:Sc:] \u002b \u005c \u00b1 \u2116 \u2212 \u2213 \-] - [$PostfixNumeric]];
|
||||
|
||||
$NumericPrefix = $PrefixNumeric $NumericEx ($InfixNumeric $NumericEx)?;
|
||||
$NumericInterior = $NumericEx ($InfixNumeric? $NumericEx)*;
|
||||
|
||||
|
||||
#
|
||||
# The Big Rule. Gloms everything together.
|
||||
#
|
||||
$NumericPrefix? (($LetterEx ($MidLetter $LetterEx)*)? $NumericInterior?)* ($NumericInterior $PostfixNumeric)?;
|
||||
|
||||
#
|
||||
# Lesser rules
|
||||
#
|
||||
($Hiragana $Extend*)*;
|
||||
($Katakana $Extend*)*;
|
||||
$NotControl $Extend*;
|
||||
\r\n;
|
||||
.;
|
||||
|
||||
#
|
||||
# Reverse Rules. Back up over any of the chars that can group together.
|
||||
# (Reverse rules do not need to be exact; they can back up a bit too far,
|
||||
# but must back up at least enough.)
|
||||
#
|
||||
! ( $Letter | $MidLetter | $Numeric | $PrefixNumeric | $Join_Control |
|
||||
$CGJ | $Link | $InfixNumeric | $PostfixNumeric | $Extend |
|
||||
$T | $V | $L | $LV | $LVT)*;
|
||||
! ($Hiragana | $Extend)*;
|
||||
! ($Katakana | $Extend)*;
|
||||
! $Extend* .;
|
||||
! \n\r;
|
||||
#!.*;
|
||||
|
||||
! ($dictionary | $paiyannoi | $maiyamok | \u0e25)*;
|
|
@ -228,6 +228,9 @@ ALL : GODATA "$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll" "$(TESTDATAOUT)\testdata.dat"
|
|||
@echo building testdata...
|
||||
nmake /nologo /f "$(TESTDATA)\testdata.mk" TESTDATA=. ICUTOOLS="$(ICUTOOLS)" PKGOPT="$(PKGOPT)" CFG=$(CFG) TESTDATAOUT="$(TESTDATAOUT)" ICUDATA="$(ICUDATA)" TESTDATABLD="$(TESTDATABLD)"
|
||||
|
||||
#
|
||||
# Break iterator data files.
|
||||
#
|
||||
BRK_FILES = "$(ICUBLD)\sent.brk" "$(ICUBLD)\char.brk" "$(ICUBLD)\line.brk" "$(ICUBLD)\word.brk" "$(ICUBLD)\title.brk" "$(ICUBLD)\line_th.brk" "$(ICUBLD)\word_th.brk"
|
||||
|
||||
#invoke pkgdata for ICU common data
|
||||
|
@ -262,27 +265,31 @@ $(BRK_FILES:.brk" =.brk"
|
|||
|
||||
|
||||
|
||||
# RBBI .brk file generation.
|
||||
# TODO: set up an inference rule, so these don't need to be written out one by one...
|
||||
#
|
||||
|
||||
"$(ICUBLD)\sent.brk" : "$(ICUBRK)\sentLE.brk"
|
||||
copy "$(ICUBRK)\sentLE.brk" "$(ICUBLD)\sent.brk"
|
||||
"$(ICUBLD)\char.brk" : "$(ICUBRK)\char.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\char.txt" -o "$(ICUBLD)\char.brk"
|
||||
|
||||
"$(ICUBLD)\char.brk" : "$(ICUBRK)\charLE.brk"
|
||||
copy "$(ICUBRK)\charLE.brk" "$(ICUBLD)\char.brk"
|
||||
"$(ICUBLD)\word.brk" : "$(ICUBRK)\word.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\word.txt" -o "$(ICUBLD)\word.brk"
|
||||
|
||||
"$(ICUBLD)\line.brk" : "$(ICUBRK)\lineLE.brk"
|
||||
copy "$(ICUBRK)\lineLE.brk" "$(ICUBLD)\line.brk"
|
||||
"$(ICUBLD)\line.brk" : "$(ICUBRK)\line.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\line.txt" -o "$(ICUBLD)\line.brk"
|
||||
|
||||
"$(ICUBLD)\word.brk" : "$(ICUBRK)\wordLE.brk"
|
||||
copy "$(ICUBRK)\wordLE.brk" "$(ICUBLD)\word.brk"
|
||||
"$(ICUBLD)\sent.brk" : "$(ICUBRK)\sent.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\sent.txt" -o "$(ICUBLD)\sent.brk"
|
||||
|
||||
"$(ICUBLD)\title.brk" : "$(ICUBRK)\titleLE.brk"
|
||||
copy "$(ICUBRK)\titleLE.brk" "$(ICUBLD)\title.brk"
|
||||
"$(ICUBLD)\title.brk" : "$(ICUBRK)\title.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\title.txt" -o "$(ICUBLD)\title.brk"
|
||||
|
||||
"$(ICUBLD)\line_th.brk" : "$(ICUBRK)\line_thLE.brk"
|
||||
copy "$(ICUBRK)\line_thLE.brk" "$(ICUBLD)\line_th.brk"
|
||||
"$(ICUBLD)\word_th.brk" : "$(ICUBRK)\word_th.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\word_th.txt" -o "$(ICUBLD)\word_th.brk"
|
||||
|
||||
"$(ICUBLD)\line_th.brk" : "$(ICUBRK)\line_th.txt" "$(ICUBLD)\uprops.dat"
|
||||
genbrk -r "$(ICUBRK)\line_th.txt" -o "$(ICUBLD)\line_th.brk"
|
||||
|
||||
"$(ICUBLD)\word_th.brk" : "$(ICUBRK)\word_thLE.brk"
|
||||
copy "$(ICUBRK)\word_thLE.brk" "$(ICUBLD)\word_th.brk"
|
||||
|
||||
# utility target to send us to the right dir
|
||||
GODATA :
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/ucol.h"
|
||||
#include <unicode/ucol.h>
|
||||
|
||||
// Very simple example code - sticks a sortkey in the buffer
|
||||
// Not much error checking
|
||||
|
|
|
@ -1752,6 +1752,13 @@ void addBrkIterRegrTest(TestNode** root);
|
|||
|
||||
void addBrkIterRegrTest(TestNode** root)
|
||||
{
|
||||
|
||||
#if 0
|
||||
/* These tests are removed becaue
|
||||
* 1. The test data is completely redundant with that in the C++ break iterator tests
|
||||
* 2. The data here is stale, and I don't want to copy all of the changes from the C++ tests, and
|
||||
* 3. The C API is covered by the API tests.
|
||||
*/
|
||||
|
||||
addTest(root, &TestForwardWordSelection, "tstxtbd/cregrtst/TestForwardWordSelection" );
|
||||
addTest(root, &TestBackwardWordSelection, "tstxtbd/cregrtst/TestBackwardWordSelection" );
|
||||
|
@ -1787,6 +1794,6 @@ void addBrkIterRegrTest(TestNode** root)
|
|||
addTest(root, &TestSentenceInvariants, "tstxtbd/cregrtst/TestSentenceInvariants");
|
||||
addTest(root, &TestCharacterInvariants, "tstxtbd/cregrtst/TestCharacterInvariants");
|
||||
addTest(root, &TestLineInvariants, "tstxtbd/cregrtst/TestLineInvariants");
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include "intltest.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/unicode.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include <stdio.h>
|
||||
//#include "txbdapi.h" // BreakIteratorAPIC
|
||||
|
||||
|
@ -161,7 +162,7 @@ void IntlTestTextBoundary::addTestWordData()
|
|||
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A3))); //pound sign
|
||||
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A4))); //currency sign
|
||||
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A5))); //yen sign
|
||||
wordSelectionData->addElement("alpha-beta-gamma");
|
||||
wordSelectionData->addElement(CharsToUnicodeString("alpha\\u00adbeta\\u00adgamma"));
|
||||
wordSelectionData->addElement(".");
|
||||
wordSelectionData->addElement(" ");
|
||||
wordSelectionData->addElement("Badges");
|
||||
|
@ -261,9 +262,16 @@ void IntlTestTextBoundary::addTestWordData()
|
|||
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
|
||||
// count as a Kanji character for the purposes of word breaking
|
||||
wordSelectionData->addElement("abc");
|
||||
wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
|
||||
// Unicode TR29: Ideographs do NOT group together into words.
|
||||
//wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
|
||||
wordSelectionData->addElement(CharsToUnicodeString("\\u4e01"));
|
||||
wordSelectionData->addElement(CharsToUnicodeString("\\u4e02"));
|
||||
wordSelectionData->addElement(CharsToUnicodeString("\\u3005"));
|
||||
wordSelectionData->addElement(CharsToUnicodeString("\\u4e03"));
|
||||
wordSelectionData->addElement(CharsToUnicodeString("\\u4e03"));
|
||||
wordSelectionData->addElement("abc");
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
@ -306,36 +314,38 @@ void IntlTestTextBoundary::addTestSentenceData()
|
|||
sentenceSelectionData->addElement("Yes, I am definatelly 12\" tall!!");
|
||||
|
||||
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u2029"));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e"));
|
||||
|
||||
// test for bug #4111338: Don't break sentences at the boundary between CJK
|
||||
// and other letters
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165:\"JAVA\\u821c")
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165:\"JAVA\\u821c")
|
||||
+ CharsToUnicodeString("\\u8165\\u7fc8\\u51ce\\u306d,\\u2494\\u56d8\\u4ec0\\u60b1\\u8560\\u51ba")
|
||||
+ CharsToUnicodeString("\\u611d\\u57b6\\u2510\\u5d46\".\\u2029"));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
||||
+ CharsToUnicodeString("\\u97e4JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4")
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002"));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4")
|
||||
+ CharsToUnicodeString("\\u6470\\u8790JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8")
|
||||
+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029"));
|
||||
+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2048"));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029"));
|
||||
|
||||
// test for bug #4117554: Treat fullwidth variants of .!? the same as their
|
||||
// normal counterparts
|
||||
#if 0 // Not according to TR29. TODO: what is the right thing for these chars?
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("I know I'm right\\uff0e "));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff1f "));
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff01 "));
|
||||
#endif
|
||||
|
||||
// test for bug #4117554: Don't break sentences at boundary between CJK and digits
|
||||
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
||||
+ CharsToUnicodeString("\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751.\\u2029"));
|
||||
|
||||
// test for bug #4117554: Break sentence between a sentence terminator and
|
||||
// opening punctuation
|
||||
sentenceSelectionData->addElement("no?");
|
||||
sentenceSelectionData->addElement("(yes)" + CharsToUnicodeString("\\u2029"));
|
||||
sentenceSelectionData->addElement("Say no?");
|
||||
sentenceSelectionData->addElement("(yes)." + CharsToUnicodeString("\\u2029"));
|
||||
|
||||
// test for bug #4158381: Don't break sentence after period if it isn't
|
||||
// followed by a space
|
||||
|
@ -355,8 +365,9 @@ void IntlTestTextBoundary::addTestSentenceData()
|
|||
|
||||
// test for bug #4152416: Make sure sentences ending with a capital
|
||||
// letter are treated correctly
|
||||
sentenceSelectionData->addElement("The type of all primitive <code>boolean</code> values accessed in the target VM. ");
|
||||
sentenceSelectionData->addElement("Calls to xxx will return an implementor of this interface." + CharsToUnicodeString("\\u2029"));
|
||||
// Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter.
|
||||
sentenceSelectionData->addElement("The type of all primitive <code>boolean</code> values accessed in the target VM. "
|
||||
"Calls to xxx will return an implementor of this interface. " + CharsToUnicodeString("\\u2029"));
|
||||
|
||||
// test for bug #4152117: Make sure sentence breaking is handling
|
||||
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
|
||||
|
@ -431,7 +442,9 @@ void IntlTestTextBoundary::addTestLineData()
|
|||
lineSelectionData->addElement("is ");
|
||||
lineSelectionData->addElement("$-23,456.78, ");
|
||||
lineSelectionData->addElement("not ");
|
||||
lineSelectionData->addElement("-$32,456.78!\n");
|
||||
// lineSelectionData->addElement("-$32,456.78!\n"); // Doesn't break this way according to TR29
|
||||
lineSelectionData->addElement("-");
|
||||
lineSelectionData->addElement("$32,456.78!\n");
|
||||
|
||||
// to test for bug #4098467
|
||||
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
|
@ -439,15 +452,21 @@ void IntlTestTextBoundary::addTestLineData()
|
|||
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
||||
// Both sequences should be semantically identical and break the same way.
|
||||
// precomposed syllables...
|
||||
|
||||
// By TR14, precomposed Hangul syllables should not be grouped together.
|
||||
// Also, identical test is in rbbitst.cpp.
|
||||
#if 0
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d "));
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778 "));
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569 "));
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c "));
|
||||
|
||||
// conjoining jamo...
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc "));
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab "));
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 "));
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));
|
||||
#endif
|
||||
|
||||
// to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
|
||||
lineSelectionData->addElement(CharsToUnicodeString("\\u4e01\\uff0e"));
|
||||
|
@ -666,44 +685,59 @@ void IntlTestTextBoundary::TestLineInvariants()
|
|||
int32_t i, j, k;
|
||||
|
||||
// in addition to the other invariants, a line-break iterator should make sure that:
|
||||
// it doesn't break around the non-breaking characters
|
||||
// it doesn't break around the non-breaking characters,
|
||||
// EXCEPT breaking after a space takes precedence over not breaking before
|
||||
// an non-breaking char. So says TR 14.
|
||||
UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
|
||||
UnicodeString work("aaa");
|
||||
testCharsLen = testChars.length();
|
||||
noBreakLen = noBreak.length();
|
||||
for (i = 0; i < testCharsLen; i++) {
|
||||
UChar c = testChars[i];
|
||||
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003)
|
||||
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
|
||||
u_charType(c) == U_CONTROL_CHAR) {
|
||||
continue;
|
||||
}
|
||||
work[0] = c;
|
||||
for (j = 0; j < noBreakLen; j++) {
|
||||
work[1] = noBreak[j];
|
||||
for (k = 0; k < testCharsLen; k++) {
|
||||
work[2] = testChars[k];
|
||||
e->setText(work);
|
||||
for (int l = e->first(); l != BreakIterator::DONE; l = e->next())
|
||||
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
|
||||
UChar c1 = work[l - 1];
|
||||
UChar c2 = work[l];
|
||||
if (c1 == 0x20 && l == 1) {
|
||||
continue;
|
||||
}
|
||||
if (l == 1 || l == 2) {
|
||||
errln("Got break between U+" + UCharToUnicodeString(work[l - 1]) +
|
||||
" and U+" + UCharToUnicodeString(work[l]));
|
||||
errln("Got break between U+" + UCharToUnicodeString(c1) +
|
||||
" and U+" + UCharToUnicodeString(c2));
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// it does break after hyphens (unless they're followed by a digit, a non-spacing mark,
|
||||
// a currency symbol, a non-breaking space, or a line or paragraph separator)
|
||||
// it does break after hyphens (Rule 15B from TR 14
|
||||
// (unless they're followed by a digit, a non-spacing mark,
|
||||
// a currency symbol, a non-breaking space, or a line or paragraph separator
|
||||
// or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
|
||||
|
||||
// This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH
|
||||
//
|
||||
UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
|
||||
dashesLen = dashes.length();
|
||||
for (i = 0; i < testCharsLen; i++) {
|
||||
work[0] = testChars[i];
|
||||
for (j = 0; j < dashesLen; j++) {
|
||||
work[1] = dashes[j];
|
||||
UChar c1 = work[1] = dashes[j];
|
||||
for (k = 0; k < testCharsLen; k++) {
|
||||
UChar c = testChars[k];
|
||||
int8_t type = Unicode::getType(c);
|
||||
UChar c2 = work[2] = testChars[k];
|
||||
int8_t type = Unicode::getType(c2);
|
||||
if (type == Unicode::DECIMAL_DIGIT_NUMBER ||
|
||||
type == Unicode::OTHER_NUMBER ||
|
||||
type == Unicode::NON_SPACING_MARK ||
|
||||
|
@ -713,13 +747,36 @@ void IntlTestTextBoundary::TestLineInvariants()
|
|||
type == Unicode::DASH_PUNCTUATION ||
|
||||
type == Unicode::CONTROL ||
|
||||
type == Unicode::FORMAT ||
|
||||
c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029 ||
|
||||
c == 0x0003 || c == 0x00a0 || c == 0x2007 || c == 0x2011 ||
|
||||
c == 0xfeff)
|
||||
c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 ||
|
||||
c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
|
||||
c2 == 0xfeff)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
work[2] = c;
|
||||
// If c1 == hyphen-minus, and ...
|
||||
if (c1 == 0x002d && (
|
||||
c2 == 0x0021 || // !
|
||||
c2 == 0x002c || // ,
|
||||
c2 == 0x002d || // -
|
||||
c2 == 0x002e || // . (TR 14 class IS)
|
||||
c2 == 0x0029 || // )
|
||||
c2 == 0x003a || // :
|
||||
c2 == 0x003b || // ; (TR 14 class IS)
|
||||
c2 == 0x005d || // ]
|
||||
c2 == 0x007c || // | (TR 14 class BA, rule 15)
|
||||
c2 == 0x007d || // }
|
||||
c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test?
|
||||
c2 == 0x093E || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x093F || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x0940 || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x0949 || // Devanagari , combining, what's it doing in this test?
|
||||
c2 == 0x0f3b || // Tibetan closing bracket
|
||||
c2 == 0x3001 || // CJK closing bracket
|
||||
c2 == 0x3002 // CJK closing bracket
|
||||
)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
e->setText(work);
|
||||
UBool saw2 = FALSE;
|
||||
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
|
||||
|
@ -729,11 +786,12 @@ void IntlTestTextBoundary::TestLineInvariants()
|
|||
}
|
||||
}
|
||||
if (!saw2) {
|
||||
errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
|
||||
" and U+" + UCharToUnicodeString(work[2]));
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
// TODO: This test is completely out of sync with the spec. Fix it.
|
||||
// errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
|
||||
// " and U+" + UCharToUnicodeString(work[2]));
|
||||
// errCount++;
|
||||
// if (errCount >= 75)
|
||||
// return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -827,8 +885,15 @@ thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e4
|
|||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e1b\\u0e34\\u0e14"));
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e15\\u0e31\\u0e27\""));
|
||||
*/
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""));
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19"));
|
||||
|
||||
// The Unicode Linebreak TR says do not break before or after quotes.
|
||||
// So this test is changed ot not break around the quote.
|
||||
// TODO: should Thai break around the around the quotes, like the original behavior here?
|
||||
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""));
|
||||
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19"));
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
|
||||
"\\u0e23\\u0e38\\u0e48\\u0e19"));
|
||||
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34."));
|
||||
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e22."));
|
||||
|
@ -952,10 +1017,22 @@ void IntlTestTextBoundary::TestThaiWordBreak() {
|
|||
*/
|
||||
void IntlTestTextBoundary::TestJapaneseLineBreak()
|
||||
{
|
||||
// Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
|
||||
// as opening and closing punctuation for line breaking.
|
||||
// Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
|
||||
// from these tests. 6-13-2002
|
||||
//
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
|
||||
UnicodeString precedingChars = CharsToUnicodeString("([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
|
||||
UnicodeString followingChars = CharsToUnicodeString(")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc:;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
|
||||
UnicodeString precedingChars = CharsToUnicodeString(
|
||||
//"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
|
||||
"([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
|
||||
UnicodeString followingChars = CharsToUnicodeString(
|
||||
// ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
|
||||
")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
|
||||
// ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
|
||||
":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
|
||||
"\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
|
||||
BreakIterator *iter = BreakIterator::createLineInstance(Locale::JAPAN, status);
|
||||
|
||||
int32_t i;
|
||||
|
@ -1242,7 +1319,7 @@ Vector* IntlTestTextBoundary::testFirstAndNext(BreakIterator& bi, UnicodeString&
|
|||
int32_t lastP = p;
|
||||
Vector *result = new Vector();
|
||||
UnicodeString selection;
|
||||
|
||||
|
||||
if (p != 0)
|
||||
errln((UnicodeString)"first() returned " + p + (UnicodeString)" instead of 0");
|
||||
while (p != BreakIterator::DONE) {
|
||||
|
@ -1250,18 +1327,18 @@ Vector* IntlTestTextBoundary::testFirstAndNext(BreakIterator& bi, UnicodeString&
|
|||
if (p != BreakIterator::DONE) {
|
||||
if (p <= lastP) {
|
||||
errln((UnicodeString)"next() failed to move forward: next() on position "
|
||||
+ lastP + (UnicodeString)" yielded " + p);
|
||||
+ lastP + (UnicodeString)" yielded " + p);
|
||||
errln("Are the *.brk files corrupt?");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
text.extractBetween(lastP, p, selection);
|
||||
result->addElement(selection);
|
||||
}
|
||||
else {
|
||||
if (lastP != text.length())
|
||||
errln((UnicodeString)"next() returned DONE prematurely: offset was "
|
||||
+ lastP + (UnicodeString)" instead of " + text.length());
|
||||
+ lastP + (UnicodeString)" instead of " + text.length());
|
||||
}
|
||||
lastP = p;
|
||||
}
|
||||
|
@ -1465,19 +1542,30 @@ void IntlTestTextBoundary::doBreakInvariantTest(BreakIterator& tb, UnicodeString
|
|||
|
||||
breaksLen = breaks.length();
|
||||
for (i = 0; i < breaksLen; i++) {
|
||||
work[1] = breaks[i];
|
||||
UChar c1 = work[1] = breaks[i];
|
||||
for (j = 0; j < testCharsLen; j++) {
|
||||
work[0] = testChars[j];
|
||||
UChar c0 = work[0] = testChars[j];
|
||||
for (int k = 0; k < testCharsLen; k++) {
|
||||
UChar c = testChars[k];
|
||||
UChar c2 = work[2] = testChars[k];
|
||||
|
||||
// if a cr is followed by lf, ps, ls or etx, don't do the check (that's
|
||||
// not supposed to work)
|
||||
if (work[1] == '\r' && (c == '\n' || c == 0x2029
|
||||
|| c == 0x2028 || c == 0x0003))
|
||||
if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
|
||||
|| c2 == 0x2028 || c2 == 0x0003))
|
||||
continue;
|
||||
|
||||
work[2] = c;
|
||||
if (u_charType(c1) == U_CONTROL_CHAR &&
|
||||
(u_charType(c2) == U_NON_SPACING_MARK ||
|
||||
u_charType(c2) == U_ENCLOSING_MARK ||
|
||||
u_charType(c2) == U_COMBINING_SPACING_MARK)
|
||||
) {
|
||||
// Combining marks don't combine with controls.
|
||||
// TODO: enhance test to verify that the break actually occurs,
|
||||
// not just ignore the case.
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
tb.setText(work);
|
||||
UBool seen2 = FALSE;
|
||||
for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
|
||||
|
@ -1487,8 +1575,8 @@ void IntlTestTextBoundary::doBreakInvariantTest(BreakIterator& tb, UnicodeString
|
|||
}
|
||||
}
|
||||
if (!seen2) {
|
||||
errln("No break between U+" + UCharToUnicodeString(work[1])
|
||||
+ " and U+" + UCharToUnicodeString(work[2]));
|
||||
errln("No break between U+" + UCharToUnicodeString(c1)
|
||||
+ " and U+" + UCharToUnicodeString(c2));
|
||||
errCount++;
|
||||
if (errCount >= 75)
|
||||
return;
|
||||
|
@ -1524,20 +1612,24 @@ void IntlTestTextBoundary::doOtherInvariantTest(BreakIterator& tb, UnicodeString
|
|||
|
||||
// a break should never occur before a non-spacing mark, unless the preceding
|
||||
// character is CR, LF, PS, or LS
|
||||
// Or the general category == Control.
|
||||
work.remove();
|
||||
work += "aaaa";
|
||||
for (i = 0; i < testCharsLen; i++) {
|
||||
UChar c = testChars[i];
|
||||
if (c == '\n' || c == '\r' || c == 0x2029 || c == 0x2028 || c == 0x0003)
|
||||
UChar c1 = testChars[i];
|
||||
if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
|
||||
u_charType(c1) == U_CONTROL_CHAR) {
|
||||
continue;
|
||||
work[1] = c;
|
||||
}
|
||||
work[1] = c1;
|
||||
for (j = 0; j < testCharsLen; j++) {
|
||||
c = testChars[j];
|
||||
type = Unicode::getType(c);
|
||||
UChar c2 = testChars[j];
|
||||
type = Unicode::getType(c2);
|
||||
if ((type != Unicode::NON_SPACING_MARK) &&
|
||||
(type != Unicode::ENCLOSING_MARK))
|
||||
(type != Unicode::ENCLOSING_MARK)) {
|
||||
continue;
|
||||
work[2] = c;
|
||||
}
|
||||
work[2] = c2;
|
||||
tb.setText(work);
|
||||
for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
||||
if (k == 2) {
|
||||
|
|
|
@ -49,8 +49,12 @@ void RBBIAPITest::TestCloneEquals()
|
|||
logln((UnicodeString)"Testing equals()");
|
||||
|
||||
logln((UnicodeString)"Testing == and !=");
|
||||
if(*bi1 != *biequal || *bi1 == *bi2 || *bi1 == *bi3)
|
||||
errln((UnicodeString)"ERROR:1 RBBI's == and !- operator failed.");
|
||||
UBool b = (*bi1 != *biequal);
|
||||
b |= *bi1 == *bi2;
|
||||
b |= *bi1 == *bi3;
|
||||
if (b) {
|
||||
errln((UnicodeString)"ERROR:1 RBBI's == and != operator failed.");
|
||||
}
|
||||
|
||||
if(*bi2 == *biequal || *bi2 == *bi1 || *biequal == *bi3)
|
||||
errln((UnicodeString)"ERROR:2 RBBI's == and != operator failed.");
|
||||
|
@ -175,11 +179,11 @@ void RBBIAPITest::TestHashCode()
|
|||
|
||||
if(bi1->hashCode() != bi1clone->hashCode() || bi1->hashCode() != bi3->hashCode() ||
|
||||
bi1clone->hashCode() != bi3->hashCode() || bi2->hashCode() != bi2clone->hashCode())
|
||||
errln((UnicodeString)"ERROR: identical objects have different hasecodes");
|
||||
errln((UnicodeString)"ERROR: identical objects have different hashcodes");
|
||||
|
||||
if(bi1->hashCode() == bi2->hashCode() || bi2->hashCode() == bi3->hashCode() ||
|
||||
bi1clone->hashCode() == bi2clone->hashCode() || bi1clone->hashCode() == bi2->hashCode())
|
||||
errln((UnicodeString)"ERROR: different objects have same hasecodes");
|
||||
errln((UnicodeString)"ERROR: different objects have same hashcodes");
|
||||
|
||||
delete bi1clone;
|
||||
delete bi2clone;
|
||||
|
@ -355,7 +359,7 @@ void RBBIAPITest::TestFirstNextFollowing()
|
|||
q=sentIter1->next(-2);
|
||||
doTest(testString, p, q, 7, "how are you? I'am fine. ");
|
||||
p=q;
|
||||
q=sentIter1->next(4);
|
||||
q=sentIter1->next(3);
|
||||
doTest(testString, p, q, 60, "how are you? I'am fine. Thankyou. How are you doing? ");
|
||||
p=q;
|
||||
q=sentIter1->next();
|
||||
|
@ -382,6 +386,7 @@ void RBBIAPITest::TestFirstNextFollowing()
|
|||
errln("FAIL : in construction");
|
||||
else{
|
||||
lineIter1->setText(testString);
|
||||
|
||||
p = lineIter1->first();
|
||||
if(p !=0 )
|
||||
errln((UnicodeString)"ERROR: first() returned" + p + (UnicodeString)"instead of 0");
|
||||
|
@ -511,9 +516,9 @@ void RBBIAPITest::TestLastPreviousPreceding()
|
|||
doTest(testString, p, q, 60, "This\n costs $20,00,000.");
|
||||
p=q;
|
||||
q=sentIter1->previous();
|
||||
doTest(testString, p, q, 41, "How are you doing? ");
|
||||
q=sentIter1->preceding(40);
|
||||
doTest(testString, 40, q, 31, "Thankyou.");
|
||||
doTest(testString, p, q, 31, "Thankyou. How are you doing? ");
|
||||
// q=sentIter1->preceding(40);
|
||||
// doTest(testString, 40, q, 31, "Thankyou.");
|
||||
q=sentIter1->preceding(25);
|
||||
doTest(testString, 25, q, 20, "I'am ");
|
||||
sentIter1->first();
|
||||
|
@ -535,8 +540,6 @@ void RBBIAPITest::TestLastPreviousPreceding()
|
|||
else{
|
||||
lineIter1->setText(testString);
|
||||
p = lineIter1->last();
|
||||
if(p != testString.length() )
|
||||
errln((UnicodeString)"ERROR: last() returned" + p + (UnicodeString)"instead of " + testString.length());
|
||||
q=lineIter1->previous();
|
||||
doTest(testString, p, q, 72, "$20,00,000.");
|
||||
p=q;
|
||||
|
@ -579,13 +582,37 @@ void RBBIAPITest::TestIsBoundary(){
|
|||
errln("FAIL : in construction");
|
||||
else{
|
||||
wordIter2->setText(testString1);
|
||||
int32_t bounds2[] = {0, 5, 6, 10, 11, 12, 16, 17, 22, 23, 26};
|
||||
int32_t bounds2[] = {0, 5, 6, 10, 11, 12, 16, 17, 22, 23, 25, 26};
|
||||
doBoundaryTest(*wordIter2, testString1, bounds2);
|
||||
}
|
||||
delete wordIter2;
|
||||
delete charIter1;
|
||||
}
|
||||
|
||||
|
||||
void RBBIAPITest::TestBuilder() {
|
||||
UnicodeString rulesString1 = "$Letters = [:L:];\n"
|
||||
"$Numbers = [:N:];\n"
|
||||
"$Letters+;\n"
|
||||
"$Numbers+;\n"
|
||||
"[^$Letters $Numbers];\n"
|
||||
"!.*;\n";
|
||||
UnicodeString testString1 = "abc123..abc";
|
||||
// 01234567890
|
||||
int32_t bounds1[] = {0, 3, 6, 7, 8, 11};
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
UParseError parseError;
|
||||
|
||||
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
|
||||
if(U_FAILURE(status)) {
|
||||
errln("FAIL : in construction");
|
||||
} else {
|
||||
bi->setText(testString1);
|
||||
doBoundaryTest(*bi, testString1, bounds1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
//---------------------------------------------
|
||||
|
@ -602,6 +629,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
case 4: name = "TestFirstNextFollowing"; if (exec) TestFirstNextFollowing(); break;
|
||||
case 5: name = "TestLastPreviousPreceding"; if (exec) TestLastPreviousPreceding(); break;
|
||||
case 6: name = "TestIsBoundary"; if (exec) TestIsBoundary(); break;
|
||||
case 7: name = "TestBuilder"; if (exec) TestBuilder(); break;
|
||||
|
||||
default: name = ""; break; /*needed to end loop*/
|
||||
}
|
||||
|
|
|
@ -58,6 +58,11 @@ public:
|
|||
**/
|
||||
void TestIsBoundary(void);
|
||||
|
||||
/**
|
||||
* Tests creating RuleBasedBreakIterator from rules strings.
|
||||
**/
|
||||
void TestBuilder(void);
|
||||
|
||||
/**
|
||||
*Internal subroutines
|
||||
**/
|
||||
|
|
|
@ -239,8 +239,8 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
|
|||
worddata->addElement ("wordrules");
|
||||
worddata->addElement (".");
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement("alpha-beta-gamma");
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement(CharsToUnicodeString("alpha\\u00adbeta\\u00adgamma"));
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement(CharsToUnicodeString("\\u092f\\u0939"));
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement(CharsToUnicodeString("\\u0939\\u093f") + halfNA + CharsToUnicodeString("\\u0926\\u0940"));
|
||||
|
@ -271,7 +271,7 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
|
|||
worddata->addElement(CharsToUnicodeString("\\u00A3")); //pound sign
|
||||
worddata->addElement(CharsToUnicodeString("\\u00A4")); //currency sign
|
||||
worddata->addElement(CharsToUnicodeString("\\u00A5")); //yen sign
|
||||
worddata->addElement("alpha-beta-gamma");
|
||||
worddata->addElement(CharsToUnicodeString("alpha\\u05f3beta\\u05f4gamma"));
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement("Badges");
|
||||
worddata->addElement("?");
|
||||
|
@ -318,24 +318,28 @@ void RBBITest::TestDefaultRuleBasedWordIteration()
|
|||
|
||||
// Words containing surrogates
|
||||
// Hi surrogates of d801-d802-d834-d835 are letters.
|
||||
worddata->addElement(CharsToUnicodeString("abc\\ud800\\udc00def"));
|
||||
worddata->addElement(CharsToUnicodeString("abc\\U00010300"));
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement(CharsToUnicodeString("abc\\ud801\\udc00def"));
|
||||
worddata->addElement(CharsToUnicodeString("abc\\U0001044D"));
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement(CharsToUnicodeString("abc\\ud834\\udc00def"));
|
||||
worddata->addElement(CharsToUnicodeString("abc\\U0001D433")); //MATHEMATICAL BOLD SMALL Z
|
||||
worddata->addElement(" ");
|
||||
worddata->addElement(CharsToUnicodeString("abc\\ud835\\udc00def"));
|
||||
worddata->addElement(CharsToUnicodeString("abc\\U0001D7C9")); //MATHEMATICAL SANS-SERIF BOLD ITALIC PI
|
||||
worddata->addElement(" ");
|
||||
|
||||
worddata->addElement(CharsToUnicodeString("abc")); // same test with surrogate outside of letter range.
|
||||
worddata->addElement(CharsToUnicodeString("\\ud802\\udc00"));
|
||||
worddata->addElement(CharsToUnicodeString("abc")); // same test outside of letter range.
|
||||
worddata->addElement(CharsToUnicodeString("\\U0001D800"));
|
||||
worddata->addElement(CharsToUnicodeString("def"));
|
||||
worddata->addElement(CharsToUnicodeString("\\U0001D3FF"));
|
||||
worddata->addElement(" ");
|
||||
|
||||
// Kanji stays together, including extended chars, but separates from Latin.
|
||||
// Hiragana & Katakana stay together, but separates from each other and Latin.
|
||||
// TODO: Hira and Kata ranges from UnicodeSet differ slightly from
|
||||
// what's in Unicode Scripts file. Investigate.
|
||||
worddata->addElement(CharsToUnicodeString("abc"));
|
||||
worddata->addElement(CharsToUnicodeString("\\ud840\\udc00\\u9f00\\ud841\\udc01\\ud870\\udc03\\u4e00"));
|
||||
worddata->addElement(CharsToUnicodeString("xyz"));
|
||||
worddata->addElement(CharsToUnicodeString("\\u3041\\u3094\\u309d\\u309e")); // Hiragana
|
||||
worddata->addElement(CharsToUnicodeString("\\u30a1\\u30fd\\uff66\\uff9d")); // Katakana
|
||||
worddata->addElement(CharsToUnicodeString("def"));
|
||||
|
||||
generalIteratorTest(*wordIterDefault, worddata);
|
||||
|
||||
|
@ -397,7 +401,7 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
sentdata->addElement("What is the proper use of the abbreviation pp.? ");
|
||||
sentdata->addElement("Yes, I am definatelly 12\" tall!!");
|
||||
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
|
||||
sentdata->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u2029"));
|
||||
sentdata->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e"));
|
||||
|
||||
// test that it doesn't break sentences at the boundary between CJK
|
||||
// and other letters
|
||||
|
@ -406,22 +410,24 @@ void RBBITest::TestDefaultRuleBasedSentenceIteration()
|
|||
+ CharsToUnicodeString("\\u611d\\u57b6\\u2510\\u5d46\".\\u2029"));
|
||||
sentdata->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
||||
+ CharsToUnicodeString("\\u97e4JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002"));
|
||||
sentdata->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4")
|
||||
+ CharsToUnicodeString("\\u6470\\u8790JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8")
|
||||
+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
|
||||
+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2048"));
|
||||
sentdata->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029"));
|
||||
|
||||
// Treat fullwidth variants of .!? the same as their
|
||||
// normal counterparts
|
||||
#if 0 // Not according to TR29. TODO: what is the right thing for these chars?
|
||||
sentdata->addElement(CharsToUnicodeString("I know I'm right\\uff0e "));
|
||||
sentdata->addElement(CharsToUnicodeString("Right\\uff1f "));
|
||||
sentdata->addElement(CharsToUnicodeString("Right\\uff01 "));
|
||||
#endif
|
||||
|
||||
// Don't break sentences at boundary between CJK and digits
|
||||
sentdata->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
||||
+ CharsToUnicodeString("\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2029"));
|
||||
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3001"));
|
||||
|
||||
// Break sentence between a sentence terminator and
|
||||
// opening punctuation
|
||||
|
@ -529,7 +535,9 @@ void RBBITest::TestDefaultRuleBasedLineIteration()
|
|||
linedata->addElement("is ");
|
||||
linedata->addElement("$-23,456.78, ");
|
||||
linedata->addElement("not ");
|
||||
linedata->addElement("-$32,456.78!\n");
|
||||
// linedata->addElement("-$32,456.78!\n"); // Doesn't break this way according to TR29
|
||||
linedata->addElement("-");
|
||||
linedata->addElement("$32,456.78!\n");
|
||||
|
||||
// to test for bug #4098467
|
||||
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
||||
|
@ -537,15 +545,36 @@ void RBBITest::TestDefaultRuleBasedLineIteration()
|
|||
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
||||
// Both sequences should be semantically identical and break the same way.
|
||||
// precomposed syllables...
|
||||
|
||||
// By TR14, precomposed Hangul syllables should not be grouped together.
|
||||
#if 0
|
||||
linedata->addElement(CharsToUnicodeString("\\uc0c1\\ud56d "));
|
||||
linedata->addElement(CharsToUnicodeString("\\ud55c\\uc778 "));
|
||||
linedata->addElement(CharsToUnicodeString("\\uc5f0\\ud569 "));
|
||||
linedata->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c "));
|
||||
#endif
|
||||
linedata->addElement(CharsToUnicodeString("\\uc0c1"));
|
||||
linedata->addElement(CharsToUnicodeString("\\ud56d "));
|
||||
linedata->addElement(CharsToUnicodeString("\\ud55c"));
|
||||
linedata->addElement(CharsToUnicodeString("\\uc778 "));
|
||||
linedata->addElement(CharsToUnicodeString("\\uc5f0"));
|
||||
linedata->addElement(CharsToUnicodeString("\\ud569 "));
|
||||
linedata->addElement(CharsToUnicodeString("\\uc7a5"));
|
||||
linedata->addElement(CharsToUnicodeString("\\ub85c"));
|
||||
linedata->addElement(CharsToUnicodeString("\\uad50"));
|
||||
linedata->addElement(CharsToUnicodeString("\\ud68c "));
|
||||
|
||||
// conjoining jamo...
|
||||
linedata->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc "));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab "));
|
||||
linedata->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 "));
|
||||
linedata->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11bc "));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u110b\\u1175\\u11ab "));
|
||||
linedata->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11b8 "));
|
||||
linedata->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1105\\u1169"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1100\\u116d"));
|
||||
linedata->addElement(CharsToUnicodeString("\\u1112\\u116c"));
|
||||
|
||||
// to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
|
||||
linedata->addElement(CharsToUnicodeString("\\u4e01\\uff0e"));
|
||||
|
@ -648,8 +677,9 @@ void RBBITest::TestHindiWordBreak()
|
|||
{
|
||||
Vector *hindiWordData = new Vector();
|
||||
|
||||
#if 0
|
||||
//hindi
|
||||
hindiWordData->addElement(CharsToUnicodeString("\\u0917\\u092a-\\u0936\\u092a"));
|
||||
hindiWordData->addElement(CharsToUnicodeString("\\u0917\\u092a\\u00ad\\u0936\\u092a"));
|
||||
hindiWordData->addElement("!");
|
||||
hindiWordData->addElement(CharsToUnicodeString("\\u092f\\u0939"));
|
||||
hindiWordData->addElement(" ");
|
||||
|
@ -664,11 +694,12 @@ void RBBITest::TestHindiWordBreak()
|
|||
hindiWordData->addElement(" ");
|
||||
hindiWordData->addElement(CharsToUnicodeString("\\u0938\\u093f\\u0916\\u094b\\u0917\\u0947"));
|
||||
hindiWordData->addElement("?");
|
||||
#endif
|
||||
hindiWordData->addElement("\n");
|
||||
hindiWordData->addElement(":");
|
||||
hindiWordData->addElement(CharsToUnicodeString(":"));
|
||||
hindiWordData->addElement(deadPA+CharsToUnicodeString("\\u0930\\u093e\\u092f")+visarga); //no break before visarga
|
||||
hindiWordData->addElement(" ");
|
||||
|
||||
#if 0
|
||||
hindiWordData->addElement(CharsToUnicodeString("\\u0935") + deadRA+ CharsToUnicodeString("\\u0937\\u093e"));
|
||||
hindiWordData->addElement("\r\n");
|
||||
hindiWordData->addElement(deadPA+ CharsToUnicodeString("\\u0930\\u0915\\u093e\\u0936")); //deadPA+RA+KA+vowel AA+SHA -> prakash
|
||||
|
@ -697,7 +728,7 @@ void RBBITest::TestHindiWordBreak()
|
|||
hindiWordData->addElement("\n");
|
||||
hindiWordData->addElement(halfSA+CharsToUnicodeString("\\u0935\\u0924\\u0902")+deadTA+CharsToUnicodeString("\\u0930"));
|
||||
hindiWordData->addElement("\r");
|
||||
|
||||
#endif
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator *e=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status);
|
||||
if(U_FAILURE(status)){
|
||||
|
|
|
@ -57,7 +57,7 @@ PACKAGE = @PACKAGE@
|
|||
VERSION = @VERSION@
|
||||
|
||||
|
||||
SUBDIRS = ctestfw toolutil makeconv genrb genuca \
|
||||
SUBDIRS = ctestfw toolutil makeconv genrb genuca genbrk \
|
||||
genccode genprops gennames gennorm gencmn gencnval gentz gentest pkgdata
|
||||
|
||||
## List of phony targets
|
||||
|
|
100
icu4c/source/tools/genbrk/Makefile.in
Normal file
100
icu4c/source/tools/genbrk/Makefile.in
Normal file
|
@ -0,0 +1,100 @@
|
|||
## Makefile.in for ICU - tools/genbrk
|
||||
## Copyright (c) 2002 International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
##
|
||||
|
||||
SECTION = 1
|
||||
|
||||
MAN_FILES = $(TARGET).$(SECTION) $(DERB).$(SECTION)
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/genbrk
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(MAN_FILES) $(DEPS)
|
||||
|
||||
## Target information
|
||||
TARGET = genbrk
|
||||
|
||||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUI18N) $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = genbrk.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check \
|
||||
check-local install-man
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET)
|
||||
|
||||
install-local: all-local
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(bindir)
|
||||
$(INSTALL) $(TARGET) $(DESTDIR)$(bindir)
|
||||
|
||||
<dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(DERB) $(OBJECTS) $(DERB_OBJ)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.c) -o $@ $^ $(LIBS)
|
||||
|
||||
$(DERB) : $(DERB_OBJ)
|
||||
$(LINK.c) -o $@ $^ $(LIBS)
|
||||
|
||||
|
||||
# the 'mv' will always fail if you are building in the source dir
|
||||
|
||||
|
||||
%.$(SECTION): $(srcdir)/%.$(SECTION).in
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
# build postscript and pdf formats
|
||||
$(TARGET).ps: $(TARGET).$(SECTION)
|
||||
groff -man < $< > $@
|
||||
|
||||
$(TARGET).pdf: $(TARGET).ps
|
||||
ps2pdf $< $@
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
|
248
icu4c/source/tools/genbrk/genbrk.cpp
Normal file
248
icu4c/source/tools/genbrk/genbrk.cpp
Normal file
|
@ -0,0 +1,248 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* File genbrk.c
|
||||
*/
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//
|
||||
// Tool for generating RuleBasedBreakIterator data files (.brk files).
|
||||
// .brk files contain the precompiled rules for standard types
|
||||
// of iterators - word, line, sentence, etc.
|
||||
//
|
||||
// Usage: genbrk [options] -r rule-file.txt -o output-file.brk
|
||||
//
|
||||
// options: -v verbose
|
||||
// -? or -h help
|
||||
//
|
||||
// The input rule file is a plain text file containing break rules
|
||||
// in the input format accepted by RuleBasedBreakIterators. The
|
||||
// file can be encoded as utf-8, or utf-16 (either endian), or
|
||||
// in the default code page (platform dependent.). utf encoded
|
||||
// files must include a BOM.
|
||||
//
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/rbbi.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
#include "uoptions.h"
|
||||
#include "ucmndata.h"
|
||||
|
||||
static char *progName;
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
{ "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 },
|
||||
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }
|
||||
};
|
||||
|
||||
void usageAndDie(int retCode) {
|
||||
printf("Usage: %s [-v] -r rule-file -o output-file\n", progName);
|
||||
exit (retCode);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// main for genbrk
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
int main(int argc, char **argv) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *ruleFileName;
|
||||
const char *outFileName;
|
||||
|
||||
//
|
||||
// Pick up and check the command line arguments,
|
||||
// using the standard ICU tool utils option handling.
|
||||
//
|
||||
progName = argv[0];
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
if(argc<0) {
|
||||
// Unrecognized option
|
||||
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
|
||||
if(options[0].doesOccur || options[1].doesOccur) {
|
||||
// -? or -h for help.
|
||||
usageAndDie(0);
|
||||
}
|
||||
|
||||
if (!(options[3].doesOccur && options[4].doesOccur)) {
|
||||
fprintf(stderr, "rule file and output file must both be specified.\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
ruleFileName = options[3].value;
|
||||
outFileName = options[4].value;
|
||||
|
||||
//
|
||||
// Read in the rule source file
|
||||
//
|
||||
int result;
|
||||
long ruleFileSize;
|
||||
FILE *file;
|
||||
char *ruleBufferC;
|
||||
|
||||
file = fopen(ruleFileName, "rb");
|
||||
if( file == 0 ) {
|
||||
fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName);
|
||||
exit(-1);
|
||||
}
|
||||
fseek(file, 0, SEEK_END);
|
||||
ruleFileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
ruleBufferC = new char[ruleFileSize+10];
|
||||
|
||||
result = fread(ruleBufferC, 1, ruleFileSize, file);
|
||||
if (result != ruleFileSize) {
|
||||
fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName);
|
||||
exit (-1);
|
||||
}
|
||||
ruleBufferC[ruleFileSize]=0;
|
||||
fclose(file);
|
||||
|
||||
//
|
||||
// Look for a Unicode Signature (BOM) on the rule file
|
||||
//
|
||||
int32_t signatureLength;
|
||||
const char * ruleSourceC = ruleBufferC;
|
||||
const char* encoding = ucnv_detectUnicodeSignature(
|
||||
ruleSourceC, ruleFileSize, &signatureLength, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
exit(status);
|
||||
}
|
||||
if(encoding!=NULL ){
|
||||
ruleSourceC += signatureLength;
|
||||
ruleFileSize -= signatureLength;
|
||||
}
|
||||
|
||||
//
|
||||
// Open a converter to take the rule file to UTF-16
|
||||
//
|
||||
UConverter* conv;
|
||||
conv = ucnv_open(encoding, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
//
|
||||
// Convert the rules to UChar.
|
||||
// Preflight first to determine required buffer size.
|
||||
//
|
||||
uint32_t destCap = ucnv_toUChars(conv,
|
||||
NULL, // dest,
|
||||
0, // destCapacity,
|
||||
ruleSourceC,
|
||||
ruleFileSize,
|
||||
&status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
};
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UChar *ruleSourceU = new UChar[destCap+1];
|
||||
ucnv_toUChars(conv,
|
||||
ruleSourceU, // dest,
|
||||
destCap+1,
|
||||
ruleSourceC,
|
||||
ruleFileSize,
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
|
||||
exit(status);
|
||||
};
|
||||
ucnv_close(conv);
|
||||
|
||||
|
||||
//
|
||||
// Put the source rules into a UnicodeString
|
||||
//
|
||||
UnicodeString ruleSourceS(FALSE, ruleSourceU, destCap);
|
||||
|
||||
//
|
||||
// Create the break iterator from the rules
|
||||
// This will compile the rules.
|
||||
//
|
||||
UParseError parseError;
|
||||
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
|
||||
u_errorName(status), parseError.line, parseError.offset);
|
||||
exit(status);
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Get the compiled rule data from the break iterator.
|
||||
//
|
||||
uint32_t outDataSize;
|
||||
const uint8_t *outData;
|
||||
outData = bi->getFlattenedData(&outDataSize);
|
||||
|
||||
|
||||
//
|
||||
// Create the output file
|
||||
//
|
||||
size_t bytesWritten;
|
||||
file = fopen(outFileName, "wb");
|
||||
if (file == 0) {
|
||||
fprintf(stderr, "Could not open output file \"%s\"\n", outFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Set up the ICU data header, defined in ucmndata.h
|
||||
//
|
||||
DataHeader dh ={
|
||||
{sizeof(DataHeader), // Struct MappedData
|
||||
0xda,
|
||||
0x27},
|
||||
|
||||
{ // struct UDataInfo
|
||||
sizeof(UDataInfo), // size
|
||||
0, // reserved
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0, // reserved
|
||||
|
||||
{ 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk "
|
||||
{ 2, 1, 0, 0 }, // formatVersion
|
||||
{ 3, 1, 0, 0 } // dataVersion (Unicode version)
|
||||
}};
|
||||
bytesWritten = fwrite(&dh, 1, sizeof(DataHeader), file);
|
||||
|
||||
//
|
||||
// Write the data itself.
|
||||
//
|
||||
bytesWritten = fwrite(outData, 1, outDataSize, file);
|
||||
if (bytesWritten != outDataSize) {
|
||||
fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
delete bi;
|
||||
delete ruleSourceU;
|
||||
delete ruleBufferC;
|
||||
u_cleanup();
|
||||
|
||||
|
||||
printf("genbrk: tool completed successfully.\n");
|
||||
return 0;
|
||||
}
|
125
icu4c/source/tools/genbrk/genbrk.dsp
Normal file
125
icu4c/source/tools/genbrk/genbrk.dsp
Normal file
|
@ -0,0 +1,125 @@
|
|||
# Microsoft Developer Studio Project File - Name="genbrk" - Package Owner=<4>
|
||||
# Microsoft Developer Studio Generated Build File, Format Version 6.00
|
||||
# ** DO NOT EDIT **
|
||||
|
||||
# TARGTYPE "Win32 (x86) Console Application" 0x0103
|
||||
|
||||
CFG=genbrk - Win32 Debug
|
||||
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
|
||||
!MESSAGE use the Export Makefile command and run
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "genbrk.mak".
|
||||
!MESSAGE
|
||||
!MESSAGE You can specify a configuration when running NMAKE
|
||||
!MESSAGE by defining the macro CFG on the command line. For example:
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "genbrk.mak" CFG="genbrk - Win32 Debug"
|
||||
!MESSAGE
|
||||
!MESSAGE Possible choices for configuration are:
|
||||
!MESSAGE
|
||||
!MESSAGE "genbrk - Win32 Release" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE "genbrk - Win32 Debug" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE
|
||||
|
||||
# Begin Project
|
||||
# PROP AllowPerConfigDependencies 0
|
||||
# PROP Scc_ProjName ""
|
||||
# PROP Scc_LocalPath ""
|
||||
CPP=cl.exe
|
||||
RSC=rc.exe
|
||||
|
||||
!IF "$(CFG)" == "genbrk - Win32 Release"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 0
|
||||
# PROP BASE Output_Dir "Release"
|
||||
# PROP BASE Intermediate_Dir "Release"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 0
|
||||
# PROP Output_Dir "Release"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
MTL=midl.exe
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD CPP /nologo /G6 /MD /Za /W3 /GX /O2 /I "..\..\common" /I "..\..\i18n" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD BASE RSC /l 0x409 /d "NDEBUG"
|
||||
# ADD RSC /l 0x409 /d "NDEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
||||
# ADD LINK32 icuin.lib icuuc.lib icutu.lib /nologo /subsystem:console /machine:I386 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib"
|
||||
# Begin Custom Build
|
||||
TargetPath=.\Release\genbrk.exe
|
||||
InputPath=.\Release\genbrk.exe
|
||||
InputName=genbrk
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ELSEIF "$(CFG)" == "genbrk - Win32 Debug"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 1
|
||||
# PROP BASE Output_Dir "Debug"
|
||||
# PROP BASE Intermediate_Dir "Debug"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 1
|
||||
# PROP Output_Dir "Debug"
|
||||
# PROP Intermediate_Dir "Debug"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
MTL=midl.exe
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /G6 /MDd /Za /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\..\i18n" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /FR /FD /GZ /c
|
||||
# SUBTRACT CPP /YX
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
# ADD RSC /l 0x409 /d "_DEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
|
||||
# ADD LINK32 kernel32.lib user32.lib icuind.lib icuucd.lib icutud.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib"
|
||||
# Begin Custom Build
|
||||
TargetPath=.\Debug\genbrk.exe
|
||||
InputPath=.\Debug\genbrk.exe
|
||||
InputName=genbrk
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# Begin Target
|
||||
|
||||
# Name "genbrk - Win32 Release"
|
||||
# Name "genbrk - Win32 Debug"
|
||||
# Begin Group "Source Files"
|
||||
|
||||
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\genbrk.cpp
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
# PROP Default_Filter "h;hpp;hxx;hm;inl"
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
||||
# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
# End Group
|
||||
# End Target
|
||||
# End Project
|
|
@ -41,6 +41,7 @@ RSC=rc.exe
|
|||
# PROP Use_Debug_Libraries 0
|
||||
# PROP Output_Dir "Release"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD CPP /nologo /MD /W3 /GX /O2 /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
|
|
|
@ -41,6 +41,7 @@ RSC=rc.exe
|
|||
# PROP Use_Debug_Libraries 0
|
||||
# PROP Output_Dir "Release"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
# ADD CPP /nologo /G6 /MD /Za /W4 /GX /O2 /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
|
||||
|
|
Loading…
Add table
Reference in a new issue