mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-45 new builder for RBBI rules, remove obsolete RBBI files
X-SVN-Rev: 8941
This commit is contained in:
parent
631cd39ece
commit
70621f8923
20 changed files with 0 additions and 3034 deletions
14
.gitattributes
vendored
14
.gitattributes
vendored
|
@ -48,21 +48,7 @@ README text !eol
|
|||
*.spp -text
|
||||
*.tri2 -text
|
||||
|
||||
icu4c/source/data/brkitr/charBE.brk -text
|
||||
icu4c/source/data/brkitr/charLE.brk -text
|
||||
icu4c/source/data/brkitr/lineBE.brk -text
|
||||
icu4c/source/data/brkitr/lineLE.brk -text
|
||||
icu4c/source/data/brkitr/line_thBE.brk -text
|
||||
icu4c/source/data/brkitr/line_thLE.brk -text
|
||||
icu4c/source/data/brkitr/sentBE.brk -text
|
||||
icu4c/source/data/brkitr/sentLE.brk -text
|
||||
icu4c/source/data/brkitr/thaidict.brk -text
|
||||
icu4c/source/data/brkitr/titleBE.brk -text
|
||||
icu4c/source/data/brkitr/titleLE.brk -text
|
||||
icu4c/source/data/brkitr/wordBE.brk -text
|
||||
icu4c/source/data/brkitr/wordLE.brk -text
|
||||
icu4c/source/data/brkitr/word_thBE.brk -text
|
||||
icu4c/source/data/brkitr/word_thLE.brk -text
|
||||
icu4c/source/data/unidata/UCARules.txt -text
|
||||
icu4c/source/samples/ucnv/data02.bin -text
|
||||
icu4c/source/test/testdata/importtest.bin -text
|
||||
|
|
|
@ -1,246 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/11/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "ucmp8.h"
|
||||
#include "cmemory.h"
|
||||
#include "rbbi_tbl.h"
|
||||
#include "unicode/unistr.h"
|
||||
#ifdef RBBI_DEBUG
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables(UDataMemory* memory)
|
||||
: refCount(0),
|
||||
ownTables(FALSE)
|
||||
{
|
||||
if(memory != 0) {
|
||||
fMemory = memory;
|
||||
const void* image = udata_getMemory(memory);
|
||||
|
||||
if(image != 0) {
|
||||
|
||||
const int32_t* im = (const int32_t*)(image);
|
||||
const int8_t* base = (const int8_t*)(image);
|
||||
|
||||
// the memory image begins with an index that gives the offsets into the
|
||||
// image for each of the fields in the BreakIteratorTables object--
|
||||
// use those to initialize the tables object (it will end up pointing
|
||||
// into the memory image for everything)
|
||||
numCategories = (int32_t)im[0];
|
||||
description = UnicodeString(TRUE, (UChar*)((int32_t)im[1] + base), -1);
|
||||
charCategoryTable = ucmp8_openAlias((uint16_t*)((int32_t)im[2] + base),
|
||||
(int8_t*)((int32_t)im[3] + base), 0);
|
||||
stateTable = (int16_t*)((int32_t)im[4] + base);
|
||||
backwardsStateTable = (int16_t*)((int32_t)im[5] + base);
|
||||
endStates = (int8_t*)((int32_t)im[6] + base);
|
||||
lookaheadStates = (int8_t*)((int32_t)im[7] + base);
|
||||
} else {
|
||||
udata_close(fMemory);
|
||||
}
|
||||
} else {
|
||||
fMemory = 0;
|
||||
}
|
||||
}
|
||||
|
||||
RuleBasedBreakIteratorTables::RuleBasedBreakIteratorTables()
|
||||
: refCount(0),
|
||||
ownTables(TRUE),
|
||||
fMemory(0)
|
||||
{
|
||||
// everything else is null-initialized. This constructor depends on
|
||||
// a RuleBasedBreakIteratorBuilder filling in all the members
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
RuleBasedBreakIteratorTables::~RuleBasedBreakIteratorTables() {
|
||||
if (ownTables) {
|
||||
delete [] stateTable;
|
||||
delete [] backwardsStateTable;
|
||||
delete [] endStates;
|
||||
delete [] lookaheadStates;
|
||||
ucmp8_close(charCategoryTable);
|
||||
}
|
||||
else {
|
||||
uprv_free(charCategoryTable);
|
||||
if(fMemory != 0) {
|
||||
udata_close(fMemory);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both tables objects are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
UBool
|
||||
RuleBasedBreakIteratorTables::operator==(const RuleBasedBreakIteratorTables& that) const {
|
||||
return this->description == that.description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hash code for these tables
|
||||
* @return A hash code
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::hashCode() const {
|
||||
return description.hashCode();
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
* The ignored parameter is used by derived implementations.
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::lookupCategory(UChar c, BreakIterator* /*ignored*/) const {
|
||||
return ucmp8_get(charCategoryTable, c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the state table.
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::lookupState(int32_t state, int32_t category) const {
|
||||
return stateTable[state * numCategories + category];
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the backwards state table.
|
||||
*/
|
||||
int32_t
|
||||
RuleBasedBreakIteratorTables::lookupBackwardState(int32_t state, int32_t category) const {
|
||||
return backwardsStateTable[state * numCategories + category];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is an accepting state.
|
||||
*/
|
||||
UBool
|
||||
RuleBasedBreakIteratorTables::isEndState(int32_t state) const {
|
||||
return endStates[state];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is a lookahead state.
|
||||
*/
|
||||
UBool
|
||||
RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
|
||||
return lookaheadStates[state];
|
||||
}
|
||||
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
//
|
||||
// debugDumpTables
|
||||
//
|
||||
void RuleBasedBreakIteratorTables::debugDumpTables() const {
|
||||
printf("Character Classes:\n");
|
||||
int currentCharClass = 257;
|
||||
int startCurrentRange = 0;
|
||||
int initialStringLength = 0;
|
||||
char buf[80];
|
||||
|
||||
UnicodeString *charClassRanges = new UnicodeString[numCategories];
|
||||
|
||||
for (int i = 0; i < 0xffff; i++) {
|
||||
if ( ucmp8_get(charCategoryTable, i) != currentCharClass) {
|
||||
if (currentCharClass != 257) {
|
||||
// Complete the output of the previous range.
|
||||
if (i != startCurrentRange+1) {
|
||||
sprintf(buf, "-%x", i-1);
|
||||
charClassRanges[currentCharClass].append(buf);
|
||||
}
|
||||
if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
|
||||
charClassRanges[currentCharClass].append("\n ");
|
||||
}
|
||||
}
|
||||
|
||||
// Output the start of the new range.
|
||||
currentCharClass = ucmp8_get(charCategoryTable, i);
|
||||
startCurrentRange = i;
|
||||
initialStringLength = charClassRanges[currentCharClass].length();
|
||||
if (charClassRanges[currentCharClass].length() > 0)
|
||||
charClassRanges[currentCharClass].append(", ");
|
||||
sprintf(buf, "%x", i);
|
||||
charClassRanges[currentCharClass].append(buf);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<numCategories; i++) {
|
||||
printf("%d: ", i);
|
||||
// Write out the chars in the UnicodeStrings.
|
||||
// We know we didn't put anything into them except for plain ascii chars.
|
||||
for (int j=0; j<charClassRanges[i].length(); j++) {
|
||||
putchar(charClassRanges[i].charAt(j));
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
||||
|
||||
delete [] charClassRanges;
|
||||
|
||||
|
||||
// State table length might be too big by one, because the only indication
|
||||
// we have is the pointer to the start of the next item in the memory
|
||||
// image, the backwardsStateTable, which is 4 byte aligned.
|
||||
//
|
||||
int stateTableLength = backwardsStateTable - stateTable;
|
||||
if ((stateTableLength % numCategories) == 1) {
|
||||
stateTableLength -= 1;
|
||||
}
|
||||
|
||||
printf("\n\nState Table. *: end state %%: look ahead state\n");
|
||||
printf("C:\t");
|
||||
for (int i = 0; i < numCategories; i++) {
|
||||
printf("%d\t", i);
|
||||
}
|
||||
printf("\n=================================================");
|
||||
|
||||
for (int i = 0; i < stateTableLength; i++) {
|
||||
if (i % numCategories == 0) {
|
||||
putchar('\n');
|
||||
if (endStates[i / numCategories])
|
||||
putchar('*');
|
||||
else
|
||||
putchar(' ');
|
||||
if (lookaheadStates[i / numCategories]) {
|
||||
putchar('%');
|
||||
}
|
||||
else
|
||||
putchar(' ');
|
||||
printf("%d:\t", i / numCategories);
|
||||
}
|
||||
if (stateTable[i] == 0) {
|
||||
printf(".\t");
|
||||
} else {
|
||||
printf("%d\t", stateTable[i]);
|
||||
}
|
||||
}
|
||||
printf("\n\n\n");
|
||||
}
|
||||
#endif // RBBI_DEBUG
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
@ -1,235 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999 IBM Corp. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 11/11/99 rgillam Complete port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef RBBI_TBL_H
|
||||
#define RBBI_TBL_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "filestrm.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
#ifndef UCMP8_H
|
||||
typedef struct _CompactByteArray CompactByteArray;
|
||||
#endif
|
||||
U_CDECL_END
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/* forward declarations */
|
||||
class RuleBasedBreakIterator;
|
||||
class DictionaryBasedBreakIterator;
|
||||
|
||||
/**
|
||||
* This class contains the internal static tables that are used by the
|
||||
* RuleBasedBreakIterator. Once created, these tables are immutable,
|
||||
* so they can be shared among all break iterators using a particular
|
||||
* set of rules. This class uses a reference-counting scheme to
|
||||
* manage the sharing.
|
||||
*
|
||||
* @author Richard Gillam
|
||||
*/
|
||||
class RuleBasedBreakIteratorTables {
|
||||
|
||||
private:
|
||||
/**
|
||||
* The number of RuleBasedBreakIterators using this object.
|
||||
*/
|
||||
int16_t refCount;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Whether or not we own the storage for the tables (the tables may be
|
||||
* stored in a memory-mapped file)
|
||||
*/
|
||||
UBool ownTables;
|
||||
|
||||
private:
|
||||
/**
|
||||
* The textual description that was used to create these tables
|
||||
*/
|
||||
UnicodeString description;
|
||||
|
||||
/**
|
||||
* A table that indexes from character values to character category numbers
|
||||
*/
|
||||
CompactByteArray* charCategoryTable;
|
||||
|
||||
/**
|
||||
* The table of state transitions used for forward iteration
|
||||
*/
|
||||
int16_t* stateTable;
|
||||
|
||||
/**
|
||||
* The table of state transitions used to sync up the iterator with the
|
||||
* text in backwards and random-access iteration
|
||||
*/
|
||||
int16_t* backwardsStateTable;
|
||||
|
||||
/**
|
||||
* A list of flags indicating which states in the state table are accepting
|
||||
* ("end") states
|
||||
*/
|
||||
int8_t* endStates;
|
||||
|
||||
/**
|
||||
* A list of flags indicating which states in the state table are
|
||||
* lookahead states (states which turn lookahead on and off)
|
||||
*/
|
||||
int8_t* lookaheadStates;
|
||||
|
||||
/**
|
||||
* The number of character categories (and, thus, the number of columns in
|
||||
* the state tables)
|
||||
*/
|
||||
int32_t numCategories;
|
||||
|
||||
//=======================================================================
|
||||
// constructor
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Creates a tables object, adopting all of the tables that are passed in.
|
||||
*/
|
||||
protected:
|
||||
RuleBasedBreakIteratorTables();
|
||||
|
||||
RuleBasedBreakIteratorTables(UDataMemory* memory);
|
||||
UDataMemory *fMemory;
|
||||
|
||||
private:
|
||||
/**
|
||||
* The copy constructor is declared private and is a no-op.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
*/
|
||||
RuleBasedBreakIteratorTables(const RuleBasedBreakIteratorTables& that);
|
||||
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~RuleBasedBreakIteratorTables();
|
||||
|
||||
private:
|
||||
/**
|
||||
* The assignment operator is declared private and is a no-op.
|
||||
* THIS CLASS MAY NOT BE COPIED.
|
||||
*/
|
||||
RuleBasedBreakIteratorTables& operator=(const RuleBasedBreakIteratorTables& that);
|
||||
|
||||
/**
|
||||
* Equality operator. Returns TRUE if both tables objects are of the
|
||||
* same class, have the same behavior, and iterate over the same text.
|
||||
*/
|
||||
virtual UBool operator==(const RuleBasedBreakIteratorTables& that) const;
|
||||
|
||||
/**
|
||||
* Not-equal operator. If operator== returns TRUE, this returns FALSE,
|
||||
* and vice versa.
|
||||
*/
|
||||
UBool operator!=(const RuleBasedBreakIteratorTables& that) const;
|
||||
|
||||
/**
|
||||
* Compute a hash code for these tables
|
||||
* @return A hash code
|
||||
*/
|
||||
virtual int32_t hashCode(void) const;
|
||||
|
||||
/**
|
||||
* Returns the description used to create these tables
|
||||
*/
|
||||
const UnicodeString& getRules(void) const;
|
||||
|
||||
//=======================================================================
|
||||
// reference counting
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* increments the reference count.
|
||||
*/
|
||||
void addReference(void);
|
||||
|
||||
/**
|
||||
* decrements the reference count and deletes the object if it reaches zero
|
||||
*/
|
||||
void removeReference(void);
|
||||
|
||||
protected:
|
||||
//=======================================================================
|
||||
// implementation
|
||||
//=======================================================================
|
||||
/**
|
||||
* Looks up a character's category (i.e., its category for breaking purposes,
|
||||
* not its Unicode category)
|
||||
*/
|
||||
virtual int32_t lookupCategory(UChar c, BreakIterator* bi) const;
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the state table.
|
||||
*/
|
||||
virtual int32_t lookupState(int32_t state, int32_t category) const;
|
||||
|
||||
/**
|
||||
* Given a current state and a character category, looks up the
|
||||
* next state to transition to in the backwards state table.
|
||||
*/
|
||||
virtual int32_t lookupBackwardState(int32_t state, int32_t category) const;
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is an accepting state.
|
||||
*/
|
||||
virtual UBool isEndState(int32_t state) const;
|
||||
|
||||
/**
|
||||
* Returns true if the specified state is a lookahead state.
|
||||
*/
|
||||
virtual UBool isLookaheadState(int32_t state) const;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
//
|
||||
// Print out state table and character classes.
|
||||
// For debugging only.
|
||||
//
|
||||
void debugDumpTables() const;
|
||||
#endif
|
||||
|
||||
friend class RuleBasedBreakIterator;
|
||||
friend class DictionaryBasedBreakIterator;
|
||||
};
|
||||
|
||||
inline UBool
|
||||
RuleBasedBreakIteratorTables::operator!=(const RuleBasedBreakIteratorTables& that) const {
|
||||
return !operator==(that);
|
||||
}
|
||||
|
||||
inline const UnicodeString&
|
||||
RuleBasedBreakIteratorTables::getRules(void) const {
|
||||
return description;
|
||||
}
|
||||
|
||||
inline void
|
||||
RuleBasedBreakIteratorTables::addReference(void) {
|
||||
++refCount;
|
||||
}
|
||||
|
||||
inline void
|
||||
RuleBasedBreakIteratorTables::removeReference(void) {
|
||||
if (--refCount <= 0)
|
||||
delete this;
|
||||
}
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -1,358 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) {1999}, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 12/15/99 rgillam Port from Java.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#ifndef RBBI_BLD_H
|
||||
#define RBBI_BLD_H
|
||||
|
||||
#include "rbbi.h"
|
||||
#include "rbbi_tbl.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "uvector.h"
|
||||
|
||||
class ExpressionList;
|
||||
|
||||
//=======================================================================
|
||||
// RuleBasedBreakIterator.Builder
|
||||
//=======================================================================
|
||||
/**
|
||||
* The Builder class has the job of constructing a RuleBasedBreakIterator from a
|
||||
* textual description. A Builder is constructed by RuleBasedBreakIterator's
|
||||
* constructor, which uses it to construct the iterator itself and then throws it
|
||||
* away.
|
||||
* <p>The construction logic is separated out into its own class for two primary
|
||||
* reasons:
|
||||
* <ul><li>The construction logic is quite complicated and large. Separating it
|
||||
* out into its own class means the code must only be loaded into memory while a
|
||||
* RuleBasedBreakIterator is being constructed, and can be purged after that.
|
||||
* <li>There is a fair amount of state that must be maintained throughout the
|
||||
* construction process that is not needed by the iterator after construction.
|
||||
* Separating this state out into another class prevents all of the functions that
|
||||
* construct the iterator from having to have really long parameter lists,
|
||||
* (hopefully) contributing to readability and maintainability.</ul>
|
||||
* <p>It'd be really nice if this could be an independent class rather than an
|
||||
* inner class, because that would shorten the source file considerably, but
|
||||
* making Builder an inner class of RuleBasedBreakIterator allows it direct access
|
||||
* to RuleBasedBreakIterator's private members, which saves us from having to
|
||||
* provide some kind of "back door" to the Builder class that could then also be
|
||||
* used by other classes.
|
||||
*/
|
||||
class RuleBasedBreakIteratorBuilder {
|
||||
|
||||
protected:
|
||||
/**
|
||||
* The iterator we're constructing.
|
||||
*/
|
||||
RuleBasedBreakIterator& iterator;
|
||||
|
||||
/**
|
||||
* The tables object for the iterator we're constructing.
|
||||
*/
|
||||
RuleBasedBreakIteratorTables* tables;
|
||||
|
||||
/**
|
||||
* A temporary place to hold the rules as they're being processed.
|
||||
*/
|
||||
UVector tempRuleList;
|
||||
|
||||
/**
|
||||
* A temporary holding place used for calculating the character categories.
|
||||
* This object contains UnicodeSet objects.
|
||||
*/
|
||||
UVector categories;
|
||||
|
||||
/**
|
||||
* The number of categories (and thus the number of columns in the finished state tables)
|
||||
*/
|
||||
int32_t numCategories;
|
||||
|
||||
/**
|
||||
* A table used to map parts of regexp text to lists of character categories,
|
||||
* rather than having to figure them out from scratch each time
|
||||
*/
|
||||
ExpressionList* expressions;
|
||||
|
||||
/**
|
||||
* A temporary holding place for the list of ignore characters
|
||||
*/
|
||||
UnicodeSet ignoreChars;
|
||||
|
||||
/**
|
||||
* A temporary holding place where the forward state table is built
|
||||
*/
|
||||
UVector tempStateTable;
|
||||
|
||||
/**
|
||||
* A list of all the states that have to be filled in with transitions to the
|
||||
* next state that is created. Used when building the state table from the
|
||||
* regular expressions.
|
||||
*/
|
||||
UVector decisionPointList;
|
||||
|
||||
/**
|
||||
* A UStack for holding decision point lists. This is used to handle nested
|
||||
* parentheses and braces in regexps.
|
||||
*/
|
||||
UStack decisionPointStack;
|
||||
|
||||
/**
|
||||
* A list of states that loop back on themselves. Used to handle .*?
|
||||
*/
|
||||
UVector loopingStates;
|
||||
|
||||
/**
|
||||
* Looping states actually have to be backfilled later in the process
|
||||
* than everything else. This is where a the list of states to backfill
|
||||
* is accumulated. This is also used to handle .*?
|
||||
*/
|
||||
UVector statesToBackfill;
|
||||
|
||||
/**
|
||||
* A list mapping pairs of state numbers for states that are to be combined
|
||||
* to the state number of the state representing their combination. Used
|
||||
* in the process of making the state table deterministic to prevent
|
||||
* infinite recursion.
|
||||
*/
|
||||
UVector mergeList;
|
||||
|
||||
/**
|
||||
* A flag that is used to indicate when the list of looping states can
|
||||
* be reset.
|
||||
*/
|
||||
UBool clearLoopingStates;
|
||||
|
||||
/**
|
||||
* A place where an error message can be stored if we get a parse error.
|
||||
* The error message is never displayed anywhere, so this is useful pretty
|
||||
* much only in conjunction with a debugger.
|
||||
*/
|
||||
UnicodeString errorMessage;
|
||||
|
||||
/**
|
||||
* A bit mask used to indicate a bit in the table's flags column that marks a
|
||||
* state as an accepting state.
|
||||
*/
|
||||
static const int32_t END_STATE_FLAG /*= 0x8000*/;
|
||||
|
||||
/**
|
||||
* A bit mask used to indicate a bit in the table's flags column that marks a
|
||||
* state as one the builder shouldn't loop to any looping states
|
||||
*/
|
||||
static const int32_t DONT_LOOP_FLAG /*= 0x4000*/;
|
||||
|
||||
/**
|
||||
* A bit mask used to indicate a bit in the table's flags column that marks a
|
||||
* state as a lookahead state.
|
||||
*/
|
||||
static const int32_t LOOKAHEAD_STATE_FLAG /*= 0x2000*/;
|
||||
|
||||
/**
|
||||
* A bit mask representing the union of the mask values listed above.
|
||||
* Used for clearing or masking off the flag bits.
|
||||
*/
|
||||
static const int32_t ALL_FLAGS /*= END_STATE_FLAG | LOOKAHEAD_STATE_FLAG
|
||||
| DONT_LOOP_FLAG*/;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* The Builder class contains a reference to the iterator it's supposed to build.
|
||||
*/
|
||||
RuleBasedBreakIteratorBuilder(RuleBasedBreakIterator& iteratorToBuild);
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
~RuleBasedBreakIteratorBuilder();
|
||||
|
||||
/**
|
||||
* This is the main function for setting up the BreakIterator's tables. It
|
||||
* just vectors different parts of the job off to other functions.
|
||||
*/
|
||||
virtual void buildBreakIterator(const UnicodeString& description,
|
||||
UErrorCode& err);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* Thus function has three main purposes:
|
||||
* <ul><li>Perform general syntax checking on the description, so the rest of the
|
||||
* build code can assume that it's parsing a legal description.
|
||||
* <li>Split the description into separate rules
|
||||
* <li>Perform variable-name substitutions (so that no one else sees variable names)
|
||||
* </ul>
|
||||
*/
|
||||
virtual void buildRuleList(UnicodeString& description,
|
||||
UErrorCode& err);
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* This function performs variable-name substitutions. First it does syntax
|
||||
* checking on the variable-name definition. If it's syntactically valid, it
|
||||
* then goes through the remainder of the description and does a simple
|
||||
* find-and-replace of the variable name with its text. (The variable text
|
||||
* must be enclosed in either [] or () for this to work.)
|
||||
*/
|
||||
virtual void processSubstitution(UnicodeString& description,
|
||||
int32_t ruleStart,
|
||||
int32_t ruleEnd,
|
||||
int32_t startPos,
|
||||
UErrorCode& err);
|
||||
|
||||
/**
|
||||
* This function defines a protocol for handling substitution names that
|
||||
* are "special," i.e., that have some property beyond just being
|
||||
* substitutions. At the RuleBasedBreakIterator level, we have one
|
||||
* special substitution name, "<ignore>". Subclasses can override this
|
||||
* function to add more. Any special processing that has to go on beyond
|
||||
* that which is done by the normal substitution-processing code is done
|
||||
* here.
|
||||
*/
|
||||
virtual void handleSpecialSubstitution(const UnicodeString& replace,
|
||||
const UnicodeString& replaceWith,
|
||||
int32_t startPos,
|
||||
const UnicodeString& description,
|
||||
UErrorCode& err);
|
||||
|
||||
/**
|
||||
* This function provides a hook for subclasses to mess with the character
|
||||
* category table.
|
||||
*/
|
||||
virtual void mungeExpressionList();
|
||||
|
||||
/**
|
||||
* This function builds the character category table. On entry,
|
||||
* tempRuleList is a UVector of break rules that has had variable names substituted.
|
||||
* On exit, the charCategoryTable data member has been initialized to hold the
|
||||
* character category table, and tempRuleList's rules have been munged to contain
|
||||
* character category numbers everywhere a literal character or a [] expression
|
||||
* originally occurred.
|
||||
*/
|
||||
virtual void buildCharCategories(UErrorCode& err);
|
||||
|
||||
private:
|
||||
|
||||
/**
|
||||
* This is the function that builds the forward state table. Most of the real
|
||||
* work is done in parseRule(), which is called once for each rule in the
|
||||
* description.
|
||||
*/
|
||||
virtual void buildStateTable(UErrorCode& err);
|
||||
|
||||
/**
|
||||
* This is where most of the work really happens. This routine parses a single
|
||||
* rule in the rule description, adding and modifying states in the state
|
||||
* table according to the new expression. The state table is kept deterministic
|
||||
* throughout the whole operation, although some ugly postprocessing is needed
|
||||
* to handle the *? token.
|
||||
*/
|
||||
virtual void parseRule(const UnicodeString& rule,
|
||||
UBool forward);
|
||||
|
||||
/**
|
||||
* Update entries in the state table, and merge states when necessary to keep
|
||||
* the table deterministic.
|
||||
* @param rows The list of rows that need updating (the decision point list)
|
||||
* @param pendingChars A character category list, encoded in a String. This is the
|
||||
* list of the columns that need updating.
|
||||
* @param newValue Update the cells specfied above to contain this value
|
||||
*/
|
||||
virtual void updateStateTable(const UVector& rows,
|
||||
const UnicodeString& pendingChars,
|
||||
int16_t newValue);
|
||||
|
||||
/**
|
||||
* The real work of making the state table deterministic happens here. This function
|
||||
* merges a state in the state table (specified by rowNum) with a state that is
|
||||
* passed in (newValues). The basic process is to copy the nonzero cells in newStates
|
||||
* into the state in the state table (we'll call that oldValues). If there's a
|
||||
* collision (i.e., if the same cell has a nonzero value in both states, and it's
|
||||
* not the SAME value), then we have to reconcile the collision. We do this by
|
||||
* creating a new state, adding it to the end of the state table, and using this
|
||||
* function recursively to merge the original two states into a single, combined
|
||||
* state. This process may happen recursively (i.e., each successive level may
|
||||
* involve collisions). To prevent infinite recursion, we keep a log of merge
|
||||
* operations. Any time we're merging two states we've merged before, we can just
|
||||
* supply the row number for the result of that merge operation rather than creating
|
||||
* a new state just like it.
|
||||
* @param rowNum The row number in the state table of the state to be updated
|
||||
* @param newValues The state to merge it with.
|
||||
* @param rowsBeingUpdated A copy of the list of rows passed to updateStateTable()
|
||||
* (itself a copy of the decision point list from parseRule()). Newly-created
|
||||
* states get added to the decision point list if their "parents" were on it.
|
||||
*/
|
||||
virtual void mergeStates(int32_t rowNum,
|
||||
int16_t* newValues,
|
||||
const UVector& rowsBeingUpdated);
|
||||
|
||||
/**
|
||||
* The merge list is a list of pairs of rows that have been merged somewhere in
|
||||
* the process of building this state table, along with the row number of the
|
||||
* row containing the merged state. This function looks up a pair of row numbers
|
||||
* and returns the row number of the row they combine into. (It returns 0 if
|
||||
* this pair of rows isn't in the merge list.)
|
||||
*/
|
||||
virtual int32_t searchMergeList(int32_t a, int32_t b);
|
||||
|
||||
/**
|
||||
* This function is used to update the list of current loooping states (i.e.,
|
||||
* states that are controlled by a *? construct). It backfills values from
|
||||
* the looping states into unpopulated cells of the states that are currently
|
||||
* marked for backfilling, and then updates the list of looping states to be
|
||||
* the new list
|
||||
* @param newLoopingStates The list of new looping states
|
||||
* @param endStates The list of states to treat as end states (states that
|
||||
* can exit the loop).
|
||||
*/
|
||||
virtual void setLoopingStates(const UVector* newLoopingStates,
|
||||
const UVector& endStates);
|
||||
|
||||
/**
|
||||
* This removes "ending states" and states reachable from them from the
|
||||
* list of states to backfill.
|
||||
* @param The row number of the state to remove from the backfill list
|
||||
*/
|
||||
virtual void eliminateBackfillStates(int32_t baseState);
|
||||
|
||||
/**
|
||||
* This function completes the backfilling process by actually doing the
|
||||
* backfilling on the states that are marked for it
|
||||
*/
|
||||
virtual void backfillLoopingStates(void);
|
||||
|
||||
/**
|
||||
* This function completes the state-table-building process by doing several
|
||||
* postprocessing steps and copying everything into its final resting place
|
||||
* in the iterator itself
|
||||
* @param forward True if we're working on the forward state table
|
||||
*/
|
||||
virtual void finishBuildingStateTable(UBool forward);
|
||||
|
||||
/**
|
||||
* This function builds the backward state table from the forward state
|
||||
* table and any additional rules (identified by the ! on the front)
|
||||
* supplied in the description
|
||||
*/
|
||||
virtual void buildBackwardsStateTable(UErrorCode& err);
|
||||
|
||||
protected:
|
||||
|
||||
/**
|
||||
* Throws an IllegalArgumentException representing a syntax error in the rule
|
||||
* description. The exception's message contains some debugging information.
|
||||
* @param message A message describing the problem
|
||||
* @param position The position in the description where the problem was
|
||||
* discovered
|
||||
* @param context The string containing the error
|
||||
*/
|
||||
virtual void setUpErrorMessage(const UnicodeString& message,
|
||||
int32_t position,
|
||||
const UnicodeString& context);
|
||||
};
|
||||
|
||||
#endif
|
|
@ -1,88 +0,0 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 03/14/00 aliu Creation.
|
||||
* 06/27/00 aliu Change from C++ class to C struct
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef PARSEERR_H
|
||||
#define PARSEERR_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
|
||||
/**
|
||||
* The capacity of the context strings in UParseError.
|
||||
* @draft ICU 2.0
|
||||
*/
|
||||
enum { U_PARSE_CONTEXT_LEN = 16 };
|
||||
|
||||
/**
|
||||
* A UParseError struct is used to returned detailed information about
|
||||
* parsing errors. It is used by ICU parsing engines that parse long
|
||||
* rules, patterns, or programs, where the text being parsed is long
|
||||
* enough that more information than a UErrorCode is needed to
|
||||
* localize the error.
|
||||
*
|
||||
* <p>The code field is an integer error code specific to each parsing
|
||||
* engine, but globally unique. See the engine header file for
|
||||
* possible values. The line, offset, and context fields are
|
||||
* optional; parsing engines may choose not to use to use them.
|
||||
*
|
||||
* <p>Examples of engines which use UParseError (or may use it in the
|
||||
* future) are RuleBasedTransliterator and RuleBasedBreakIterator.
|
||||
*
|
||||
* @draft ICU 2.0
|
||||
*/
|
||||
typedef struct _UParseError {
|
||||
|
||||
/**
|
||||
* An integer indicating the type of error. If no error was
|
||||
* encountered, the parse engine sets this to zero, and the
|
||||
* other fields' values should be ignored.
|
||||
*
|
||||
* <p>Each parse engine should use a range of codes from
|
||||
* 0xNNNN0001 to 0xNNNNFFFF, where NNNN is a 16-bit integer
|
||||
* between 0x0001 and 0xFFFF unique to each parse engine.
|
||||
* Parse engines should define the enum PARSE_ERROR_BASE
|
||||
* to be 0xNNNN0000.
|
||||
*/
|
||||
/*int32_t code; */
|
||||
|
||||
/**
|
||||
* The line on which the error occured. If the parse engine
|
||||
* is not using this field, it should set it to zero. Otherwise
|
||||
* it should be a positive integer. The default value of this field
|
||||
* is -1. It will be set to 0 if the code populating this struct is not
|
||||
* using line numbers.
|
||||
*/
|
||||
int32_t line;
|
||||
|
||||
/**
|
||||
* The character offset to the error. If the line field is
|
||||
* being used, then this offset is from the start of the line.
|
||||
* If the line field is not being used, then this offset is from
|
||||
* the start of the text.The default value of this field
|
||||
* is -1. It will be set to appropriate value by the code that
|
||||
* populating the struct.
|
||||
*/
|
||||
int32_t offset;
|
||||
|
||||
/**
|
||||
* Textual context before the error. Null-terminated.
|
||||
* May be the empty string if not implemented by parser.
|
||||
*/
|
||||
UChar preContext[U_PARSE_CONTEXT_LEN];
|
||||
|
||||
/**
|
||||
* Textual context after the error. Null-terminated.
|
||||
* May be the empty string if not implemented by parser.
|
||||
*/
|
||||
UChar postContext[U_PARSE_CONTEXT_LEN];
|
||||
|
||||
} UParseError;
|
||||
|
||||
#endif
|
Loading…
Add table
Reference in a new issue