ICU-1076 initial limited support for Kleene star and plus operators

X-SVN-Rev: 5359
This commit is contained in:
Alan Liu 2001-07-27 00:18:53 +00:00
parent 40bfe95d06
commit ef8c73fc7c
26 changed files with 663 additions and 136 deletions

View file

@ -72,7 +72,7 @@ unifltlg.o unirange.o uniset.o unitohex.o unum.o \
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \
remtrans.o utrans.o \
titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \
unifilt.o
unifilt.o quant.o strmatch.o
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))

View file

@ -198,6 +198,10 @@ SOURCE=.\numfmt.cpp
# End Source File
# Begin Source File
SOURCE=.\quant.cpp
# End Source File
# Begin Source File
SOURCE=.\rbbi.cpp
# End Source File
# Begin Source File
@ -242,6 +246,10 @@ SOURCE=.\sortkey.cpp
# End Source File
# Begin Source File
SOURCE=.\strmatch.cpp
# End Source File
# Begin Source File
SOURCE=.\tblcoll.cpp
# End Source File
# Begin Source File
@ -1029,6 +1037,10 @@ InputPath=.\unicode\parsepos.h
# End Source File
# Begin Source File
SOURCE=.\quant.h
# End Source File
# Begin Source File
SOURCE=.\unicode\rbbi.h
!IF "$(CFG)" == "i18n - Win32 Release"
@ -1188,6 +1200,10 @@ InputPath=.\unicode\sortkey.h
# End Source File
# Begin Source File
SOURCE=.\strmatch.h
# End Source File
# Begin Source File
SOURCE=.\unicode\tblcoll.h
!IF "$(CFG)" == "i18n - Win32 Release"

View file

@ -0,0 +1,80 @@
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/26/01 aliu Creation.
**********************************************************************
*/
#include "quant.h"
Quantifier::Quantifier(UnicodeMatcher *adopted,
uint32_t minCount, uint32_t maxCount) {
// assert(adopted != 0);
// assert(minCount <= maxCount);
matcher = adopted;
this->minCount = minCount;
this->maxCount = maxCount;
}
Quantifier::Quantifier(const Quantifier& o) :
matcher(o.matcher->clone()),
minCount(o.minCount),
maxCount(o.maxCount) {
delete matcher;
}
Quantifier::~Quantifier() {
delete matcher;
}
/**
* Implement UnicodeMatcher
*/
UnicodeMatcher* Quantifier::clone() const {
return new Quantifier(*this);
}
UMatchDegree Quantifier::matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) const {
int32_t start = offset;
uint32_t count = 0;
while (count < maxCount) {
UMatchDegree m = matcher->matches(text, offset, limit, incremental);
if (m == U_MATCH) {
++count;
} else if (incremental && m == U_PARTIAL_MATCH) {
return U_PARTIAL_MATCH;
} else {
break;
}
}
if (incremental && offset == limit) {
return U_PARTIAL_MATCH;
}
if (count >= minCount) {
return U_MATCH;
}
offset = start;
return U_MISMATCH;
}
/**
* Implement UnicodeMatcher
*/
UnicodeString& Quantifier::toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
// TODO finish this
return result;
}
/**
* Implement UnicodeMatcher
*/
UBool Quantifier::matchesIndexValue(uint8_t v) const {
return (minCount == 0) || matcher->matchesIndexValue(v);
}
//eof

57
icu4c/source/i18n/quant.h Normal file
View file

@ -0,0 +1,57 @@
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/26/01 aliu Creation.
**********************************************************************
*/
#ifndef QUANT_H
#define QUANT_H
#include "unicode/unimatch.h"
class Quantifier : public UnicodeMatcher {
public:
Quantifier(UnicodeMatcher *adopted,
uint32_t minCount, uint32_t maxCount);
Quantifier(const Quantifier& o);
virtual ~Quantifier();
/**
* Implement UnicodeMatcher
*/
virtual UnicodeMatcher* clone() const;
/**
* Implement UnicodeMatcher
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) const;
/**
* Implement UnicodeMatcher
*/
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = FALSE) const;
/**
* Implement UnicodeMatcher
*/
virtual UBool matchesIndexValue(uint8_t v) const;
private:
UnicodeMatcher* matcher; // owned
uint32_t minCount;
uint32_t maxCount;
};
#endif

View file

@ -13,7 +13,7 @@
#include "unicode/uniset.h"
TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
variableNames(0), setVariables(0) {
variableNames(0), variables(0) {
if (U_FAILURE(status)) {
return;
}
@ -21,14 +21,14 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
if (U_SUCCESS(status)) {
variableNames->setValueDeleter(uhash_deleteUnicodeString);
}
setVariables = 0;
setVariablesLength = 0;
variables = 0;
variablesLength = 0;
}
TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData& other) :
ruleSet(other.ruleSet),
setVariablesBase(other.setVariablesBase),
setVariablesLength(other.setVariablesLength),
variablesBase(other.variablesBase),
variablesLength(other.variablesLength),
segmentBase(other.segmentBase) {
UErrorCode status = U_ZERO_ERROR;
@ -44,29 +44,29 @@ TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData&
}
}
setVariables = 0;
if (other.setVariables != 0) {
setVariables = new UnicodeSet*[setVariablesLength];
for (int32_t i=0; i<setVariablesLength; ++i) {
setVariables[i] = new UnicodeSet(*other.setVariables[i]);
variables = 0;
if (other.variables != 0) {
variables = new UnicodeMatcher*[variablesLength];
for (int32_t i=0; i<variablesLength; ++i) {
variables[i] = other.variables[i]->clone();
}
}
}
TransliterationRuleData::~TransliterationRuleData() {
delete variableNames;
if (setVariables != 0) {
for (int32_t i=0; i<setVariablesLength; ++i) {
delete setVariables[i];
if (variables != 0) {
for (int32_t i=0; i<variablesLength; ++i) {
delete variables[i];
}
delete[] setVariables;
delete[] variables;
}
}
const UnicodeSet*
TransliterationRuleData::lookupSet(UChar32 standIn) const {
int32_t i = standIn - setVariablesBase;
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
const UnicodeMatcher*
TransliterationRuleData::lookup(UChar32 standIn) const {
int32_t i = standIn - variablesBase;
return (i >= 0 && i < variablesLength) ? variables[i] : 0;
}
int32_t

View file

@ -11,7 +11,7 @@
#include "rbt_set.h"
class UnicodeString;
class UnicodeSet;
class UnicodeMatcher;
class Hashtable;
/**
@ -46,35 +46,35 @@ public:
* Map variable name (String) to variable (UnicodeString). A variable name
* corresponds to zero or more characters, stored in a UnicodeString in
* this hash. One or more of these chars may also correspond to a
* UnicodeSet, in which case the character in the UnicodeString in this hash is
* UnicodeMatcher, in which case the character in the UnicodeString in this hash is
* a stand-in: it is an index for a secondary lookup in
* data.setVariables. The stand-in also represents the UnicodeSet in
* data.variables. The stand-in also represents the UnicodeMatcher in
* the stored rules.
*/
Hashtable* variableNames;
/**
* Map category variable (UChar) to set (UnicodeSet).
* Map category variable (UChar) to set (UnicodeMatcher).
* Variables that correspond to a set of characters are mapped
* from variable name to a stand-in character in data.variableNames.
* The stand-in then serves as a key in this hash to lookup the
* actual UnicodeSet object. In addition, the stand-in is
* actual UnicodeMatcher object. In addition, the stand-in is
* stored in the rule text to represent the set of characters.
* setVariables[i] represents character (setVariablesBase + i).
* variables[i] represents character (variablesBase + i).
*/
UnicodeSet** setVariables;
UnicodeMatcher** variables;
/**
* The character that represents setVariables[0]. Characters
* setVariablesBase through setVariablesBase +
* setVariables.length - 1 represent UnicodeSet objects.
* The character that represents variables[0]. Characters
* variablesBase through variablesBase +
* variablesLength - 1 represent UnicodeMatcher objects.
*/
UChar setVariablesBase;
UChar variablesBase;
/**
* The length of setVariables.
* The length of variables.
*/
int32_t setVariablesLength;
int32_t variablesLength;
/**
* The character that represents segment 1. Characters segmentBase
@ -90,7 +90,11 @@ public:
~TransliterationRuleData();
const UnicodeSet* lookupSet(UChar32 standIn) const;
/**
* Given a stand-in character, return the UnicodeMatcher that it
* represents, or NULL.
*/
const UnicodeMatcher* lookup(UChar32 standIn) const;
/**
* Return the zero-based index of the segment represented by the given

View file

@ -7,19 +7,21 @@
* 11/17/99 aliu Creation.
**********************************************************************
*/
#include "rbt_pars.h"
#include "unicode/rbt.h"
#include "rbt_rule.h"
#include "unirange.h"
#include "rbt_data.h"
#include "unicode/uniset.h"
#include "cstring.h"
#include "unicode/parsepos.h"
#include "symtable.h"
#include "unicode/parseerr.h"
#include "hash.h"
#include "unicode/unicode.h"
#include "quant.h"
#include "rbt_data.h"
#include "rbt_pars.h"
#include "rbt_rule.h"
#include "strmatch.h"
#include "symtable.h"
#include "unirange.h"
#include "unicode/parseerr.h"
#include "unicode/parsepos.h"
#include "unicode/putil.h"
#include "unicode/rbt.h"
#include "unicode/unicode.h"
#include "unicode/uniset.h"
// Operators
#define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
@ -43,6 +45,8 @@
#define CURSOR_POS ((UChar)0x007C) /*|*/
#define CURSOR_OFFSET ((UChar)0x0040) /*@*/
#define ANCHOR_START ((UChar)0x005E) /*^*/
#define KLEENE_STAR ((UChar)0x002A) /***/
#define ONE_OR_MORE ((UChar)0x002B) /*+*/
// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
@ -61,17 +65,17 @@ static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
/**
* This class implements the SymbolTable interface. It is used
* during parsing to give UnicodeSet access to variables that
* have been defined so far. Note that it uses setVariablesVector,
* have been defined so far. Note that it uses variablesVector,
* _not_ data.setVariables.
*/
class ParseData : public SymbolTable {
public:
const TransliterationRuleData* data; // alias
const UVector* setVariablesVector; // alias
const UVector* variablesVector; // alias
ParseData(const TransliterationRuleData* data = 0,
const UVector* setVariablesVector = 0);
const UVector* variablesVector = 0);
virtual const UnicodeString* lookup(const UnicodeString& s) const;
@ -83,7 +87,7 @@ public:
ParseData::ParseData(const TransliterationRuleData* d,
const UVector* sets) :
data(d), setVariablesVector(sets) {}
data(d), variablesVector(sets) {}
/**
* Implement SymbolTable API.
@ -99,11 +103,11 @@ const UnicodeSet* ParseData::lookupSet(UChar32 ch) const {
// Note that we cannot use data.lookupSet() because the
// set array has not been constructed yet.
const UnicodeSet* set = NULL;
int32_t i = ch - data->setVariablesBase;
if (i >= 0 && i < setVariablesVector->size()) {
int32_t i = ch - data->setVariablesBase;
set = (i < setVariablesVector->size()) ?
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
int32_t i = ch - data->variablesBase;
if (i >= 0 && i < variablesVector->size()) {
int32_t i = ch - data->variablesBase;
set = (i < variablesVector->size()) ?
(UnicodeSet*) variablesVector->elementAt(i) : 0;
}
return set;
}
@ -276,7 +280,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
if (escaped == (UChar32) -1) {
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
}
buf.append((UChar) escaped);
buf.append(escaped);
continue;
}
// Handle quoted matter
@ -431,6 +435,40 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
}
}
break;
case KLEENE_STAR:
case ONE_OR_MORE:
// Very limited initial implementation. Note that this
// works strangely for quotes and variables --
// 'foo'* => fo o*
// $a = foo; $a * => fo o*
// We will fix this later so that
// 'foo'* => (foo) *
// $a = foo; $a * => (foo) *
// Implement with hidden segments, perhaps at # 10+.
{
int32_t start, limit;
if (segments != 0 &&
segments->size() >= 2 &&
segments->size() % 2 == 0 &&
_voidPtr_to_int32(segments->elementAt(segments->size()-1)) == buf.length()) {
// The * immediately follows a segment
int32_t len = segments->size();
start = _voidPtr_to_int32(segments->elementAt(len - 2));
limit = _voidPtr_to_int32(segments->elementAt(len - 1));
segments->setElementAt(_int32_to_voidPtr(start+1), len-1);
} else {
// The * follows an isolated character
// (or quote, or variable reference)
start = buf.length() - 1;
limit = start + 1;
}
UnicodeMatcher *m =
new StringMatcher(buf, start, limit, *parser.data);
m = new Quantifier(m, (c == ONE_OR_MORE)?1:0, 0x7FFFFFFF);
buf.truncate(start);
buf.append(parser.generateStandInFor(m));
}
break;
// case SET_CLOSE:
default:
// Disallow unquoted characters other than [0-9A-Za-z]
@ -551,7 +589,7 @@ TransliteratorParser::TransliteratorParser(
UTransDirection theDirection,
UParseError* theParseError) :
rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
parseData = new ParseData(0, &setVariablesVector);
parseData = new ParseData(0, &variablesVector);
}
/**
@ -589,7 +627,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
}
parseData->data = data;
setVariablesVector.removeAllElements();
variablesVector.removeAllElements();
if (parseError != 0) {
parseError->code = 0;
}
@ -668,16 +706,16 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
}
// Convert the set vector to an array
data->setVariablesLength = setVariablesVector.size();
data->setVariables = data->setVariablesLength == 0 ? 0 : new UnicodeSet*[data->setVariablesLength];
data->variablesLength = variablesVector.size();
data->variables = data->variablesLength == 0 ? 0 : new UnicodeMatcher*[data->variablesLength];
// orphanElement removes the given element and shifts all other
// elements down. For performance (and code clarity) we work from
// the end back to index 0.
int32_t i;
for (i=data->setVariablesLength; i>0; ) {
for (i=data->variablesLength; i>0; ) {
--i;
data->setVariables[i] =
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
data->variables[i] =
(UnicodeSet*) variablesVector.orphanElementAt(i);
}
// Index the rules
@ -894,14 +932,23 @@ int32_t TransliteratorParser::syntaxError(int32_t parseErrorCode,
UChar TransliteratorParser::parseSet(const UnicodeString& rule,
ParsePosition& pos) {
UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status);
set->compact();
return generateStandInFor(set);
}
/**
* Generate and return a stand-in for a new UnicodeMatcher. Store
* the matcher (adopt it).
*/
UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
// assert(adopted != 0);
if (variableNext >= variableLimit) {
// throw new RuntimeException("Private use variables exhausted");
delete set;
delete adopted;
status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
set->compact();
setVariablesVector.addElement(set);
variablesVector.addElement(adopted);
return variableNext++;
}
@ -949,12 +996,12 @@ void TransliteratorParser::determineVariableRange(void) {
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
data->variablesBase = variableNext = variableLimit = (UChar) 0;
if (r != 0) {
// Allocate 9 characters for segment references 1 through 9
data->segmentBase = r->start;
data->setVariablesBase = variableNext = (UChar) (data->segmentBase + 9);
data->variablesBase = variableNext = (UChar) (data->segmentBase + 9);
variableLimit = (UChar) (r->start + r->length);
delete r;
}

View file

@ -13,7 +13,7 @@
#include "unicode/parseerr.h"
class TransliterationRuleData;
class UnicodeSet;
class UnicodeMatcher;
class ParseData;
class RuleHalf;
class ParsePosition;
@ -48,11 +48,11 @@ class TransliteratorParser {
ParseData* parseData;
/**
* Temporary vector of set variables. When parsing is complete, this
* is copied into the array data.setVariables. As with data.setVariables,
* element 0 corresponds to character data.setVariablesBase.
* Temporary vector of matcher variables. When parsing is complete, this
* is copied into the array data.variables. As with data.variables,
* element 0 corresponds to character data.variablesBase.
*/
UVector setVariablesVector;
UVector variablesVector;
/**
* The next available stand-in for variables. This starts at some point in
@ -169,6 +169,12 @@ private:
UChar parseSet(const UnicodeString& rule,
ParsePosition& pos);
/**
* Generate and return a stand-in for a new UnicodeMatcher. Store
* the matcher (adopt it).
*/
UChar generateStandInFor(UnicodeMatcher* adopted);
/**
* Append the value of the given variable name to the given
* UnicodeString.

View file

@ -161,16 +161,16 @@ void TransliterationRule::init(const UnicodeString& input,
this->segments = adoptedSegs;
// Find the position of the first segment index that is after the
// anteContext (in the key). Note that this may be a start or a
// limit index.
// limit index. If all segments are in the ante context,
// firstKeySeg should point past the last segment -- that is, it
// should point at the end marker, which is -1. This allows the
// code to back up by one to obtain the last ante context segment.
firstKeySeg = -1;
if (segments != 0) {
do {
++firstKeySeg;
} while (segments[firstKeySeg] >= 0 &&
segments[firstKeySeg] < anteContextLength);
if (segments[firstKeySeg] < 0) {
firstKeySeg = -1;
}
}
pattern = input;
@ -221,7 +221,7 @@ int16_t TransliterationRule::getIndexValue() const {
return -1;
}
UChar32 c = pattern.char32At(anteContextLength);
return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1);
return (int16_t)(data.lookup(c) == NULL ? (c & 0xFF) : -1);
}
/**
@ -241,8 +241,9 @@ UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
return TRUE;
}
UChar32 c = pattern.char32At(anteContextLength);
const UnicodeSet* set = data.lookupSet(c);
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
const UnicodeMatcher* matcher = data.lookup(c);
return matcher == NULL ? (uint8_t(c) == v) :
matcher->matchesIndexValue(v);
}
/**
@ -367,17 +368,21 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// A mismatch in the ante context, or with the start anchor,
// is an outright U_MISMATCH regardless of whether we are
// incremental or not.
int32_t cursor = pos.start - UTF_CHAR_LENGTH(text.char32At(pos.start-1));
int32_t cursor = pos.start;
int32_t newStart = 0;
int32_t i;
// Backup cursor by one
if (cursor > 0) {
cursor -= UTF_CHAR_LENGTH(text.char32At(cursor-1));
} else {
--cursor;
}
for (i=anteContextLength-1; i>=0; --i) {
while (i == nextSegPos) {
segPos[iSeg] = cursor;
nextSegPos == (--iSeg >= 0) ? segments[iSeg] : -1;
}
UChar keyChar = pattern.charAt(i);
const UnicodeSet* set = data.lookupSet(keyChar);
if (set == 0) {
const UnicodeMatcher* matcher = data.lookup(keyChar);
if (matcher == 0) {
if (cursor >= pos.contextStart &&
keyChar == text.charAt(cursor)) {
--cursor;
@ -386,7 +391,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
}
} else {
// Subtract 1 from contextStart to make it a reverse limit
if (set->matches(text, cursor, pos.contextStart-1, FALSE)
if (matcher->matches(text, cursor, pos.contextStart-1, FALSE)
!= U_MATCH) {
return U_MISMATCH;
}
@ -395,6 +400,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// Record the position of the cursor
newStart = cursor;
}
while (nextSegPos == i) {
segPos[iSeg] = cursor;
if (cursor >= 0) {
segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(cursor));
} else {
++segPos[iSeg];
}
nextSegPos = (--iSeg >= 0) ? segments[iSeg] : -1;
}
}
// ------------------------ Start Anchor ------------------------
@ -405,8 +419,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
// -------------------- Key and Post Context --------------------
// YUCKY OPTIMIZATION. To make things a miniscule amount faster,
// subtract anteContextLength from all segments[i] with i >=
// firstKeySeg. Then we don't have to do so here. I only mention
// this here in order to say DO NOT DO THIS. The gain is
// miniscule (how long does an integer subtraction take?) and the
// increase in confusion isn't worth it.
iSeg = firstKeySeg;
nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
nextSegPos = (iSeg >= 0) ? (segments[iSeg] - anteContextLength) : -1;
i = 0;
cursor = pos.start;
@ -424,14 +445,14 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
}
while (i == nextSegPos) {
segPos[iSeg] = cursor;
nextSegPos = segments[++iSeg];
nextSegPos = segments[++iSeg] - anteContextLength;
}
if (i == keyLength) {
keyLimit = cursor;
}
UChar keyChar = pattern.charAt(anteContextLength + i++);
const UnicodeSet* set = data.lookupSet(keyChar);
if (set == 0) {
const UnicodeMatcher* matcher = data.lookup(keyChar);
if (matcher == 0) {
// Don't need the cursor < pos.contextLimit check if
// incremental is TRUE (because it's done above); do need
// it otherwise.
@ -443,7 +464,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
}
} else {
UMatchDegree m =
set->matches(text, cursor, pos.contextLimit, incremental);
matcher->matches(text, cursor, pos.contextLimit, incremental);
if (m != U_MATCH) {
return m;
}
@ -451,7 +472,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
}
while (i == nextSegPos) {
segPos[iSeg] = cursor;
nextSegPos = segments[++iSeg];
nextSegPos = segments[++iSeg] - anteContextLength;
}
if (i == keyLength) {
keyLimit = cursor;
@ -686,11 +707,11 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
}
UChar c = pattern.charAt(i);
const UnicodeSet *set = data.lookupSet(c);
if (set == 0) {
const UnicodeMatcher *matcher = data.lookup(c);
if (matcher == 0) {
_appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
} else {
_appendToRule(rule, set->toPattern(str, escapeUnprintable),
_appendToRule(rule, matcher->toPattern(str, escapeUnprintable),
TRUE, escapeUnprintable, quoteBuf);
}
}

View file

@ -0,0 +1,128 @@
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/23/01 aliu Creation.
**********************************************************************
*/
#include "strmatch.h"
#include "rbt_data.h"
StringMatcher::StringMatcher(const UnicodeString& theString,
int32_t start,
int32_t limit,
const TransliterationRuleData& theData) :
data(theData) {
theString.extractBetween(start, limit, pattern);
}
StringMatcher::StringMatcher(const UnicodeString& theString,
const TransliterationRuleData& theData) :
pattern(theString),
data(theData) {
}
StringMatcher::StringMatcher(const StringMatcher& o) :
pattern(o.pattern),
data(o.data) {
}
/**
* Destructor
*/
StringMatcher::~StringMatcher() {
}
/**
* Implement UnicodeMatcher
*/
UnicodeMatcher* StringMatcher::clone() const {
return new StringMatcher(*this);
}
/**
* Implement UnicodeMatcher
*/
UMatchDegree StringMatcher::matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) const {
int32_t i;
int32_t cursor = offset;
if (limit < cursor) {
for (i=pattern.length()-1; i>=0; --i) {
UChar keyChar = pattern.charAt(i);
const UnicodeMatcher* subm = data.lookup(keyChar);
if (subm == 0) {
if (cursor >= limit &&
keyChar == text.charAt(cursor)) {
--cursor;
} else {
return U_MISMATCH;
}
} else {
UMatchDegree m =
subm->matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
} else {
for (i=0; i<pattern.length(); ++i) {
if (incremental && cursor == limit) {
// We've reached the context limit without a mismatch and
// without completing our match.
return U_PARTIAL_MATCH;
}
UChar keyChar = pattern.charAt(i);
const UnicodeMatcher* subm = data.lookup(keyChar);
if (subm == 0) {
// Don't need the cursor < limit check if
// incremental is TRUE (because it's done above); do need
// it otherwise.
if (cursor < limit &&
keyChar == text.charAt(cursor)) {
++cursor;
} else {
return U_MISMATCH;
}
} else {
UMatchDegree m =
subm->matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
}
offset = cursor;
return U_MATCH;
}
/**
* Implement UnicodeMatcher
*/
UnicodeString& StringMatcher::toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
for (int32_t i=0; i<pattern.length(); ++i) {
// TODO finish this
}
return result;
}
/**
* Implement UnicodeMatcher
*/
UBool StringMatcher::matchesIndexValue(uint8_t v) const {
if (pattern.length() == 0) {
return TRUE;
}
UChar32 c = pattern.char32At(0);
const UnicodeMatcher *m = data.lookup(c);
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
}
//eof

View file

@ -0,0 +1,69 @@
/*
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/23/01 aliu Creation.
**********************************************************************
*/
#ifndef STRMATCH_H
#define STRMATCH_H
#include "unicode/unistr.h"
#include "unicode/unimatch.h"
class TransliterationRuleData;
/**
* An object that matches a string.
*/
class StringMatcher : public UnicodeMatcher {
public:
StringMatcher(const UnicodeString& string,
int32_t start,
int32_t limit,
const TransliterationRuleData& data);
StringMatcher(const UnicodeString& string,
const TransliterationRuleData& data);
StringMatcher(const StringMatcher& o);
/**
* Destructor
*/
virtual ~StringMatcher();
/**
* Implement UnicodeMatcher
*/
virtual UnicodeMatcher* clone() const;
/**
* Implement UnicodeMatcher
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) const;
/**
* Implement UnicodeMatcher
*/
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = FALSE) const;
/**
* Implement UnicodeMatcher
*/
virtual UBool matchesIndexValue(uint8_t v) const;
private:
UnicodeString pattern;
const TransliterationRuleData& data;
};
#endif

View file

@ -149,7 +149,7 @@ Transliterator::Transliterator(const Transliterator& other) :
maximumContextLength(other.maximumContextLength) {
if (other.filter != 0) {
// We own the filter, so we must have our own copy
filter = other.filter->clone();
filter = (UnicodeFilter*) other.filter->clone();
}
}
@ -160,7 +160,7 @@ Transliterator& Transliterator::operator=(const Transliterator& other) {
ID = other.ID;
maximumContextLength = other.maximumContextLength;
// MUST go through adoptFilter in case latter is overridden
adoptFilter((other.filter == 0) ? 0 : other.filter->clone());
adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone());
return *this;
}
@ -361,6 +361,25 @@ void Transliterator::_transliterate(Replaceable& text,
filteredTransliterate(text, index, TRUE);
#if 0
// I CAN'T DO what I'm attempting below now that the Kleene star
// operator is supported. For example, in the rule
// ([:Lu:]+) { x } > $1;
// what is the maximum context length? getMaximumContextLength()
// will return 1, but this is just the length of the ante context
// part of the pattern string -- 1 character, which is a standin
// for a Quantifier, which contains a StringMatcher, which
// contains a UnicodeSet.
// There is a complicated way to make this work again, and that's
// to add a "maximum left context" protocol into the
// UnicodeMatcher hierarchy. At present I'm not convinced this is
// worth it.
// ---
// The purpose of the code below is to keep the context small
// while doing incremental transliteration. When part of the left
// context (between contextStart and start) is no longer needed,
@ -373,6 +392,7 @@ void Transliterator::_transliterate(Replaceable& text,
newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
}
index.contextStart = uprv_max(newCS, originalStart);
#endif
}
/**

View file

@ -38,12 +38,15 @@ public:
virtual UBool contains(UChar32 c) const = 0;
/**
* Returns a copy of this object. All UnicodeFilter objects have
* to support cloning in order to allow classes using
* UnicodeFilters, such as Transliterator, to implement cloning.
* @draft
* UnicodeMatcher API. This class stubs this out.
*/
virtual UnicodeFilter* clone() const = 0;
UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable) const;
/**
* UnicodeMatcher API. This class stubs this out.
*/
UBool matchesIndexValue(uint8_t v) const;
/**
* Implement UnicodeMatcher API.

View file

@ -11,6 +11,7 @@
#include "unicode/utypes.h"
class Replaceable;
class UnicodeString;
/**
* Constants returned by <code>UnicodeMatcher::matches()</code>
@ -59,6 +60,13 @@ public:
*/
virtual ~UnicodeMatcher();
/**
* Returns a copy of this object. All UnicodeMatcher objects have
* to support cloning in order to allow classes using
* UnicodeMatchers to implement cloning.
*/
virtual UnicodeMatcher* clone() const = 0;
/**
* Return a UMatchDegree value indicating the degree of match for
* the given text at the given offset. Zero, one, or more
@ -106,6 +114,28 @@ public:
int32_t limit,
UBool incremental) const = 0;
/**
* Returns a string representation of this matcher. If the result of
* calling this function is passed to the appropriate parser, it
* will produce another matcher that is equal to this one.
* @param result the string to receive the pattern. Previous
* contents will be deleted.
* @param escapeUnprintable if TRUE then convert unprintable
* character to their hex escape representations, \uxxxx or
* \Uxxxxxxxx. Unprintable characters are those other than
* U+000A, U+0020..U+007E.
*/
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = FALSE) const = 0;
/**
* Returns TRUE if this matcher will match a character c, where c
* & 0xFF == v, at offset, in the forward direction (with limit >
* offset). This is used by <tt>RuleBasedTransliterator</tt> for
* indexing.
*/
virtual UBool matchesIndexValue(uint8_t v) const = 0;
protected:
UnicodeMatcher();

View file

@ -365,12 +365,12 @@ public:
UBool operator!=(const UnicodeSet& o) const;
/**
* Returns a copy of this object. All UnicodeFilter objects have
* Returns a copy of this object. All UnicodeMatcher objects have
* to support cloning in order to allow classes using
* UnicodeFilters, such as Transliterator, to implement cloning.
* UnicodeMatchers, such as Transliterator, to implement cloning.
* @draft
*/
virtual UnicodeFilter* clone() const;
virtual UnicodeMatcher* clone() const;
/**
* Returns the hash code value for this set.
@ -691,7 +691,7 @@ private:
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
* indexing.
*/
UBool containsIndexValue(uint8_t v) const;
virtual UBool matchesIndexValue(uint8_t v) const;
private:

View file

@ -40,3 +40,16 @@ UMatchDegree UnicodeFilter::matches(const Replaceable& text,
}
return U_MISMATCH;
}
// Stub this out for filters that do not implement a pattern
UnicodeString& UnicodeFilter::toPattern(UnicodeString& result,
UBool escapeUnprintable) const {
return result;
}
// Stub this out for filters that do not implement indexing
UBool UnicodeFilter::matchesIndexValue(uint8_t v) const {
return FALSE;
}
//eof

View file

@ -22,7 +22,7 @@ public:
NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; }
virtual ~NullFilter() {}
virtual UBool contains(UChar32 /*c*/) const { return result; }
virtual UnicodeFilter* clone() const { return new NullFilter(*this); }
virtual UnicodeMatcher* clone() const { return new NullFilter(*this); }
};
class UnicodeNotFilter : public UnicodeFilter {
@ -32,15 +32,15 @@ public:
UnicodeNotFilter(const UnicodeNotFilter&);
virtual ~UnicodeNotFilter();
virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
virtual UnicodeMatcher* clone() const;
};
UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f)
: UnicodeFilter(f), filt(f.filt->clone()) {}
: UnicodeFilter(f), filt((UnicodeFilter*) f.filt->clone()) {}
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); }
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
UnicodeMatcher* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
/**
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
@ -50,7 +50,7 @@ UnicodeFilter* UnicodeFilterLogic::createNot(const UnicodeFilter* f) {
if (f == 0) {
return new NullFilter(FALSE);
} else {
return new UnicodeNotFilter(f->clone());
return new UnicodeNotFilter((UnicodeFilter*)f->clone());
}
}
@ -62,15 +62,15 @@ public:
UnicodeAndFilter(const UnicodeAndFilter&);
virtual ~UnicodeAndFilter();
virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
virtual UnicodeMatcher* clone() const;
};
UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f)
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
: UnicodeFilter(f), filt1((UnicodeFilter*)f.filt1->clone()), filt2((UnicodeFilter*)f.filt2->clone()) {}
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); }
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
UnicodeMatcher* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
@ -84,12 +84,12 @@ UnicodeFilter* UnicodeFilterLogic::createAnd(const UnicodeFilter* f,
if (g == 0) {
return NULL;
}
return g->clone();
return (UnicodeFilter*)g->clone();
}
if (g == 0) {
return f->clone();
return (UnicodeFilter*)f->clone();
}
return new UnicodeAndFilter(f->clone(), g->clone());
return new UnicodeAndFilter((UnicodeFilter*)f->clone(), (UnicodeFilter*)g->clone());
}
class UnicodeOrFilter : public UnicodeFilter {
@ -100,15 +100,15 @@ public:
UnicodeOrFilter(const UnicodeOrFilter&);
virtual ~UnicodeOrFilter();
virtual UBool contains(UChar32 c) const;
virtual UnicodeFilter* clone() const;
virtual UnicodeMatcher* clone() const;
};
UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f)
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
: UnicodeFilter(f), filt1((UnicodeFilter*)f.filt1->clone()), filt2((UnicodeFilter*)f.filt2->clone()) {}
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); }
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
UnicodeMatcher* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
/**
* Returns a <tt>UnicodeFilter</tt> that implements a short
@ -122,10 +122,10 @@ UnicodeFilter* UnicodeFilterLogic::createOr(const UnicodeFilter* f,
if (g == 0) {
return NULL;
}
return g->clone();
return (UnicodeFilter*)g->clone();
}
if (g == 0) {
return f->clone();
return (UnicodeFilter*)f->clone();
}
return new UnicodeOrFilter(f->clone(), g->clone());
return new UnicodeOrFilter((UnicodeFilter*)f->clone(), (UnicodeFilter*)g->clone());
}

View file

@ -228,11 +228,11 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
}
/**
* Returns a copy of this object. All UnicodeFilter objects have
* Returns a copy of this object. All UnicodeMatcher objects have
* to support cloning in order to allow classes using
* UnicodeFilters, such as Transliterator, to implement cloning.
* UnicodeMatchers, such as Transliterator, to implement cloning.
*/
UnicodeFilter* UnicodeSet::clone() const {
UnicodeMatcher* UnicodeSet::clone() const {
return new UnicodeSet(*this);
}
@ -547,7 +547,7 @@ UBool UnicodeSet::contains(UChar32 c) const {
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
* indexing.
*/
UBool UnicodeSet::containsIndexValue(uint8_t v) const {
UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
/* The index value v, in the range [0,255], is contained in this set if
* it is contained in any pair of this set. Pairs either have the high
* bytes equal, or unequal. If the high bytes are equal, then we have

View file

@ -72,7 +72,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
* Used by TestConstruction() and TestTransliterate.
*/
class TestHangulFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestHangulFilter(*this);
}
virtual UBool contains(UChar32 c) const {

View file

@ -56,7 +56,7 @@ void HexToUniTransliteratorTest::runIndexedTest( int32_t index, UBool exec, cons
* Used by TestConstruction() and TestTransliterate.
*/
class TestHexFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestHexFilter(*this);
}
virtual UBool contains(UChar32 c) const {

View file

@ -70,7 +70,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
* Used by TestConstruction() and TestTransliterate.
*/
class TestJamoFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestJamoFilter(*this);
}
virtual UBool contains(UChar32 c) const {

View file

@ -615,7 +615,7 @@ void TransliteratorAPITest::TestRegisterUnregister(){
* Used by TestFiltering().
*/
class TestFilter1 : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestFilter1(*this);
}
virtual UBool contains(UChar32 c) const {
@ -626,7 +626,7 @@ class TestFilter1 : public UnicodeFilter {
}
};
class TestFilter2 : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestFilter2(*this);
}
virtual UBool contains(UChar32 c) const {
@ -637,7 +637,7 @@ class TestFilter2 : public UnicodeFilter {
}
};
class TestFilter3 : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestFilter3(*this);
}
virtual UBool contains(UChar32 c) const {

View file

@ -68,6 +68,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(32,TestToRules);
TESTCASE(33,TestContext);
TESTCASE(34,TestSupplemental);
TESTCASE(35,TestQuantifier);
default: name = ""; break;
}
}
@ -477,7 +478,7 @@ void TransliteratorTest::TestCompoundHex(void) {
* Used by TestFiltering().
*/
class TestFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestFilter(*this);
}
virtual UBool contains(UChar32 c) const {
@ -1501,6 +1502,36 @@ void TransliteratorTest::TestSupplemental() {
CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
}
void TransliteratorTest::TestQuantifier() {
expect("(ab)+ {x} > '(' $1 ')';",
"x abx ababxy",
"x ab(ab) abab(abab)y");
expect("b+ > x;",
"ac abc abbc abbbc",
"ac axc axc axc");
expect("[abc]+ > x;",
"qac abrc abbcs abtbbc",
"qx xrx xs xtx");
expect("q{(ab)+} > x;",
"qa qab qaba qababc qaba",
"qa qx qxa qxc qxa");
expect("q(ab)* > x;",
"qa qab qaba qababc",
"xa x xa xc");
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
// In perl, it only matches the first occurrence, so the output
// is "()a (ab) (ab)a (ab)c".
expect("q(ab)* > '(' $1 ')';",
"qa qab qaba qababc",
"()a (ab) (ab)a (abab)c");
}
//======================================================================
// Support methods
//======================================================================

View file

@ -172,6 +172,8 @@ class TransliteratorTest : public IntlTest {
void TestSupplemental(void);
void TestQuantifier(void);
//======================================================================
// Support methods
//======================================================================

View file

@ -36,7 +36,7 @@ void UnicodeFilterLogicTest::runIndexedTest( int32_t index, UBool exec, const ch
}
class Filter1: public UnicodeFilter{
virtual UnicodeFilter* clone() const{
virtual UnicodeMatcher* clone() const{
return new Filter1(*this);
}
virtual UBool contains(UChar32 c) const {
@ -47,7 +47,7 @@ class Filter1: public UnicodeFilter{
}
};
class Filter2: public UnicodeFilter{
virtual UnicodeFilter* clone() const{
virtual UnicodeMatcher* clone() const{
return new Filter2(*this);
}
virtual UBool contains(UChar32 c) const {

View file

@ -68,7 +68,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
* Used by TestConstruction() and TestTransliterate.
*/
class TestUniFilter : public UnicodeFilter {
virtual UnicodeFilter* clone() const {
virtual UnicodeMatcher* clone() const {
return new TestUniFilter(*this);
}
virtual UBool contains(UChar32 c) const {