mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-1076 initial limited support for Kleene star and plus operators
X-SVN-Rev: 5359
This commit is contained in:
parent
40bfe95d06
commit
ef8c73fc7c
26 changed files with 663 additions and 136 deletions
|
@ -72,7 +72,7 @@ unifltlg.o unirange.o uniset.o unitohex.o unum.o \
|
|||
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \
|
||||
remtrans.o utrans.o \
|
||||
titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \
|
||||
unifilt.o
|
||||
unifilt.o quant.o strmatch.o
|
||||
|
||||
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
|
||||
|
||||
|
|
|
@ -198,6 +198,10 @@ SOURCE=.\numfmt.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\quant.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\rbbi.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -242,6 +246,10 @@ SOURCE=.\sortkey.cpp
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\strmatch.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\tblcoll.cpp
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -1029,6 +1037,10 @@ InputPath=.\unicode\parsepos.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\quant.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\rbbi.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
@ -1188,6 +1200,10 @@ InputPath=.\unicode\sortkey.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\strmatch.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\tblcoll.h
|
||||
|
||||
!IF "$(CFG)" == "i18n - Win32 Release"
|
||||
|
|
80
icu4c/source/i18n/quant.cpp
Normal file
80
icu4c/source/i18n/quant.cpp
Normal file
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 07/26/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "quant.h"
|
||||
|
||||
Quantifier::Quantifier(UnicodeMatcher *adopted,
|
||||
uint32_t minCount, uint32_t maxCount) {
|
||||
// assert(adopted != 0);
|
||||
// assert(minCount <= maxCount);
|
||||
matcher = adopted;
|
||||
this->minCount = minCount;
|
||||
this->maxCount = maxCount;
|
||||
}
|
||||
|
||||
Quantifier::Quantifier(const Quantifier& o) :
|
||||
matcher(o.matcher->clone()),
|
||||
minCount(o.minCount),
|
||||
maxCount(o.maxCount) {
|
||||
delete matcher;
|
||||
}
|
||||
|
||||
Quantifier::~Quantifier() {
|
||||
delete matcher;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UnicodeMatcher* Quantifier::clone() const {
|
||||
return new Quantifier(*this);
|
||||
}
|
||||
|
||||
UMatchDegree Quantifier::matches(const Replaceable& text,
|
||||
int32_t& offset,
|
||||
int32_t limit,
|
||||
UBool incremental) const {
|
||||
int32_t start = offset;
|
||||
uint32_t count = 0;
|
||||
while (count < maxCount) {
|
||||
UMatchDegree m = matcher->matches(text, offset, limit, incremental);
|
||||
if (m == U_MATCH) {
|
||||
++count;
|
||||
} else if (incremental && m == U_PARTIAL_MATCH) {
|
||||
return U_PARTIAL_MATCH;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (incremental && offset == limit) {
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
if (count >= minCount) {
|
||||
return U_MATCH;
|
||||
}
|
||||
offset = start;
|
||||
return U_MISMATCH;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UnicodeString& Quantifier::toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable) const {
|
||||
// TODO finish this
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UBool Quantifier::matchesIndexValue(uint8_t v) const {
|
||||
return (minCount == 0) || matcher->matchesIndexValue(v);
|
||||
}
|
||||
|
||||
//eof
|
57
icu4c/source/i18n/quant.h
Normal file
57
icu4c/source/i18n/quant.h
Normal file
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 07/26/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef QUANT_H
|
||||
#define QUANT_H
|
||||
|
||||
#include "unicode/unimatch.h"
|
||||
|
||||
class Quantifier : public UnicodeMatcher {
|
||||
|
||||
public:
|
||||
|
||||
Quantifier(UnicodeMatcher *adopted,
|
||||
uint32_t minCount, uint32_t maxCount);
|
||||
|
||||
Quantifier(const Quantifier& o);
|
||||
|
||||
virtual ~Quantifier();
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UnicodeMatcher* clone() const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UMatchDegree matches(const Replaceable& text,
|
||||
int32_t& offset,
|
||||
int32_t limit,
|
||||
UBool incremental) const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UnicodeString& toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable = FALSE) const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UBool matchesIndexValue(uint8_t v) const;
|
||||
|
||||
private:
|
||||
|
||||
UnicodeMatcher* matcher; // owned
|
||||
|
||||
uint32_t minCount;
|
||||
|
||||
uint32_t maxCount;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -13,7 +13,7 @@
|
|||
#include "unicode/uniset.h"
|
||||
|
||||
TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
|
||||
variableNames(0), setVariables(0) {
|
||||
variableNames(0), variables(0) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -21,14 +21,14 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
|
|||
if (U_SUCCESS(status)) {
|
||||
variableNames->setValueDeleter(uhash_deleteUnicodeString);
|
||||
}
|
||||
setVariables = 0;
|
||||
setVariablesLength = 0;
|
||||
variables = 0;
|
||||
variablesLength = 0;
|
||||
}
|
||||
|
||||
TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData& other) :
|
||||
ruleSet(other.ruleSet),
|
||||
setVariablesBase(other.setVariablesBase),
|
||||
setVariablesLength(other.setVariablesLength),
|
||||
variablesBase(other.variablesBase),
|
||||
variablesLength(other.variablesLength),
|
||||
segmentBase(other.segmentBase) {
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
@ -44,29 +44,29 @@ TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData&
|
|||
}
|
||||
}
|
||||
|
||||
setVariables = 0;
|
||||
if (other.setVariables != 0) {
|
||||
setVariables = new UnicodeSet*[setVariablesLength];
|
||||
for (int32_t i=0; i<setVariablesLength; ++i) {
|
||||
setVariables[i] = new UnicodeSet(*other.setVariables[i]);
|
||||
variables = 0;
|
||||
if (other.variables != 0) {
|
||||
variables = new UnicodeMatcher*[variablesLength];
|
||||
for (int32_t i=0; i<variablesLength; ++i) {
|
||||
variables[i] = other.variables[i]->clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TransliterationRuleData::~TransliterationRuleData() {
|
||||
delete variableNames;
|
||||
if (setVariables != 0) {
|
||||
for (int32_t i=0; i<setVariablesLength; ++i) {
|
||||
delete setVariables[i];
|
||||
if (variables != 0) {
|
||||
for (int32_t i=0; i<variablesLength; ++i) {
|
||||
delete variables[i];
|
||||
}
|
||||
delete[] setVariables;
|
||||
delete[] variables;
|
||||
}
|
||||
}
|
||||
|
||||
const UnicodeSet*
|
||||
TransliterationRuleData::lookupSet(UChar32 standIn) const {
|
||||
int32_t i = standIn - setVariablesBase;
|
||||
return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
|
||||
const UnicodeMatcher*
|
||||
TransliterationRuleData::lookup(UChar32 standIn) const {
|
||||
int32_t i = standIn - variablesBase;
|
||||
return (i >= 0 && i < variablesLength) ? variables[i] : 0;
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
#include "rbt_set.h"
|
||||
|
||||
class UnicodeString;
|
||||
class UnicodeSet;
|
||||
class UnicodeMatcher;
|
||||
class Hashtable;
|
||||
|
||||
/**
|
||||
|
@ -46,35 +46,35 @@ public:
|
|||
* Map variable name (String) to variable (UnicodeString). A variable name
|
||||
* corresponds to zero or more characters, stored in a UnicodeString in
|
||||
* this hash. One or more of these chars may also correspond to a
|
||||
* UnicodeSet, in which case the character in the UnicodeString in this hash is
|
||||
* UnicodeMatcher, in which case the character in the UnicodeString in this hash is
|
||||
* a stand-in: it is an index for a secondary lookup in
|
||||
* data.setVariables. The stand-in also represents the UnicodeSet in
|
||||
* data.variables. The stand-in also represents the UnicodeMatcher in
|
||||
* the stored rules.
|
||||
*/
|
||||
Hashtable* variableNames;
|
||||
|
||||
/**
|
||||
* Map category variable (UChar) to set (UnicodeSet).
|
||||
* Map category variable (UChar) to set (UnicodeMatcher).
|
||||
* Variables that correspond to a set of characters are mapped
|
||||
* from variable name to a stand-in character in data.variableNames.
|
||||
* The stand-in then serves as a key in this hash to lookup the
|
||||
* actual UnicodeSet object. In addition, the stand-in is
|
||||
* actual UnicodeMatcher object. In addition, the stand-in is
|
||||
* stored in the rule text to represent the set of characters.
|
||||
* setVariables[i] represents character (setVariablesBase + i).
|
||||
* variables[i] represents character (variablesBase + i).
|
||||
*/
|
||||
UnicodeSet** setVariables;
|
||||
UnicodeMatcher** variables;
|
||||
|
||||
/**
|
||||
* The character that represents setVariables[0]. Characters
|
||||
* setVariablesBase through setVariablesBase +
|
||||
* setVariables.length - 1 represent UnicodeSet objects.
|
||||
* The character that represents variables[0]. Characters
|
||||
* variablesBase through variablesBase +
|
||||
* variablesLength - 1 represent UnicodeMatcher objects.
|
||||
*/
|
||||
UChar setVariablesBase;
|
||||
UChar variablesBase;
|
||||
|
||||
/**
|
||||
* The length of setVariables.
|
||||
* The length of variables.
|
||||
*/
|
||||
int32_t setVariablesLength;
|
||||
int32_t variablesLength;
|
||||
|
||||
/**
|
||||
* The character that represents segment 1. Characters segmentBase
|
||||
|
@ -90,7 +90,11 @@ public:
|
|||
|
||||
~TransliterationRuleData();
|
||||
|
||||
const UnicodeSet* lookupSet(UChar32 standIn) const;
|
||||
/**
|
||||
* Given a stand-in character, return the UnicodeMatcher that it
|
||||
* represents, or NULL.
|
||||
*/
|
||||
const UnicodeMatcher* lookup(UChar32 standIn) const;
|
||||
|
||||
/**
|
||||
* Return the zero-based index of the segment represented by the given
|
||||
|
|
|
@ -7,19 +7,21 @@
|
|||
* 11/17/99 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "rbt_pars.h"
|
||||
#include "unicode/rbt.h"
|
||||
#include "rbt_rule.h"
|
||||
#include "unirange.h"
|
||||
#include "rbt_data.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "symtable.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "hash.h"
|
||||
#include "unicode/unicode.h"
|
||||
#include "quant.h"
|
||||
#include "rbt_data.h"
|
||||
#include "rbt_pars.h"
|
||||
#include "rbt_rule.h"
|
||||
#include "strmatch.h"
|
||||
#include "symtable.h"
|
||||
#include "unirange.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/rbt.h"
|
||||
#include "unicode/unicode.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
// Operators
|
||||
#define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
|
||||
|
@ -43,6 +45,8 @@
|
|||
#define CURSOR_POS ((UChar)0x007C) /*|*/
|
||||
#define CURSOR_OFFSET ((UChar)0x0040) /*@*/
|
||||
#define ANCHOR_START ((UChar)0x005E) /*^*/
|
||||
#define KLEENE_STAR ((UChar)0x002A) /***/
|
||||
#define ONE_OR_MORE ((UChar)0x002B) /*+*/
|
||||
|
||||
// By definition, the ANCHOR_END special character is a
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
|
@ -61,17 +65,17 @@ static const UChar ID_TOKEN[] = { 0x3A, 0x3A }; // ':', ':'
|
|||
/**
|
||||
* This class implements the SymbolTable interface. It is used
|
||||
* during parsing to give UnicodeSet access to variables that
|
||||
* have been defined so far. Note that it uses setVariablesVector,
|
||||
* have been defined so far. Note that it uses variablesVector,
|
||||
* _not_ data.setVariables.
|
||||
*/
|
||||
class ParseData : public SymbolTable {
|
||||
public:
|
||||
const TransliterationRuleData* data; // alias
|
||||
|
||||
const UVector* setVariablesVector; // alias
|
||||
const UVector* variablesVector; // alias
|
||||
|
||||
ParseData(const TransliterationRuleData* data = 0,
|
||||
const UVector* setVariablesVector = 0);
|
||||
const UVector* variablesVector = 0);
|
||||
|
||||
virtual const UnicodeString* lookup(const UnicodeString& s) const;
|
||||
|
||||
|
@ -83,7 +87,7 @@ public:
|
|||
|
||||
ParseData::ParseData(const TransliterationRuleData* d,
|
||||
const UVector* sets) :
|
||||
data(d), setVariablesVector(sets) {}
|
||||
data(d), variablesVector(sets) {}
|
||||
|
||||
/**
|
||||
* Implement SymbolTable API.
|
||||
|
@ -99,11 +103,11 @@ const UnicodeSet* ParseData::lookupSet(UChar32 ch) const {
|
|||
// Note that we cannot use data.lookupSet() because the
|
||||
// set array has not been constructed yet.
|
||||
const UnicodeSet* set = NULL;
|
||||
int32_t i = ch - data->setVariablesBase;
|
||||
if (i >= 0 && i < setVariablesVector->size()) {
|
||||
int32_t i = ch - data->setVariablesBase;
|
||||
set = (i < setVariablesVector->size()) ?
|
||||
(UnicodeSet*) setVariablesVector->elementAt(i) : 0;
|
||||
int32_t i = ch - data->variablesBase;
|
||||
if (i >= 0 && i < variablesVector->size()) {
|
||||
int32_t i = ch - data->variablesBase;
|
||||
set = (i < variablesVector->size()) ?
|
||||
(UnicodeSet*) variablesVector->elementAt(i) : 0;
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
@ -276,7 +280,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||
if (escaped == (UChar32) -1) {
|
||||
return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
|
||||
}
|
||||
buf.append((UChar) escaped);
|
||||
buf.append(escaped);
|
||||
continue;
|
||||
}
|
||||
// Handle quoted matter
|
||||
|
@ -431,6 +435,40 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
|
|||
}
|
||||
}
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
// Very limited initial implementation. Note that this
|
||||
// works strangely for quotes and variables --
|
||||
// 'foo'* => fo o*
|
||||
// $a = foo; $a * => fo o*
|
||||
// We will fix this later so that
|
||||
// 'foo'* => (foo) *
|
||||
// $a = foo; $a * => (foo) *
|
||||
// Implement with hidden segments, perhaps at # 10+.
|
||||
{
|
||||
int32_t start, limit;
|
||||
if (segments != 0 &&
|
||||
segments->size() >= 2 &&
|
||||
segments->size() % 2 == 0 &&
|
||||
_voidPtr_to_int32(segments->elementAt(segments->size()-1)) == buf.length()) {
|
||||
// The * immediately follows a segment
|
||||
int32_t len = segments->size();
|
||||
start = _voidPtr_to_int32(segments->elementAt(len - 2));
|
||||
limit = _voidPtr_to_int32(segments->elementAt(len - 1));
|
||||
segments->setElementAt(_int32_to_voidPtr(start+1), len-1);
|
||||
} else {
|
||||
// The * follows an isolated character
|
||||
// (or quote, or variable reference)
|
||||
start = buf.length() - 1;
|
||||
limit = start + 1;
|
||||
}
|
||||
UnicodeMatcher *m =
|
||||
new StringMatcher(buf, start, limit, *parser.data);
|
||||
m = new Quantifier(m, (c == ONE_OR_MORE)?1:0, 0x7FFFFFFF);
|
||||
buf.truncate(start);
|
||||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
|
@ -551,7 +589,7 @@ TransliteratorParser::TransliteratorParser(
|
|||
UTransDirection theDirection,
|
||||
UParseError* theParseError) :
|
||||
rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
|
||||
parseData = new ParseData(0, &setVariablesVector);
|
||||
parseData = new ParseData(0, &variablesVector);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -589,7 +627,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
|
|||
}
|
||||
|
||||
parseData->data = data;
|
||||
setVariablesVector.removeAllElements();
|
||||
variablesVector.removeAllElements();
|
||||
if (parseError != 0) {
|
||||
parseError->code = 0;
|
||||
}
|
||||
|
@ -668,16 +706,16 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
|
|||
}
|
||||
|
||||
// Convert the set vector to an array
|
||||
data->setVariablesLength = setVariablesVector.size();
|
||||
data->setVariables = data->setVariablesLength == 0 ? 0 : new UnicodeSet*[data->setVariablesLength];
|
||||
data->variablesLength = variablesVector.size();
|
||||
data->variables = data->variablesLength == 0 ? 0 : new UnicodeMatcher*[data->variablesLength];
|
||||
// orphanElement removes the given element and shifts all other
|
||||
// elements down. For performance (and code clarity) we work from
|
||||
// the end back to index 0.
|
||||
int32_t i;
|
||||
for (i=data->setVariablesLength; i>0; ) {
|
||||
for (i=data->variablesLength; i>0; ) {
|
||||
--i;
|
||||
data->setVariables[i] =
|
||||
(UnicodeSet*) setVariablesVector.orphanElementAt(i);
|
||||
data->variables[i] =
|
||||
(UnicodeSet*) variablesVector.orphanElementAt(i);
|
||||
}
|
||||
|
||||
// Index the rules
|
||||
|
@ -894,14 +932,23 @@ int32_t TransliteratorParser::syntaxError(int32_t parseErrorCode,
|
|||
UChar TransliteratorParser::parseSet(const UnicodeString& rule,
|
||||
ParsePosition& pos) {
|
||||
UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status);
|
||||
set->compact();
|
||||
return generateStandInFor(set);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate and return a stand-in for a new UnicodeMatcher. Store
|
||||
* the matcher (adopt it).
|
||||
*/
|
||||
UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
|
||||
// assert(adopted != 0);
|
||||
if (variableNext >= variableLimit) {
|
||||
// throw new RuntimeException("Private use variables exhausted");
|
||||
delete set;
|
||||
delete adopted;
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
set->compact();
|
||||
setVariablesVector.addElement(set);
|
||||
variablesVector.addElement(adopted);
|
||||
return variableNext++;
|
||||
}
|
||||
|
||||
|
@ -949,12 +996,12 @@ void TransliteratorParser::determineVariableRange(void) {
|
|||
|
||||
UnicodeRange* r = privateUse.largestUnusedSubrange(rules);
|
||||
|
||||
data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
|
||||
data->variablesBase = variableNext = variableLimit = (UChar) 0;
|
||||
|
||||
if (r != 0) {
|
||||
// Allocate 9 characters for segment references 1 through 9
|
||||
data->segmentBase = r->start;
|
||||
data->setVariablesBase = variableNext = (UChar) (data->segmentBase + 9);
|
||||
data->variablesBase = variableNext = (UChar) (data->segmentBase + 9);
|
||||
variableLimit = (UChar) (r->start + r->length);
|
||||
delete r;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#include "unicode/parseerr.h"
|
||||
|
||||
class TransliterationRuleData;
|
||||
class UnicodeSet;
|
||||
class UnicodeMatcher;
|
||||
class ParseData;
|
||||
class RuleHalf;
|
||||
class ParsePosition;
|
||||
|
@ -48,11 +48,11 @@ class TransliteratorParser {
|
|||
ParseData* parseData;
|
||||
|
||||
/**
|
||||
* Temporary vector of set variables. When parsing is complete, this
|
||||
* is copied into the array data.setVariables. As with data.setVariables,
|
||||
* element 0 corresponds to character data.setVariablesBase.
|
||||
* Temporary vector of matcher variables. When parsing is complete, this
|
||||
* is copied into the array data.variables. As with data.variables,
|
||||
* element 0 corresponds to character data.variablesBase.
|
||||
*/
|
||||
UVector setVariablesVector;
|
||||
UVector variablesVector;
|
||||
|
||||
/**
|
||||
* The next available stand-in for variables. This starts at some point in
|
||||
|
@ -169,6 +169,12 @@ private:
|
|||
UChar parseSet(const UnicodeString& rule,
|
||||
ParsePosition& pos);
|
||||
|
||||
/**
|
||||
* Generate and return a stand-in for a new UnicodeMatcher. Store
|
||||
* the matcher (adopt it).
|
||||
*/
|
||||
UChar generateStandInFor(UnicodeMatcher* adopted);
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* UnicodeString.
|
||||
|
|
|
@ -161,16 +161,16 @@ void TransliterationRule::init(const UnicodeString& input,
|
|||
this->segments = adoptedSegs;
|
||||
// Find the position of the first segment index that is after the
|
||||
// anteContext (in the key). Note that this may be a start or a
|
||||
// limit index.
|
||||
// limit index. If all segments are in the ante context,
|
||||
// firstKeySeg should point past the last segment -- that is, it
|
||||
// should point at the end marker, which is -1. This allows the
|
||||
// code to back up by one to obtain the last ante context segment.
|
||||
firstKeySeg = -1;
|
||||
if (segments != 0) {
|
||||
do {
|
||||
++firstKeySeg;
|
||||
} while (segments[firstKeySeg] >= 0 &&
|
||||
segments[firstKeySeg] < anteContextLength);
|
||||
if (segments[firstKeySeg] < 0) {
|
||||
firstKeySeg = -1;
|
||||
}
|
||||
}
|
||||
|
||||
pattern = input;
|
||||
|
@ -221,7 +221,7 @@ int16_t TransliterationRule::getIndexValue() const {
|
|||
return -1;
|
||||
}
|
||||
UChar32 c = pattern.char32At(anteContextLength);
|
||||
return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1);
|
||||
return (int16_t)(data.lookup(c) == NULL ? (c & 0xFF) : -1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -241,8 +241,9 @@ UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
|
|||
return TRUE;
|
||||
}
|
||||
UChar32 c = pattern.char32At(anteContextLength);
|
||||
const UnicodeSet* set = data.lookupSet(c);
|
||||
return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
|
||||
const UnicodeMatcher* matcher = data.lookup(c);
|
||||
return matcher == NULL ? (uint8_t(c) == v) :
|
||||
matcher->matchesIndexValue(v);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -367,17 +368,21 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
// A mismatch in the ante context, or with the start anchor,
|
||||
// is an outright U_MISMATCH regardless of whether we are
|
||||
// incremental or not.
|
||||
int32_t cursor = pos.start - UTF_CHAR_LENGTH(text.char32At(pos.start-1));
|
||||
int32_t cursor = pos.start;
|
||||
int32_t newStart = 0;
|
||||
int32_t i;
|
||||
|
||||
// Backup cursor by one
|
||||
if (cursor > 0) {
|
||||
cursor -= UTF_CHAR_LENGTH(text.char32At(cursor-1));
|
||||
} else {
|
||||
--cursor;
|
||||
}
|
||||
|
||||
for (i=anteContextLength-1; i>=0; --i) {
|
||||
while (i == nextSegPos) {
|
||||
segPos[iSeg] = cursor;
|
||||
nextSegPos == (--iSeg >= 0) ? segments[iSeg] : -1;
|
||||
}
|
||||
UChar keyChar = pattern.charAt(i);
|
||||
const UnicodeSet* set = data.lookupSet(keyChar);
|
||||
if (set == 0) {
|
||||
const UnicodeMatcher* matcher = data.lookup(keyChar);
|
||||
if (matcher == 0) {
|
||||
if (cursor >= pos.contextStart &&
|
||||
keyChar == text.charAt(cursor)) {
|
||||
--cursor;
|
||||
|
@ -386,7 +391,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
}
|
||||
} else {
|
||||
// Subtract 1 from contextStart to make it a reverse limit
|
||||
if (set->matches(text, cursor, pos.contextStart-1, FALSE)
|
||||
if (matcher->matches(text, cursor, pos.contextStart-1, FALSE)
|
||||
!= U_MATCH) {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
|
@ -395,6 +400,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
// Record the position of the cursor
|
||||
newStart = cursor;
|
||||
}
|
||||
while (nextSegPos == i) {
|
||||
segPos[iSeg] = cursor;
|
||||
if (cursor >= 0) {
|
||||
segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(cursor));
|
||||
} else {
|
||||
++segPos[iSeg];
|
||||
}
|
||||
nextSegPos = (--iSeg >= 0) ? segments[iSeg] : -1;
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------ Start Anchor ------------------------
|
||||
|
@ -405,8 +419,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
|
||||
// -------------------- Key and Post Context --------------------
|
||||
|
||||
// YUCKY OPTIMIZATION. To make things a miniscule amount faster,
|
||||
// subtract anteContextLength from all segments[i] with i >=
|
||||
// firstKeySeg. Then we don't have to do so here. I only mention
|
||||
// this here in order to say DO NOT DO THIS. The gain is
|
||||
// miniscule (how long does an integer subtraction take?) and the
|
||||
// increase in confusion isn't worth it.
|
||||
|
||||
iSeg = firstKeySeg;
|
||||
nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
|
||||
nextSegPos = (iSeg >= 0) ? (segments[iSeg] - anteContextLength) : -1;
|
||||
|
||||
i = 0;
|
||||
cursor = pos.start;
|
||||
|
@ -424,14 +445,14 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
}
|
||||
while (i == nextSegPos) {
|
||||
segPos[iSeg] = cursor;
|
||||
nextSegPos = segments[++iSeg];
|
||||
nextSegPos = segments[++iSeg] - anteContextLength;
|
||||
}
|
||||
if (i == keyLength) {
|
||||
keyLimit = cursor;
|
||||
}
|
||||
UChar keyChar = pattern.charAt(anteContextLength + i++);
|
||||
const UnicodeSet* set = data.lookupSet(keyChar);
|
||||
if (set == 0) {
|
||||
const UnicodeMatcher* matcher = data.lookup(keyChar);
|
||||
if (matcher == 0) {
|
||||
// Don't need the cursor < pos.contextLimit check if
|
||||
// incremental is TRUE (because it's done above); do need
|
||||
// it otherwise.
|
||||
|
@ -443,7 +464,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
}
|
||||
} else {
|
||||
UMatchDegree m =
|
||||
set->matches(text, cursor, pos.contextLimit, incremental);
|
||||
matcher->matches(text, cursor, pos.contextLimit, incremental);
|
||||
if (m != U_MATCH) {
|
||||
return m;
|
||||
}
|
||||
|
@ -451,7 +472,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
|
|||
}
|
||||
while (i == nextSegPos) {
|
||||
segPos[iSeg] = cursor;
|
||||
nextSegPos = segments[++iSeg];
|
||||
nextSegPos = segments[++iSeg] - anteContextLength;
|
||||
}
|
||||
if (i == keyLength) {
|
||||
keyLimit = cursor;
|
||||
|
@ -686,11 +707,11 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
|
|||
}
|
||||
|
||||
UChar c = pattern.charAt(i);
|
||||
const UnicodeSet *set = data.lookupSet(c);
|
||||
if (set == 0) {
|
||||
const UnicodeMatcher *matcher = data.lookup(c);
|
||||
if (matcher == 0) {
|
||||
_appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
|
||||
} else {
|
||||
_appendToRule(rule, set->toPattern(str, escapeUnprintable),
|
||||
_appendToRule(rule, matcher->toPattern(str, escapeUnprintable),
|
||||
TRUE, escapeUnprintable, quoteBuf);
|
||||
}
|
||||
}
|
||||
|
|
128
icu4c/source/i18n/strmatch.cpp
Normal file
128
icu4c/source/i18n/strmatch.cpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
/*
|
||||
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 07/23/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "strmatch.h"
|
||||
#include "rbt_data.h"
|
||||
|
||||
StringMatcher::StringMatcher(const UnicodeString& theString,
|
||||
int32_t start,
|
||||
int32_t limit,
|
||||
const TransliterationRuleData& theData) :
|
||||
data(theData) {
|
||||
theString.extractBetween(start, limit, pattern);
|
||||
}
|
||||
|
||||
StringMatcher::StringMatcher(const UnicodeString& theString,
|
||||
const TransliterationRuleData& theData) :
|
||||
pattern(theString),
|
||||
data(theData) {
|
||||
}
|
||||
|
||||
StringMatcher::StringMatcher(const StringMatcher& o) :
|
||||
pattern(o.pattern),
|
||||
data(o.data) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
StringMatcher::~StringMatcher() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UnicodeMatcher* StringMatcher::clone() const {
|
||||
return new StringMatcher(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UMatchDegree StringMatcher::matches(const Replaceable& text,
|
||||
int32_t& offset,
|
||||
int32_t limit,
|
||||
UBool incremental) const {
|
||||
int32_t i;
|
||||
int32_t cursor = offset;
|
||||
if (limit < cursor) {
|
||||
for (i=pattern.length()-1; i>=0; --i) {
|
||||
UChar keyChar = pattern.charAt(i);
|
||||
const UnicodeMatcher* subm = data.lookup(keyChar);
|
||||
if (subm == 0) {
|
||||
if (cursor >= limit &&
|
||||
keyChar == text.charAt(cursor)) {
|
||||
--cursor;
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
UMatchDegree m =
|
||||
subm->matches(text, cursor, limit, incremental);
|
||||
if (m != U_MATCH) {
|
||||
return m;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (i=0; i<pattern.length(); ++i) {
|
||||
if (incremental && cursor == limit) {
|
||||
// We've reached the context limit without a mismatch and
|
||||
// without completing our match.
|
||||
return U_PARTIAL_MATCH;
|
||||
}
|
||||
UChar keyChar = pattern.charAt(i);
|
||||
const UnicodeMatcher* subm = data.lookup(keyChar);
|
||||
if (subm == 0) {
|
||||
// Don't need the cursor < limit check if
|
||||
// incremental is TRUE (because it's done above); do need
|
||||
// it otherwise.
|
||||
if (cursor < limit &&
|
||||
keyChar == text.charAt(cursor)) {
|
||||
++cursor;
|
||||
} else {
|
||||
return U_MISMATCH;
|
||||
}
|
||||
} else {
|
||||
UMatchDegree m =
|
||||
subm->matches(text, cursor, limit, incremental);
|
||||
if (m != U_MATCH) {
|
||||
return m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
offset = cursor;
|
||||
return U_MATCH;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UnicodeString& StringMatcher::toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable) const {
|
||||
for (int32_t i=0; i<pattern.length(); ++i) {
|
||||
// TODO finish this
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
UBool StringMatcher::matchesIndexValue(uint8_t v) const {
|
||||
if (pattern.length() == 0) {
|
||||
return TRUE;
|
||||
}
|
||||
UChar32 c = pattern.char32At(0);
|
||||
const UnicodeMatcher *m = data.lookup(c);
|
||||
return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
|
||||
}
|
||||
|
||||
//eof
|
69
icu4c/source/i18n/strmatch.h
Normal file
69
icu4c/source/i18n/strmatch.h
Normal file
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 07/23/01 aliu Creation.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef STRMATCH_H
|
||||
#define STRMATCH_H
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unimatch.h"
|
||||
|
||||
class TransliterationRuleData;
|
||||
|
||||
/**
|
||||
* An object that matches a string.
|
||||
*/
|
||||
class StringMatcher : public UnicodeMatcher {
|
||||
|
||||
public:
|
||||
|
||||
StringMatcher(const UnicodeString& string,
|
||||
int32_t start,
|
||||
int32_t limit,
|
||||
const TransliterationRuleData& data);
|
||||
|
||||
StringMatcher(const UnicodeString& string,
|
||||
const TransliterationRuleData& data);
|
||||
|
||||
StringMatcher(const StringMatcher& o);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~StringMatcher();
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UnicodeMatcher* clone() const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UMatchDegree matches(const Replaceable& text,
|
||||
int32_t& offset,
|
||||
int32_t limit,
|
||||
UBool incremental) const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UnicodeString& toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable = FALSE) const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher
|
||||
*/
|
||||
virtual UBool matchesIndexValue(uint8_t v) const;
|
||||
|
||||
private:
|
||||
|
||||
UnicodeString pattern;
|
||||
|
||||
const TransliterationRuleData& data;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -149,7 +149,7 @@ Transliterator::Transliterator(const Transliterator& other) :
|
|||
maximumContextLength(other.maximumContextLength) {
|
||||
if (other.filter != 0) {
|
||||
// We own the filter, so we must have our own copy
|
||||
filter = other.filter->clone();
|
||||
filter = (UnicodeFilter*) other.filter->clone();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,7 +160,7 @@ Transliterator& Transliterator::operator=(const Transliterator& other) {
|
|||
ID = other.ID;
|
||||
maximumContextLength = other.maximumContextLength;
|
||||
// MUST go through adoptFilter in case latter is overridden
|
||||
adoptFilter((other.filter == 0) ? 0 : other.filter->clone());
|
||||
adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone());
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
@ -361,6 +361,25 @@ void Transliterator::_transliterate(Replaceable& text,
|
|||
|
||||
filteredTransliterate(text, index, TRUE);
|
||||
|
||||
#if 0
|
||||
// I CAN'T DO what I'm attempting below now that the Kleene star
|
||||
// operator is supported. For example, in the rule
|
||||
|
||||
// ([:Lu:]+) { x } > $1;
|
||||
|
||||
// what is the maximum context length? getMaximumContextLength()
|
||||
// will return 1, but this is just the length of the ante context
|
||||
// part of the pattern string -- 1 character, which is a standin
|
||||
// for a Quantifier, which contains a StringMatcher, which
|
||||
// contains a UnicodeSet.
|
||||
|
||||
// There is a complicated way to make this work again, and that's
|
||||
// to add a "maximum left context" protocol into the
|
||||
// UnicodeMatcher hierarchy. At present I'm not convinced this is
|
||||
// worth it.
|
||||
|
||||
// ---
|
||||
|
||||
// The purpose of the code below is to keep the context small
|
||||
// while doing incremental transliteration. When part of the left
|
||||
// context (between contextStart and start) is no longer needed,
|
||||
|
@ -373,6 +392,7 @@ void Transliterator::_transliterate(Replaceable& text,
|
|||
newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
|
||||
}
|
||||
index.contextStart = uprv_max(newCS, originalStart);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -38,12 +38,15 @@ public:
|
|||
virtual UBool contains(UChar32 c) const = 0;
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeFilter objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeFilters, such as Transliterator, to implement cloning.
|
||||
* @draft
|
||||
* UnicodeMatcher API. This class stubs this out.
|
||||
*/
|
||||
virtual UnicodeFilter* clone() const = 0;
|
||||
UnicodeString& toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable) const;
|
||||
|
||||
/**
|
||||
* UnicodeMatcher API. This class stubs this out.
|
||||
*/
|
||||
UBool matchesIndexValue(uint8_t v) const;
|
||||
|
||||
/**
|
||||
* Implement UnicodeMatcher API.
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include "unicode/utypes.h"
|
||||
|
||||
class Replaceable;
|
||||
class UnicodeString;
|
||||
|
||||
/**
|
||||
* Constants returned by <code>UnicodeMatcher::matches()</code>
|
||||
|
@ -59,6 +60,13 @@ public:
|
|||
*/
|
||||
virtual ~UnicodeMatcher();
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeMatcher objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeMatchers to implement cloning.
|
||||
*/
|
||||
virtual UnicodeMatcher* clone() const = 0;
|
||||
|
||||
/**
|
||||
* Return a UMatchDegree value indicating the degree of match for
|
||||
* the given text at the given offset. Zero, one, or more
|
||||
|
@ -106,6 +114,28 @@ public:
|
|||
int32_t limit,
|
||||
UBool incremental) const = 0;
|
||||
|
||||
/**
|
||||
* Returns a string representation of this matcher. If the result of
|
||||
* calling this function is passed to the appropriate parser, it
|
||||
* will produce another matcher that is equal to this one.
|
||||
* @param result the string to receive the pattern. Previous
|
||||
* contents will be deleted.
|
||||
* @param escapeUnprintable if TRUE then convert unprintable
|
||||
* character to their hex escape representations, \uxxxx or
|
||||
* \Uxxxxxxxx. Unprintable characters are those other than
|
||||
* U+000A, U+0020..U+007E.
|
||||
*/
|
||||
virtual UnicodeString& toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable = FALSE) const = 0;
|
||||
|
||||
/**
|
||||
* Returns TRUE if this matcher will match a character c, where c
|
||||
* & 0xFF == v, at offset, in the forward direction (with limit >
|
||||
* offset). This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
* indexing.
|
||||
*/
|
||||
virtual UBool matchesIndexValue(uint8_t v) const = 0;
|
||||
|
||||
protected:
|
||||
|
||||
UnicodeMatcher();
|
||||
|
|
|
@ -365,12 +365,12 @@ public:
|
|||
UBool operator!=(const UnicodeSet& o) const;
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeFilter objects have
|
||||
* Returns a copy of this object. All UnicodeMatcher objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeFilters, such as Transliterator, to implement cloning.
|
||||
* UnicodeMatchers, such as Transliterator, to implement cloning.
|
||||
* @draft
|
||||
*/
|
||||
virtual UnicodeFilter* clone() const;
|
||||
virtual UnicodeMatcher* clone() const;
|
||||
|
||||
/**
|
||||
* Returns the hash code value for this set.
|
||||
|
@ -691,7 +691,7 @@ private:
|
|||
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
* indexing.
|
||||
*/
|
||||
UBool containsIndexValue(uint8_t v) const;
|
||||
virtual UBool matchesIndexValue(uint8_t v) const;
|
||||
|
||||
private:
|
||||
|
||||
|
|
|
@ -40,3 +40,16 @@ UMatchDegree UnicodeFilter::matches(const Replaceable& text,
|
|||
}
|
||||
return U_MISMATCH;
|
||||
}
|
||||
|
||||
// Stub this out for filters that do not implement a pattern
|
||||
UnicodeString& UnicodeFilter::toPattern(UnicodeString& result,
|
||||
UBool escapeUnprintable) const {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Stub this out for filters that do not implement indexing
|
||||
UBool UnicodeFilter::matchesIndexValue(uint8_t v) const {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
//eof
|
||||
|
|
|
@ -22,7 +22,7 @@ public:
|
|||
NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; }
|
||||
virtual ~NullFilter() {}
|
||||
virtual UBool contains(UChar32 /*c*/) const { return result; }
|
||||
virtual UnicodeFilter* clone() const { return new NullFilter(*this); }
|
||||
virtual UnicodeMatcher* clone() const { return new NullFilter(*this); }
|
||||
};
|
||||
|
||||
class UnicodeNotFilter : public UnicodeFilter {
|
||||
|
@ -32,15 +32,15 @@ public:
|
|||
UnicodeNotFilter(const UnicodeNotFilter&);
|
||||
virtual ~UnicodeNotFilter();
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
virtual UnicodeMatcher* clone() const;
|
||||
};
|
||||
|
||||
UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
|
||||
UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f)
|
||||
: UnicodeFilter(f), filt(f.filt->clone()) {}
|
||||
: UnicodeFilter(f), filt((UnicodeFilter*) f.filt->clone()) {}
|
||||
UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
|
||||
UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); }
|
||||
UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
|
||||
UnicodeMatcher* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements the inverse of
|
||||
|
@ -50,7 +50,7 @@ UnicodeFilter* UnicodeFilterLogic::createNot(const UnicodeFilter* f) {
|
|||
if (f == 0) {
|
||||
return new NullFilter(FALSE);
|
||||
} else {
|
||||
return new UnicodeNotFilter(f->clone());
|
||||
return new UnicodeNotFilter((UnicodeFilter*)f->clone());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -62,15 +62,15 @@ public:
|
|||
UnicodeAndFilter(const UnicodeAndFilter&);
|
||||
virtual ~UnicodeAndFilter();
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
virtual UnicodeMatcher* clone() const;
|
||||
};
|
||||
|
||||
UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
|
||||
UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f)
|
||||
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
|
||||
: UnicodeFilter(f), filt1((UnicodeFilter*)f.filt1->clone()), filt2((UnicodeFilter*)f.filt2->clone()) {}
|
||||
UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
|
||||
UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); }
|
||||
UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
|
||||
UnicodeMatcher* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
|
@ -84,12 +84,12 @@ UnicodeFilter* UnicodeFilterLogic::createAnd(const UnicodeFilter* f,
|
|||
if (g == 0) {
|
||||
return NULL;
|
||||
}
|
||||
return g->clone();
|
||||
return (UnicodeFilter*)g->clone();
|
||||
}
|
||||
if (g == 0) {
|
||||
return f->clone();
|
||||
return (UnicodeFilter*)f->clone();
|
||||
}
|
||||
return new UnicodeAndFilter(f->clone(), g->clone());
|
||||
return new UnicodeAndFilter((UnicodeFilter*)f->clone(), (UnicodeFilter*)g->clone());
|
||||
}
|
||||
|
||||
class UnicodeOrFilter : public UnicodeFilter {
|
||||
|
@ -100,15 +100,15 @@ public:
|
|||
UnicodeOrFilter(const UnicodeOrFilter&);
|
||||
virtual ~UnicodeOrFilter();
|
||||
virtual UBool contains(UChar32 c) const;
|
||||
virtual UnicodeFilter* clone() const;
|
||||
virtual UnicodeMatcher* clone() const;
|
||||
};
|
||||
|
||||
UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
|
||||
UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f)
|
||||
: UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
|
||||
: UnicodeFilter(f), filt1((UnicodeFilter*)f.filt1->clone()), filt2((UnicodeFilter*)f.filt2->clone()) {}
|
||||
UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
|
||||
UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); }
|
||||
UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
|
||||
UnicodeMatcher* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
|
||||
|
||||
/**
|
||||
* Returns a <tt>UnicodeFilter</tt> that implements a short
|
||||
|
@ -122,10 +122,10 @@ UnicodeFilter* UnicodeFilterLogic::createOr(const UnicodeFilter* f,
|
|||
if (g == 0) {
|
||||
return NULL;
|
||||
}
|
||||
return g->clone();
|
||||
return (UnicodeFilter*)g->clone();
|
||||
}
|
||||
if (g == 0) {
|
||||
return f->clone();
|
||||
return (UnicodeFilter*)f->clone();
|
||||
}
|
||||
return new UnicodeOrFilter(f->clone(), g->clone());
|
||||
return new UnicodeOrFilter((UnicodeFilter*)f->clone(), (UnicodeFilter*)g->clone());
|
||||
}
|
||||
|
|
|
@ -228,11 +228,11 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this object. All UnicodeFilter objects have
|
||||
* Returns a copy of this object. All UnicodeMatcher objects have
|
||||
* to support cloning in order to allow classes using
|
||||
* UnicodeFilters, such as Transliterator, to implement cloning.
|
||||
* UnicodeMatchers, such as Transliterator, to implement cloning.
|
||||
*/
|
||||
UnicodeFilter* UnicodeSet::clone() const {
|
||||
UnicodeMatcher* UnicodeSet::clone() const {
|
||||
return new UnicodeSet(*this);
|
||||
}
|
||||
|
||||
|
@ -547,7 +547,7 @@ UBool UnicodeSet::contains(UChar32 c) const {
|
|||
* is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
|
||||
* indexing.
|
||||
*/
|
||||
UBool UnicodeSet::containsIndexValue(uint8_t v) const {
|
||||
UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
|
||||
/* The index value v, in the range [0,255], is contained in this set if
|
||||
* it is contained in any pair of this set. Pairs either have the high
|
||||
* bytes equal, or unequal. If the high bytes are equal, then we have
|
||||
|
|
|
@ -72,7 +72,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
|
|||
* Used by TestConstruction() and TestTransliterate.
|
||||
*/
|
||||
class TestHangulFilter : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestHangulFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
|
|
@ -56,7 +56,7 @@ void HexToUniTransliteratorTest::runIndexedTest( int32_t index, UBool exec, cons
|
|||
* Used by TestConstruction() and TestTransliterate.
|
||||
*/
|
||||
class TestHexFilter : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestHexFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
|
|
@ -70,7 +70,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
|
|||
* Used by TestConstruction() and TestTransliterate.
|
||||
*/
|
||||
class TestJamoFilter : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestJamoFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
|
|
@ -615,7 +615,7 @@ void TransliteratorAPITest::TestRegisterUnregister(){
|
|||
* Used by TestFiltering().
|
||||
*/
|
||||
class TestFilter1 : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestFilter1(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
@ -626,7 +626,7 @@ class TestFilter1 : public UnicodeFilter {
|
|||
}
|
||||
};
|
||||
class TestFilter2 : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestFilter2(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
@ -637,7 +637,7 @@ class TestFilter2 : public UnicodeFilter {
|
|||
}
|
||||
};
|
||||
class TestFilter3 : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestFilter3(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
|
|
@ -68,6 +68,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE(32,TestToRules);
|
||||
TESTCASE(33,TestContext);
|
||||
TESTCASE(34,TestSupplemental);
|
||||
TESTCASE(35,TestQuantifier);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
@ -477,7 +478,7 @@ void TransliteratorTest::TestCompoundHex(void) {
|
|||
* Used by TestFiltering().
|
||||
*/
|
||||
class TestFilter : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
@ -1501,6 +1502,36 @@ void TransliteratorTest::TestSupplemental() {
|
|||
CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
|
||||
}
|
||||
|
||||
void TransliteratorTest::TestQuantifier() {
|
||||
|
||||
expect("(ab)+ {x} > '(' $1 ')';",
|
||||
"x abx ababxy",
|
||||
"x ab(ab) abab(abab)y");
|
||||
|
||||
expect("b+ > x;",
|
||||
"ac abc abbc abbbc",
|
||||
"ac axc axc axc");
|
||||
|
||||
expect("[abc]+ > x;",
|
||||
"qac abrc abbcs abtbbc",
|
||||
"qx xrx xs xtx");
|
||||
|
||||
expect("q{(ab)+} > x;",
|
||||
"qa qab qaba qababc qaba",
|
||||
"qa qx qxa qxc qxa");
|
||||
|
||||
expect("q(ab)* > x;",
|
||||
"qa qab qaba qababc",
|
||||
"xa x xa xc");
|
||||
|
||||
// Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
|
||||
// In perl, it only matches the first occurrence, so the output
|
||||
// is "()a (ab) (ab)a (ab)c".
|
||||
expect("q(ab)* > '(' $1 ')';",
|
||||
"qa qab qaba qababc",
|
||||
"()a (ab) (ab)a (abab)c");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -172,6 +172,8 @@ class TransliteratorTest : public IntlTest {
|
|||
|
||||
void TestSupplemental(void);
|
||||
|
||||
void TestQuantifier(void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
|
|
@ -36,7 +36,7 @@ void UnicodeFilterLogicTest::runIndexedTest( int32_t index, UBool exec, const ch
|
|||
}
|
||||
|
||||
class Filter1: public UnicodeFilter{
|
||||
virtual UnicodeFilter* clone() const{
|
||||
virtual UnicodeMatcher* clone() const{
|
||||
return new Filter1(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
@ -47,7 +47,7 @@ class Filter1: public UnicodeFilter{
|
|||
}
|
||||
};
|
||||
class Filter2: public UnicodeFilter{
|
||||
virtual UnicodeFilter* clone() const{
|
||||
virtual UnicodeMatcher* clone() const{
|
||||
return new Filter2(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
|
|
@ -68,7 +68,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
|
|||
* Used by TestConstruction() and TestTransliterate.
|
||||
*/
|
||||
class TestUniFilter : public UnicodeFilter {
|
||||
virtual UnicodeFilter* clone() const {
|
||||
virtual UnicodeMatcher* clone() const {
|
||||
return new TestUniFilter(*this);
|
||||
}
|
||||
virtual UBool contains(UChar32 c) const {
|
||||
|
|
Loading…
Add table
Reference in a new issue