ICU-1076 initial limited support for Kleene star and plus operators

X-SVN-Rev: 5359
2025-04-13 08:53:20 +00:00 · 2001-07-27 00:18:53 +00:00 · 2001-07-27 00:18:53 +00:00 · ef8c73fc7c
commit ef8c73fc7c
parent 40bfe95d06
26 changed files with 663 additions and 136 deletions
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@ -72,7 +72,7 @@ unifltlg.o unirange.o uniset.o unitohex.o unum.o \
 dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o brkdict.o nultrans.o jamohang.o hangjamo.o \
 remtrans.o utrans.o \
 titletrn.o tolowtrn.o toupptrn.o xformtrn.o name2uni.o uni2name.o nortrans.o \
-unifilt.o
+unifilt.o quant.o strmatch.o

 STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))

--- a/icu4c/source/i18n/i18n.dsp
+++ b/icu4c/source/i18n/i18n.dsp
@ -198,6 +198,10 @@ SOURCE=.\numfmt.cpp
 # End Source File
 # Begin Source File

+SOURCE=.\quant.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\rbbi.cpp
 # End Source File
 # Begin Source File
@ -242,6 +246,10 @@ SOURCE=.\sortkey.cpp
 # End Source File
 # Begin Source File

+SOURCE=.\strmatch.cpp
+# End Source File
+# Begin Source File
+
 SOURCE=.\tblcoll.cpp
 # End Source File
 # Begin Source File
@ -1029,6 +1037,10 @@ InputPath=.\unicode\parsepos.h
 # End Source File
 # Begin Source File

+SOURCE=.\quant.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\rbbi.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
@ -1188,6 +1200,10 @@ InputPath=.\unicode\sortkey.h
 # End Source File
 # Begin Source File

+SOURCE=.\strmatch.h
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\tblcoll.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
--- a/icu4c/source/i18n/quant.cpp
+++ b/icu4c/source/i18n/quant.cpp
@ -0,0 +1,80 @@
+/*
+* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   07/26/01    aliu        Creation.
+**********************************************************************
+*/
+
+#include "quant.h"
+
+Quantifier::Quantifier(UnicodeMatcher *adopted,
+                       uint32_t minCount, uint32_t maxCount) {
+    // assert(adopted != 0);
+    // assert(minCount <= maxCount);
+    matcher = adopted;
+    this->minCount = minCount;
+    this->maxCount = maxCount;
+}
+
+Quantifier::Quantifier(const Quantifier& o) :
+    matcher(o.matcher->clone()),
+    minCount(o.minCount),
+    maxCount(o.maxCount) {
+    delete matcher;
+}
+
+Quantifier::~Quantifier() {
+    delete matcher;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UnicodeMatcher* Quantifier::clone() const {
+    return new Quantifier(*this);
+}
+
+UMatchDegree Quantifier::matches(const Replaceable& text,
+                                 int32_t& offset,
+                                 int32_t limit,
+                                 UBool incremental) const {
+    int32_t start = offset;
+    uint32_t count = 0;
+    while (count < maxCount) {
+        UMatchDegree m = matcher->matches(text, offset, limit, incremental);
+        if (m == U_MATCH) {
+            ++count;
+        } else if (incremental && m == U_PARTIAL_MATCH) {
+            return U_PARTIAL_MATCH;
+        } else {
+            break;
+        }
+    }
+    if (incremental && offset == limit) {
+        return U_PARTIAL_MATCH;
+    }
+    if (count >= minCount) {
+        return U_MATCH;
+    }
+    offset = start;
+    return U_MISMATCH;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UnicodeString& Quantifier::toPattern(UnicodeString& result,
+                                     UBool escapeUnprintable) const {
+    // TODO finish this
+    return result;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UBool Quantifier::matchesIndexValue(uint8_t v) const {
+    return (minCount == 0) || matcher->matchesIndexValue(v);
+}
+
+//eof
--- a/icu4c/source/i18n/quant.h
+++ b/icu4c/source/i18n/quant.h
@ -0,0 +1,57 @@
+/*
+* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   07/26/01    aliu        Creation.
+**********************************************************************
+*/
+#ifndef QUANT_H
+#define QUANT_H
+
+#include "unicode/unimatch.h"
+
+class Quantifier : public UnicodeMatcher {
+
+ public:
+
+    Quantifier(UnicodeMatcher *adopted,
+               uint32_t minCount, uint32_t maxCount);
+
+    Quantifier(const Quantifier& o);
+
+    virtual ~Quantifier();
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UnicodeMatcher* clone() const;
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UMatchDegree matches(const Replaceable& text,
+                                 int32_t& offset,
+                                 int32_t limit,
+                                 UBool incremental) const;
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UnicodeString& toPattern(UnicodeString& result,
+                                     UBool escapeUnprintable = FALSE) const;
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UBool matchesIndexValue(uint8_t v) const;
+
+ private:
+
+    UnicodeMatcher* matcher; // owned
+
+    uint32_t minCount;
+
+    uint32_t maxCount;
+};
+
+#endif
--- a/icu4c/source/i18n/rbt_data.cpp
+++ b/icu4c/source/i18n/rbt_data.cpp
@ -13,7 +13,7 @@
 #include "unicode/uniset.h"

 TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
-    variableNames(0), setVariables(0) {
+    variableNames(0), variables(0) {
    if (U_FAILURE(status)) {
        return;
    }
@ -21,14 +21,14 @@ TransliterationRuleData::TransliterationRuleData(UErrorCode& status) :
    if (U_SUCCESS(status)) {
        variableNames->setValueDeleter(uhash_deleteUnicodeString);
    }
-    setVariables = 0;
-    setVariablesLength = 0;
+    variables = 0;
+    variablesLength = 0;
 }

 TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData& other) :
    ruleSet(other.ruleSet),
-    setVariablesBase(other.setVariablesBase),
-    setVariablesLength(other.setVariablesLength),
+    variablesBase(other.variablesBase),
+    variablesLength(other.variablesLength),
    segmentBase(other.segmentBase) {

    UErrorCode status = U_ZERO_ERROR;
@ -44,29 +44,29 @@ TransliterationRuleData::TransliterationRuleData(const TransliterationRuleData&
        }
    }

-    setVariables = 0;
-    if (other.setVariables != 0) {
-        setVariables = new UnicodeSet*[setVariablesLength];
-        for (int32_t i=0; i<setVariablesLength; ++i) {
-            setVariables[i] = new UnicodeSet(*other.setVariables[i]);
+    variables = 0;
+    if (other.variables != 0) {
+        variables = new UnicodeMatcher*[variablesLength];
+        for (int32_t i=0; i<variablesLength; ++i) {
+            variables[i] = other.variables[i]->clone();
        }
    }    
 }

 TransliterationRuleData::~TransliterationRuleData() {
    delete variableNames;
-    if (setVariables != 0) {
-        for (int32_t i=0; i<setVariablesLength; ++i) {
-            delete setVariables[i];
+    if (variables != 0) {
+        for (int32_t i=0; i<variablesLength; ++i) {
+            delete variables[i];
        }
-        delete[] setVariables;
+        delete[] variables;
    }
 }

-const UnicodeSet*
-TransliterationRuleData::lookupSet(UChar32 standIn) const {
-    int32_t i = standIn - setVariablesBase;
-    return (i >= 0 && i < setVariablesLength) ? setVariables[i] : 0;
+const UnicodeMatcher*
+TransliterationRuleData::lookup(UChar32 standIn) const {
+    int32_t i = standIn - variablesBase;
+    return (i >= 0 && i < variablesLength) ? variables[i] : 0;
 }

 int32_t
--- a/icu4c/source/i18n/rbt_data.h
+++ b/icu4c/source/i18n/rbt_data.h
@ -11,7 +11,7 @@
 #include "rbt_set.h"

 class UnicodeString;
-class UnicodeSet;
+class UnicodeMatcher;
 class Hashtable;

 /**
@ -46,35 +46,35 @@ public:
     * Map variable name (String) to variable (UnicodeString).  A variable name
     * corresponds to zero or more characters, stored in a UnicodeString in
     * this hash.  One or more of these chars may also correspond to a
-     * UnicodeSet, in which case the character in the UnicodeString in this hash is
+     * UnicodeMatcher, in which case the character in the UnicodeString in this hash is
     * a stand-in: it is an index for a secondary lookup in
-     * data.setVariables.  The stand-in also represents the UnicodeSet in
+     * data.variables.  The stand-in also represents the UnicodeMatcher in
     * the stored rules.
     */
    Hashtable* variableNames;

    /**
-     * Map category variable (UChar) to set (UnicodeSet).
+     * Map category variable (UChar) to set (UnicodeMatcher).
     * Variables that correspond to a set of characters are mapped
     * from variable name to a stand-in character in data.variableNames.
     * The stand-in then serves as a key in this hash to lookup the
-     * actual UnicodeSet object.  In addition, the stand-in is
+     * actual UnicodeMatcher object.  In addition, the stand-in is
     * stored in the rule text to represent the set of characters.
-     * setVariables[i] represents character (setVariablesBase + i).
+     * variables[i] represents character (variablesBase + i).
     */
-    UnicodeSet** setVariables;
+    UnicodeMatcher** variables;

    /**
-     * The character that represents setVariables[0].  Characters
-     * setVariablesBase through setVariablesBase +
-     * setVariables.length - 1 represent UnicodeSet objects.
+     * The character that represents variables[0].  Characters
+     * variablesBase through variablesBase +
+     * variablesLength - 1 represent UnicodeMatcher objects.
     */
-    UChar setVariablesBase;
+    UChar variablesBase;

    /**
-     * The length of setVariables.
+     * The length of variables.
     */
-    int32_t setVariablesLength;
+    int32_t variablesLength;

    /**
     * The character that represents segment 1.  Characters segmentBase
@ -90,7 +90,11 @@ public:

    ~TransliterationRuleData();

-    const UnicodeSet* lookupSet(UChar32 standIn) const;
+    /**
+     * Given a stand-in character, return the UnicodeMatcher that it
+     * represents, or NULL.
+     */
+    const UnicodeMatcher* lookup(UChar32 standIn) const;

    /**
     * Return the zero-based index of the segment represented by the given
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -7,19 +7,21 @@
 *   11/17/99    aliu        Creation.
 **********************************************************************
 */
-#include "rbt_pars.h"
-#include "unicode/rbt.h"
-#include "rbt_rule.h"
-#include "unirange.h"
-#include "rbt_data.h"
-#include "unicode/uniset.h"
 #include "cstring.h"
-#include "unicode/parsepos.h"
-#include "symtable.h"
-#include "unicode/parseerr.h"
 #include "hash.h"
-#include "unicode/unicode.h"
+#include "quant.h"
+#include "rbt_data.h"
+#include "rbt_pars.h"
+#include "rbt_rule.h"
+#include "strmatch.h"
+#include "symtable.h"
+#include "unirange.h"
+#include "unicode/parseerr.h"
+#include "unicode/parsepos.h"
 #include "unicode/putil.h"
+#include "unicode/rbt.h"
+#include "unicode/unicode.h"
+#include "unicode/uniset.h"

 // Operators
 #define VARIABLE_DEF_OP ((UChar)0x003D) /*=*/
@ -43,6 +45,8 @@
 #define CURSOR_POS         ((UChar)0x007C) /*|*/
 #define CURSOR_OFFSET      ((UChar)0x0040) /*@*/
 #define ANCHOR_START       ((UChar)0x005E) /*^*/
+#define KLEENE_STAR        ((UChar)0x002A) /***/
+#define ONE_OR_MORE        ((UChar)0x002B) /*+*/

 // By definition, the ANCHOR_END special character is a
 // trailing SymbolTable.SYMBOL_REF character.
@ -61,17 +65,17 @@ static const UChar   ID_TOKEN[]   = { 0x3A, 0x3A }; // ':', ':'
 /**
 * This class implements the SymbolTable interface.  It is used
 * during parsing to give UnicodeSet access to variables that
- * have been defined so far.  Note that it uses setVariablesVector,
+ * have been defined so far.  Note that it uses variablesVector,
 * _not_ data.setVariables.
 */
 class ParseData : public SymbolTable {
 public:
    const TransliterationRuleData* data; // alias

-    const UVector* setVariablesVector; // alias
+    const UVector* variablesVector; // alias

    ParseData(const TransliterationRuleData* data = 0,
-              const UVector* setVariablesVector = 0);
+              const UVector* variablesVector = 0);

    virtual const UnicodeString* lookup(const UnicodeString& s) const;

@ -83,7 +87,7 @@ public:

 ParseData::ParseData(const TransliterationRuleData* d,
                     const UVector* sets) :
-    data(d), setVariablesVector(sets) {}
+    data(d), variablesVector(sets) {}

 /**
 * Implement SymbolTable API.
@ -99,11 +103,11 @@ const UnicodeSet* ParseData::lookupSet(UChar32 ch) const {
    // Note that we cannot use data.lookupSet() because the
    // set array has not been constructed yet.
    const UnicodeSet* set = NULL;
-    int32_t i = ch - data->setVariablesBase;
-    if (i >= 0 && i < setVariablesVector->size()) {
-        int32_t i = ch - data->setVariablesBase;
-        set = (i < setVariablesVector->size()) ?
-            (UnicodeSet*) setVariablesVector->elementAt(i) : 0;
+    int32_t i = ch - data->variablesBase;
+    if (i >= 0 && i < variablesVector->size()) {
+        int32_t i = ch - data->variablesBase;
+        set = (i < variablesVector->size()) ?
+            (UnicodeSet*) variablesVector->elementAt(i) : 0;
    }
    return set;
 }
@ -276,7 +280,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            if (escaped == (UChar32) -1) {
                return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rule, start);
            }
-            buf.append((UChar) escaped);
+            buf.append(escaped);
            continue;
        }
        // Handle quoted matter
@ -431,6 +435,40 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                }
            }
            break;
+        case KLEENE_STAR:
+        case ONE_OR_MORE:
+            // Very limited initial implementation.  Note that this
+            // works strangely for quotes and variables --
+            //  'foo'* => fo o*
+            //  $a = foo; $a * => fo o*
+            // We will fix this later so that
+            //  'foo'* => (foo) *
+            //  $a = foo; $a * => (foo) *
+            // Implement with hidden segments, perhaps at # 10+.
+            {
+                int32_t start, limit;
+                if (segments != 0 &&
+                    segments->size() >= 2 &&
+                    segments->size() % 2 == 0 &&
+                    _voidPtr_to_int32(segments->elementAt(segments->size()-1)) == buf.length()) {
+                    // The * immediately follows a segment
+                    int32_t len = segments->size();
+                    start = _voidPtr_to_int32(segments->elementAt(len - 2));
+                    limit = _voidPtr_to_int32(segments->elementAt(len - 1));
+                    segments->setElementAt(_int32_to_voidPtr(start+1), len-1);
+                } else {
+                    // The * follows an isolated character
+                    // (or quote, or variable reference)
+                    start = buf.length() - 1;
+                    limit = start + 1;
+                }
+                UnicodeMatcher *m =
+                    new StringMatcher(buf, start, limit, *parser.data);
+                m = new Quantifier(m, (c == ONE_OR_MORE)?1:0, 0x7FFFFFFF);
+                buf.truncate(start);
+                buf.append(parser.generateStandInFor(m));
+            }
+            break;
        // case SET_CLOSE:
        default:
            // Disallow unquoted characters other than [0-9A-Za-z]
@ -551,7 +589,7 @@ TransliteratorParser::TransliteratorParser(
                                     UTransDirection theDirection,
                                     UParseError* theParseError) :
    rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
-    parseData = new ParseData(0, &setVariablesVector);
+    parseData = new ParseData(0, &variablesVector);
 }

 /**
@ -589,7 +627,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
    }

    parseData->data = data;
-    setVariablesVector.removeAllElements();
+    variablesVector.removeAllElements();
    if (parseError != 0) {
        parseError->code = 0;
    }
@ -668,16 +706,16 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
    }
    
    // Convert the set vector to an array
-    data->setVariablesLength = setVariablesVector.size();
-    data->setVariables = data->setVariablesLength == 0 ? 0 : new UnicodeSet*[data->setVariablesLength];
+    data->variablesLength = variablesVector.size();
+    data->variables = data->variablesLength == 0 ? 0 : new UnicodeMatcher*[data->variablesLength];
    // orphanElement removes the given element and shifts all other
    // elements down.  For performance (and code clarity) we work from
    // the end back to index 0.
    int32_t i;
-    for (i=data->setVariablesLength; i>0; ) {
+    for (i=data->variablesLength; i>0; ) {
        --i;
-        data->setVariables[i] =
-            (UnicodeSet*) setVariablesVector.orphanElementAt(i);
+        data->variables[i] =
+            (UnicodeSet*) variablesVector.orphanElementAt(i);
    }

    // Index the rules
@ -894,14 +932,23 @@ int32_t TransliteratorParser::syntaxError(int32_t parseErrorCode,
 UChar TransliteratorParser::parseSet(const UnicodeString& rule,
                                          ParsePosition& pos) {
    UnicodeSet* set = new UnicodeSet(rule, pos, *parseData, status);
+    set->compact();
+    return generateStandInFor(set);
+}
+
+/**
+ * Generate and return a stand-in for a new UnicodeMatcher.  Store
+ * the matcher (adopt it).
+ */
+UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
+    // assert(adopted != 0);
    if (variableNext >= variableLimit) {
        // throw new RuntimeException("Private use variables exhausted");
-        delete set;
+        delete adopted;
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
-    set->compact();
-    setVariablesVector.addElement(set);
+    variablesVector.addElement(adopted);
    return variableNext++;
 }

@ -949,12 +996,12 @@ void TransliteratorParser::determineVariableRange(void) {

    UnicodeRange* r = privateUse.largestUnusedSubrange(rules);

-    data->setVariablesBase = variableNext = variableLimit = (UChar) 0;
+    data->variablesBase = variableNext = variableLimit = (UChar) 0;
    
    if (r != 0) {
        // Allocate 9 characters for segment references 1 through 9
        data->segmentBase = r->start;
-        data->setVariablesBase = variableNext = (UChar) (data->segmentBase + 9);
+        data->variablesBase = variableNext = (UChar) (data->segmentBase + 9);
        variableLimit = (UChar) (r->start + r->length);
        delete r;
    }
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -13,7 +13,7 @@
 #include "unicode/parseerr.h"

 class TransliterationRuleData;
-class UnicodeSet;
+class UnicodeMatcher;
 class ParseData;
 class RuleHalf;
 class ParsePosition;
@ -48,11 +48,11 @@ class TransliteratorParser {
    ParseData* parseData;

    /**
-     * Temporary vector of set variables.  When parsing is complete, this
-     * is copied into the array data.setVariables.  As with data.setVariables,
-     * element 0 corresponds to character data.setVariablesBase.
+     * Temporary vector of matcher variables.  When parsing is complete, this
+     * is copied into the array data.variables.  As with data.variables,
+     * element 0 corresponds to character data.variablesBase.
     */
-    UVector setVariablesVector;
+    UVector variablesVector;

    /**
     * The next available stand-in for variables.  This starts at some point in
@ -169,6 +169,12 @@ private:
    UChar parseSet(const UnicodeString& rule,
                   ParsePosition& pos);

+    /**
+     * Generate and return a stand-in for a new UnicodeMatcher.  Store
+     * the matcher (adopt it).
+     */
+    UChar generateStandInFor(UnicodeMatcher* adopted);
+
    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -161,16 +161,16 @@ void TransliterationRule::init(const UnicodeString& input,
    this->segments = adoptedSegs;
    // Find the position of the first segment index that is after the
    // anteContext (in the key).  Note that this may be a start or a
-    // limit index.
+    // limit index.  If all segments are in the ante context,
+    // firstKeySeg should point past the last segment -- that is, it
+    // should point at the end marker, which is -1.  This allows the
+    // code to back up by one to obtain the last ante context segment.
    firstKeySeg = -1;
    if (segments != 0) {
        do {
            ++firstKeySeg;
        } while (segments[firstKeySeg] >= 0 &&
                 segments[firstKeySeg] < anteContextLength);
-        if (segments[firstKeySeg] < 0) {
-            firstKeySeg = -1;
-        }
    }

    pattern = input;
@ -221,7 +221,7 @@ int16_t TransliterationRule::getIndexValue() const {
        return -1;
    }
    UChar32 c = pattern.char32At(anteContextLength);
-    return (int16_t)(data.lookupSet(c) == NULL ? (c & 0xFF) : -1);
+    return (int16_t)(data.lookup(c) == NULL ? (c & 0xFF) : -1);
 }

 /**
@ -241,8 +241,9 @@ UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
        return TRUE;
    }
    UChar32 c = pattern.char32At(anteContextLength);
-    const UnicodeSet* set = data.lookupSet(c);
-    return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v);
+    const UnicodeMatcher* matcher = data.lookup(c);
+    return matcher == NULL ? (uint8_t(c) == v) :
+        matcher->matchesIndexValue(v);
 }

 /**
@ -367,17 +368,21 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // A mismatch in the ante context, or with the start anchor,
    // is an outright U_MISMATCH regardless of whether we are
    // incremental or not.
-    int32_t cursor = pos.start - UTF_CHAR_LENGTH(text.char32At(pos.start-1));
+    int32_t cursor = pos.start;
    int32_t newStart = 0;
    int32_t i;
+
+    // Backup cursor by one
+    if (cursor > 0) {
+        cursor -= UTF_CHAR_LENGTH(text.char32At(cursor-1));
+    } else {
+        --cursor;
+    }
+
    for (i=anteContextLength-1; i>=0; --i) {
-        while (i == nextSegPos) {
-            segPos[iSeg] = cursor;
-            nextSegPos == (--iSeg >= 0) ? segments[iSeg] : -1;
-        }
        UChar keyChar = pattern.charAt(i);
-        const UnicodeSet* set = data.lookupSet(keyChar);
-        if (set == 0) {
+        const UnicodeMatcher* matcher = data.lookup(keyChar);
+        if (matcher == 0) {
            if (cursor >= pos.contextStart &&
                keyChar == text.charAt(cursor)) {
                --cursor;
@ -386,7 +391,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
            }
        } else {
            // Subtract 1 from contextStart to make it a reverse limit
-            if (set->matches(text, cursor, pos.contextStart-1, FALSE)
+            if (matcher->matches(text, cursor, pos.contextStart-1, FALSE)
                != U_MATCH) {
                return U_MISMATCH;
            }
@ -395,6 +400,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
            // Record the position of the cursor
            newStart = cursor;
        }
+        while (nextSegPos == i) {
+            segPos[iSeg] = cursor;
+            if (cursor >= 0) {
+                segPos[iSeg] += UTF_CHAR_LENGTH(text.char32At(cursor));
+            } else {
+                ++segPos[iSeg];
+            }
+            nextSegPos = (--iSeg >= 0) ? segments[iSeg] : -1;
+        }
    }

    // ------------------------ Start Anchor ------------------------
@ -405,8 +419,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,

    // -------------------- Key and Post Context --------------------

+    // YUCKY OPTIMIZATION.  To make things a miniscule amount faster,
+    // subtract anteContextLength from all segments[i] with i >=
+    // firstKeySeg.  Then we don't have to do so here.  I only mention
+    // this here in order to say DO NOT DO THIS.  The gain is
+    // miniscule (how long does an integer subtraction take?) and the
+    // increase in confusion isn't worth it.
+
    iSeg = firstKeySeg;
-    nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;
+    nextSegPos = (iSeg >= 0) ? (segments[iSeg] - anteContextLength) : -1;

    i = 0;
    cursor = pos.start;
@ -424,14 +445,14 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
        }
        while (i == nextSegPos) {
            segPos[iSeg] = cursor;
-            nextSegPos = segments[++iSeg];
+            nextSegPos = segments[++iSeg] - anteContextLength;
        }
        if (i == keyLength) {
            keyLimit = cursor;
        }
        UChar keyChar = pattern.charAt(anteContextLength + i++);
-        const UnicodeSet* set = data.lookupSet(keyChar);
-        if (set == 0) {
+        const UnicodeMatcher* matcher = data.lookup(keyChar);
+        if (matcher == 0) {
            // Don't need the cursor < pos.contextLimit check if
            // incremental is TRUE (because it's done above); do need
            // it otherwise.
@ -443,7 +464,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
            }
        } else {
            UMatchDegree m =
-                set->matches(text, cursor, pos.contextLimit, incremental);
+                matcher->matches(text, cursor, pos.contextLimit, incremental);
            if (m != U_MATCH) {
                return m;
            }
@ -451,7 +472,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    }
    while (i == nextSegPos) {
        segPos[iSeg] = cursor;
-        nextSegPos = segments[++iSeg];
+        nextSegPos = segments[++iSeg] - anteContextLength;
    }
 	if (i == keyLength) {
 		keyLimit = cursor;
@ -686,11 +707,11 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
        }

        UChar c = pattern.charAt(i);
-        const UnicodeSet *set = data.lookupSet(c);
-        if (set == 0) {
+        const UnicodeMatcher *matcher = data.lookup(c);
+        if (matcher == 0) {
            _appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
        } else {
-            _appendToRule(rule, set->toPattern(str, escapeUnprintable),
+            _appendToRule(rule, matcher->toPattern(str, escapeUnprintable),
                          TRUE, escapeUnprintable, quoteBuf);
        }
    }
--- a/icu4c/source/i18n/strmatch.cpp
+++ b/icu4c/source/i18n/strmatch.cpp
@ -0,0 +1,128 @@
+/*
+* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   07/23/01    aliu        Creation.
+**********************************************************************
+*/
+
+#include "strmatch.h"
+#include "rbt_data.h"
+
+StringMatcher::StringMatcher(const UnicodeString& theString,
+                             int32_t start,
+                             int32_t limit,
+                             const TransliterationRuleData& theData) :
+    data(theData) {
+    theString.extractBetween(start, limit, pattern);
+}
+
+StringMatcher::StringMatcher(const UnicodeString& theString,
+                             const TransliterationRuleData& theData) :
+    pattern(theString),
+    data(theData) {
+}
+
+StringMatcher::StringMatcher(const StringMatcher& o) :
+    pattern(o.pattern),
+    data(o.data) {
+}
+
+/**
+ * Destructor
+ */
+StringMatcher::~StringMatcher() {
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UnicodeMatcher* StringMatcher::clone() const {
+    return new StringMatcher(*this);
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UMatchDegree StringMatcher::matches(const Replaceable& text,
+                                    int32_t& offset,
+                                    int32_t limit,
+                                    UBool incremental) const {
+    int32_t i;
+    int32_t cursor = offset;
+    if (limit < cursor) {
+        for (i=pattern.length()-1; i>=0; --i) {
+            UChar keyChar = pattern.charAt(i);
+            const UnicodeMatcher* subm = data.lookup(keyChar);
+            if (subm == 0) {
+                if (cursor >= limit &&
+                    keyChar == text.charAt(cursor)) {
+                    --cursor;
+                } else {
+                    return U_MISMATCH;
+                }
+            } else {
+                UMatchDegree m =
+                    subm->matches(text, cursor, limit, incremental);
+                if (m != U_MATCH) {
+                    return m;
+                }
+            }
+        }
+    } else {
+        for (i=0; i<pattern.length(); ++i) {
+            if (incremental && cursor == limit) {
+                // We've reached the context limit without a mismatch and
+                // without completing our match.
+                return U_PARTIAL_MATCH;
+            }
+            UChar keyChar = pattern.charAt(i);
+            const UnicodeMatcher* subm = data.lookup(keyChar);
+            if (subm == 0) {
+                // Don't need the cursor < limit check if
+                // incremental is TRUE (because it's done above); do need
+                // it otherwise.
+                if (cursor < limit &&
+                    keyChar == text.charAt(cursor)) {
+                    ++cursor;
+                } else {
+                    return U_MISMATCH;
+                }
+            } else {
+                UMatchDegree m =
+                    subm->matches(text, cursor, limit, incremental);
+                if (m != U_MATCH) {
+                    return m;
+                }
+            }
+        }
+    }
+
+    offset = cursor;
+    return U_MATCH;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UnicodeString& StringMatcher::toPattern(UnicodeString& result,
+                                        UBool escapeUnprintable) const {
+    for (int32_t i=0; i<pattern.length(); ++i) {
+        // TODO finish this
+    }
+    return result;
+}
+
+/**
+ * Implement UnicodeMatcher
+ */
+UBool StringMatcher::matchesIndexValue(uint8_t v) const {
+    if (pattern.length() == 0) {
+        return TRUE;
+    }
+    UChar32 c = pattern.char32At(0);
+    const UnicodeMatcher *m = data.lookup(c);
+    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
+}
+
+//eof
--- a/icu4c/source/i18n/strmatch.h
+++ b/icu4c/source/i18n/strmatch.h
@ -0,0 +1,69 @@
+/*
+* Copyright (C) 2001, International Business Machines Corporation and others. All Rights Reserved.
+**********************************************************************
+*   Date        Name        Description
+*   07/23/01    aliu        Creation.
+**********************************************************************
+*/
+#ifndef STRMATCH_H
+#define STRMATCH_H
+
+#include "unicode/unistr.h"
+#include "unicode/unimatch.h"
+
+class TransliterationRuleData;
+
+/**
+ * An object that matches a string.
+ */
+class StringMatcher : public UnicodeMatcher {
+
+ public:
+
+    StringMatcher(const UnicodeString& string,
+                  int32_t start,
+                  int32_t limit,
+                  const TransliterationRuleData& data);
+
+    StringMatcher(const UnicodeString& string,
+                  const TransliterationRuleData& data);
+
+    StringMatcher(const StringMatcher& o);
+        
+    /**
+     * Destructor
+     */
+    virtual ~StringMatcher();
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UnicodeMatcher* clone() const;
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UMatchDegree matches(const Replaceable& text,
+                                 int32_t& offset,
+                                 int32_t limit,
+                                 UBool incremental) const;
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UnicodeString& toPattern(UnicodeString& result,
+                                     UBool escapeUnprintable = FALSE) const;
+
+    /**
+     * Implement UnicodeMatcher
+     */
+    virtual UBool matchesIndexValue(uint8_t v) const;
+
+ private:
+
+    UnicodeString pattern;
+
+    const TransliterationRuleData& data;
+};
+
+#endif
--- a/icu4c/source/i18n/translit.cpp
+++ b/icu4c/source/i18n/translit.cpp
@ -149,7 +149,7 @@ Transliterator::Transliterator(const Transliterator& other) :
    maximumContextLength(other.maximumContextLength) {
    if (other.filter != 0) {
        // We own the filter, so we must have our own copy
-        filter = other.filter->clone();
+        filter = (UnicodeFilter*) other.filter->clone();
    }
 }

@ -160,7 +160,7 @@ Transliterator& Transliterator::operator=(const Transliterator& other) {
    ID = other.ID;
    maximumContextLength = other.maximumContextLength;
    // MUST go through adoptFilter in case latter is overridden
-    adoptFilter((other.filter == 0) ? 0 : other.filter->clone());
+    adoptFilter((other.filter == 0) ? 0 : (UnicodeFilter*) other.filter->clone());
    return *this;
 }

@ -361,6 +361,25 @@ void Transliterator::_transliterate(Replaceable& text,

    filteredTransliterate(text, index, TRUE);

+#if 0
+    // I CAN'T DO what I'm attempting below now that the Kleene star
+    // operator is supported.  For example, in the rule
+
+    //   ([:Lu:]+) { x } > $1;
+
+    // what is the maximum context length?  getMaximumContextLength()
+    // will return 1, but this is just the length of the ante context
+    // part of the pattern string -- 1 character, which is a standin
+    // for a Quantifier, which contains a StringMatcher, which
+    // contains a UnicodeSet.
+
+    // There is a complicated way to make this work again, and that's
+    // to add a "maximum left context" protocol into the
+    // UnicodeMatcher hierarchy.  At present I'm not convinced this is
+    // worth it.
+
+    // ---
+
    // The purpose of the code below is to keep the context small
    // while doing incremental transliteration.  When part of the left
    // context (between contextStart and start) is no longer needed,
@ -373,6 +392,7 @@ void Transliterator::_transliterate(Replaceable& text,
        newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1;
    }
    index.contextStart = uprv_max(newCS, originalStart);
+#endif
 }

 /**
--- a/icu4c/source/i18n/unicode/unifilt.h
+++ b/icu4c/source/i18n/unicode/unifilt.h
@ -38,12 +38,15 @@ public:
    virtual UBool contains(UChar32 c) const = 0;

    /**
-     * Returns a copy of this object.  All UnicodeFilter objects have
-     * to support cloning in order to allow classes using
-     * UnicodeFilters, such as Transliterator, to implement cloning.
-     * @draft
+     * UnicodeMatcher API.  This class stubs this out.
     */
-    virtual UnicodeFilter* clone() const = 0;
+    UnicodeString& toPattern(UnicodeString& result,
+                             UBool escapeUnprintable) const;
+
+    /**
+     * UnicodeMatcher API.  This class stubs this out.
+     */
+    UBool matchesIndexValue(uint8_t v) const;

    /**
     * Implement UnicodeMatcher API.
--- a/icu4c/source/i18n/unicode/unimatch.h
+++ b/icu4c/source/i18n/unicode/unimatch.h
@ -11,6 +11,7 @@
 #include "unicode/utypes.h"

 class Replaceable;
+class UnicodeString;

 /**
 * Constants returned by <code>UnicodeMatcher::matches()</code>
@ -59,6 +60,13 @@ public:
     */
    virtual ~UnicodeMatcher();

+    /**
+     * Returns a copy of this object.  All UnicodeMatcher objects have
+     * to support cloning in order to allow classes using
+     * UnicodeMatchers to implement cloning.
+     */
+    virtual UnicodeMatcher* clone() const = 0;
+
    /**
     * Return a UMatchDegree value indicating the degree of match for
     * the given text at the given offset.  Zero, one, or more
@ -106,6 +114,28 @@ public:
                                 int32_t limit,
                                 UBool incremental) const = 0;

+    /**
+     * Returns a string representation of this matcher.  If the result of
+     * calling this function is passed to the appropriate parser, it
+     * will produce another matcher that is equal to this one.
+     * @param result the string to receive the pattern.  Previous
+     * contents will be deleted.
+     * @param escapeUnprintable if TRUE then convert unprintable
+     * character to their hex escape representations, \uxxxx or
+     * \Uxxxxxxxx.  Unprintable characters are those other than
+     * U+000A, U+0020..U+007E.
+     */
+    virtual UnicodeString& toPattern(UnicodeString& result,
+                                     UBool escapeUnprintable = FALSE) const = 0;
+
+    /**
+     * Returns TRUE if this matcher will match a character c, where c
+     * & 0xFF == v, at offset, in the forward direction (with limit >
+     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
+     * indexing.
+     */
+    virtual UBool matchesIndexValue(uint8_t v) const = 0;
+
 protected:

    UnicodeMatcher();
--- a/icu4c/source/i18n/unicode/uniset.h
+++ b/icu4c/source/i18n/unicode/uniset.h
@ -365,12 +365,12 @@ public:
    UBool operator!=(const UnicodeSet& o) const;

    /**
-     * Returns a copy of this object.  All UnicodeFilter objects have
+     * Returns a copy of this object.  All UnicodeMatcher objects have
     * to support cloning in order to allow classes using
-     * UnicodeFilters, such as Transliterator, to implement cloning.
+     * UnicodeMatchers, such as Transliterator, to implement cloning.
     * @draft
     */
-    virtual UnicodeFilter* clone() const;
+    virtual UnicodeMatcher* clone() const;

    /**
     * Returns the hash code value for this set.
@ -691,7 +691,7 @@ private:
     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
     * indexing.
     */
-    UBool containsIndexValue(uint8_t v) const;
+    virtual UBool matchesIndexValue(uint8_t v) const;

 private:

--- a/icu4c/source/i18n/unifilt.cpp
+++ b/icu4c/source/i18n/unifilt.cpp
@ -40,3 +40,16 @@ UMatchDegree UnicodeFilter::matches(const Replaceable& text,
    }
    return U_MISMATCH;
 }
+
+// Stub this out for filters that do not implement a pattern
+UnicodeString& UnicodeFilter::toPattern(UnicodeString& result,
+                                        UBool escapeUnprintable) const {
+    return result;
+}
+
+// Stub this out for filters that do not implement indexing
+UBool UnicodeFilter::matchesIndexValue(uint8_t v) const {
+    return FALSE;
+}
+
+//eof
--- a/icu4c/source/i18n/unifltlg.cpp
+++ b/icu4c/source/i18n/unifltlg.cpp
@ -22,7 +22,7 @@ public:
    NullFilter(const NullFilter& f) : UnicodeFilter(f) { result = f.result; }
    virtual ~NullFilter() {}
    virtual UBool contains(UChar32 /*c*/) const { return result; }
-    virtual UnicodeFilter* clone() const { return new NullFilter(*this); }
+    virtual UnicodeMatcher* clone() const { return new NullFilter(*this); }
 };

 class UnicodeNotFilter : public UnicodeFilter {
@ -32,15 +32,15 @@ public:
    UnicodeNotFilter(const UnicodeNotFilter&);
    virtual ~UnicodeNotFilter();
    virtual UBool contains(UChar32 c) const;
-    virtual UnicodeFilter* clone() const;
+    virtual UnicodeMatcher* clone() const;
 };

 UnicodeNotFilter::UnicodeNotFilter(UnicodeFilter* adopted) : filt(adopted) {}
 UnicodeNotFilter::UnicodeNotFilter(const UnicodeNotFilter& f)
- : UnicodeFilter(f), filt(f.filt->clone()) {}
+ : UnicodeFilter(f), filt((UnicodeFilter*) f.filt->clone()) {}
 UnicodeNotFilter::~UnicodeNotFilter() { delete filt; }
 UBool UnicodeNotFilter::contains(UChar32 c) const { return !filt->contains(c); }
-UnicodeFilter* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }
+UnicodeMatcher* UnicodeNotFilter::clone() const { return new UnicodeNotFilter(*this); }

 /**
 * Returns a <tt>UnicodeFilter</tt> that implements the inverse of
@ -50,7 +50,7 @@ UnicodeFilter* UnicodeFilterLogic::createNot(const UnicodeFilter* f) {
    if (f == 0) {
        return new NullFilter(FALSE);
    } else {
-        return new UnicodeNotFilter(f->clone());
+        return new UnicodeNotFilter((UnicodeFilter*)f->clone());
    }
 }

@ -62,15 +62,15 @@ public:
    UnicodeAndFilter(const UnicodeAndFilter&);
    virtual ~UnicodeAndFilter();
    virtual UBool contains(UChar32 c) const;
-    virtual UnicodeFilter* clone() const;
+    virtual UnicodeMatcher* clone() const;
 };

 UnicodeAndFilter::UnicodeAndFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
 UnicodeAndFilter::UnicodeAndFilter(const UnicodeAndFilter& f)
- : UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
+ : UnicodeFilter(f), filt1((UnicodeFilter*)f.filt1->clone()), filt2((UnicodeFilter*)f.filt2->clone()) {}
 UnicodeAndFilter::~UnicodeAndFilter() { delete filt1; delete filt2; }
 UBool UnicodeAndFilter::contains(UChar32 c) const { return filt1->contains(c) && filt2->contains(c); }
-UnicodeFilter* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }
+UnicodeMatcher* UnicodeAndFilter::clone() const { return new UnicodeAndFilter(*this); }

 /**
 * Returns a <tt>UnicodeFilter</tt> that implements a short
@ -84,12 +84,12 @@ UnicodeFilter* UnicodeFilterLogic::createAnd(const UnicodeFilter* f,
        if (g == 0) {
            return NULL;
        }
-        return g->clone();
+        return (UnicodeFilter*)g->clone();
    }
    if (g == 0) {
-        return f->clone();
+        return (UnicodeFilter*)f->clone();
    }
-    return new UnicodeAndFilter(f->clone(), g->clone());
+    return new UnicodeAndFilter((UnicodeFilter*)f->clone(), (UnicodeFilter*)g->clone());
 }

 class UnicodeOrFilter : public UnicodeFilter {
@ -100,15 +100,15 @@ public:
    UnicodeOrFilter(const UnicodeOrFilter&);
    virtual ~UnicodeOrFilter();
    virtual UBool contains(UChar32 c) const;
-    virtual UnicodeFilter* clone() const;
+    virtual UnicodeMatcher* clone() const;
 };

 UnicodeOrFilter::UnicodeOrFilter(UnicodeFilter* f1, UnicodeFilter* f2) : filt1(f1), filt2(f2) {}
 UnicodeOrFilter::UnicodeOrFilter(const UnicodeOrFilter& f)
- : UnicodeFilter(f), filt1(f.filt1->clone()), filt2(f.filt2->clone()) {}
+ : UnicodeFilter(f), filt1((UnicodeFilter*)f.filt1->clone()), filt2((UnicodeFilter*)f.filt2->clone()) {}
 UnicodeOrFilter::~UnicodeOrFilter() { delete filt1; delete filt2; }
 UBool UnicodeOrFilter::contains(UChar32 c) const { return filt1->contains(c) || filt2->contains(c); }
-UnicodeFilter* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }
+UnicodeMatcher* UnicodeOrFilter::clone() const { return new UnicodeOrFilter(*this); }

 /**
 * Returns a <tt>UnicodeFilter</tt> that implements a short
@ -122,10 +122,10 @@ UnicodeFilter* UnicodeFilterLogic::createOr(const UnicodeFilter* f,
        if (g == 0) {
            return NULL;
        }
-        return g->clone();
+        return (UnicodeFilter*)g->clone();
    }
    if (g == 0) {
-        return f->clone();
+        return (UnicodeFilter*)f->clone();
    }
-    return new UnicodeOrFilter(f->clone(), g->clone());
+    return new UnicodeOrFilter((UnicodeFilter*)f->clone(), (UnicodeFilter*)g->clone());
 }
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -228,11 +228,11 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const {
 }

 /**
- * Returns a copy of this object.  All UnicodeFilter objects have
+ * Returns a copy of this object.  All UnicodeMatcher objects have
 * to support cloning in order to allow classes using
- * UnicodeFilters, such as Transliterator, to implement cloning.
+ * UnicodeMatchers, such as Transliterator, to implement cloning.
 */
-UnicodeFilter* UnicodeSet::clone() const {
+UnicodeMatcher* UnicodeSet::clone() const {
    return new UnicodeSet(*this);
 }

@ -547,7 +547,7 @@ UBool UnicodeSet::contains(UChar32 c) const {
 * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
 * indexing.
 */
-UBool UnicodeSet::containsIndexValue(uint8_t v) const {
+UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
    /* The index value v, in the range [0,255], is contained in this set if
     * it is contained in any pair of this set.  Pairs either have the high
     * bytes equal, or unequal.  If the high bytes are equal, then we have
--- a/icu4c/source/test/intltest/hajatrts.cpp
+++ b/icu4c/source/test/intltest/hajatrts.cpp
@ -72,7 +72,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
 * Used by TestConstruction() and TestTransliterate.
 */
 class TestHangulFilter : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestHangulFilter(*this);
    }
    virtual UBool contains(UChar32 c) const {
--- a/icu4c/source/test/intltest/hxuntrts.cpp
+++ b/icu4c/source/test/intltest/hxuntrts.cpp
@ -56,7 +56,7 @@ void HexToUniTransliteratorTest::runIndexedTest( int32_t index, UBool exec, cons
 * Used by TestConstruction() and TestTransliterate.
 */
 class TestHexFilter : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestHexFilter(*this);
    }
    virtual UBool contains(UChar32 c) const {
--- a/icu4c/source/test/intltest/jahatrts.cpp
+++ b/icu4c/source/test/intltest/jahatrts.cpp
@ -70,7 +70,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
 * Used by TestConstruction() and TestTransliterate.
 */
 class TestJamoFilter : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestJamoFilter(*this);
    }
    virtual UBool contains(UChar32 c) const {
--- a/icu4c/source/test/intltest/transapi.cpp
+++ b/icu4c/source/test/intltest/transapi.cpp
@ -615,7 +615,7 @@ void TransliteratorAPITest::TestRegisterUnregister(){
 * Used by TestFiltering().
 */
 class TestFilter1 : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestFilter1(*this);
    }
    virtual UBool contains(UChar32 c) const {
@ -626,7 +626,7 @@ class TestFilter1 : public UnicodeFilter {
    }
 };
 class TestFilter2 : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestFilter2(*this);
    }
    virtual UBool contains(UChar32 c) const {
@ -637,7 +637,7 @@ class TestFilter2 : public UnicodeFilter {
    }
 };
 class TestFilter3 : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestFilter3(*this);
    }
    virtual UBool contains(UChar32 c) const {
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -68,6 +68,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
        TESTCASE(32,TestToRules);
        TESTCASE(33,TestContext);
        TESTCASE(34,TestSupplemental);
+        TESTCASE(35,TestQuantifier);
        default: name = ""; break;
    }
 }
@ -477,7 +478,7 @@ void TransliteratorTest::TestCompoundHex(void) {
 * Used by TestFiltering().
 */
 class TestFilter : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestFilter(*this);
    }
    virtual UBool contains(UChar32 c) const {
@ -1501,6 +1502,36 @@ void TransliteratorTest::TestSupplemental() {
           CharsToUnicodeString("ky\\U00010400y\\U00010400m"));
 }

+void TransliteratorTest::TestQuantifier() { 
+
+    expect("(ab)+ {x} > '(' $1 ')';",
+           "x abx ababxy",
+           "x ab(ab) abab(abab)y");
+
+    expect("b+ > x;",
+           "ac abc abbc abbbc",
+           "ac axc axc axc");
+
+    expect("[abc]+ > x;",
+           "qac abrc abbcs abtbbc",
+           "qx xrx xs xtx");
+
+    expect("q{(ab)+} > x;",
+           "qa qab qaba qababc qaba",
+           "qa qx qxa qxc qxa");
+
+    expect("q(ab)* > x;",
+           "qa qab qaba qababc",
+           "xa x xa xc");
+
+    // Oddity -- "(foo)* > $1" causes $1 to match the run of "foo"s
+    // In perl, it only matches the first occurrence, so the output
+    // is "()a (ab) (ab)a (ab)c".
+    expect("q(ab)* > '(' $1 ')';",
+           "qa qab qaba qababc",
+           "()a (ab) (ab)a (abab)c");
+}
+
 //======================================================================
 // Support methods
 //======================================================================
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@ -172,6 +172,8 @@ class TransliteratorTest : public IntlTest {

    void TestSupplemental(void);

+    void TestQuantifier(void);
+
    //======================================================================
    // Support methods
    //======================================================================
--- a/icu4c/source/test/intltest/ufltlgts.cpp
+++ b/icu4c/source/test/intltest/ufltlgts.cpp
@ -36,7 +36,7 @@ void UnicodeFilterLogicTest::runIndexedTest( int32_t index, UBool exec, const ch
 }

 class Filter1: public UnicodeFilter{
-    virtual UnicodeFilter* clone() const{
+    virtual UnicodeMatcher* clone() const{
        return new Filter1(*this);
    }
    virtual UBool contains(UChar32 c) const {
@ -47,7 +47,7 @@ class Filter1: public UnicodeFilter{
    }
 };
 class Filter2: public UnicodeFilter{
-    virtual UnicodeFilter* clone() const{
+    virtual UnicodeMatcher* clone() const{
        return new Filter2(*this);
    }
    virtual UBool contains(UChar32 c) const {
--- a/icu4c/source/test/intltest/unhxtrts.cpp
+++ b/icu4c/source/test/intltest/unhxtrts.cpp
@ -68,7 +68,7 @@ static void pseudoHandleTransliterate(const Transliterator* t,
 * Used by TestConstruction() and TestTransliterate.
 */
 class TestUniFilter : public UnicodeFilter {
-    virtual UnicodeFilter* clone() const {
+    virtual UnicodeMatcher* clone() const {
        return new TestUniFilter(*this);
    }
    virtual UBool contains(UChar32 c) const {