ICU-1261 initial implementation of compound filters in IDs and ::ID blocks

X-SVN-Rev: 6154
2025-04-08 23:10:40 +00:00 · 2001-10-10 19:29:45 +00:00 · 2001-10-10 19:29:45 +00:00 · 267a914bc3
commit 267a914bc3
parent c968b1ea77
7 changed files with 413 additions and 302 deletions
--- a/icu4c/source/i18n/cpdtrans.cpp
+++ b/icu4c/source/i18n/cpdtrans.cpp
@ -10,6 +10,7 @@
 #include "unicode/cpdtrans.h"
 #include "unicode/unifilt.h"
 #include "unicode/unifltlg.h"
+#include "unicode/uniset.h"
 #include "uvector.h"

 // keep in sync with Transliterator
@ -129,13 +130,18 @@ void CompoundTransliterator::init(const UnicodeString& id,
    }

    UVector list(status);
+    UnicodeSet* compoundFilter = NULL;
    UnicodeString regenID;
    Transliterator::parseCompoundID(id, regenID, direction,
                                    idSplitPoint, adoptedSplitTrans,
-                                    list, compoundRBTIndex,
+                                    list, compoundRBTIndex, compoundFilter,
                                    parseError, status);

    init(list, direction, fixReverseID, status);
+
+    if (compoundFilter != NULL) {
+        adoptFilter(compoundFilter);
+    }
 }

 /**
--- a/icu4c/source/i18n/rbt.cpp
+++ b/icu4c/source/i18n/rbt.cpp
@ -23,12 +23,24 @@ void RuleBasedTransliterator::_construct(const UnicodeString& rules,
                                         UErrorCode& status) {
    data = 0;
    isDataOwned = TRUE;
-    if (U_SUCCESS(status)) {
-        data = TransliteratorParser::parse(rules, direction, parseError,status);
-        if (U_SUCCESS(status)) {
-            setMaximumContextLength(data->ruleSet.getMaximumContextLength());
-        }
+    if (U_FAILURE(status)) {
+        return;
    }
+
+    TransliteratorParser parser;
+    parser.parse(rules, direction, parseError, status);
+    if (U_FAILURE(status)) {
+        return;
+    }
+
+    if (parser.idBlock.length() != 0 ||
+        parser.compoundFilter != NULL) {
+        status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
+        return;
+    }
+
+    data = parser.orphanData();
+    setMaximumContextLength(data->ruleSet.getMaximumContextLength());
 }

 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -16,6 +16,7 @@
 #include "strmatch.h"
 #include "symtable.h"
 #include "unirange.h"
+#include "uvector.h"
 #include "unicode/parseerr.h"
 #include "unicode/parsepos.h"
 #include "unicode/putil.h"
@ -795,89 +796,63 @@ int32_t* RuleHalf::createSegments(UErrorCode& status) const {
 }

 //----------------------------------------------------------------------
-// END RuleHalf
+// PUBLIC API
 //----------------------------------------------------------------------

-TransliterationRuleData*
-TransliteratorParser::parse(const UnicodeString& rules,
-                            UTransDirection direction,
-                            UParseError& parseError,
-                            UErrorCode& ec) {
-    TransliteratorParser parser(rules, direction, parseError);
-    UnicodeString idBlock;
-    int32_t idSplitPoint, count;
-    parser.parseRules(idBlock, idSplitPoint, count);
-    if (U_FAILURE(parser.status) || idBlock.length() != 0) {
-        delete parser.data;
-        parser.data = 0;
-        ec = U_FAILURE(parser.status) ? parser.status : U_ILLEGAL_ARGUMENT_ERROR;
-    }
-    return parser.data;
-}
-
 /**
- * Parse a given set of rules.  Return up to three pieces of
- * parsed data.  These are the header ::id block, the rule block,
- * and the footer ::id block.  Any or all of these may be empty.
- * If the ::id blocks are empty, their corresponding parameters
- * are returned as the empty string.  If there are no rules, the
- * TransliterationRuleData result is 0.
- * @param ruleDataResult caller owns the pointer stored here.
- * May be NULL.
- * @param headerRule string including semicolons for the header
- * ::id block.  May be empty.
- * @param footerRule string including semicolons for the footer
- * ::id block.  May be empty.
+ * Constructor.
 */
-void TransliteratorParser::parse(const UnicodeString& rules,
-                                 UTransDirection direction,
-                                 TransliterationRuleData*& ruleDataResult,
-                                 UnicodeString& idBlockResult,
-                                 int32_t& idSplitPointResult,
-                                 UParseError& parseError,
-                                 UErrorCode& ec) {
-    if (U_FAILURE(ec)) {
-        ruleDataResult = 0;
-        return;
-    }
-    TransliteratorParser parser(rules, direction, parseError);
-    int32_t count;
-    parser.parseRules(idBlockResult, idSplitPointResult, count);
-    if (U_FAILURE(parser.status) || count == 0) {
-        delete parser.data;
-        parser.data = 0;
-    }
-    ruleDataResult = parser.data;
-    ec = parser.status;
-}
-
-/**
- * @param rules list of rules, separated by newline characters
- * @exception IllegalArgumentException if there is a syntax error in the
- * rules
- */
-
-/* Ram: Reordered member initializers to match declaration order and make GCC happy */
-TransliteratorParser::TransliteratorParser(
-                                     const UnicodeString& theRules,
-                                     UTransDirection theDirection,
-                                     UParseError& theParseError)
- :  
-    rules(theRules), direction(theDirection),data(0),parseError(theParseError), variablesVector(status) 
-{
-    parseData = new ParseData(0, &variablesVector);
-    if (parseData == NULL) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-    }
+TransliteratorParser::TransliteratorParser() {
+    data = NULL;
+    compoundFilter = NULL;
+    parseData = NULL;
+    variablesVector = NULL;
 }

 /**
 * Destructor.
 */
 TransliteratorParser::~TransliteratorParser() {
+    delete data;
+    delete compoundFilter;
    delete parseData;
+    delete variablesVector;
 }

+void
+TransliteratorParser::parse(const UnicodeString& rules,
+                            UTransDirection direction,
+                            UParseError& pe,
+                            UErrorCode& ec) {
+    if (U_SUCCESS(ec)) {
+        parseRules(rules, direction);
+        pe = parseError;
+        ec = status;
+    }
+}
+
+/**
+ * Return the compound filter parsed by parse().  Caller owns result.
+ */ 
+UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
+    UnicodeSet* f = compoundFilter;
+    compoundFilter = NULL;
+    return f;
+}
+
+/**
+ * Return the data object parsed by parse().  Caller owns result.
+ */
+TransliterationRuleData* TransliteratorParser::orphanData() {
+    TransliterationRuleData* d = data;
+    data = NULL;
+    return d;
+}
+
+//----------------------------------------------------------------------
+// Private implementation
+//----------------------------------------------------------------------
+
 /**
 * Parse the given string as a sequence of rules, separated by newline
 * characters ('\n'), and cause this object to implement those rules.  Any
@ -886,18 +861,12 @@ TransliteratorParser::~TransliteratorParser() {
 * @exception IllegalArgumentException if there is a syntax error in the
 * rules
 */
-void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
-                                      int32_t& idSplitPointResult,
-                                      int32_t& ruleCount) {
-    status = U_ZERO_ERROR;
-    ruleCount = 0;
-
+void TransliteratorParser::parseRules(const UnicodeString& rules,
+                                      UTransDirection theDirection) {
    // Clear error struct
-    //if (parseError != 0) {
-        //parseError->code = parseError->line = 0;
-        parseError.offset = 0;
-        parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
-    //}
+    parseError.line = parseError.offset = 0;
+    parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
+    status = U_ZERO_ERROR;

    delete data;
    data = new TransliterationRuleData(status);
@ -905,17 +874,28 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
        return;
    }

-    parseData->data = data;
-    variablesVector.removeAllElements();
-/*    if (parseError != 0) {
-        parseError->code = 0;
+    direction = theDirection;
+    ruleCount = 0;
+
+    delete compoundFilter;
+    compoundFilter = NULL;
+
+    if (variablesVector == NULL) {
+        variablesVector = new UVector(status);
+    } else {
+        variablesVector->removeAllElements();
    }
-*/
-    determineVariableRange();
+    parseData = new ParseData(0, variablesVector);
+    if (parseData == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return;
+    }
+    parseData->data = data;
+    determineVariableRange(rules);

    UnicodeString str; // scratch
-    idBlockResult.truncate(0);
-    idSplitPointResult = -1;
+    idBlock.truncate(0);
+    idSplitPoint = -1;
    int32_t pos = 0;
    int32_t limit = rules.length();
    // The mode marks whether we are in the header ::id block, the
@ -924,6 +904,15 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
    // mode == 1: in rules: rule->1, ::id->2
    // mode == 2: in footer rule block: rule->ERROR, ::id->2
    int32_t mode = 0;
+
+    // The compound filter offset is an index into idBlockResult.
+    // If it is 0, then the compound filter occurred at the start,
+    // and it is the offset to the _start_ of the compound filter
+    // pattern.  Otherwise it is the offset to the _limit_ of the
+    // compound filter pattern within idBlockResult.
+    compoundFilter = NULL;
+    int32_t compoundFilterOffset = -1;
+
    while (pos < limit && U_SUCCESS(status)) {
        UChar c = rules.charAt(pos++);
        if (u_isWhitespace(c)) {
@ -954,25 +943,39 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
            int32_t p = pos;
            UBool sawDelim;
            UnicodeString regenID;
-            Transliterator::parseID(rules, regenID, p, sawDelim, direction,parseError, FALSE,status);
+            UnicodeSet* cpdFilter = NULL;
+            Transliterator::parseID(rules, regenID, p, sawDelim, cpdFilter, direction,parseError, FALSE,status);
            if (p == pos || !sawDelim) {
                // Invalid ::id
+                delete cpdFilter;
                syntaxError(U_ILLEGAL_ARGUMENT_ERROR, rules, pos);
            } else {
                if (mode == 1) {
                    mode = 2;
-                    idSplitPointResult = idBlockResult.length();
+                    idSplitPoint = idBlock.length();
+                }
+                if (cpdFilter != NULL) {
+                    if (compoundFilter != NULL) {
+                        syntaxError(U_MULTIPLE_COMPOUND_FILTERS, rules, pos);
+                    }
+                    compoundFilter = cpdFilter;
+                    if (idBlock.length() == 0) {
+                        compoundFilterOffset = 0;
+                    }
                }
                rules.extractBetween(pos, p, str);
-                idBlockResult.append(str);
+                idBlock.append(str);
                if (!sawDelim) {
-                    idBlockResult.append((UChar)0x003B /*;*/);
+                    idBlock.append((UChar)0x003B /*;*/);
+                }
+                if (cpdFilter != NULL && compoundFilterOffset < 0) {
+                    compoundFilterOffset = idBlock.length();
                }
                pos = p;
            }
        } else {
            // Parse a rule
-            pos = parseRule(pos, limit);
+            pos = parseRule(rules, pos, limit);
            if (U_SUCCESS(status)) {
                ++ruleCount;
                if (mode == 2) {
@ -988,7 +991,7 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
    }
    
    // Convert the set vector to an array
-    data->variablesLength = variablesVector.size();
+    data->variablesLength = variablesVector->size();
    data->variables = data->variablesLength == 0 ? 0 : new UnicodeMatcher*[data->variablesLength];
    // orphanElement removes the given element and shifts all other
    // elements down.  For performance (and code clarity) we work from
@ -997,14 +1000,29 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
    for (i=data->variablesLength; i>0; ) {
        --i;
        data->variables[i] =
-            (UnicodeSet*) variablesVector.orphanElementAt(i);
+            (UnicodeSet*) variablesVector->orphanElementAt(i);
    }

    // Index the rules
    if (U_SUCCESS(status)) {
+        if (compoundFilter != NULL) {
+            if ((direction == UTRANS_FORWARD &&
+                 compoundFilterOffset != 0) ||
+                (direction == UTRANS_REVERSE &&
+                 compoundFilterOffset != idBlock.length())) {
+                status = U_MISPLACED_COMPOUND_FILTER;
+            }
+        }        
+
        data->ruleSet.freeze(parseError,status);
-        if (idSplitPointResult < 0) {
-            idSplitPointResult = idBlockResult.length();
+
+        if (idSplitPoint < 0) {
+            idSplitPoint = idBlock.length();
+        }
+
+        if (ruleCount == 0) {
+            delete data;
+            data = NULL;
        }
    }
 }
@ -1022,11 +1040,10 @@ void TransliteratorParser::parseRules(UnicodeString& idBlockResult,
 * indicators.  Once it does a lexical breakdown of the rule at pos, it
 * creates a rule object and adds it to our rule list.
 */
-int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
+int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos, int32_t limit) {
    // Locate the left side, operator, and right side
    int32_t start = pos;
    UChar op = 0;
-    const UnicodeString& rule = rules; // TEMPORARY: FIX LATER

    // Use pointers to automatics to make swapping possible.
    RuleHalf _left(*this), _right(*this);
@ -1188,41 +1205,26 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
 int32_t TransliteratorParser::syntaxError(UErrorCode parseErrorCode,
                                               const UnicodeString& rule,
                                               int32_t pos) {
-   // if (parseError != 0) {
-   /*     parseError->line = 0; // We don't return a line #
-        parseError->offset = start; // Character offset from rule start
-        int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE);
-        if (end < 0) {
-            end = rule.length();
-        }
-        int32_t len = uprv_min(end - start, U_PARSE_CONTEXT_LEN-1);
-        // Extract everything into the preContext and leave the postContext
-        // blank, since we don't have precise error position.
-        // TODO: Fix this.
-        rule.extract(start, len, parseError->preContext); // Current rule
-        parseError->preContext[len] = 0;
-        parseError->postContext[0] = 0;
-   */
-        parseError.offset = pos;
-        parseError.line = 0 ; /* we are not using line numbers */
+    parseError.offset = pos;
+    parseError.line = 0 ; /* we are not using line numbers */
    
-        // for pre-context
-        int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
-        int32_t stop  = pos;
+    // for pre-context
+    int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
+    int32_t stop  = pos;
    
-        rule.extract(start,stop-start,parseError.preContext);
-        //null terminate the buffer
-        parseError.preContext[stop-start] = 0;
+    rule.extract(start,stop-start,parseError.preContext);
+    //null terminate the buffer
+    parseError.preContext[stop-start] = 0;
    
-        //for post-context
-        start = pos+1;
-        stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rule.length() )? (pos+(U_PARSE_CONTEXT_LEN-1)) : 
-                                                                rule.length();
+    //for post-context
+    start = pos+1;
+    stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rule.length() )? (pos+(U_PARSE_CONTEXT_LEN-1)) : 
+        rule.length();
+    
+    rule.extract(start,stop-start,parseError.postContext);
+    //null terminate the buffer
+    parseError.postContext[stop-start]= 0;

-        rule.extract(start,stop-start,parseError.postContext);
-        //null terminate the buffer
-        parseError.postContext[stop-start]= 0;
-   // }
    status = (UErrorCode)parseErrorCode;
    return pos;

@ -1251,7 +1253,7 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
-    variablesVector.addElement(adopted, status);
+    variablesVector->addElement(adopted, status);
    return variableNext++;
 }

@ -1306,7 +1308,7 @@ UChar TransliteratorParser::getSegmentStandin(int32_t r) {
 * When done, everything not in the hash is available for use.  In practice,
 * this method may employ some other algorithm for improved speed.
 */
-void TransliteratorParser::determineVariableRange(void) {
+void TransliteratorParser::determineVariableRange(const UnicodeString& rules) {
    UnicodeRange privateUse(0xE000, 0x1900); // Private use area

    UnicodeRange* r = privateUse.largestUnusedSubrange(rules, status);
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -9,7 +9,6 @@
 #define RBT_PARS_H

 #include "unicode/rbt.h"
-#include "uvector.h"
 #include "unicode/parseerr.h"

 U_NAMESPACE_BEGIN
@ -19,19 +18,49 @@ class UnicodeMatcher;
 class ParseData;
 class RuleHalf;
 class ParsePosition;
+class UVector;

 class TransliteratorParser {

+ public:
+
    /**
-     * This is a reference to external data we don't own.  This works because
-     * we only hold this for the duration of the call to parse().
+     * PUBLIC data member containing the parsed data object, or null if
+     * there were no rules.
     */
-    const UnicodeString& rules;
+    TransliterationRuleData* data;
+
+    /**
+     * PUBLIC data member.
+     * The block of ::IDs, both at the top and at the bottom.
+     * Inserted into these may be additional rules at the
+     * idSplitPoint.
+     */
+    UnicodeString idBlock;
+
+    /**
+     * PUBLIC data member.
+     * In a compound RBT, the index at which the RBT rules are
+     * inserted into the ID block.  Index 0 means before any IDs
+     * in the block.  Index idBlock.length() means after all IDs
+     * in the block.  Index is a string index.
+     */
+    int32_t idSplitPoint;
+
+    /**
+     * PUBLIC data member containing the parsed compound filter, if any.
+     */
+    UnicodeSet* compoundFilter;
+
+ private:
+
+    // The number of rules parsed.  This tells us if there were
+    // any actual transliterator rules, or if there were just ::ID
+    // block IDs.
+    int32_t ruleCount;

    UTransDirection direction;

-    TransliterationRuleData* data;
-
    /**
     * We use a single error code during parsing.  Rather than pass it
     * through each API, we keep it here.
@ -39,10 +68,9 @@ class TransliteratorParser {
    UErrorCode status;

    /**
-     * Pointer to user structure in which to return parse error information.
-     * May be NULL.
+     * Parse error information.
     */
-    UParseError& parseError;
+    UParseError parseError;

    /**
     * Temporary symbol table used during parsing.
@ -54,7 +82,7 @@ class TransliteratorParser {
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
-    UVector variablesVector;
+    UVector* variablesVector;

    /**
     * The next available stand-in for variables.  This starts at some point in
@ -82,44 +110,10 @@ class TransliteratorParser {

 public:

-    static TransliterationRuleData*
-        parse(const UnicodeString& rules,
-              UTransDirection direction,
-              UParseError& parseError,
-              UErrorCode& ec);
-
    /**
-     * Parse a given set of rules.  Return up to three pieces of
-     * parsed data.  These are the header ::id block, the rule block,
-     * and the footer ::id block.  Any or all of these may be empty.
-     * If the ::id blocks are empty, their corresponding parameters
-     * are returned as the empty string.  If there are no rules, the
-     * TransliterationRuleData result is 0.
-     * @param ruleDataResult caller owns the pointer stored here.
-     * May be NULL.
-     * @param headerRule string including semicolons for the header
-     * ::id block.  May be empty.
-     * @param footerRule string including semicolons for the footer
-     * ::id block.  May be empty.
+     * Constructor.
     */
-    static void parse(const UnicodeString& rules,
-                      UTransDirection direction,
-                      TransliterationRuleData*& ruleDataResult,
-                      UnicodeString& idBlockResult,
-                      int32_t& idSplitPointResult,
-                      UParseError& parseError,
-                      UErrorCode& ec);
-
-private:
-
-    /**
-     * @param rules list of rules, separated by newline characters
-     * @exception IllegalArgumentException if there is a syntax error in the
-     * rules
-     */
-    TransliteratorParser(const UnicodeString& rules,
-                              UTransDirection direction,
-                              UParseError& parseError);
+    TransliteratorParser();

    /**
     * Destructor.
@ -130,12 +124,32 @@ private:
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
-     * once, during construction.
-     * @exception IllegalArgumentException if there is a syntax error in the
-     * rules
+     * once after construction.
+     *
+     * Parse the given rules, in the given direction.  After this call
+     * returns, query the public data members for results.  The caller
+     * owns the 'data' and 'compoundFilter' data members after this
+     * call returns.
     */
-    void parseRules(UnicodeString& idBlockResult, int32_t& idSplitPointResult,
-                    int32_t& ruleCount);
+    void parse(const UnicodeString& rules,
+               UTransDirection direction,
+               UParseError& pe,
+               UErrorCode& ec);
+
+    /**
+     * Return the compound filter parsed by parse().  Caller owns result.
+     */ 
+    UnicodeSet* orphanCompoundFilter();
+
+    /**
+     * Return the data object parsed by parse().  Caller owns result.
+     */
+    TransliterationRuleData* orphanData();
+
+private:
+
+    void parseRules(const UnicodeString& rules,
+                    UTransDirection direction);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
@ -150,7 +164,7 @@ private:
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     */
-    int32_t parseRule(int32_t pos, int32_t limit);
+    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Called by main parser upon syntax error.  Search the rule string
@ -198,7 +212,7 @@ private:
     * When done, everything not in the hash is available for use.  In practice,
     * this method may employ some other algorithm for improved speed.
     */
-    void determineVariableRange(void);
+    void determineVariableRange(const UnicodeString&);

    /**
     * Returns the index of a character, ignoring quoted text.
--- a/icu4c/source/i18n/translit.cpp
+++ b/icu4c/source/i18n/translit.cpp
@ -43,6 +43,7 @@ static const UChar ID_DELIM    = 0x003B; /*;*/
 static const UChar VARIANT_SEP = 0x002F; // '/'
 static const UChar OPEN_PAREN  = 40;
 static const UChar CLOSE_PAREN = 41;
+
 /**
 * Prefix for resource bundle key for the display name for a
 * transliterator.  The ID is appended to this to form the key.
@ -688,8 +689,9 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
    UVector list(status);
    int32_t ignored;
    UnicodeString regenID;
+    UnicodeSet* compoundFilter = 0;
    parseCompoundID(ID, regenID, dir, idSplitPoint, adoptedSplitTrans,
-                    list, ignored, parseError, status);
+                    list, ignored, compoundFilter, parseError, status);

    if (U_FAILURE(status)) {
        return 0;
@ -708,6 +710,9 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
        break;
    }
    t->setID(regenID);
+    if (compoundFilter != NULL) {
+        t->adoptFilter(compoundFilter);
+    }
    return t;
 }

@ -724,52 +729,52 @@ Transliterator* Transliterator::createFromRules(const UnicodeString& ID,
                                                UTransDirection dir,
                                                UParseError& parseError,
                                                UErrorCode& status) {
-    UnicodeString idBlock;
-    int32_t idSplitPoint = -1;
-    TransliterationRuleData *data = 0;
+    Transliterator* t = NULL;

-    TransliteratorParser::parse(rules, dir, data,
-                                idBlock, idSplitPoint,
-                                parseError, status);
+    TransliteratorParser parser;
+    parser.parse(rules, dir, parseError, status);

    if (U_FAILURE(status)) {
-        delete data;
        return 0;
    }

    // NOTE: The logic here matches that in TransliteratorRegistry.
-    if (idBlock.length() == 0) {
-        if (data == 0) {
+    if (parser.idBlock.length() == 0) {
+        if (parser.data == NULL) {
            // No idBlock, no data -- this is just an
            // alias for Null
-            return new NullTransliterator();
+            t = new NullTransliterator();
        } else {
            // No idBlock, data != 0 -- this is an
            // ordinary RBT_DATA.
-            return new RuleBasedTransliterator(ID, data, TRUE); // TRUE == adopt data object
+            t = new RuleBasedTransliterator(ID, parser.orphanData(), TRUE); // TRUE == adopt data object
        }
    } else {
-        if (data == 0) {
+        if (parser.data == NULL) {
            // idBlock, no data -- this is an alias
-            Transliterator *t = createInstance(idBlock, dir, parseError,status);
-            if (t != 0) {
+            t = createInstance(parser.idBlock, dir, parseError, status);
+            if (t != NULL) {
                t->setID(ID);
            }
-            return t;
        } else {
            // idBlock and data -- this is a compound
            // RBT
            UnicodeString id("_", "");
-            Transliterator *t = new RuleBasedTransliterator(id, data, TRUE); // TRUE == adopt data object
-            t = new CompoundTransliterator(ID, idBlock, idSplitPoint,
-                                           t,parseError,status);
+            t = new RuleBasedTransliterator(id, parser.orphanData(), TRUE); // TRUE == adopt data object
+            t = new CompoundTransliterator(ID, parser.idBlock, parser.idSplitPoint,
+                                           t, parseError, status);
            if (U_FAILURE(status)) {
                delete t;
                t = 0;
            }
+            if (parser.compoundFilter != NULL) {
+                t->adoptFilter(parser.orphanCompoundFilter());
+            }
            return t;
        }
    }
+
+    return t;
 }

 UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
@ -806,6 +811,7 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
                                     Transliterator *adoptedSplitTrans,
                                     UVector& result,
                                     int32_t& splitTransIndex,
+                                     UnicodeSet*& compoundFilter,
                                     UParseError& parseError,
                                     UErrorCode& status) {
    if (U_FAILURE(status)) {
@ -816,6 +822,15 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
    splitTransIndex = -1;
    int32_t pos = 0;
    int32_t i;
+
+    // A compound filter is a filter on an entire compound
+    // transliterator.  It is indicated by the syntax [abc]; A-B;
+    // B-C or in the reverse direction A-B; B-C; ([abc]).  We
+    // record the filter and its index (in terms of the result
+    // vector).
+    compoundFilter = NULL;
+    int32_t compoundFilterIndex = -1;
+    
    while (pos < id.length()) {
        // We compare (pos >= split), not (pos == split), so we can
        // skip over whitespace (see below).
@ -826,13 +841,25 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
        }
        int32_t p = pos;
        UBool sawDelimiter; // We ignore this
+        UnicodeSet* cpdFilter = NULL;
        Transliterator *t =
-            parseID(id, regenID, p, sawDelimiter, dir, parseError, TRUE,status);
+            parseID(id, regenID, p, sawDelimiter, cpdFilter, dir, parseError, TRUE,status);
        
        if(U_FAILURE(status)){
            delete t;
+            delete cpdFilter;
            break;
        }
+        if (cpdFilter != NULL) {
+            if (compoundFilter != NULL) {
+                status = U_MULTIPLE_COMPOUND_FILTERS;
+                delete t;
+                delete cpdFilter;
+                break;
+            }
+            compoundFilter = cpdFilter;
+            compoundFilterIndex = result.size();
+        }

        if (p == pos || (p < id.length() && !sawDelimiter)) {
            delete t;
@ -848,18 +875,28 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
    }

    // Handle case of idSplitPoint == id.length()
-    if (pos >= idSplitPoint && adoptedSplitTrans != 0) {
+    if (U_SUCCESS(status) && pos >= idSplitPoint && adoptedSplitTrans != 0) {
        splitTransIndex = result.size();
        result.addElement(adoptedSplitTrans, status);
        adoptedSplitTrans = 0;
    }

+    // Check validity of compound filter position
+    if (compoundFilter != NULL) {
+        if ((dir == UTRANS_FORWARD && compoundFilterIndex != 0) ||
+            (dir == UTRANS_REVERSE && compoundFilterIndex != result.size())) {
+            status = U_MISPLACED_COMPOUND_FILTER;
+        }
+    }
+
    if (U_FAILURE(status)) {
        for (i=0; i<result.size(); ++i) {
            delete (Transliterator*)result.elementAt(i);
        }
        result.removeAllElements();
        delete adoptedSplitTrans;
+        delete compoundFilter;
+        compoundFilter = NULL;
    }
 }

@ -885,6 +922,9 @@ void Transliterator::parseCompoundID(const UnicodeString& id,
 * first character to parse.  On output, the position after the last
 * character parsed.  This will be a semicolon or ID.length().  In the
 * case of an error this value will be unchanged.
+ * @param compoundFilter OUTPUT parameter to receive a compound
+ * filter, if one is parsed.  When a non-null compound filter is
+ * returned then a null Transliterator pointer is returned.
 * @param create if TRUE, create and return the result.  If FALSE,
 * only scan the ID, and return NULL.
 * @return a newly created transliterator, or NULL.  NULL is returned
@ -898,6 +938,7 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
                                        UnicodeString& regenID,
                                        int32_t& pos,
                                        UBool& sawDelimiter,
+                                        UnicodeSet*& compoundFilter,
                                        UTransDirection dir,
                                        UParseError& parseError,
                                        UBool create,
@ -907,19 +948,22 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
        idStart, idLimit,
        setStart, setLimit;

+    UnicodeSet* fwdFilter = NULL;
+    UnicodeSet* revFilter = NULL;
    UnicodeSet* filter = 0;

    if (!parseIDBounds(ID, pos, FALSE, limit,
-                       setStart, setLimit, revStart, filter)) {
-        delete filter;
+                       setStart, setLimit, revStart, fwdFilter)) {
+        delete fwdFilter;
        return 0;
    }
+    filter = fwdFilter;
+
    idStart = pos;
    idLimit = limit;

    if (revStart >= 0 && revStart < limit) {
        int32_t revSetStart, revSetLimit, dummy;
-        UnicodeSet* revFilter = 0;
        if (!parseIDBounds(ID, revStart+1, TRUE, revLimit,
                           revSetStart, revSetLimit, dummy, revFilter)) {
            delete filter;
@ -981,83 +1025,103 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
        }
    }

-    // Fix the id, if necessary, by reversing it (A-B => B-A).  This
-    // is only done if the id is NOT of the form Foo(Bar).  Record the
-    // position of the separator.
-    // 
-    // For both A-B and Foo(Bar) ids, detect the special case of Null,
-    // whose inverse is itself.  Given an ID with no separator "Foo",
-    // an abbreviation for "Any-Foo", consider the inverse to be
-    // "Foo-Any".
-    int32_t sep = id.indexOf(ID_SEP);
-    if (sep < 0 && id.caseCompare(NullTransliterator::SHORT_ID,
-                                  U_FOLD_CASE_DEFAULT) == 0) {
-        // Handle "Null"
-        sep = id.length();
-    } else if (dir == UTRANS_REVERSE &&
-               id.caseCompare(NullTransliterator::ID,
-                              U_FOLD_CASE_DEFAULT) == 0) {
-        // Reverse of "Any-Null" => "Null"
-        id.removeBetween(0, sep+1);
-        sep = id.length();
-    } else if (dir == UTRANS_REVERSE && revStart < 0) {
-        if (sep >= 0) {
-            id.extractBetween(0, sep, str);
-            id.removeBetween(0, sep+1);
+    Transliterator* t = NULL;
+    int32_t sep = 0; // index of the separator ('-') in id
+
+    // If id is empty, then we have either an empty specifier,
+    // which is illegal, or a compound filter, which is legal
+    // as long as its in the right place -- we let the caller
+    // decide that.
+    UBool isCompoundFilter = (id.length() == 0 && filter != NULL);
+    if (isCompoundFilter) {
+        if (dir == UTRANS_FORWARD) {
+            compoundFilter = fwdFilter;
+            delete revFilter;
+            revFilter = NULL;
        } else {
-            str = UnicodeString("Any", "");
+            compoundFilter = revFilter;
+            delete fwdFilter;
+            fwdFilter = NULL;
        }
-        sep = id.length();
-        id.append(ID_SEP).append(str);
-    } else if (sep < 0 && id.length() > 0) {
-        // Don't do anything for empty IDs -- we handle these specially below
-        str = UnicodeString("Any-", "");
-        sep = str.length() - 1;
-        id.insert(0, str);
    }
-
-    Transliterator *t = 0;
-
-    // If we have a reverse part of the ID, e.g., Foo(Bar), then we
-    // need to check for an empty part, which represents a Null
-    // transliterator.  We return 0 (not a NullTransliterator).  If we
-    // are not of the form Foo(Bar) then an empty string is illegal.
-    if (revStart >= 0 && id.length() == 0) {
-        // Ignore any filters; filters on Null are meaningless (and we
-        // can't attach them to 0 anyway)
-        delete filter;
-    }
-
+    
    else {
-        // Create the actual transliterator from the registry
-        if (registry == 0) {
-            initializeRegistry();
-        }
-        parseError.line = parseError.offset = 0;
-        parseError.preContext[0] = parseError.postContext[0] = 0;
-        TransliteratorAlias* alias = 0;
-        {
-            Mutex lock(&registryMutex);
-            t = registry->get(id, alias, parseError,status);
-            // Need to enclose this in a block to prevent deadlock when
-            // instantiating aliases (below).
-        }
-        
-        if (alias != 0) {
-            // assert(t==0);
-            // Instantiate an alias
-            t = alias->create(parseError, status);
-            delete alias;
+        // Fix the id, if necessary, by reversing it (A-B => B-A).  This
+        // is only done if the id is NOT of the form Foo(Bar).  Record the
+        // position of the separator.
+        // 
+        // For both A-B and Foo(Bar) ids, detect the special case of Null,
+        // whose inverse is itself.  Given an ID with no separator "Foo",
+        // an abbreviation for "Any-Foo", consider the inverse to be
+        // "Foo-Any".
+        sep = id.indexOf(ID_SEP);
+        if (sep < 0 && id.caseCompare(NullTransliterator::SHORT_ID,
+                                      U_FOLD_CASE_DEFAULT) == 0) {
+            // Handle "Null"
+            sep = id.length();
+        } else if (dir == UTRANS_REVERSE &&
+                   id.caseCompare(NullTransliterator::ID,
+                                  U_FOLD_CASE_DEFAULT) == 0) {
+            // Reverse of "Any-Null" => "Null"
+            id.removeBetween(0, sep+1);
+            sep = id.length();
+        } else if (dir == UTRANS_REVERSE && revStart < 0) {
+            if (sep >= 0) {
+                id.extractBetween(0, sep, str);
+                id.removeBetween(0, sep+1);
+            } else {
+                str = UnicodeString("Any", "");
+            }
+            sep = id.length();
+            id.append(ID_SEP).append(str);
+        } else if (sep < 0 && id.length() > 0) {
+            // Don't do anything for empty IDs -- we handle these specially below
+            str = UnicodeString("Any-", "");
+            sep = str.length() - 1;
+            id.insert(0, str);
        }

-        if (t == 0) {
-            // Creation failed; the ID is invalid
+        // If we have a reverse part of the ID, e.g., Foo(Bar), then we
+        // need to check for an empty part, which represents a Null
+        // transliterator.  We return 0 (not a NullTransliterator).  If we
+        // are not of the form Foo(Bar) then an empty string is illegal.
+        if (revStart >= 0 && id.length() == 0) {
+            // Ignore any filters; filters on Null are meaningless (and we
+            // can't attach them to 0 anyway)
            delete filter;
-            return 0;
        }

-        // Set the filter, if any
-        t->adoptFilter(filter);
+        else {
+            // Create the actual transliterator from the registry
+            if (registry == 0) {
+                initializeRegistry();
+            }
+            parseError.line = parseError.offset = 0;
+            parseError.preContext[0] = parseError.postContext[0] = 0;
+            TransliteratorAlias* alias = 0;
+            {
+                Mutex lock(&registryMutex);
+                t = registry->get(id, alias, parseError,status);
+                // Need to enclose this in a block to prevent deadlock when
+                // instantiating aliases (below).
+            }
+
+            if (alias != 0) {
+                // assert(t==0);
+                // Instantiate an alias
+                t = alias->create(parseError, status);
+                delete alias;
+            }
+
+            if (t == 0) {
+                // Creation failed; the ID is invalid
+                delete filter;
+                return 0;
+            }
+
+            // Set the filter, if any
+            t->adoptFilter(filter);
+        }
    }
    
    // Set the ID.  This is normally just a substring of the input
--- a/icu4c/source/i18n/transreg.cpp
+++ b/icu4c/source/i18n/transreg.cpp
@ -16,6 +16,7 @@
 #include "unicode/rbt.h"
 #include "unicode/resbund.h"
 #include "unicode/translit.h"
+#include "unicode/uniset.h"
 #include "unicode/uscript.h"

 // UChar constants
@ -77,11 +78,13 @@ TransliteratorAlias::TransliteratorAlias(const UnicodeString& theAliasID) :
 TransliteratorAlias::TransliteratorAlias(const UnicodeString& theID,
                                         const UnicodeString& idBlock,
                                         Transliterator* adopted,
-                                         int32_t theIDSplitPoint) :
+                                         int32_t theIDSplitPoint,
+                                         const UnicodeSet* cpdFilter) :
    ID(theID),
    aliasID(idBlock),
    trans(adopted),
-    idSplitPoint(theIDSplitPoint) {
+    idSplitPoint(theIDSplitPoint),
+    compoundFilter(cpdFilter) {
 }

 TransliteratorAlias::~TransliteratorAlias() {
@ -90,16 +93,19 @@ TransliteratorAlias::~TransliteratorAlias() {


 Transliterator* TransliteratorAlias::create(UParseError& pe,
-                                                      UErrorCode& ec) {
+                                            UErrorCode& ec) {
+    Transliterator *t;
    if (trans == 0) {
-        return Transliterator::createInstance(aliasID, UTRANS_FORWARD, pe, ec);
+        t = Transliterator::createInstance(aliasID, UTRANS_FORWARD, pe, ec);
    } else {
-        Transliterator *t = trans;
+        t = new CompoundTransliterator(ID, aliasID, idSplitPoint,
+                                       trans, pe, ec);
        trans = 0; // so we don't delete it later
-        return new CompoundTransliterator(ID, aliasID, idSplitPoint,
-                                          t, pe, ec);
-
+        if (compoundFilter) {
+            t->adoptFilter((UnicodeSet*) compoundFilter->clone());
+        }
    }
+    return t;
 }

 //----------------------------------------------------------------------
@ -277,6 +283,7 @@ public:
    // it has a copy constructor
    UnicodeString stringArg; // For RULES_*, ALIAS, COMPOUND_RBT
    int32_t intArg; // For COMPOUND_RBT
+    UnicodeSet* compoundFilter; // For COMPOUND_RBT
    union {
        Transliterator* prototype; // For PROTOTYPE
        TransliterationRuleData* data; // For RBT_DATA, COMPOUND_RBT
@ -290,6 +297,7 @@ public:

 Entry::Entry() {
    u.prototype = 0;
+    compoundFilter = NULL;
    entryType = NONE;
 }

@ -303,6 +311,7 @@ Entry::~Entry() {
        // invalidates any RBTs that the user has instantiated.
        delete u.data;
    }
+    delete compoundFilter;
 }

 void Entry::adoptPrototype(Transliterator* adopted) {
@ -906,7 +915,7 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
        } else if (entry->entryType == Entry::COMPOUND_RBT) {
            UnicodeString id("_", "");
            Transliterator *t = new RuleBasedTransliterator(id, entry->u.data);
-            aliasReturn = new TransliteratorAlias(ID, entry->stringArg, t, entry->intArg);
+            aliasReturn = new TransliteratorAlias(ID, entry->stringArg, t, entry->intArg, entry->compoundFilter);
            return 0;
        }

@ -935,13 +944,9 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
        // transliterators; if it lists something that's not
        // installed, we'll get an error from ResourceBundle.

-        TransliteratorParser::parse(rules, isReverse ?
-                                    UTRANS_REVERSE : UTRANS_FORWARD,
-                                    entry->u.data,
-                                    entry->stringArg,
-                                    entry->intArg,
-                                    parseError,
-                                    status);
+        TransliteratorParser parser;
+        parser.parse(rules, isReverse ? UTRANS_REVERSE : UTRANS_FORWARD,
+                     parseError, status);

        if (U_FAILURE(status)) {
            // We have a failure of some kind.  Remove the ID from the
@ -954,6 +959,11 @@ Transliterator* TransliteratorRegistry::instantiateEntry(const UnicodeString& ID
            break;
        }

+        entry->u.data = parser.orphanData();
+        entry->stringArg = parser.idBlock;
+        entry->intArg = parser.idSplitPoint;
+        entry->compoundFilter = parser.orphanCompoundFilter();
+
        // Reset entry->entryType to something that we process at the
        // top of the loop, then loop back to the top.  As long as we
        // do this, we only loop through twice at most.
--- a/icu4c/source/i18n/transreg.h
+++ b/icu4c/source/i18n/transreg.h
@ -45,7 +45,8 @@ class TransliteratorAlias {
     * Construct a compound RBT alias.
     */
    TransliteratorAlias(const UnicodeString& ID, const UnicodeString& idBlock,
-                        Transliterator* adopted, int32_t idSplitPoint);
+                        Transliterator* adopted, int32_t idSplitPoint,
+                        const UnicodeSet* compoundFilter);

    ~TransliteratorAlias();
    
@ -64,10 +65,12 @@ class TransliteratorAlias {
    // 2. CompoundRBT
    //    Here ID is the ID, aliasID is the idBlock, trans is the
    //    contained RBT, and idSplitPoint is the offet in aliasID
-    //    where the contained RBT goes.
+    //    where the contained RBT goes.  compoundFilter is the
+    //    compound filter, and it is _not_ owned.
    UnicodeString ID;
    UnicodeString aliasID;
    Transliterator* trans; // owned
+    const UnicodeSet* compoundFilter; // alias
    int32_t idSplitPoint;
 };