ICU-1076 implement ? operator, remove 9 segment limit, fix toPattern

X-SVN-Rev: 5381
2025-04-08 23:10:40 +00:00 · 2001-07-30 23:23:51 +00:00 · 2001-07-30 23:23:51 +00:00 · 83e058fbe4
commit 83e058fbe4
parent f42a1b08d0
4 changed files with 439 additions and 138 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -47,6 +47,7 @@
 #define ANCHOR_START       ((UChar)0x005E) /*^*/
 #define KLEENE_STAR        ((UChar)0x002A) /***/
 #define ONE_OR_MORE        ((UChar)0x002B) /*+*/
+#define ZERO_OR_ONE        ((UChar)0x003F) /*?*/

 // By definition, the ANCHOR_END special character is a
 // trailing SymbolTable.SYMBOL_REF character.
@ -137,6 +138,204 @@ UnicodeString ParseData::parseReference(const UnicodeString& text,
    return result;
 }

+//----------------------------------------------------------------------
+// Segments
+//----------------------------------------------------------------------
+
+class Segments {
+    UVector offsets;
+    UVector isOpenParen;
+public:
+    Segments();
+    ~Segments();
+    void addParenthesisAt(int32_t offset, UBool isOpenParen);
+    int32_t getLastParenOffset(UBool& isOpenParen) const;
+    UBool extractLastParenSubstring(int32_t& start, int32_t& limit);
+    int32_t* createArray() const;
+    UBool validate() const;
+    int32_t count() const; // number of segments
+private:
+    int32_t offset(int32_t i) const;
+    UBool isOpen(int32_t i) const;
+    int32_t size() const; // size of the UVectors
+};
+
+// Store int32_t as a void* in a UVector.  DO NOT ASSUME sizeof(void*)
+// is 32.  Assume sizeof(void*) >= 32.
+inline void* _int32_to_voidPtr(int32_t x) {
+    void* a = 0; // May be > 32 bits
+    *(int32_t*)&a = x; // Careful here...
+    return a;
+}
+inline int32_t _voidPtr_to_int32(void* x) {
+    void* a = x; // Copy to stack (portability)
+    return *(int32_t*)&a; // Careful here...
+}
+
+int32_t Segments::offset(int32_t i) const {
+    return _voidPtr_to_int32(offsets.elementAt(i));
+}
+
+UBool Segments::isOpen(int32_t i) const {
+    return _voidPtr_to_int32(isOpenParen.elementAt(i)) != 0;
+}
+
+int32_t Segments::size() const {
+    // assert(offset.size() == isOpenParen.size());
+    return offsets.size();
+}
+
+Segments::Segments() {}
+Segments::~Segments() {}
+
+void Segments::addParenthesisAt(int32_t offset, UBool isOpen) {
+    offsets.addElement(_int32_to_voidPtr(offset));
+    isOpenParen.addElement(_int32_to_voidPtr(isOpen ? 1 : 0));
+}
+
+int32_t Segments::getLastParenOffset(UBool& isOpenParen) const {
+    if (size() == 0) {
+        return -1;
+    }
+    isOpenParen = isOpen(size()-1);
+    return offset(size()-1);
+}
+
+// Remove the last (rightmost) segment.  Store its offsets in start
+// and limit, and then convert all offsets at or after start to be
+// equal to start.  Upon failure, return FALSE.  Assume that the
+// caller has already called getLastParenOffset() and validated that
+// there is at least one parenthesis and that the last one is a close
+// paren.
+UBool Segments::extractLastParenSubstring(int32_t& start, int32_t& limit) {
+    // assert(offsets.size() > 0);
+    // assert(isOpenParen.elementAt(isOpenParen.size()-1) == 0);
+    int32_t i = size() - 1;
+    int32_t n = 1; // count of close parens we need to match
+    // Record position of the last close paren
+    limit = offset(i);
+    --i; // back up to the one before the last one
+    while (i >= 0 && n != 0) {
+        n += isOpen(i) ? -1 : 1;
+    }
+    if (n != 0) {
+        return FALSE;
+    }
+    // assert(i>=0);
+    start = offset(i);
+    // Reset all segment pairs from i to size() - 1 to [start, start+1).
+    while (i<size()) {
+        int32_t o = isOpen(i) ? start : (start+1);
+        offsets.setElementAt(_int32_to_voidPtr(o), i);
+        ++i;
+    }
+    return TRUE;
+}
+
+// Assume caller has already gotten a TRUE validate().
+int32_t* Segments::createArray() const {
+    /**
+     * >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<<
+     *
+     * The segments array encodes information about parentheses-
+     * enclosed regions of the input string.  These are referenced in
+     * the output string using the notation $1, $2, etc.  Numbering is
+     * in order of appearance of the left parenthesis.  Number is
+     * one-based.  Segments are defined as start, limit pairs.
+     * Segments may nest.
+     * 
+     * In order two avoid allocating two subobjects, the segments
+     * array actually comprises two arrays.  The first is gives the
+     * index values of the open and close parentheses in the order
+     * they appear.  The second maps segment numbers to the indices of
+     * the first array.  The two arrays have the same length.
+     *
+     * Example:  (a b(c d)e f)
+     *            0 1 2 3 4 5 6
+     *
+     * First array: Indices are 0, 2, 4, and 6.
+
+     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
+     * second array is 0, 3, 1 2 -- these give the indices in the
+     * first array at which $1:open, $1:close, $2:open, and $2:close
+     * occur.
+     *
+     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
+     *
+     * Each subarray is terminated with a -1, and two leading entries
+     * give the number of segments and the offset to the first entry
+     * of the second array.  In addition, the second array value are
+     * all offset by 2 so they index directly into the final array.
+     * The total array size is 4*segments[0] + 4.  The second index is
+     * 2*segments[0] + 3.
+     *
+     * In the output string, a segment reference is indicated by a
+     * character in a special range, as defined by
+     * RuleBasedTransliterator.Data.
+     *
+     * Most rules have no segments, in which case segments is null, and the
+     * output string need not be checked for segment reference characters.
+     */
+    int32_t c = count(); // number of segments
+    int32_t arrayLen = 4*c + 4;
+    int32_t *array = new int32_t[arrayLen];
+    int32_t a2offset = 2*c + 3; // offset to array 2
+    array[0] = c;
+    array[1] = a2offset;
+    int32_t i;
+    for (i=0; i<2*c; ++i) {
+        array[2+i] = offset(i);
+    }
+    array[a2offset-1] = -1;
+    array[arrayLen-1] = -1;
+    // Now walk through and match up segment numbers with parentheses.
+    // Number segments from 0.  We're going to offset all entries by 2
+    // to skip the first two elements, array[0] and array[1].
+    UStack stack;
+    int32_t nextOpen = 0; // seg # of next open, 0-based
+    int32_t j = a2offset; // index of start of array 2
+    for (i=0; i<2*c; ++i) {
+        UBool open = isOpen(i);
+        // Let seg be the zero-based segment number.
+        // Open parens are at 2*seg in array 2.
+        // Close parens are at 2*seg+1 in array 2.
+        if (open) {
+            array[a2offset + 2*nextOpen] = 2+i;
+            stack.push(_int32_to_voidPtr(nextOpen));
+            ++nextOpen;
+        } else {
+            int32_t nextClose = _voidPtr_to_int32(stack.pop());
+            array[a2offset + 2*nextClose+1] = 2+i;
+        }
+    }
+    // assert(stack.empty());
+    return array;
+}
+
+UBool Segments::validate() const {
+    // want number of parens >= 2
+    // want number of parens to be even
+    // want first paren '('
+    // want parens to match up in the end
+    if ((size() < 2) || (size() % 2 != 0) || !isOpen(0)) {
+        return FALSE;
+    }
+    int32_t n = 0;
+    for (int32_t i=0; i<size(); ++i) {
+        n += isOpen(i) ? 1 : -1;
+        if (n < 0) {
+            return FALSE;
+        }
+    }
+    return n == 0;
+}
+
+// Assume caller has already gotten a TRUE validate().
+int32_t Segments::count() const {
+    // assert(validate());
+    return size() / 2;
+}
+
 //----------------------------------------------------------------------
 // BEGIN RuleHalf
 //----------------------------------------------------------------------
@ -159,8 +358,8 @@ public:
    // Record the position of the segment substrings and references.  A
    // given side should have segments or segment references, but not
    // both.
-    UVector* segments; // ref substring start,limits
-    int32_t maxRef;       // index of largest ref (1..9)
+    Segments* segments;
+    int32_t maxRef;       // index of largest ref ($n) on the right

    // Record the offset to the cursor either to the left or to the
    // right of the key.  This is indicated by characters on the output
@ -214,18 +413,6 @@ private:
    RuleHalf& operator=(const RuleHalf&);
 };

-// Store int32_t as a void* in a UVector.  DO NOT ASSUME sizeof(void*)
-// is 32.  Assume sizeof(void*) >= 32.
-inline void* _int32_to_voidPtr(int32_t x) {
-    void* a = 0; // May be > 32 bits
-    *(int32_t*)&a = x; // Careful here...
-    return a;
-}
-inline int32_t _voidPtr_to_int32(void* x) {
-    void* a = x; // Copy to stack (portability)
-    return *(int32_t*)&a; // Careful here...
-}
-
 const UnicodeString RuleHalf::gOperators = OPERATORS;

 RuleHalf::RuleHalf(TransliteratorParser& p) : parser(p) {
@ -335,14 +522,9 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            // Handle segment definitions "(" and ")"
            // Parse "(", ")"
            if (segments == NULL) {
-                segments = new UVector();
+                segments = new Segments();
            }
-            if ((c == SEGMENT_OPEN) !=
-                (segments->size() % 2 == 0)) {
-                return syntaxError(RuleBasedTransliterator::MISMATCHED_SEGMENT_DELIMITERS,
-                                   rule, start);
-            }
-            segments->addElement(_int32_to_voidPtr(buf.length()));
+            segments->addParenthesisAt(buf.length(), c == SEGMENT_OPEN);
            break;
        case END_OF_RULE:
            --pos; // Backup to point to END_OF_RULE
@ -361,15 +543,28 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                    anchorEnd = TRUE;
                    break;
                }
-                // Parse "$1" "$2" .. "$9"
+                // Parse "$1" "$2" .. "$9" .. (no upper limit)
                c = rule.charAt(pos);
-                int32_t r = Unicode::digit(c, 10);
+                int32_t r = u_charDigitValue(c);
                if (r >= 1 && r <= 9) {
+                    ++pos;
+                    for (;;) {
+                        c = rule.charAt(pos);
+                        int32_t d = u_charDigitValue(c);
+                        if (d < 0) {
+                            break;
+                        }
+                        if (r > 214748364 ||
+                            (r == 214748364 && d > 7)) {
+                            return syntaxError(RuleBasedTransliterator::UNDEFINED_SEGMENT_REFERENCE,
+                                               rule, start);
+                        }
+                        r = 10*r + d;
+                    }
                    if (r > maxRef) {
                        maxRef = r;
                    }
-                    buf.append(parser.data->getSegmentStandin(r));
-                    ++pos;
+                    buf.append(parser.getSegmentStandin(r));
                } else {
                    pp.setIndex(pos);
                    UnicodeString name = parser.parseData->
@ -444,6 +639,7 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            break;
        case KLEENE_STAR:
        case ONE_OR_MORE:
+        case ZERO_OR_ONE:
            // Quantifiers.  We handle single characters, quoted strings,
            // variable references, and segments.
            //  a+      matches  aaa
@ -452,15 +648,18 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
            //  (seg)+  matches  segsegseg
            {
                int32_t start, limit;
+                UBool isOpenParen;
+                UBool isSegment = FALSE;
                if (segments != 0 &&
-                    segments->size() >= 2 &&
-                    segments->size() % 2 == 0 &&
-                    _voidPtr_to_int32(segments->elementAt(segments->size()-1)) == buf.length()) {
+                    segments->getLastParenOffset(isOpenParen) == buf.length()) {
                    // The */+ immediately follows a segment
-                    int32_t len = segments->size();
-                    start = _voidPtr_to_int32(segments->elementAt(len - 2));
-                    limit = _voidPtr_to_int32(segments->elementAt(len - 1));
-                    segments->setElementAt(_int32_to_voidPtr(start+1), len-1);
+                    if (isOpenParen) {
+                        return syntaxError(RuleBasedTransliterator::MISPLACED_QUANTIFIER, rule, start);
+                    }
+                    if (!segments->extractLastParenSubstring(start, limit)) {
+                        return syntaxError(RuleBasedTransliterator::MISMATCHED_SEGMENT_DELIMITERS, rule, start);
+                    }
+                    isSegment = TRUE;
                } else {
                    // The */+ follows an isolated character or quote
                    // or variable reference
@ -479,8 +678,21 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
                    }
                }
                UnicodeMatcher *m =
-                    new StringMatcher(buf, start, limit, *parser.data);
-                m = new Quantifier(m, (c == ONE_OR_MORE)?1:0, 0x7FFFFFFF);
+                    new StringMatcher(buf, start, limit, isSegment, *parser.data);
+                int32_t min = 0;
+                int32_t max = Quantifier::MAX;
+                switch (c) {
+                case ONE_OR_MORE:
+                    min = 1;
+                    break;
+                case ZERO_OR_ONE:
+                    min = 0;
+                    max = 1;
+                    break;
+                // case KLEENE_STAR:
+                //    do nothing -- min, max already set
+                }
+                m = new Quantifier(m, min, max);
                buf.truncate(start);
                buf.append(parser.generateStandInFor(m));
            }
@ -528,16 +740,7 @@ void RuleHalf::removeContext() {
 * Create and return an int32_t[] array of segments.
 */
 int32_t* RuleHalf::createSegments() const {
-    if (segments == NULL) {
-        return NULL;
-    }
-    int32_t len = segments->size();
-    int32_t* result = new int32_t[len + 1];
-    for (int32_t i=0; i<len; ++i) {
-        result[i] = _voidPtr_to_int32(segments->elementAt(i));
-    }
-    result[len] = -1; // end marker
-    return result;
+    return (segments == 0) ? 0 : segments->createArray();
 }

 //----------------------------------------------------------------------
@ -888,11 +1091,10 @@ int32_t TransliteratorParser::parseRule(int32_t pos, int32_t limit) {
    // segment's start must have a corresponding limit, and the
    // references must not refer to segments that do not exist.
    if (left->segments != NULL) {
-        int n = left->segments->size();
-        if (n % 2 != 0) {
+        if (!left->segments->validate()) {
            return syntaxError(RuleBasedTransliterator::MISSING_SEGMENT_CLOSE, rule, start);
        }
-        n /= 2;
+        int32_t n = left->segments->count();
        if (right->maxRef > n) {
            return syntaxError(RuleBasedTransliterator::UNDEFINED_SEGMENT_REFERENCE, rule, start);
        }
@ -999,6 +1201,18 @@ void TransliteratorParser::appendVariableDef(const UnicodeString& name,
    }
 }

+UChar TransliteratorParser::getSegmentStandin(int32_t r) {
+    // assert(r>=1);
+    if (r > data->segmentCount) {
+        data->segmentCount = r;
+        variableLimit = data->segmentBase - r + 1;
+        if (variableNext >= variableLimit) {
+            status = U_ILLEGAL_ARGUMENT_ERROR;
+        }
+    }
+    return data->getSegmentStandin(r);
+}
+
 /**
 * Determines what part of the private use region of Unicode we can use for
 * variable stand-ins.  The correct way to do this is as follows: Parse each
@ -1015,9 +1229,11 @@ void TransliteratorParser::determineVariableRange(void) {
    data->variablesBase = variableNext = variableLimit = (UChar) 0;
    
    if (r != 0) {
-        // Allocate 9 characters for segment references 1 through 9
-        data->segmentBase = r->start;
-        data->variablesBase = variableNext = (UChar) (data->segmentBase + 9);
+        // Segment references work down; variables work up.  We don't
+        // know how many of each we will need.
+        data->segmentBase = (UChar) (r->start + r->length - 1);
+        data->segmentCount = 0;
+        data->variablesBase = variableNext = (UChar) r->start;
        variableLimit = (UChar) (r->start + r->length);
        delete r;
    }
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -182,6 +182,13 @@ private:
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf);

+    /**
+     * Return a stand-in character that refers to the given segments.
+     * @param r a reference number >= 1
+     * @return a stand-in for the given segment reference
+     */
+    UChar getSegmentStandin(int32_t r);
+
    /**
     * Determines what part of the private use region of Unicode we can use for
     * variable stand-ins.  The correct way to do this is as follows: Parse each
--- a/icu4c/source/i18n/rbt_rule.cpp
+++ b/icu4c/source/i18n/rbt_rule.cpp
@ -20,6 +20,17 @@ const UChar TransliterationRule::ETHER = 0xFFFF;
 static const UChar APOSTROPHE = 0x0027; // '
 static const UChar BACKSLASH  = 0x005C; // \

+// To process segments we need to allocate arrays of integers.  We use
+// stack storage as long as the segment count is <= MAX_STATIC_SEGS.
+// Otherwise, we allocate heap space.
+#define MAX_STATIC_SEGS 20
+
+#define FIRST_SEG_POS_INDEX 2
+#define SEGMENTS_COUNT segments[0]
+#define SEGMENTS_LEN (SEGMENTS_COUNT*2+4)
+#define SEGMENTS_POS(i) segments[FIRST_SEG_POS_INDEX+i]
+#define SEGMENTS_NUM(i) segments[segments[1]+i]
+
 /**
 * Construct a new rule with the given input, output text, and other
 * attributes.  A cursor position may be specified for the output text.
@ -102,12 +113,7 @@ TransliterationRule::TransliterationRule(TransliterationRule& other) :

    segments = 0;
    if (other.segments != 0) {
-        // Find the end marker, which is a -1.
-        int32_t len = 0;
-        while (other.segments[len] >= 0) {
-            ++len;
-        }
-        ++len;
+        int32_t len = SEGMENTS_LEN;
        segments = new int32_t[len];
        uprv_memcpy(segments, other.segments, len*sizeof(segments[0]));
    }
@ -167,10 +173,11 @@ void TransliterationRule::init(const UnicodeString& input,
    // code to back up by one to obtain the last ante context segment.
    firstKeySeg = -1;
    if (segments != 0) {
-        do {
+        firstKeySeg = FIRST_SEG_POS_INDEX;
+        while (segments[firstKeySeg] >= 0 &&
+               segments[firstKeySeg] < anteContextLength) {
            ++firstKeySeg;
-        } while (segments[firstKeySeg] >= 0 &&
-                 segments[firstKeySeg] < anteContextLength);
+        }
    }

    pattern = input;
@ -359,10 +366,17 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // In the future, the first two constraints may be lifted,
    // in which case this method will have to be modified.

-    int32_t segPos[18];
+    int32_t _segPos[2*MAX_STATIC_SEGS];
+    int32_t *segPos = _segPos;
+    if (segments != 0 && SEGMENTS_COUNT > MAX_STATIC_SEGS) {
+        segPos = new int32_t[2*SEGMENTS_COUNT];
+    }
    int32_t iSeg = firstKeySeg - 1;
    int32_t nextSegPos = (iSeg >= 0) ? segments[iSeg] : -1;

+    UMatchDegree m;
+    int32_t lenDelta, keyLimit;
+
    // ------------------------ Ante Context ------------------------

    // A mismatch in the ante context, or with the start anchor,
@ -387,13 +401,15 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                keyChar == text.charAt(cursor)) {
                --cursor;
            } else {
-                return U_MISMATCH;
+                m = U_MISMATCH;
+                goto exit;
            }
        } else {
            // Subtract 1 from contextStart to make it a reverse limit
            if (matcher->matches(text, cursor, pos.contextStart-1, FALSE)
                != U_MATCH) {
-                return U_MISMATCH;
+                m = U_MISMATCH;
+                goto exit;
            }
        }
        if (cursorPos == (i - anteContextLength)) {
@ -407,41 +423,37 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
            } else {
                ++segPos[iSeg];
            }
-            nextSegPos = (--iSeg >= 0) ? segments[iSeg] : -1;
+            nextSegPos = (--iSeg >= FIRST_SEG_POS_INDEX) ? segments[iSeg] : -1;
        }
    }

    // ------------------------ Start Anchor ------------------------

    if ((flags & ANCHOR_START) && cursor != (pos.contextStart-1)) {
-        return U_MISMATCH;
+        m = U_MISMATCH;
+        goto exit;
    }

    // -------------------- Key and Post Context --------------------

-    // YUCKY OPTIMIZATION.  To make things a miniscule amount faster,
-    // subtract anteContextLength from all segments[i] with i >=
-    // firstKeySeg.  Then we don't have to do so here.  I only mention
-    // this here in order to say DO NOT DO THIS.  The gain is
-    // miniscule (how long does an integer subtraction take?) and the
-    // increase in confusion isn't worth it.
-
    iSeg = firstKeySeg;
-    nextSegPos = (iSeg >= 0) ? (segments[iSeg] - anteContextLength) : -1;
+    nextSegPos = (iSeg >= FIRST_SEG_POS_INDEX) ? (segments[iSeg] - anteContextLength) : -1;

    i = 0;
    cursor = pos.start;
-    int32_t keyLimit = 0;
+    keyLimit = 0;
    while (i < (pattern.length() - anteContextLength)) {
        if (incremental && cursor == pos.contextLimit) {
            // We've reached the context limit without a mismatch and
            // without completing our match.
-            return U_PARTIAL_MATCH;
+            m = U_PARTIAL_MATCH;
+            goto exit;
        }
        if (cursor == pos.limit && i < keyLength) {
            // We're still in the pattern key but we're entering the
            // post context.
-            return U_MISMATCH;
+            m = U_MISMATCH;
+            goto exit;
        }
        while (i == nextSegPos) {
            segPos[iSeg] = cursor;
@ -460,13 +472,13 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                keyChar == text.charAt(cursor)) {
                ++cursor;
            } else {
-                return U_MISMATCH;
+                m = U_MISMATCH;
+                goto exit;
            }
        } else {
-            UMatchDegree m =
-                matcher->matches(text, cursor, pos.contextLimit, incremental);
+            m = matcher->matches(text, cursor, pos.contextLimit, incremental);
            if (m != U_MATCH) {
-                return m;
+                goto exit;
            }
        }
    }
@ -495,8 +507,6 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    // keyLimit.  Segment indices have been recorded in segPos[].
    // Perform a replacement.

-    int32_t lenDelta = 0;
-
    if (segments == NULL) {
        text.handleReplaceBetween(pos.start, keyLimit, output);
        lenDelta = output.length() - (keyLimit - pos.start);
@ -534,8 +544,10 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
                }
                // Copy segment with out-of-band data
                b *= 2;
-                text.copy(segPos[b], segPos[b+1], dest);
-                dest += segPos[b+1] - segPos[b];
+                int32_t start = segPos[SEGMENTS_NUM(b)];
+                int32_t limit = segPos[SEGMENTS_NUM(b+1)];
+                text.copy(start, limit, dest);
+                dest += limit - start;
            }
            i += UTF_CHAR_LENGTH(c);
        }
@ -557,14 +569,21 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
    pos.limit += lenDelta;
    pos.contextLimit += lenDelta;
    pos.start = newStart;
+    m = U_MATCH;
    
-    return U_MATCH;
+  exit:
+    if (segPos != _segPos) {
+        delete[] segPos;
+    }
+    return m;
 }

 /**
- * Append a character to a rule that is being built up.
+ * Append a character to a rule that is being built up.  To flush
+ * the quoteBuf to rule, make one final call with isLiteral == TRUE.
+ * If there is no final character, pass in (UChar32)-1 as c.
 * @param rule the string to append the character to
- * @param c the character to append
+ * @param c the character to append, or (UChar32)-1 if none.
 * @param isLiteral if true, then the given character should not be
 * quoted or escaped.  Usually this means it is a syntactic element
 * such as > or $
@ -577,7 +596,7 @@ UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
 * cleared out by, at the end, calling this method with a literal
 * character.
 */
-void TransliterationRule::_appendToRule(UnicodeString& rule,
+void TransliterationRule::appendToRule(UnicodeString& rule,
                                        UChar32 c,
                                        UBool isLiteral,
                                        UBool escapeUnprintable,
@ -620,8 +639,10 @@ void TransliterationRule::_appendToRule(UnicodeString& rule,
                rule.append(BACKSLASH).append(APOSTROPHE);
            }
        }
-        if (!escapeUnprintable || !UnicodeSet::_escapeUnprintable(rule, c)) {
-            rule.append(c);
+        if (c != (UChar32)-1) {
+            if (!escapeUnprintable || !UnicodeSet::_escapeUnprintable(rule, c)) {
+                rule.append(c);
+            }
        }
    }

@ -635,12 +656,12 @@ void TransliterationRule::_appendToRule(UnicodeString& rule,
    // Specials (printable ascii that isn't [0-9a-zA-Z]) and
    // whitespace need quoting.  Also append stuff to quotes if we are
    // building up a quoted substring already.
-    else if ((c >= 0x0021 && c <= 0x007E &&
+    else if (quoteBuf.length() > 0 ||
+             (c >= 0x0021 && c <= 0x007E &&
              !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
                (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
                (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
-             Unicode::isWhitespace(c) ||
-             quoteBuf.length() > 0) {
+             Unicode::isWhitespace(c)) {
        quoteBuf.append(c);
        // Double ' within a quote
        if (c == APOSTROPHE) {
@ -654,16 +675,19 @@ void TransliterationRule::_appendToRule(UnicodeString& rule,
    }
 }

-void TransliterationRule::_appendToRule(UnicodeString& rule,
+void TransliterationRule::appendToRule(UnicodeString& rule,
                                        const UnicodeString& text,
                                        UBool isLiteral,
                                        UBool escapeUnprintable,
                                        UnicodeString& quoteBuf) {
    for (int32_t i=0; i<text.length(); ++i) {
-        _appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
+        appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
    }
 }

+static const int32_t POW10[] = {1, 10, 100, 1000, 10000, 100000, 1000000,
+                                10000000, 100000000, 1000000000};
+
 /**
 * Create a source string that represents this rule.  Append it to the
 * given string.
@ -672,10 +696,20 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
                                           UBool escapeUnprintable) const {
    int32_t i;

-    int32_t iseg = 0;
+    int32_t iseg = FIRST_SEG_POS_INDEX-1;
    int32_t nextSeg = -1;
+    // Build an array of booleans specifying open vs. close paren
+    UBool _isOpen[2*MAX_STATIC_SEGS];
+    UBool *isOpen = _isOpen;
    if (segments != 0) {
-        nextSeg = segments[iseg++];
+        if (SEGMENTS_COUNT > MAX_STATIC_SEGS) {
+            isOpen = new UBool[2*SEGMENTS_COUNT];
+        }
+        for (i=0; i<2*SEGMENTS_COUNT; i+=2) {
+            isOpen[SEGMENTS_NUM(i)] = TRUE;
+            isOpen[SEGMENTS_NUM(i+1)] = FALSE;
+        }
+        nextSeg = segments[++iseg];
    }

    // Accumulate special characters (and non-specials following them)
@ -691,41 +725,41 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
    // Emit the input pattern
    for (i=0; i<pattern.length(); ++i) {
        if (emitBraces && i == anteContextLength) {
-            _appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
        }

        // Append either '(' or ')' if we are at a segment index
        if (i == nextSeg) {
-            _appendToRule(rule, ((iseg % 2) == 0) ?
-                             (UChar)0x0029 : (UChar)0x0028,
+            appendToRule(rule, isOpen[iseg-FIRST_SEG_POS_INDEX] ?
+                             (UChar)0x0028 : (UChar)0x0029,
                             TRUE, escapeUnprintable, quoteBuf);
-            nextSeg = segments[iseg++];
+            nextSeg = segments[++iseg];
        }

        if (emitBraces && i == (anteContextLength + keyLength)) {
-            _appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
        }

        UChar c = pattern.charAt(i);
        const UnicodeMatcher *matcher = data.lookup(c);
        if (matcher == 0) {
-            _appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
        } else {
-            _appendToRule(rule, matcher->toPattern(str, escapeUnprintable),
+            appendToRule(rule, matcher->toPattern(str, escapeUnprintable),
                          TRUE, escapeUnprintable, quoteBuf);
        }
    }

    if (i == nextSeg) {
-        // assert((iseg % 2) == 0);
-        _appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
+        // assert(!isOpen[iSeg-FIRST_SEG_POS_INDEX]);
+        appendToRule(rule, (UChar)0x0029 /*)*/, TRUE, escapeUnprintable, quoteBuf);
    }

    if (emitBraces && i == (anteContextLength + keyLength)) {
-        _appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
+        appendToRule(rule, (UChar)0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
    }

-    _appendToRule(rule, UnicodeString(" > ", ""), TRUE, escapeUnprintable, quoteBuf);
+    appendToRule(rule, UnicodeString(" > ", ""), TRUE, escapeUnprintable, quoteBuf);

    // Emit the output pattern

@ -733,27 +767,35 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
    int32_t cursor = cursorPos;
    if (cursor < 0) {
        while (cursor++ < 0) {
-            _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
        // Fall through and append '|' below
    }

    for (i=0; i<output.length(); ++i) {
        if (i == cursor) {
-            _appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
        }
        UChar c = output.charAt(i);
        int32_t seg = data.lookupSegmentReference(c);
        if (seg < 0) {
-            _appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
        } else {
-            UChar segRef[4] = {
-                0x0020 /* */,
-                0x0024 /*$*/,
-                (UChar) (0x0031 + seg) /*0..9*/,
-                0x0020 /* */
-            };
-            _appendToRule(rule, UnicodeString(FALSE, segRef, 4), TRUE, escapeUnprintable, quoteBuf);
+            ++seg; // make 1-based
+            appendToRule(rule, (UChar)0x20, TRUE, escapeUnprintable, quoteBuf);
+            rule.append((UChar)0x24 /*$*/);
+            UBool show = FALSE; // TRUE if we should display digits
+            for (int32_t p=9; p>=0; --p) {
+                int32_t d = seg / POW10[p];
+                seg -= d * POW10[p];
+                if (d != 0 || p == 0) {
+                    show = TRUE;
+                }
+                if (show) {
+                    rule.append((UChar)(48+d));
+                }
+            }            
+            rule.append((UChar)0x20);
        }
    }

@ -763,13 +805,16 @@ UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
    if (cursor > output.length()) {
        cursor -= output.length();
        while (cursor-- > 0) {
-            _appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
+            appendToRule(rule, (UChar) 0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
        }
-        _appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
+        appendToRule(rule, (UChar) 0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
    }

-    _appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
+    appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);

+    if (isOpen != _isOpen) {
+        delete[] isOpen;
+    }
    return rule;
 }

--- a/icu4c/source/i18n/rbt_rule.h
+++ b/icu4c/source/i18n/rbt_rule.h
@ -63,14 +63,43 @@ private:
    UnicodeString output;

    /**
-     * Array of segments.  These are segments of the input string that may be
-     * referenced and appear in the output string.  Each segment is stored as an
-     * offset, limit pair.  Segments are referenced by a 1-based index;
-     * reference i thus includes characters at offset segments[2*i-2] to
-     * segments[2*i-1]-1 in the pattern string.
+     * >>> Duplicated in rbt_pars.cpp and rbt_rule.h <<<
     *
-     * In the output string, a segment reference is indicated by a character in
-     * a special range, as defined by RuleBasedTransliterator.Data.
+     * The segments array encodes information about parentheses-
+     * enclosed regions of the input string.  These are referenced in
+     * the output string using the notation $1, $2, etc.  Numbering is
+     * in order of appearance of the left parenthesis.  Number is
+     * one-based.  Segments are defined as start, limit pairs.
+     * Segments may nest.
+     * 
+     * In order two avoid allocating two subobjects, the segments
+     * array actually comprises two arrays.  The first is gives the
+     * index values of the open and close parentheses in the order
+     * they appear.  The second maps segment numbers to the indices of
+     * the first array.  The two arrays have the same length.
+     *
+     * Example:  (a b(c d)e f)
+     *            0 1 2 3 4 5 6
+     *
+     * First array: Indices are 0, 2, 4, and 6.
+
+     * Second array: $1 is at 0 and 6, and $2 is at 2 and 4, so the
+     * second array is 0, 3, 1 2 -- these give the indices in the
+     * first array at which $1:open, $1:close, $2:open, and $2:close
+     * occur.
+     *
+     * The final array is: 2, 7, 0, 2, 4, 6, -1, 2, 5, 3, 4, -1
+     *
+     * Each subarray is terminated with a -1, and two leading entries
+     * give the number of segments and the offset to the first entry
+     * of the second array.  In addition, the second array value are
+     * all offset by 2 so they index directly into the final array.
+     * The total array size is 4*segments[0] + 4.  The second index is
+     * 2*segments[0] + 3.
+     *
+     * In the output string, a segment reference is indicated by a
+     * character in a special range, as defined by
+     * RuleBasedTransliterator.Data.
     *
     * Most rules have no segments, in which case segments is null, and the
     * output string need not be checked for segment reference characters.
@ -277,7 +306,7 @@ public:
     */
    virtual UnicodeString& toRule(UnicodeString& pat,
                                  UBool escapeUnprintable) const;
-private:
+ private:

    void init(const UnicodeString& input,
              int32_t anteContextPos, int32_t postContextPos,
@ -287,17 +316,21 @@ private:
              UBool anchorStart, UBool anchorEnd,
              UErrorCode& status);

-    static void _appendToRule(UnicodeString& rule,
-                              UChar32 c,
-                              UBool isLiteral,
-                              UBool escapeUnprintable,
-                              UnicodeString& quoteBuf);
+ private:

-    static void _appendToRule(UnicodeString& rule,
-                              const UnicodeString& text,
-                              UBool isLiteral,
-                              UBool escapeUnprintable,
-                              UnicodeString& quoteBuf);
+    friend class StringMatcher;
+
+    static void appendToRule(UnicodeString& rule,
+                             UChar32 c,
+                             UBool isLiteral,
+                             UBool escapeUnprintable,
+                             UnicodeString& quoteBuf);
+    
+    static void appendToRule(UnicodeString& rule,
+                             const UnicodeString& text,
+                             UBool isLiteral,
+                             UBool escapeUnprintable,
+                             UnicodeString& quoteBuf);
 };

 #endif