ICU-2422 Regexp general cleanup

X-SVN-Rev: 11387
2025-04-13 08:53:20 +00:00 · 2003-03-24 05:23:07 +00:00 · 2003-03-24 05:23:07 +00:00 · 4575efb175
commit 4575efb175
parent 86f44db9be
6 changed files with 315 additions and 132 deletions
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -520,7 +520,6 @@ void    RegexCompile::compile(
    //   are too short.
    //
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
-    fRXPat->fMaxMatchLen = maxMatchLength(3, fRXPat->fCompiledPat->size()-1);

    //
    // Optimization pass:  Categorize how a match can start, for use by find()
@ -1114,7 +1113,36 @@ UBool RegexCompile::doParseActions(EParseAction action)

    case doPossesiveInterval:
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
-        compileInterval(URX_CTR_INIT_P, URX_CTR_LOOP_P);
+        {
+            // Remember the loc for the top of the block being looped over.
+            //   (Can not reserve a slot in the compiled pattern at this time, becuase 
+            //    compileInterval needs to reserve also, and blockTopLoc can only reserve 
+            //    once per block.)
+            int32_t topLoc = blockTopLoc(FALSE);
+
+            // Produce normal looping code.
+            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
+
+            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
+            //  just as if the loop was inclosed in atomic parentheses.
+
+            // First the STO_SP before the start of the loop
+            insertOp(topLoc);
+            int32_t  varLoc    = fRXPat->fDataSize;    // Reserve a data location for saving the
+            fRXPat->fDataSize += 1;                    //  state stack ptr.
+            int32_t  op        = URX_BUILD(URX_STO_SP, varLoc);
+            fRXPat->fCompiledPat->setElementAt(op, topLoc);
+
+            int32_t loopOp = fRXPat->fCompiledPat->popi();
+            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
+            loopOp++;     // point LoopOp after the just-inserted STO_SP
+            fRXPat->fCompiledPat->push(loopOp, *fStatus);
+
+            // Then the LD_SP after the end of the loop
+            op = URX_BUILD(URX_LD_SP, varLoc);
+            fRXPat->fCompiledPat->addElement(op, *fStatus);
+        }
+
        break;

    case doNGInterval:
@ -1685,7 +1713,6 @@ void   RegexCompile::insertOp(int32_t where) {
            opType == URX_STATE_SAVE   ||
            opType == URX_CTR_LOOP     ||
            opType == URX_CTR_LOOP_NG  ||
-            opType == URX_CTR_LOOP_P   ||
            opType == URX_JMP_SAV      ||
            opType == URX_RELOC_OPRND)    && opValue > where) {
            // Target location for this opcode is after the insertion point and
@ -1705,6 +1732,13 @@ void   RegexCompile::insertOp(int32_t where) {
            fParenStack.setElementAt(x, loc);
        }
    }
+
+    if (fMatchCloseParen > where) {
+        fMatchCloseParen++;
+    }
+    if (fMatchOpenParen > where) {
+        fMatchOpenParen++;
+    }
 }


@ -2011,6 +2045,15 @@ void        RegexCompile::compileSet(UnicodeSet *theSet)
 //                      for all three types (greedy, non-greedy, possessive) of
 //                      intervals.  The opcodes are supplied as parameters.
 //
+//                      The code for interval loops has this form:
+//                         0  CTR_INIT   counter loc (in stack frame)
+//                         1             5  patt address of CTR_LOOP at bottom of block
+//                         2             min count
+//                         3             max count   (-1 for unbounded)
+//                         4  ...        block to be iterated over
+//                         5  CTR_LOOP   
+//    
+//                       In                                 
 //----------------------------------------------------------------------------------------
 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
 {
@ -2050,6 +2093,8 @@ void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
        error(U_REGEX_MAX_LT_MIN);
    }

+
+
 }


@ -2348,7 +2393,6 @@ void   RegexCompile::matchStartType() {

        case URX_CTR_INIT:
        case URX_CTR_INIT_NG:
-        case URX_CTR_INIT_P:
            {
                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
                //   so location must be updated accordingly.
@ -2372,7 +2416,6 @@ void   RegexCompile::matchStartType() {

        case URX_CTR_LOOP:
        case URX_CTR_LOOP_NG:
-        case URX_CTR_LOOP_P:
            // Loop ops. 
            //  The jump is conditional, backwards only.
            atStart = FALSE;
@ -2631,7 +2674,6 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {

        case URX_CTR_INIT:
        case URX_CTR_INIT_NG:
-        case URX_CTR_INIT_P:
            {
                // Loop Init Ops.  
                //   If the min loop count == 0
@ -2652,7 +2694,6 @@ int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {

        case URX_CTR_LOOP:
        case URX_CTR_LOOP_NG:
-        case URX_CTR_LOOP_P:
            // Loop ops. 
            //  The jump is conditional, backwards only.
            break;
@ -2882,10 +2923,8 @@ int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {

        case URX_CTR_INIT:
        case URX_CTR_INIT_NG:
-        case URX_CTR_INIT_P:
        case URX_CTR_LOOP:
        case URX_CTR_LOOP_NG:
-        case URX_CTR_LOOP_P:
            // For anything to do with loops, make the match length unbounded.
            //  TODO, possibly later, special case short loops like {0,1}.
            //   Note:  INIT instructions are multi-word.  Can ignore because
@ -2992,7 +3031,6 @@ void RegexCompile::stripNOPs() {
        case URX_JMP:
        case URX_CTR_LOOP:
        case URX_CTR_LOOP_NG:
-        case URX_CTR_LOOP_P:
        case URX_RELOC_OPRND:
        case URX_JMPX:
        case URX_JMP_SAV:
@ -3007,14 +3045,59 @@ void RegexCompile::stripNOPs() {
                break;
            }

-        default:
-            // The remaining instructions are unaltered by the relocation.
+        case URX_RESERVED_OP:
+        case URX_RESERVED_OP_N:
+        case URX_BACKTRACK:
+        case URX_END:
+        case URX_ONECHAR:
+        case URX_STRING:
+        case URX_STRING_LEN:
+        case URX_START_CAPTURE:
+        case URX_END_CAPTURE:
+        case URX_STATIC_SETREF:
+        case URX_SETREF:
+        case URX_DOTANY:
+        case URX_FAIL:
+        case URX_BACKSLASH_B:
+        case URX_BACKSLASH_G:
+        case URX_UNUSED_1:
+        case URX_BACKSLASH_X:
+        case URX_BACKSLASH_Z:
+        case URX_DOTANY_ALL:
+        case URX_BACKSLASH_D:
+        case URX_CARET:
+        case URX_DOLLAR:
+        case URX_CTR_INIT:
+        case URX_CTR_INIT_NG:
+        case URX_STO_SP:
+        case URX_LD_SP:
+        case URX_BACKREF:
+        case URX_STO_INP_LOC:
+        case URX_LA_START:
+        case URX_LA_END:
+        case URX_ONECHAR_I:
+        case URX_STRING_I:
+        case URX_BACKREF_I:
+        case URX_DOLLAR_M:
+        case URX_CARET_M:
+        case URX_LB_START:
+        case URX_LB_CONT:
+        case URX_LB_END:
+        case URX_LBN_CONT:
+        case URX_LBN_END:
+            // These instructions are unaltered by the relocation.
            fRXPat->fCompiledPat->setElementAt(op, dst);
            dst++;
            break;
+
+        default:
+            // Some op is unaccounted for.
+            U_ASSERT(FALSE);
+            error(U_REGEX_INTERNAL_ERROR);
        }
    }

+    fRXPat->fCompiledPat->setSize(dst);
 }


--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -54,6 +54,7 @@ U_NAMESPACE_BEGIN
 //
 enum {
     URX_RESERVED_OP   = 0,    // For multi-operand ops, most non-first words.
+     URX_RESERVED_OP_N = 255,  // For multi-operand ops, negative operand values.
     URX_BACKTRACK     = 1,
     URX_END           = 2,
     URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
@ -84,7 +85,7 @@ enum {

     URX_CTR_INIT      = 25,   // Counter Inits for {Interval} loops.
     URX_CTR_INIT_NG   = 26,   //   3 kinds, normal, non-greedy, and possesive.
-     URX_CTR_INIT_P    = 27,   //   These are 4 word opcodes.  See description.
+     URX_UNUSED_2      = 27,   //   These are 4 word opcodes.  See description.
                               //    First Operand:  Data loc of counter variable
                               //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx 
                               //                    at the end of the loop.
@ -92,7 +93,7 @@ enum {
                               //    4th   Operand:  Max count, -1 for unbounded.
     URX_CTR_LOOP      = 28,   // Loop Ops for {interval} loops.
     URX_CTR_LOOP_NG   = 29,   //   Also in three flavors.
-     URX_CTR_LOOP_P    = 30,   //   Operand is loc of corresponding CTR_INIT.
+     URX_UNUSED_3      = 30,   //   Operand is loc of corresponding CTR_INIT.

     URX_RELOC_OPRND   = 31,   // Operand value in multi-operand ops that refers
                               //   back into compiled pattern code, and thus must
@ -180,10 +181,10 @@ enum {
        "URX_DOLLAR",          \
        "CTR_INIT",            \
        "CTR_INIT_NG",         \
-        "CTR_INIT_P",          \
+        "CTR_UNUSED_2",        \
        "CTR_LOOP",            \
        "CTR_LOOP_NG",         \
-        "CTR_LOOP_P",          \
+        "CTR_UNUSED_3",        \
        "RELOC_OPRND",         \
        "STO_SP",              \
        "LD_SP",               \
@ -207,7 +208,7 @@ enum {
 //  Convenience macros for assembling and disassembling a compiled operation.
 //
 #define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
-#define URX_TYPE(x)          ((x) >> 24) 
+#define URX_TYPE(x)          ((uint32_t)(x) >> 24) 
 #define URX_VAL(x)           ((x) & 0xffffff)

                
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@ -171,7 +171,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
                    dest.append(escapedChar);
                    replIdx += (c==0x55? 9: 5); 
                    // TODO:  Report errors for mal-formed \u escapes?
-                    //        As this is, the original sequence is output.
+                    //        As this is, the original sequence is output, which may be OK.
                    continue;
                }
            }
@ -655,6 +655,94 @@ void RegexMatcher::setTrace(UBool state) {



+//---------------------------------------------------------------------
+//
+//   split
+//
+//---------------------------------------------------------------------
+int32_t  RegexMatcher::split(const UnicodeString &input,
+        UnicodeString    dest[],
+        int32_t          destCapacity,
+        UErrorCode       &status)
+{
+    //
+    // Check arguements for validity
+    //
+    if (U_FAILURE(status)) {
+        return 0;
+    };
+
+    if (destCapacity < 1) {
+        status = U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+
+    //
+    // Reset for the input text
+    //
+    reset(input);
+    int32_t   inputLen = input.length();
+    int32_t   nextOutputStringStart = 0;
+    if (inputLen == 0) {
+        return 0;
+    }
+
+
+    //
+    // Loop through the input text, searching for the delimiter pattern
+    //
+    int i;
+    int32_t numCaptureGroups = fPattern->fGroupMap->size();
+    for (i=0; ; i++) {
+        if (i==destCapacity-1) {
+            // There is only one output string left.
+            // Fill it with whatever is left from the input, then exit the loop.
+            dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
+            break;
+        }
+        if (find()) {
+            // We found another delimiter.  Move everything from where we started looking
+            //  up until the start of the delimiter into the next output string.
+            int32_t fieldLen = fMatchStart - nextOutputStringStart;
+            dest[i].setTo(input, nextOutputStringStart, fieldLen);
+            nextOutputStringStart = fMatchEnd;
+
+            // If the delimiter pattern has capturing parentheses, the captured
+            //  text goes out into the next n destination strings.
+            int32_t groupNum;
+            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
+                if (i==destCapacity-1) {
+                    break;
+                }
+                i++;
+                dest[i] = group(groupNum, status);
+            }
+
+            if (nextOutputStringStart == inputLen) {
+                // The delimiter was at the end of the string.  We're done.
+                break;
+            }
+
+            if (i==destCapacity-1) {
+                // We've filled up the last output string with capture group data.
+                //  Give back the last string, to be used for the remainder of the input.
+                i--;
+            }
+        }
+        else
+        {
+            // We ran off the end of the input while looking for the next delimiter.
+            // All the remaining text goes into the current output string.
+            dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
+            break;
+        }
+    }
+    return i+1;
+}
+
+
+
 //--------------------------------------------------------------------------------
 //
 //     start
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -67,7 +67,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fLiteralText      = other.fLiteralText;
    fDeferredStatus   = other.fDeferredStatus;
    fMinMatchLen      = other.fMinMatchLen;
-    fMaxMatchLen      = other.fMaxMatchLen;
    fMaxCaptureDigits = other.fMaxCaptureDigits;
    fStaticSets       = other.fStaticSets; 
    
@ -81,10 +80,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
    fGroupMap->assign(*other.fGroupMap, fDeferredStatus);

-    // Note:  do not copy fMatcher.  It'll be created on first use if the
-    //        destination needs one. 
-    //    TODO:  thread safety
-
    //  Copy the Unicode Sets.  
    //    Could be made more efficient if the sets were reference counted and shared,
    //    but I doubt that pattern copying will be particularly common. 
@ -116,10 +111,8 @@ void RegexPattern::init() {
    fFlags            = 0;
    fDeferredStatus   = U_ZERO_ERROR;
    fMinMatchLen      = 0;
-    fMaxMatchLen      = -1;
    fMaxCaptureDigits = 1;  
    fStaticSets       = NULL;
-    fMatcher          = NULL;
    fFrameSize        = 0;
    fDataSize         = 0;
    fStartType        = START_NO_INFO;
@ -151,8 +144,6 @@ void RegexPattern::init() {
 //
 //--------------------------------------------------------------------------
 void RegexPattern::zap() {
-    delete fMatcher;
-    fMatcher = NULL;
    delete fCompiledPat;
    fCompiledPat = NULL;
    int i;
@ -263,6 +254,19 @@ RegexPattern *RegexPattern::compile( const UnicodeString &regex,



+//
+//   compile with no UParseErr parameter.
+//
+RegexPattern *RegexPattern::compile( const UnicodeString &regex,
+        uint32_t             flags,
+        UErrorCode           &err) 
+{
+    UParseError pe;
+    return compile(regex, flags, pe, err); 
+}
+
+
+
 //---------------------------------------------------------------------
 //
 //   flags
@ -280,6 +284,21 @@ uint32_t RegexPattern::flags() const {
 //---------------------------------------------------------------------
 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
                                    UErrorCode          &status)  const {
+    RegexMatcher    *retMatcher = matcher(status);
+    if (retMatcher != NULL) {
+        retMatcher->reset(input);
+    }
+    return retMatcher;
+};
+
+
+
+//---------------------------------------------------------------------
+//
+//   matcher(status)
+//
+//---------------------------------------------------------------------
+RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
    RegexMatcher    *retMatcher = NULL;

    if (U_FAILURE(status)) {
@ -295,7 +314,6 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
-    retMatcher->reset(input);
    return retMatcher;
 };

@ -352,99 +370,17 @@ int32_t  RegexPattern::split(const UnicodeString &input,
        int32_t          destCapacity,
        UErrorCode       &status) const
 {
-    //
-    // Check arguements for validity
-    //
    if (U_FAILURE(status)) {
        return 0;
    };

-    if (destCapacity < 1) {
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return 0;
-    }
-
-    //
-    // If we don't already have a cached matcher object from a previous call
-    //   to split(), create one now.
-    //  TODO:  NOT THREAD SAFE.   FIX.
-    //
-    if (fMatcher == NULL) {
-        RegexMatcher *m = matcher(input, status);
-        if (U_FAILURE(status)) {
-            return 0;
-        }
-        // Need to cast off const to cache the matcher
-        RegexPattern *nonConstThis = (RegexPattern *)this;
-        nonConstThis->fMatcher = m;
-    }
-
-    //
-    // Set our input text into the matcher
-    //
-    fMatcher->reset(input);
-    int32_t   inputLen = input.length();
-    int32_t   nextOutputStringStart = 0;
-    if (inputLen == 0) {
-        return 0;
-    }
-
-
-    //
-    // Loop through the input text, searching for the delimiter pattern
-    //
-    int i;
-    int32_t numCaptureGroups = fGroupMap->size();
-    for (i=0; ; i++) {
-        if (i==destCapacity-1) {
-            // There is only one output string left.
-            // Fill it with whatever is left from the input, then exit the loop.
-            dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
-            break;
-        }
-        if (fMatcher->find()) {
-            // We found another delimiter.  Move everything from where we started looking
-            //  up until the start of the delimiter into the next output string.
-            int32_t fieldLen = fMatcher->fMatchStart - nextOutputStringStart;
-            dest[i].setTo(input, nextOutputStringStart, fieldLen);
-            nextOutputStringStart = fMatcher->fMatchEnd;
-
-            // If the delimiter pattern has capturing parentheses, the captured
-            //  text goes out into the next n destination strings.
-            int32_t groupNum;
-            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
-                if (i==destCapacity-1) {
-                    break;
-                }
-                i++;
-                dest[i] = fMatcher->group(groupNum, status);
-            }
-
-            if (nextOutputStringStart == inputLen) {
-                // The delimiter was at the end of the string.  We're done.
-                break;
-            }
-
-            if (i==destCapacity-1) {
-                // We've filled up the last output string with capture group data.
-                //  Give back the last string, to be used for the remainder of the input.
-                i--;
-            }
-        }
-        else
-        {
-            // We ran off the end of the input while looking for the next delimiter.
-            // All the remaining text goes into the current output string.
-            dest[i].setTo(input, nextOutputStringStart, inputLen-nextOutputStringStart);
-            break;
-        }
-    }
-    return i+1;
+    RegexMatcher  m(this);
+    int32_t r = m.split(input, dest, destCapacity, status);
+    return r;
 }



-
 //---------------------------------------------------------------------
 //
 //   dump    Output the compiled form of the pattern.
@ -489,10 +425,8 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_STRING_LEN:
    case URX_CTR_INIT:
    case URX_CTR_INIT_NG:
-    case URX_CTR_INIT_P:
    case URX_CTR_LOOP:
    case URX_CTR_LOOP_NG:
-    case URX_CTR_LOOP_P:
    case URX_RELOC_OPRND:
    case URX_STO_SP:
    case URX_LD_SP:
@ -567,9 +501,6 @@ void   RegexPattern::dumpOp(int32_t index) const {



-// TODO:  get rid of max match length
-
-
 void   RegexPattern::dump() const {
    int      index;
    int      i;
@ -580,7 +511,6 @@ void   RegexPattern::dump() const {
    }
    REGEX_DUMP_DEBUG_PRINTF("\n");
    REGEX_DUMP_DEBUG_PRINTF("   Min Match Length:  %d\n", fMinMatchLen);
-    REGEX_DUMP_DEBUG_PRINTF("   Max Match Length:  %d\n", fMaxMatchLen);
    REGEX_DUMP_DEBUG_PRINTF("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));   
    if (fStartType == START_STRING) {
        REGEX_DUMP_DEBUG_PRINTF("    Initial match sting: \"");
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -58,7 +58,6 @@ struct REStackFrame;

 /**
 * Constants for Regular Expression Match Modes.
- * <p>Note that non-default match modes will not be supported until ICU 2.6</p>
 * @draft ICU 2.4
 */
 enum {
@ -208,6 +207,29 @@ public:
        UErrorCode           &status);


+   /**
+    *     Compiles the regular expression in string form into a RegexPattern
+    *     object using the specified match mode flags.  These compile methods,
+    *     rather than the constructors, are the usual way that RegexPattern objects
+    *     are created.
+    *
+    *     <p>Note that RegexPattern objects must not be deleted while RegexMatcher
+    *     objects created from the pattern are active.  RegexMatchers keep a pointer
+    *     back to their pattern, so premature deletion of the pattern is a
+    *     catastrophic error.</p>
+    *
+    *    @param regex The regular expression to be compiled.
+    *    @param flags The match mode flags to be used.
+    *    @param status   A reference to a UErrorCode to receive any errors.
+    *    @return      A regexPattern object for the compiled pattern.
+    *
+    *    @draft ICU 2.6
+    */
+    static RegexPattern *compile( const UnicodeString &regex,
+        uint32_t             flags,
+        UErrorCode           &status);
+
+
   /**
    *     Get the match mode flags that were used when compiling this pattern.
    *     @return  the match mode flags
@ -231,6 +253,20 @@ public:
        UErrorCode          &status) const;


+   /**
+    *  Creates a RegexMatcher that will match against this pattern.  The
+    *   RegexMatcher can be used to perform match, find or replace operations.
+    *   Note that a RegexPattern object must not be deleted while
+    *   RegexMatchers created from it still exist and might possibly be used again.
+    *
+    *   @param status   A reference to a UErrorCode to receive any errors.
+    *   @return      A RegexMatcher object for this pattern and input.
+    *
+    *   @draft ICU 2.6
+    */
+    virtual RegexMatcher *matcher(UErrorCode  &status) const;
+
+
   /**
    *  Test whether a string matches a regular expression.  This convenience function
    *   both compiles the reguluar expression and applies it in a single operation.
@ -259,7 +295,14 @@ public:


    /**
-     * Split a string around matches of the pattern.  Somewhat like split() from Perl.
+     * Split a string into fields.  Somewhat like split() from Perl.
+     * The pattern matches identify delimiters that separate the input
+     *  into fields.  The input data between the matches becomes the
+     *  fields themselves.
+     * <p>
+     *  For the best performance on split() operations,
+     *  RegexMatcher::split</code> is perferrable to this function
+     * 
     * @param input   The string to be split into fields.  The field delimiters
     *                match the pattern (in the "this" object)
     * @param dest    An array of UnicodeStrings to receive the results of the split.
@ -324,17 +367,6 @@ private:
                                   //   value may be less than the true shortest
                                   //   possible match.

-    int32_t         fMaxMatchLen;  // Maximum Match Length.  All matches will have length
-                                   //   <= this value.  For some patterns, this calculated
-                                   //   value may be greater than the true longest
-                                   //   possible match.  For patterns with unbounded
-                                   //   match length, value = -1.
-
-    RegexMatcher    *fMatcher;     // A cached matcher for this pattern, used for
-                                   //  split(), to avoid having to
-                                   //  make new ones on each call.
-                                   //  TODO:  fix thread safety problems.
-
    int32_t         fFrameSize;    // Size of a state stack frame in the
                                   //   execution engine.

@ -402,6 +434,12 @@ public:
      * created for the same expression, it will be more efficient to
      * separately create and cache a RegexPattern object, and use
      * its matcher() method to create the RegexMatcher objects.
+      *
+      *  @param regexp The Regular Expression to be compiled.
+      *  @param flags  Regular expression options, such as case insensitive matching.
+      *                @see UREGEX_CASE_INSENSITIVE
+      *  @param status Any errors are reported by setting this UErrorCode variable.
+      *  @draft ICU 2.6
      */
    RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);

@ -412,6 +450,12 @@ public:
      * created for the same expression, it will be more efficient to
      * separately create and cache a RegexPattern object, and use
      * its matcher() method to create the RegexMatcher objects.
+      *
+      *  @param regexp The Regular Expression to be compiled.
+      *  @param flags  Regular expression options, such as case insensitive matching.
+      *                @see UREGEX_CASE_INSENSITIVE
+      *  @param status Any errors are reported by setting this UErrorCode variable.
+      *  @draft ICU 2.6
      */
    RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
        uint32_t flags, UErrorCode &status);
@ -690,6 +734,37 @@ public:



+    /**
+     * Split a string into fields.  Somewhat like split() from Perl.
+     * The pattern matches identify delimiters that separate the input
+     *  into fields.  The input data between the matches becomes the
+     *  fields themselves.
+     * <p>
+     * 
+     * @param input   The string to be split into fields.  The field delimiters
+     *                match the pattern (in the "this" object).  This matcher
+     *                will be reset to this input string.
+     * @param dest    An array of UnicodeStrings to receive the results of the split.
+     *                This is an array of actual UnicodeString objects, not an
+     *                array of pointers to strings.  Local (stack based) arrays can
+     *                work well here.
+     * @param destCapacity  The number of elements in the destination array.
+     *                If the number of fields found is less than destCapacity, the
+     *                extra strings in the destination array are not altered.
+     *                If the number of destination strings is less than the number
+     *                of fields, the trailing part of the input string, including any
+     *                field delimiters, is placed in the last destination string.
+     * @param status  A reference to a UErrorCode to receive any errors.
+     * @return        The number of fields into which the input string was split.
+     * @draft ICU 2.4
+     */
+    virtual int32_t  split(const UnicodeString &input,
+        UnicodeString    dest[],
+        int32_t          destCapacity,
+        UErrorCode       &status);
+
+
+
   /**
     *   setTrace   Debug function, enable/disable tracing of the matching engine.
     *              For internal ICU development use only.  DO NO USE!!!!
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -188,6 +188,12 @@
 "(ABC){2,3}?AX"                "<0>ABCABC<1>ABC</1>AX</0>"
 "(ABC){2,3}?AX"                "ABC<0>ABCABC<1>ABC</1>AX</0>"

+# Possessive {min,max}+ intervals
+"(ABC){2,3}+ABC"               "ABCABCABC"
+"(ABC){1,2}+ABC"               "<0>ABC<1>ABC</1>ABC</0>"
+"(?:(.)\1){2,5}+."              "<0>aabbcc<1>d</1>de</0>x"
+
+
 # Atomic Grouping
 "(?>.*)abc"                    "abcabcabc"  # no match.  .* consumed entire string.
 "(?>(abc{2,4}?))(c*)"          "<0><1>abcc</1><2>ccc</2></0>ddd"