ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10076
2025-04-09 15:27:38 +00:00 · 2002-10-29 01:20:15 +00:00 · 2002-10-29 01:20:15 +00:00 · 5494469d5b
commit 5494469d5b
parent 0a03fdb072
6 changed files with 308 additions and 204 deletions
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -504,6 +504,7 @@ typedef enum UErrorCode {
     U_REGEX_INTERNAL_ERROR,
     U_REGEX_INVALID_STATE,
     U_REGEX_BAD_ESCAPE_SEQUENCE,
+     U_REGEX_PROPERTY_SYNTAX,
     U_REGEX_ERROR_LIMIT,

    U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@ -463,17 +463,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
        //     3.   jmp 1
        //     4.   ...
        {
-            int32_t   topLoc;        // location of item #1, the start of the stuff to repeat
-
-            if (fRXPat->fCompiledPat->size() == fMatchCloseParen)    
-            {
-                // The thing being repeated (item 1) is a parenthesized block.
-                //   Pick up the location of the top of the block.
-                topLoc = fMatchOpenParen+1;   
-            } else {
-                // Repeating just a single item, the last thing in the compiled patternn so far.
-                topLoc = fRXPat->fCompiledPat->size()-1;
-            }
+            int32_t   topLoc = blockTopLoc(FALSE);        // location of item #1

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the +   (4 in the comment above)
@ -506,7 +496,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
        // Normal (greedy) ? quantifier.
        //  Compiles to
        //     1. state save 3
-        //     2.    body of optional stuff
+        //     2.    body of optional block
        //     3. ...
        // Insert the state save into the compiled pattern, and we're done.
        {
@ -520,11 +510,26 @@ UBool RegexCompile::doParseActions(EParseAction action)
        // Non-greedy ?? quantifier
        //   compiles to
        //    1.  jmp   4
-        //    2.     body of optional stuff
+        //    2.     body of optional block
        //    3   jmp   5
        //    4.  state save 2
        //    5    ...
+        //  This code is less than ideal, with two jmps instead of one, because we can only
+        //  insert one instruction at the top of the block being iterated.
+        {
+            int32_t  jmp1_loc = blockTopLoc(TRUE);
+            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();

+            int32_t  jmp1_op  = URX_BUILD(URX_JMP, jmp2_loc+1);
+            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
+
+            int32_t  jmp2_op  = URX_BUILD(URX_JMP, jmp2_loc+2);
+            fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus);
+
+            int32_t  save_op  = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1);
+            fRXPat->fCompiledPat->addElement(save_op, *fStatus);
+        }
+        break;


    case doStar:
@ -708,37 +713,21 @@ UBool RegexCompile::doParseActions(EParseAction action)
        returnVal = FALSE;
        break;

-    case doScanUnicodeSet:
+    case doProperty:
        {
-            UnicodeSet *theSet = scanSet();
-            if (theSet == NULL) {
-                break;
-            }
-            if (theSet->size() > 1) {
-                //  The set contains two or more chars.
-                //  Put it into the compiled pattern as a set.
-                int32_t setNumber = fRXPat->fSets->size();
-                fRXPat->fSets->addElement(theSet, *fStatus);
-                int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
-                fRXPat->fCompiledPat->addElement(setOp, *fStatus);
-            }
-            else
-            {
-                // The set contains only a single code point.  Put it into
-                //   the compiled pattern as a single char operation rather
-                //   than a set, and discard the set itself.
-                UChar32  c = theSet->charAt(0);
-                if (c == -1) {
-                    // Set contained no chars.  Stuff an invalid char that can't match.
-                    c = 0x1fffff;
-                }
-                int32_t  charToken = URX_BUILD(URX_ONECHAR, c);
-                fRXPat->fCompiledPat->addElement(charToken, *fStatus);
-                delete theSet;
-            }
+            UnicodeSet *theSet = scanProp();
+            compileSet(theSet);
        }
        break;

+
+    case doScanUnicodeSet:
+        {
+            UnicodeSet *theSet = scanSet();
+            compileSet(theSet);
+        }
+        break;
+            
    default:
        error(U_BRK_INTERNAL_ERROR);
        returnVal = FALSE;
@ -860,6 +849,43 @@ void  RegexCompile::handleCloseParen() {
 }


+
+//----------------------------------------------------------------------------------------
+//
+//   compileSet       Compile the pattern operations for a reference to a
+//                    UnicodeSet.
+//
+//----------------------------------------------------------------------------------------
+void        RegexCompile::compileSet(UnicodeSet *theSet)
+{
+    if (theSet == NULL) {
+        return;
+    }
+    if (theSet->size() > 1) {
+        //  The set contains two or more chars.
+        //  Put it into the compiled pattern as a set.
+        int32_t setNumber = fRXPat->fSets->size();
+        fRXPat->fSets->addElement(theSet, *fStatus);
+        int32_t setOp = URX_BUILD(URX_SETREF, setNumber);
+        fRXPat->fCompiledPat->addElement(setOp, *fStatus);
+    }
+    else
+    {
+        // The set contains only a single code point.  Put it into
+        //   the compiled pattern as a single char operation rather
+        //   than a set, and discard the set itself.
+        UChar32  c = theSet->charAt(0);
+        if (c == -1) {
+            // Set contained no chars.  Stuff an invalid char that can't match.
+            c = 0x1fffff;
+        }
+        int32_t  charToken = URX_BUILD(URX_ONECHAR, c);
+        fRXPat->fCompiledPat->addElement(charToken, *fStatus);
+        delete theSet;
+    }
+}
+
+
 //----------------------------------------------------------------------------------------
 //
 //  Error         Report a rule parse error.
@ -898,6 +924,11 @@ static const UChar      chPound     = 0x23;      // '#', introduces a comment.
 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
 static const UChar      chLParen    = 0x28;
 static const UChar      chRParen    = 0x29;
+static const UChar      chLBracket  = 0x5b;
+static const UChar      chRBracket  = 0x5d;
+static const UChar      chRBrace    = 0x7d;
+static const UChar      chLowerP    = 0x70;
+static const UChar      chUpperP    = 0x50;


 //----------------------------------------------------------------------------------------
@ -1077,5 +1108,56 @@ UnicodeSet *RegexCompile::scanSet() {
 };


+//---------------------------------------------------------------------------------
+//
+//  scanProp   Construct a UnicodeSet from the text at the current scan
+//             position, which will be of the form \p{whaterver} 
+//
+//             The scan position will be at the 'p' or 'P'.  On return
+//             the scan position should be just after the '}'
+//
+//             Return a UnicodeSet, constructed from the \P pattern,
+//             or NULL if the pattern is invalid.
+//
+//---------------------------------------------------------------------------------
+UnicodeSet *RegexCompile::scanProp() {
+    UnicodeSet    *uset = NULL;
+
+    if (U_FAILURE(*fStatus)) {
+        return NULL;
+    }
+
+    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP);
+
+    // enclose the \p{property} from the regex pattern source in  [brackets]
+    UnicodeString setPattern;
+    setPattern.append(chLBracket);
+    setPattern.append(chBackSlash);
+    for (;;) {
+        setPattern.append(fC.fChar);
+        if (fC.fChar == chRBrace) {
+            break;
+        }
+        nextChar(fC);
+        if (fC.fChar == -1) {
+            // Hit the end of the input string without finding the closing '}'
+            *fStatus = U_REGEX_PROPERTY_SYNTAX;
+            return NULL;
+        }
+    }
+    setPattern.append(chRBracket);
+
+    // Build the UnicodeSet from the set pattern we just built up in a string.
+    uset = new UnicodeSet(setPattern, *fStatus);
+    if (U_FAILURE(*fStatus)) {
+        delete uset;
+        uset =  NULL;
+    }
+
+    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
+    return uset;
+};
+
+
 U_NAMESPACE_END

--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -88,11 +88,14 @@ private:
    UChar32     nextCharLL();
    UChar32     peekCharLL();
    UnicodeSet  *scanSet();
+    UnicodeSet  *scanProp();
    void        handleCloseParen();
    int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
                                                     //  at the top of the just completed block
                                                     //  or operation, and optionally ensure that
                                                     //  there is space to add an opcode there.
+    void        compileSet(UnicodeSet *theSet);      // Generate the compiled pattern for
+                                                     //   a reference to a UnicodeSet.


    UErrorCode                    *fStatus;
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
 enum Regex_PatternParseAction {
    doExprOrOperator,
    doCloseParen,
+    doProperty,
    doTagValue,
    doOrOperator,
    doOpenCaptureParen,
@ -95,7 +96,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doDotAny, 46 /* . */, 18,0,  TRUE}     //  7 
    , {doNOP, 92 /* \ */, 59,0,  TRUE}     //  8 
    , {doNOP, 253, 2,0,  FALSE}     //  9 
-    , {doRuleError, 255, 69,0,  FALSE}     //  10 
+    , {doRuleError, 255, 71,0,  FALSE}     //  10 
    , {doStringChar, 254, 11,0,  TRUE}     //  11      string
    , {doStringChar, 130, 11,0,  TRUE}     //  12 
    , {doSplitString, 63 /* ? */, 18,0,  FALSE}     //  13 
@ -117,10 +118,10 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpenLookAhead, 61 /* = */, 3, 22, TRUE}     //  29 
    , {doOpenLookAheadNeg, 33 /* ! */, 3, 22, TRUE}     //  30 
    , {doNOP, 60 /* < */, 33,0,  TRUE}     //  31 
-    , {doBadOpenParenType, 255, 69,0,  FALSE}     //  32 
+    , {doBadOpenParenType, 255, 71,0,  FALSE}     //  32 
    , {doOpenLookBehind, 61 /* = */, 3, 22, TRUE}     //  33      open-paren-lookbehind
    , {doOpenLookBehindNeg, 33 /* ! */, 3, 22, TRUE}     //  34 
-    , {doBadOpenParenType, 255, 69,0,  FALSE}     //  35 
+    , {doBadOpenParenType, 255, 71,0,  FALSE}     //  35 
    , {doNGStar, 63 /* ? */, 22,0,  TRUE}     //  36      quant-star
    , {doPossesiveStar, 43 /* + */, 22,0,  TRUE}     //  37 
    , {doStar, 255, 22,0,  FALSE}     //  38 
@ -132,14 +133,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpt, 255, 22,0,  FALSE}     //  44 
    , {doNOP, 129, 45,0,  TRUE}     //  45      interval-open
    , {doIntervalMinValue, 128, 48,0,  FALSE}     //  46 
-    , {doNumberExpectedError, 255, 69,0,  FALSE}     //  47 
+    , {doNumberExpectedError, 255, 71,0,  FALSE}     //  47 
    , {doNOP, 129, 52,0,  TRUE}     //  48      interval-value
    , {doNOP, 125 /* } */, 52,0,  FALSE}     //  49 
    , {doIntervalDigit, 128, 48,0,  TRUE}     //  50 
-    , {doNumberExpectedError, 255, 69,0,  FALSE}     //  51 
+    , {doNumberExpectedError, 255, 71,0,  FALSE}     //  51 
    , {doNOP, 129, 52,0,  TRUE}     //  52      interval-close
    , {doTagValue, 125 /* } */, 55,0,  TRUE}     //  53 
-    , {doNumberExpectedError, 255, 69,0,  FALSE}     //  54 
+    , {doNumberExpectedError, 255, 71,0,  FALSE}     //  54 
    , {doNOP, 254, 3,0,  FALSE}     //  55      expr-cont-no-interval
    , {doExprOrOperator, 124 /* | */, 3,0,  TRUE}     //  56 
    , {doExprRParen, 41 /* ) */, 255,0,  TRUE}     //  57 
@ -148,13 +149,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doBackslashB, 66 /* B */, 3,0,  TRUE}     //  60 
    , {doBackslashb, 98 /* b */, 3,0,  TRUE}     //  61 
    , {doBackslashG, 71 /* G */, 3,0,  TRUE}     //  62 
-    , {doBackslashW, 87 /* W */, 3,0,  TRUE}     //  63 
-    , {doBackslashw, 119 /* w */, 3,0,  TRUE}     //  64 
-    , {doBackslashX, 88 /* X */, 3,0,  TRUE}     //  65 
-    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  66 
-    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  67 
-    , {doStartString, 255, 11,0,  TRUE}     //  68 
-    , {doExit, 255, 69,0,  TRUE}     //  69      errorDeath
+    , {doProperty, 112 /* p */, 18,0,  FALSE}     //  63 
+    , {doProperty, 80 /* P */, 18,0,  FALSE}     //  64 
+    , {doBackslashW, 87 /* W */, 3,0,  TRUE}     //  65 
+    , {doBackslashw, 119 /* w */, 3,0,  TRUE}     //  66 
+    , {doBackslashX, 88 /* X */, 3,0,  TRUE}     //  67 
+    , {doBackslashZ, 90 /* Z */, 3,0,  TRUE}     //  68 
+    , {doBackslashz, 122 /* z */, 3,0,  TRUE}     //  69 
+    , {doStartString, 255, 11,0,  TRUE}     //  70 
+    , {doExit, 255, 71,0,  TRUE}     //  71      errorDeath
 };
 static const char *RegexStateNames[] = {    0,
     "start",
@ -224,6 +227,8 @@ static const char *RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
    0,
     "errorDeath",
    0};
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -166,9 +166,9 @@ quant-plus:
 #                  between plain '?', '??', '?+'
 #
 quant-opt:
-     '?'                 n  expr-cont                               doNGOpt                 #  *?
-     '+'                 n  expr-cont                               doPossesiveOpt          #  *+
-     default                expr-cont                               doOpt
+     '?'                 n  expr-cont                               doNGOpt                 #  ??
+     '+'                 n  expr-cont                               doPossesiveOpt          #  ?+
+     default                expr-cont                               doOpt                   #  ?


 #
@ -215,6 +215,8 @@ backslash:
   'B'                   n  term                                    doBackslashB
   'b'                   n  term                                    doBackslashb
   'G'                   n  term                                    doBackslashG
+   'p'			    expr-quant                              doProperty       #   \p{Lu}  style property
+   'P'			    expr-quant                              doProperty
   'W'                   n  term                                    doBackslashW
   'w'                   n  term                                    doBackslashw
   'X'                   n  term                                    doBackslashX
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -15,6 +15,7 @@
 #include "intltest.h"
 #include "regextst.h"
 #include "uvector.h"
+#include "stdlib.h"


 //---------------------------------------------------------------------------
@ -220,6 +221,7 @@ void RegexTest::regex_find(char *pat, char *input, UErrorCode expectedStatus, in
        errln("Line %d: error %x compiling pattern.", line, status);
        goto cleanupAndReturn;
    }
+    // callerPattern->dump();

    //
    //  Find the tags in the input data, remove them, and record the group boundary
@ -298,6 +300,154 @@ cleanupAndReturn:
 }
 

+//---------------------------------------------------------------------------
+//
+//      Basic      Check for basic functionality of regex pattern matching.
+//                 Avoid the use of REGEX_FIND test macro, which has
+//                 substantial dependencies on basic Regex functionality.
+//
+//---------------------------------------------------------------------------
+void RegexTest::Basic() {
+
+
+//
+// Debug - slide failing test cases early
+//
+#if 0
+    {
+    REGEX_FIND( "\\p{Lu}+", "here we go ... <0>ABC</0> and no more.")
+    }
+    exit(1);
+#endif
+
+
+    //
+    // Pattern with parentheses
+    //
+    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
+    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
+    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
+
+    //
+    // Patterns with *
+    //
+    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
+    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
+    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
+    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
+    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
+
+    REGEX_TESTLM("a*", "",  TRUE, TRUE);
+    REGEX_TESTLM("a*", "b", TRUE, FALSE);
+
+
+    //
+    //  Patterns with "."
+    //
+    REGEX_TESTLM(".", "abc", TRUE, FALSE);
+    REGEX_TESTLM("...", "abc", TRUE, TRUE);
+    REGEX_TESTLM("....", "abc", FALSE, FALSE);
+    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
+    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
+    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
+    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
+    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
+
+    //
+    //  Patterns with * applied to chars at end of literal string
+    //
+    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
+    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
+
+    //
+    //  Supplemental chars match as single chars, not a pair of surrogates.
+    //
+    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
+    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
+    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
+
+
+    //
+    //  UnicodeSets in the pattern
+    //
+    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
+    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
+    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
+    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
+    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
+    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
+
+    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
+    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
+    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
+    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
+    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
+
+    //
+    //   OR operator in patterns
+    //
+    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
+    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
+    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
+    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
+
+    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
+    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
+    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
+    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
+    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
+    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
+
+    //
+    //  +
+    //
+    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
+    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
+    REGEX_TESTLM("b+", "", FALSE, FALSE);
+    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
+    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
+    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
+
+    //
+    //   ?
+    //
+    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
+    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
+    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
+    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
+    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
+    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
+    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
+    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
+    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
+
+    //
+    //  Escape sequences that become single literal chars, handled internally
+    //   by ICU's Unescape.
+    //
+    
+    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
+    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
+    REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE);        // BS
+    // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L (or whatever) TODO: bug in Unescape
+    // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape  TODO: bug in Unescape
+    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
+    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
+    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
+    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
+    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);       
+    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);       
+
+    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
+    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
+
+    // Escape of special chars in patterns
+    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);       
+
+
+};
+
+
 //---------------------------------------------------------------------------
 //
 //      API_Match   Test that the API for class RegexMatcher 
@ -576,154 +726,6 @@ void RegexTest::API_Match() {



-//---------------------------------------------------------------------------
-//
-//      Basic      Check for basic functionality of regex pattern matching.
-//                 Avoid the use of REGEX_FIND test macro, which has
-//                 substantial dependencies on basic Regex functionality.
-//
-//---------------------------------------------------------------------------
-void RegexTest::Basic() {
-
-
-//
-// Debug - slide failing test cases early
-//
-#if 0
-    {
-            REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
-    }
-    return;
-#endif
-
-
-    //
-    // Pattern with parentheses
-    //
-    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
-    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
-    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
-
-    //
-    // Patterns with *
-    //
-    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
-    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
-    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
-    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
-    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
-
-    REGEX_TESTLM("a*", "",  TRUE, TRUE);
-    REGEX_TESTLM("a*", "b", TRUE, FALSE);
-
-
-    //
-    //  Patterns with "."
-    //
-    REGEX_TESTLM(".", "abc", TRUE, FALSE);
-    REGEX_TESTLM("...", "abc", TRUE, TRUE);
-    REGEX_TESTLM("....", "abc", FALSE, FALSE);
-    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
-    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
-    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
-    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
-    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
-
-    //
-    //  Patterns with * applied to chars at end of literal string
-    //
-    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
-    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
-
-    //
-    //  Supplemental chars match as single chars, not a pair of surrogates.
-    //
-    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
-    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
-    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
-
-
-    //
-    //  UnicodeSets in the pattern
-    //
-    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
-    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
-    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
-    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
-    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
-    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
-
-    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
-    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
-    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
-    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
-    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
-
-    //
-    //   OR operator in patterns
-    //
-    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
-    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
-    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
-    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
-
-    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
-    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
-    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
-    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
-    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
-    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
-
-    //
-    //  +
-    //
-    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
-    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
-    REGEX_TESTLM("b+", "", FALSE, FALSE);
-    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
-    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
-    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
-
-    //
-    //   ?
-    //
-    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
-    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
-    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
-    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
-    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
-    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
-    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
-    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
-    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
-
-    //
-    //  Escape sequences that become single literal chars, handled internally
-    //   by ICU's Unescape.
-    //
-    
-    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
-    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
-    REGEX_TESTLM("\\b", "\\u0008", TRUE, TRUE);        // BS
-    // REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L (or whatever) TODO: bug in Unescape
-    // REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape  TODO: bug in Unescape
-    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
-    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
-    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
-    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
-    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);       
-    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);       
-
-    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
-    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
-
-    // Escape of special chars in patterns
-    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);       
-
-
-};
-
-

 //---------------------------------------------------------------------------
 //
@ -1001,6 +1003,15 @@ void RegexTest::Extended() {

    REGEX_FIND( "((ab)+?)((ab)*)", "<0><1><2>ab</2></1><3>ababababab<4>ab</4></3></0>");
    REGEX_FIND( "((ab)+)((ab)*)", "<0><1>abababababab<2>ab</2></1><3></3></0>");
+
+    // Non-greedy ?? quantifier
+    REGEX_FIND( "(ab)(ab)\?\?(ab)\?\?(ab)\?\?(ab)\?\?c", 
+                "<0><1>ab</1><4>ab</4><5>ab</5>c</0>");
+
+    // Unicode Properties as naked elements in a pattern
+    REGEX_FIND( "\\p{Lu}+", "here we go ... <0>ABC</0> and no more.");
+    REGEX_FIND( "(\\p{L}+)(\\P{L}*?) (\\p{Zs}*)",  "7999<0><1>letters</1><2>4949%^&*(</2> <3>   </3></0>");
+
 }