ICU-5386 Regular Expressions update, improved Java 1.5 compatibility. svn merge -r 22747:23061 from branches/andy/regex

X-SVN-Rev: 23063
2025-04-08 06:53:45 +00:00 · 2007-12-11 21:30:10 +00:00 · 2007-12-11 21:30:10 +00:00 · 67e296e813
commit 67e296e813
parent 0d216c877d
21 changed files with 3974 additions and 1300 deletions
--- a/icu4c/source/common/unicode/uniset.h
+++ b/icu4c/source/common/unicode/uniset.h
@ -16,10 +16,10 @@
 #include "unicode/uset.h"

 /**
- * \file 
+ * \file
 * \brief C++ API: Unicode Set
 */
- 
+
 U_NAMESPACE_BEGIN

 class BMPSet;
@ -1213,6 +1213,14 @@ public:
     */
    UnicodeSet& closeOver(int32_t attribute);

+    /**
+     * Remove all strings from this set.
+     *
+     * @return a reference to this set.
+     * @internal
+     */
+    virtual UnicodeSet &removeAllStrings();
+
    /**
     * Iteration method that returns the number of ranges contained in
     * this set.
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@ -718,6 +718,9 @@ typedef enum UErrorCode {
    U_REGEX_INVALID_FLAG,                 /**< Invalid value for match mode flags.                */
    U_REGEX_LOOK_BEHIND_LIMIT,            /**< Look-Behind pattern matches must have a bounded maximum length.    */
    U_REGEX_SET_CONTAINS_STRING,          /**< Regexps cannot have UnicodeSets containing strings.*/
+    U_REGEX_OCTAL_TOO_BIG,                /**< Octal character constants must be <= 0377.         */
+    U_REGEX_MISSING_CLOSE_BRACKET,        /**< Missing closing bracket on a bracket expression.   */
+    U_REGEX_INVALID_RANGE,                /**< In a character range [x-y], x is greater than y.   */
    U_REGEX_ERROR_LIMIT,                  /**< This must always be the last value to indicate the limit for regexp errors */

    /*
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@ -1037,6 +1037,12 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
    return *this;
 }

+UnicodeSet& UnicodeSet::removeAllStrings() {
+    strings->removeAllElements();
+    return *this;
+}
+
+
 /**
 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
--- a/icu4c/source/common/utypes.c
+++ b/icu4c/source/common/utypes.c
@ -152,7 +152,10 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
    "U_REGEX_INVALID_BACK_REF",
    "U_REGEX_INVALID_FLAG",
    "U_REGEX_LOOK_BEHIND_LIMIT",
-    "U_REGEX_SET_CONTAINS_STRING"
+    "U_REGEX_SET_CONTAINS_STRING",
+    "U_REGEX_OCTAL_TOO_BIG",
+    "U_REGEX_MISSING_CLOSE_BRACKET",
+    "U_REGEX_INVALID_RANGE"
 };

 static const char * const
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@ -51,7 +51,7 @@ public:
    };

    RegexCompile(RegexPattern *rp, UErrorCode &e);
-    
+
    void       compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);


@ -68,7 +68,7 @@ public:
    //   determines the code to be generated when the matching close ) is encountered.
    enum EParenClass {
        plain        = -1,               // No special handling
-        capturing    = -2, 
+        capturing    = -2,
        atomic       = -3,
        lookAhead    = -4,
        negLookAhead = -5,
@ -85,8 +85,8 @@ private:

    UChar32     nextCharLL();
    UChar32     peekCharLL();
-    UnicodeSet  *scanSet();
    UnicodeSet  *scanProp();
+    UnicodeSet  *scanPosixProp();
    void        handleCloseParen();
    int32_t     blockTopLoc(UBool reserve);          // Locate a position in the compiled pattern
                                                     //  at the top of the just completed block
@ -109,7 +109,11 @@ private:
                               int32_t end);
    void        matchStartType();
    void        stripNOPs();
-    void        OptDotStar();
+
+    void        setEval(int32_t op);
+    void        setPushOp(int32_t op);
+    UChar32     scanNamedChar();
+    UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);


    UErrorCode                    *fStatus;
@ -125,7 +129,7 @@ private:
                                                     //   is the first character not yet scanned.
    UBool                         fQuoteMode;        // Scan is in a \Q...\E quoted region
    UBool                         fInBackslashQuote; // Scan is between a '\' and the following char.
-    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to 
+    UBool                         fEOLComments;      // When scan is just after '(?',  inhibit #... to
                                                     //   end of line comments, in favor of (?#...) comments.
    int32_t                       fLineNum;          // Line number in input file.
    int32_t                       fCharNum;          // Char position within the line.
@ -167,7 +171,7 @@ private:

    UVector32                     fParenStack;       // parentheses stack.  Each frame consists of
                                                     //   the positions of compiled pattern operations
-                                                     //   needing fixup, followed by negative value.  The  
+                                                     //   needing fixup, followed by negative value.  The
                                                     //   first entry in each frame is the position of the
                                                     //   spot reserved for use when a quantifier
                                                     //   needs to add a SAVE at the start of a (block)
@ -194,8 +198,33 @@ private:
    int32_t                       fNameStartPos;     // Starting position of a \N{NAME} name in a
                                                     //   pattern, valid while remainder of name is
                                                     //   scanned.
+
+    UStack                        fSetStack;         // Stack of UnicodeSets, used while evaluating
+                                                     //   (at compile time) set expressions within
+                                                     //   the pattern.
+    UStack                        fSetOpStack;       // Stack of pending set operators (&&, --, union)
+
+    UChar32                       fLastSetLiteral;   // The last single code point added to a set.
+                                                     //   needed when "-y" is scanned, and we need
+                                                     //   to turn "x-y" into a range.
+
 };

+// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
+//   The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
+
+enum SetOperations {
+    setStart         = 0 << 16 | 1,
+    setEnd           = 1 << 16 | 2,
+    setNegation      = 2 << 16 | 3,
+    setCaseClose     = 2 << 16 | 9,
+    setDifference2   = 3 << 16 | 4,    // '--' set difference operator
+    setIntersection2 = 3 << 16 | 5,    // '&&' set intersection operator
+    setUnion         = 4 << 16 | 6,    // implicit union of adjacent items
+    setDifference1   = 4 << 16 | 7,    // '-', single dash difference op, for compatibility with old UnicodeSet.
+    setIntersection1 = 4 << 16 | 8     // '&', single amp intersection op, for compatibility with old UnicodeSet.
+    };
+
 U_NAMESPACE_END
 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
 #endif   // RBBISCAN_H
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@ -5,7 +5,7 @@
 //    It is generated by the Perl script "regexcst.pl" from
 //    the rule parser state definitions file "regexcst.txt".
 //
-//   Copyright (C) 2002-2003 International Business Machines Corporation 
+//   Copyright (C) 2002-2007 International Business Machines Corporation 
 //   and others. All rights reserved.  
 //
 //---------------------------------------------------------------------------------
@ -17,74 +17,100 @@ U_NAMESPACE_BEGIN
 // Character classes for regex pattern scanning.
 //
    static const uint8_t kRuleSet_digit_char = 128;
-    static const uint8_t kRuleSet_white_space = 129;
-    static const uint8_t kRuleSet_rule_char = 130;
+    static const uint8_t kRuleSet_rule_char = 129;


 enum Regex_PatternParseAction {
-    doPossessivePlus,
-    doCloseParen,
-    doProperty,
-    doBeginMatchMode,
-    doOrOperator,
-    doOpenCaptureParen,
-    doBadOpenParenType,
-    doRuleError,
-    doIntevalLowerDigit,
-    doBackslashs,
-    doNGOpt,
-    doBackslashw,
-    doMismatchedParenErr,
-    doOpenLookBehind,
-    doBackslashz,
-    doIntervalError,
-    doStar,
-    doCaret,
-    doEnterQuoteMode,
-    doNGStar,
-    doMatchMode,
-    doIntervalUpperDigit,
-    doOpenLookAheadNeg,
-    doPlus,
-    doOpenNonCaptureParen,
-    doBackslashA,
-    doBackslashB,
-    doNGPlus,
-    doSetMatchMode,
-    doPatFinish,
-    doBackslashD,
-    doPossessiveInterval,
-    doEscapeError,
-    doBackslashG,
-    doSuppressComments,
-    doMatchModeParen,
-    doOpt,
-    doInterval,
    doLiteralChar,
-    doIntervalInit,
-    doOpenAtomicParen,
-    doBackslashS,
-    doOpenLookAhead,
-    doBackRef,
-    doDollar,
-    doDotAny,
-    doBackslashW,
-    doBackslashX,
-    doScanUnicodeSet,
-    doBackslashZ,
-    doPerlInline,
-    doPossessiveOpt,
+    doSetEnd,
+    doBackslashA,
+    doSetBeginUnion,
    doNOP,
-    doConditionalExpr,
-    doExit,
-    doNGInterval,
-    doPatStart,
-    doBadModeFlag,
-    doBackslashb,
-    doPossessiveStar,
-    doBackslashd,
-    doIntervalSame,
+    doSetBackslash_w,
+    doSetRange,
+    doBackslashG,
+    doPerlInline,
+    doSetAddDash,
+    doIntevalLowerDigit,
+    doProperty,
+    doBackslashX,
+    doOpenAtomicParen,
+    doSetLiteralEscaped,
+    doPatFinish,
+    doSetBackslash_D,
+    doSetDifference2,
+    doNamedChar,
+    doNGPlus,
    doOpenLookBehindNeg,
+    doIntervalError,
+    doIntervalSame,
+    doBackRef,
+    doPlus,
+    doOpenCaptureParen,
+    doMismatchedParenErr,
+    doBeginMatchMode,
+    doEscapeError,
+    doOpenNonCaptureParen,
+    doDollar,
+    doSetProp,
+    doIntervalUpperDigit,
+    doSetBegin,
+    doBackslashs,
+    doOpenLookBehind,
+    doSetMatchMode,
+    doOrOperator,
+    doCaret,
+    doMatchModeParen,
+    doStar,
+    doOpt,
+    doMatchMode,
+    doSuppressComments,
+    doPossessiveInterval,
+    doOpenLookAheadNeg,
+    doBackslashW,
+    doCloseParen,
+    doSetOpError,
+    doIntervalInit,
+    doSetFinish,
+    doSetIntersection2,
+    doNGStar,
+    doEnterQuoteMode,
+    doSetAddAmp,
+    doBackslashB,
+    doBackslashw,
+    doPossessiveOpt,
+    doSetNegate,
+    doRuleError,
+    doBackslashb,
+    doConditionalExpr,
+    doPossessivePlus,
+    doBadOpenParenType,
+    doNGInterval,
+    doSetLiteral,
+    doSetNamedChar,
+    doBackslashd,
+    doSetBeginDifference1,
+    doBackslashD,
+    doExit,
+    doSetBackslash_S,
+    doInterval,
+    doSetNoCloseError,
+    doNGOpt,
+    doSetPosixProp,
+    doBackslashS,
+    doBackslashZ,
+    doSetBeginIntersection1,
+    doSetBackslash_W,
+    doSetBackslash_d,
+    doOpenLookAhead,
+    doBadModeFlag,
+    doPatStart,
+    doSetNamedRange,
+    doPossessiveStar,
+    doEscapedLiteralChar,
+    doSetBackslash_s,
+    doBackslashz,
+    doDotAny,
    rbbiLastAction};

 //-------------------------------------------------------------------------------
@ -106,21 +132,21 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    {doNOP, 0, 0, 0, TRUE}
    , {doPatStart, 255, 2,0,  FALSE}     //  1      start
    , {doLiteralChar, 254, 14,0,  TRUE}     //  2      term
-    , {doLiteralChar, 130, 14,0,  TRUE}     //  3 
-    , {doScanUnicodeSet, 91 /* [ */, 14,0,  TRUE}     //  4 
+    , {doLiteralChar, 129, 14,0,  TRUE}     //  3 
+    , {doSetBegin, 91 /* [ */, 104, 182, TRUE}     //  4 
    , {doNOP, 40 /* ( */, 27,0,  TRUE}     //  5 
    , {doDotAny, 46 /* . */, 14,0,  TRUE}     //  6 
-    , {doCaret, 94 /* ^ */, 2,0,  TRUE}     //  7 
-    , {doDollar, 36 /* $ */, 2,0,  TRUE}     //  8 
-    , {doNOP, 92 /* \ */, 81,0,  TRUE}     //  9 
+    , {doCaret, 94 /* ^ */, 14,0,  TRUE}     //  7 
+    , {doDollar, 36 /* $ */, 14,0,  TRUE}     //  8 
+    , {doNOP, 92 /* \ */, 84,0,  TRUE}     //  9 
    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  10 
    , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  11 
    , {doPatFinish, 253, 2,0,  FALSE}     //  12 
-    , {doRuleError, 255, 101,0,  FALSE}     //  13 
-    , {doNOP, 42 /* * */, 59,0,  TRUE}     //  14      expr-quant
-    , {doNOP, 43 /* + */, 62,0,  TRUE}     //  15 
-    , {doNOP, 63 /* ? */, 65,0,  TRUE}     //  16 
-    , {doIntervalInit, 123 /* { */, 68,0,  TRUE}     //  17 
+    , {doRuleError, 255, 183,0,  FALSE}     //  13 
+    , {doNOP, 42 /* * */, 63,0,  TRUE}     //  14      expr-quant
+    , {doNOP, 43 /* + */, 66,0,  TRUE}     //  15 
+    , {doNOP, 63 /* ? */, 69,0,  TRUE}     //  16 
+    , {doIntervalInit, 123 /* { */, 72,0,  TRUE}     //  17 
    , {doNOP, 40 /* ( */, 23,0,  TRUE}     //  18 
    , {doNOP, 255, 20,0,  FALSE}     //  19 
    , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  20      expr-cont
@ -128,7 +154,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doNOP, 255, 2,0,  FALSE}     //  22 
    , {doSuppressComments, 63 /* ? */, 25,0,  TRUE}     //  23      open-paren-quant
    , {doNOP, 255, 27,0,  FALSE}     //  24 
-    , {doNOP, 35 /* # */, 47, 14, TRUE}     //  25      open-paren-quant2
+    , {doNOP, 35 /* # */, 49, 14, TRUE}     //  25      open-paren-quant2
    , {doNOP, 255, 29,0,  FALSE}     //  26 
    , {doSuppressComments, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
    , {doOpenCaptureParen, 255, 2, 14, FALSE}     //  28 
@ -136,75 +162,157 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
    , {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE}     //  30 
    , {doOpenLookAhead, 61 /* = */, 2, 20, TRUE}     //  31 
    , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE}     //  32 
-    , {doNOP, 60 /* < */, 44,0,  TRUE}     //  33 
-    , {doNOP, 35 /* # */, 47, 2, TRUE}     //  34 
-    , {doBeginMatchMode, 105 /* i */, 50,0,  FALSE}     //  35 
-    , {doBeginMatchMode, 109 /* m */, 50,0,  FALSE}     //  36 
-    , {doBeginMatchMode, 115 /* s */, 50,0,  FALSE}     //  37 
-    , {doBeginMatchMode, 119 /* w */, 50,0,  FALSE}     //  38 
-    , {doBeginMatchMode, 120 /* x */, 50,0,  FALSE}     //  39 
-    , {doBeginMatchMode, 45 /* - */, 50,0,  FALSE}     //  40 
-    , {doConditionalExpr, 40 /* ( */, 101,0,  TRUE}     //  41 
-    , {doPerlInline, 123 /* { */, 101,0,  TRUE}     //  42 
-    , {doBadOpenParenType, 255, 101,0,  FALSE}     //  43 
-    , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE}     //  44      open-paren-lookbehind
-    , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE}     //  45 
-    , {doBadOpenParenType, 255, 101,0,  FALSE}     //  46 
-    , {doNOP, 41 /* ) */, 255,0,  TRUE}     //  47      paren-comment
-    , {doMismatchedParenErr, 253, 101,0,  FALSE}     //  48 
-    , {doNOP, 255, 47,0,  TRUE}     //  49 
-    , {doMatchMode, 105 /* i */, 50,0,  TRUE}     //  50      paren-flag
-    , {doMatchMode, 109 /* m */, 50,0,  TRUE}     //  51 
-    , {doMatchMode, 115 /* s */, 50,0,  TRUE}     //  52 
-    , {doMatchMode, 119 /* w */, 50,0,  TRUE}     //  53 
-    , {doMatchMode, 120 /* x */, 50,0,  TRUE}     //  54 
-    , {doMatchMode, 45 /* - */, 50,0,  TRUE}     //  55 
-    , {doSetMatchMode, 41 /* ) */, 2,0,  TRUE}     //  56 
-    , {doMatchModeParen, 58 /* : */, 2, 14, TRUE}     //  57 
-    , {doBadModeFlag, 255, 101,0,  FALSE}     //  58 
-    , {doNGStar, 63 /* ? */, 20,0,  TRUE}     //  59      quant-star
-    , {doPossessiveStar, 43 /* + */, 20,0,  TRUE}     //  60 
-    , {doStar, 255, 20,0,  FALSE}     //  61 
-    , {doNGPlus, 63 /* ? */, 20,0,  TRUE}     //  62      quant-plus
-    , {doPossessivePlus, 43 /* + */, 20,0,  TRUE}     //  63 
-    , {doPlus, 255, 20,0,  FALSE}     //  64 
-    , {doNGOpt, 63 /* ? */, 20,0,  TRUE}     //  65      quant-opt
-    , {doPossessiveOpt, 43 /* + */, 20,0,  TRUE}     //  66 
-    , {doOpt, 255, 20,0,  FALSE}     //  67 
-    , {doNOP, 129, 68,0,  TRUE}     //  68      interval-open
-    , {doNOP, 128, 71,0,  FALSE}     //  69 
-    , {doIntervalError, 255, 101,0,  FALSE}     //  70 
-    , {doIntevalLowerDigit, 128, 71,0,  TRUE}     //  71      interval-lower
-    , {doNOP, 44 /* , */, 75,0,  TRUE}     //  72 
-    , {doIntervalSame, 125 /* } */, 78,0,  TRUE}     //  73 
-    , {doIntervalError, 255, 101,0,  FALSE}     //  74 
-    , {doIntervalUpperDigit, 128, 75,0,  TRUE}     //  75      interval-upper
-    , {doNOP, 125 /* } */, 78,0,  TRUE}     //  76 
-    , {doIntervalError, 255, 101,0,  FALSE}     //  77 
-    , {doNGInterval, 63 /* ? */, 20,0,  TRUE}     //  78      interval-type
-    , {doPossessiveInterval, 43 /* + */, 20,0,  TRUE}     //  79 
-    , {doInterval, 255, 20,0,  FALSE}     //  80 
-    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  81      backslash
-    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  82 
-    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  83 
-    , {doBackslashd, 100 /* d */, 14,0,  TRUE}     //  84 
-    , {doBackslashD, 68 /* D */, 14,0,  TRUE}     //  85 
-    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  86 
-    , {doProperty, 78 /* N */, 14,0,  FALSE}     //  87 
-    , {doProperty, 112 /* p */, 14,0,  FALSE}     //  88 
-    , {doProperty, 80 /* P */, 14,0,  FALSE}     //  89 
-    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  90 
-    , {doBackslashS, 83 /* S */, 14,0,  TRUE}     //  91 
-    , {doBackslashs, 115 /* s */, 14,0,  TRUE}     //  92 
-    , {doBackslashW, 87 /* W */, 14,0,  TRUE}     //  93 
-    , {doBackslashw, 119 /* w */, 14,0,  TRUE}     //  94 
-    , {doBackslashX, 88 /* X */, 14,0,  TRUE}     //  95 
-    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  96 
-    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  97 
-    , {doBackRef, 128, 14,0,  TRUE}     //  98 
-    , {doEscapeError, 253, 101,0,  FALSE}     //  99 
-    , {doLiteralChar, 255, 14,0,  TRUE}     //  100 
-    , {doExit, 255, 101,0,  TRUE}     //  101      errorDeath
+    , {doNOP, 60 /* < */, 46,0,  TRUE}     //  33 
+    , {doNOP, 35 /* # */, 49, 2, TRUE}     //  34 
+    , {doBeginMatchMode, 105 /* i */, 52,0,  FALSE}     //  35 
+    , {doBeginMatchMode, 100 /* d */, 52,0,  FALSE}     //  36 
+    , {doBeginMatchMode, 109 /* m */, 52,0,  FALSE}     //  37 
+    , {doBeginMatchMode, 115 /* s */, 52,0,  FALSE}     //  38 
+    , {doBeginMatchMode, 117 /* u */, 52,0,  FALSE}     //  39 
+    , {doBeginMatchMode, 119 /* w */, 52,0,  FALSE}     //  40 
+    , {doBeginMatchMode, 120 /* x */, 52,0,  FALSE}     //  41 
+    , {doBeginMatchMode, 45 /* - */, 52,0,  FALSE}     //  42 
+    , {doConditionalExpr, 40 /* ( */, 183,0,  TRUE}     //  43 
+    , {doPerlInline, 123 /* { */, 183,0,  TRUE}     //  44 
+    , {doBadOpenParenType, 255, 183,0,  FALSE}     //  45 
+    , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE}     //  46      open-paren-lookbehind
+    , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE}     //  47 
+    , {doBadOpenParenType, 255, 183,0,  FALSE}     //  48 
+    , {doNOP, 41 /* ) */, 255,0,  TRUE}     //  49      paren-comment
+    , {doMismatchedParenErr, 253, 183,0,  FALSE}     //  50 
+    , {doNOP, 255, 49,0,  TRUE}     //  51 
+    , {doMatchMode, 105 /* i */, 52,0,  TRUE}     //  52      paren-flag
+    , {doMatchMode, 100 /* d */, 52,0,  TRUE}     //  53 
+    , {doMatchMode, 109 /* m */, 52,0,  TRUE}     //  54 
+    , {doMatchMode, 115 /* s */, 52,0,  TRUE}     //  55 
+    , {doMatchMode, 117 /* u */, 52,0,  TRUE}     //  56 
+    , {doMatchMode, 119 /* w */, 52,0,  TRUE}     //  57 
+    , {doMatchMode, 120 /* x */, 52,0,  TRUE}     //  58 
+    , {doMatchMode, 45 /* - */, 52,0,  TRUE}     //  59 
+    , {doSetMatchMode, 41 /* ) */, 2,0,  TRUE}     //  60 
+    , {doMatchModeParen, 58 /* : */, 2, 14, TRUE}     //  61 
+    , {doBadModeFlag, 255, 183,0,  FALSE}     //  62 
+    , {doNGStar, 63 /* ? */, 20,0,  TRUE}     //  63      quant-star
+    , {doPossessiveStar, 43 /* + */, 20,0,  TRUE}     //  64 
+    , {doStar, 255, 20,0,  FALSE}     //  65 
+    , {doNGPlus, 63 /* ? */, 20,0,  TRUE}     //  66      quant-plus
+    , {doPossessivePlus, 43 /* + */, 20,0,  TRUE}     //  67 
+    , {doPlus, 255, 20,0,  FALSE}     //  68 
+    , {doNGOpt, 63 /* ? */, 20,0,  TRUE}     //  69      quant-opt
+    , {doPossessiveOpt, 43 /* + */, 20,0,  TRUE}     //  70 
+    , {doOpt, 255, 20,0,  FALSE}     //  71 
+    , {doNOP, 128, 74,0,  FALSE}     //  72      interval-open
+    , {doIntervalError, 255, 183,0,  FALSE}     //  73 
+    , {doIntevalLowerDigit, 128, 74,0,  TRUE}     //  74      interval-lower
+    , {doNOP, 44 /* , */, 78,0,  TRUE}     //  75 
+    , {doIntervalSame, 125 /* } */, 81,0,  TRUE}     //  76 
+    , {doIntervalError, 255, 183,0,  FALSE}     //  77 
+    , {doIntervalUpperDigit, 128, 78,0,  TRUE}     //  78      interval-upper
+    , {doNOP, 125 /* } */, 81,0,  TRUE}     //  79 
+    , {doIntervalError, 255, 183,0,  FALSE}     //  80 
+    , {doNGInterval, 63 /* ? */, 20,0,  TRUE}     //  81      interval-type
+    , {doPossessiveInterval, 43 /* + */, 20,0,  TRUE}     //  82 
+    , {doInterval, 255, 20,0,  FALSE}     //  83 
+    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  84      backslash
+    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  85 
+    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  86 
+    , {doBackslashd, 100 /* d */, 14,0,  TRUE}     //  87 
+    , {doBackslashD, 68 /* D */, 14,0,  TRUE}     //  88 
+    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  89 
+    , {doNamedChar, 78 /* N */, 14,0,  FALSE}     //  90 
+    , {doProperty, 112 /* p */, 14,0,  FALSE}     //  91 
+    , {doProperty, 80 /* P */, 14,0,  FALSE}     //  92 
+    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  93 
+    , {doBackslashS, 83 /* S */, 14,0,  TRUE}     //  94 
+    , {doBackslashs, 115 /* s */, 14,0,  TRUE}     //  95 
+    , {doBackslashW, 87 /* W */, 14,0,  TRUE}     //  96 
+    , {doBackslashw, 119 /* w */, 14,0,  TRUE}     //  97 
+    , {doBackslashX, 88 /* X */, 14,0,  TRUE}     //  98 
+    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  99 
+    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  100 
+    , {doBackRef, 128, 14,0,  TRUE}     //  101 
+    , {doEscapeError, 253, 183,0,  FALSE}     //  102 
+    , {doEscapedLiteralChar, 255, 14,0,  TRUE}     //  103 
+    , {doSetNegate, 94 /* ^ */, 107,0,  TRUE}     //  104      set-open
+    , {doSetPosixProp, 58 /* : */, 109,0,  FALSE}     //  105 
+    , {doNOP, 255, 107,0,  FALSE}     //  106 
+    , {doSetLiteral, 93 /* ] */, 122,0,  TRUE}     //  107      set-open2
+    , {doNOP, 255, 112,0,  FALSE}     //  108 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  109      set-posix
+    , {doNOP, 58 /* : */, 112,0,  FALSE}     //  110 
+    , {doRuleError, 255, 183,0,  FALSE}     //  111 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  112      set-start
+    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  113 
+    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  114 
+    , {doNOP, 45 /* - */, 118,0,  TRUE}     //  115 
+    , {doNOP, 38 /* & */, 120,0,  TRUE}     //  116 
+    , {doSetLiteral, 255, 122,0,  TRUE}     //  117 
+    , {doRuleError, 45 /* - */, 183,0,  FALSE}     //  118      set-start-dash
+    , {doSetAddDash, 255, 122,0,  FALSE}     //  119 
+    , {doRuleError, 38 /* & */, 183,0,  FALSE}     //  120      set-start-amp
+    , {doSetAddAmp, 255, 122,0,  FALSE}     //  121 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  122      set-after-lit
+    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  123 
+    , {doNOP, 45 /* - */, 159,0,  TRUE}     //  124 
+    , {doNOP, 38 /* & */, 150,0,  TRUE}     //  125 
+    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  126 
+    , {doSetNoCloseError, 253, 183,0,  FALSE}     //  127 
+    , {doSetLiteral, 255, 122,0,  TRUE}     //  128 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  129      set-after-set
+    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  130 
+    , {doNOP, 45 /* - */, 152,0,  TRUE}     //  131 
+    , {doNOP, 38 /* & */, 147,0,  TRUE}     //  132 
+    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  133 
+    , {doSetNoCloseError, 253, 183,0,  FALSE}     //  134 
+    , {doSetLiteral, 255, 122,0,  TRUE}     //  135 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  136      set-after-range
+    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  137 
+    , {doNOP, 45 /* - */, 155,0,  TRUE}     //  138 
+    , {doNOP, 38 /* & */, 157,0,  TRUE}     //  139 
+    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  140 
+    , {doSetNoCloseError, 253, 183,0,  FALSE}     //  141 
+    , {doSetLiteral, 255, 122,0,  TRUE}     //  142 
+    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  143      set-after-op
+    , {doSetOpError, 93 /* ] */, 183,0,  FALSE}     //  144 
+    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  145 
+    , {doSetLiteral, 255, 122,0,  TRUE}     //  146 
+    , {doSetBeginIntersection1, 91 /* [ */, 104, 129, TRUE}     //  147      set-set-amp
+    , {doSetIntersection2, 38 /* & */, 143,0,  TRUE}     //  148 
+    , {doSetAddAmp, 255, 122,0,  FALSE}     //  149 
+    , {doSetIntersection2, 38 /* & */, 143,0,  TRUE}     //  150      set-lit-amp
+    , {doSetAddAmp, 255, 122,0,  FALSE}     //  151 
+    , {doSetBeginDifference1, 91 /* [ */, 104, 129, TRUE}     //  152      set-set-dash
+    , {doSetDifference2, 45 /* - */, 143,0,  TRUE}     //  153 
+    , {doSetAddDash, 255, 122,0,  FALSE}     //  154 
+    , {doSetDifference2, 45 /* - */, 143,0,  TRUE}     //  155      set-range-dash
+    , {doSetAddDash, 255, 122,0,  FALSE}     //  156 
+    , {doSetIntersection2, 38 /* & */, 143,0,  TRUE}     //  157      set-range-amp
+    , {doSetAddAmp, 255, 122,0,  FALSE}     //  158 
+    , {doSetDifference2, 45 /* - */, 143,0,  TRUE}     //  159      set-lit-dash
+    , {doSetAddDash, 91 /* [ */, 122,0,  FALSE}     //  160 
+    , {doSetAddDash, 93 /* ] */, 122,0,  FALSE}     //  161 
+    , {doNOP, 92 /* \ */, 164,0,  TRUE}     //  162 
+    , {doSetRange, 255, 136,0,  TRUE}     //  163 
+    , {doSetOpError, 115 /* s */, 183,0,  FALSE}     //  164      set-lit-dash-escape
+    , {doSetOpError, 83 /* S */, 183,0,  FALSE}     //  165 
+    , {doSetOpError, 119 /* w */, 183,0,  FALSE}     //  166 
+    , {doSetOpError, 87 /* W */, 183,0,  FALSE}     //  167 
+    , {doSetOpError, 100 /* d */, 183,0,  FALSE}     //  168 
+    , {doSetOpError, 68 /* D */, 183,0,  FALSE}     //  169 
+    , {doSetNamedRange, 78 /* N */, 136,0,  FALSE}     //  170 
+    , {doSetRange, 255, 136,0,  TRUE}     //  171 
+    , {doSetProp, 112 /* p */, 129,0,  FALSE}     //  172      set-escape
+    , {doSetProp, 80 /* P */, 129,0,  FALSE}     //  173 
+    , {doSetNamedChar, 78 /* N */, 122,0,  FALSE}     //  174 
+    , {doSetBackslash_s, 115 /* s */, 136,0,  TRUE}     //  175 
+    , {doSetBackslash_S, 83 /* S */, 136,0,  TRUE}     //  176 
+    , {doSetBackslash_w, 119 /* w */, 136,0,  TRUE}     //  177 
+    , {doSetBackslash_W, 87 /* W */, 136,0,  TRUE}     //  178 
+    , {doSetBackslash_d, 100 /* d */, 136,0,  TRUE}     //  179 
+    , {doSetBackslash_D, 68 /* D */, 136,0,  TRUE}     //  180 
+    , {doSetLiteralEscaped, 255, 122,0,  TRUE}     //  181 
+    , {doSetFinish, 255, 14,0,  FALSE}     //  182      set-finish
+    , {doExit, 255, 183,0,  TRUE}     //  183      errorDeath
 };
 static const char * const RegexStateNames[] = {    0,
     "start",
@ -249,6 +357,8 @@ static const char * const RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
    0,
     "open-paren-lookbehind",
    0,
@ -264,6 +374,8 @@ static const char * const RegexStateNames[] = {    0,
    0,
    0,
    0,
+    0,
+    0,
    0,
     "quant-star",
    0,
@ -275,7 +387,6 @@ static const char * const RegexStateNames[] = {    0,
    0,
    0,
     "interval-open",
-    0,
    0,
     "interval-lower",
    0,
@ -307,6 +418,85 @@ static const char * const RegexStateNames[] = {    0,
    0,
    0,
    0,
+     "set-open",
+    0,
+    0,
+     "set-open2",
+    0,
+     "set-posix",
+    0,
+    0,
+     "set-start",
+    0,
+    0,
+    0,
+    0,
+    0,
+     "set-start-dash",
+    0,
+     "set-start-amp",
+    0,
+     "set-after-lit",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+     "set-after-set",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+     "set-after-range",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+     "set-after-op",
+    0,
+    0,
+    0,
+     "set-set-amp",
+    0,
+    0,
+     "set-lit-amp",
+    0,
+     "set-set-dash",
+    0,
+    0,
+     "set-range-dash",
+    0,
+     "set-range-amp",
+    0,
+     "set-lit-dash",
+    0,
+    0,
+    0,
+    0,
+     "set-lit-dash-escape",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+     "set-escape",
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+     "set-finish",
     "errorDeath",
    0};

--- a/icu4c/source/i18n/regexcst.pl
+++ b/icu4c/source/i18n/regexcst.pl
@ -1,7 +1,7 @@
 #!/usr/bin/perl
 #  ********************************************************************
 #  * COPYRIGHT:
-#  * Copyright (c) 2002-2003, International Business Machines Corporation and
+#  * Copyright (c) 2002-2007, International Business Machines Corporation and
 #  * others. All Rights Reserved.
 #  ********************************************************************
 #
@ -22,10 +22,6 @@
 #             for the Rule Based Break Iterator Rule Parser.  Perhaps they could be
 #             merged?
 #
-#*********************************************************************
-#   Copyright (C) 2002 International Business Machines Corporation   *
-#   and others. All rights reserved.                                 *
-#*********************************************************************


 $num_states = 1;         # Always the state number for the line being compiled.
@ -210,7 +206,7 @@ print "//    This file contains the state table for the ICU Regular Expression P
 print "//    It is generated by the Perl script \"regexcst.pl\" from\n";
 print "//    the rule parser state definitions file \"regexcst.txt\".\n";
 print "//\n";
-print "//   Copyright (C) 2002-2003 International Business Machines Corporation \n";
+print "//   Copyright (C) 2002-2007 International Business Machines Corporation \n";
 print "//   and others. All rights reserved.  \n";
 print "//\n";
 print "//---------------------------------------------------------------------------------\n";
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@ -1,7 +1,7 @@

 #*****************************************************************************
 #
-#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2007, International Business Machines Corporation and others.
 #   All Rights Reserved.
 #
 #*****************************************************************************
@ -25,8 +25,8 @@
 #
 #
 #StateName:
-#   input-char           n next-state           ^push-state     action    
-#   input-char           n next-state           ^push-state     action    
+#   input-char           n next-state           ^push-state     action
+#   input-char           n next-state           ^push-state     action
 #       |                |   |                      |             |
 #       |                |   |                      |             |--- action to be performed by state machine
 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
@ -46,7 +46,7 @@
 #            matches, peform the actions and go to the state specified on this line.
 #            The input character is tested sequentally, in the order written.  The characters and
 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
-#            
+#



@ -56,27 +56,27 @@
 #
 start:
   default                 term                                     doPatStart
-    

-    
-    
+
+
+
 #
 #  term.  At a position where we can accept the start most items in a pattern.
 #
 term:
    quoted               n expr-quant                               doLiteralChar
    rule_char            n expr-quant                               doLiteralChar
-    '['                  n expr-quant                               doScanUnicodeSet
-    '('                  n open-paren                     
+    '['                  n set-open       ^set-finish               doSetBegin
+    '('                  n open-paren
    '.'                  n expr-quant                               doDotAny
-    '^'                  n term                                     doCaret
-    '$'                  n term                                     doDollar
+    '^'                  n expr-quant                               doCaret
+    '$'                  n expr-quant                               doDollar
    '\'                  n backslash
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    eof	                   term                                     doPatFinish
    default                errorDeath                               doRuleError
-    
+


 #
@ -84,14 +84,14 @@ term:
 #                 trailing quantifier - *, +, ?, *?,  etc.
 #
 expr-quant:
-    '*'                  n  quant-star                       
-    '+'                  n  quant-plus                              
-    '?'                  n  quant-opt     
+    '*'                  n  quant-star
+    '+'                  n  quant-plus
+    '?'                  n  quant-opt
    '{'                  n  interval-open                          doIntervalInit
    '('                  n  open-paren-quant
-    default                 expr-cont 
-    
-    
+    default                 expr-cont
+
+
 #
 #  expr-cont      Expression, continuation.  At a point where additional terms are
 #                                            allowed, but not required.  No Quantifiers
@ -99,8 +99,8 @@ expr-quant:
 expr-cont:
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
-    default                 term                                    
-    
+    default                 term
+

 #
 #   open-paren-quant   Special case handling for comments appearing before a quantifier,
@ -111,12 +111,12 @@ expr-cont:
 open-paren-quant:
    '?'                  n  open-paren-quant2                      doSuppressComments
    default                 open-paren
-    
+
 open-paren-quant2:
    '#'                  n  paren-comment   ^expr-quant
    default                 open-paren-extended
-    
- 
+
+
 #
 #   open-paren    We've got an open paren.  We need to scan further to
 #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
@ -124,7 +124,7 @@ open-paren-quant2:
 open-paren:
    '?'                  n  open-paren-extended                     doSuppressComments
    default                 term            ^expr-quant             doOpenCaptureParen
-    
+
 open-paren-extended:
    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
@ -133,24 +133,25 @@ open-paren-extended:
    '<'                  n  open-paren-lookbehind
    '#'                  n  paren-comment   ^term
    'i'                     paren-flag                              doBeginMatchMode
+    'd'                     paren-flag                              doBeginMatchMode
    'm'                     paren-flag                              doBeginMatchMode
    's'                     paren-flag                              doBeginMatchMode
+    'u'                     paren-flag                              doBeginMatchMode
    'w'                     paren-flag                              doBeginMatchMode
    'x'                     paren-flag                              doBeginMatchMode
    '-'                     paren-flag                              doBeginMatchMode
    '('                  n  errorDeath                              doConditionalExpr
    '{'                  n  errorDeath                              doPerlInline
    default                 errorDeath                              doBadOpenParenType
-    
+
 open-paren-lookbehind:
    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
    default                 errorDeath                              doBadOpenParenType
-    
+

 #
 #   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
-#                    TODO:  should parens nest here?  Check what perl does.
 #
 paren-comment:
    ')'                  n  pop
@ -158,20 +159,22 @@ paren-comment:
    default              n  paren-comment

 #
-#  paren-flag    Scanned a (?ismx-ismx  flag setting 
-#                 
+#  paren-flag    Scanned a (?ismx-ismx  flag setting
+#
 paren-flag:
    'i'                  n  paren-flag                              doMatchMode
+    'd'                  n  paren-flag                              doMatchMode
    'm'                  n  paren-flag                              doMatchMode
    's'                  n  paren-flag                              doMatchMode
+    'u'                  n  paren-flag                              doMatchMode
    'w'                  n  paren-flag                              doMatchMode
    'x'                  n  paren-flag                              doMatchMode
    '-'                  n  paren-flag                              doMatchMode
    ')'                  n  term                                    doSetMatchMode
    ':'                  n  term              ^expr-quant           doMatchModeParen
    default                 errorDeath                              doBadModeFlag
-    
-    
+
+
 #
 #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
 #                 between plain '*', '*?', '*+'
@ -204,13 +207,12 @@ quant-opt:

 #
 #   Interval         scanning a '{', the opening delimiter for an interval specification
-#                                   {number} or {min, max} or {min, }
+#                                   {number} or {min, max} or {min,}
 #
 interval-open:
-    white_space          n  interval-open                                  # TODO:  is white space allowed here in non-free mode?
-    digit_char              interval-lower                          
+    digit_char              interval-lower
    default                 errorDeath                              doIntervalError
-    
+
 interval-lower:
    digit_char           n  interval-lower                          doIntevalLowerDigit
    ','			         n  interval-upper
@ -221,13 +223,13 @@ interval-upper:
    digit_char           n  interval-upper                          doIntervalUpperDigit
    '}'                  n  interval-type
    default                 errorDeath                              doIntervalError
-    
+
 interval-type:
    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
    default                 expr-cont                               doInterval                  # {m,n}
-    
-    
+
+
 #
 #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
 #                                  The low level next-char function will have preprocessed
@ -239,7 +241,7 @@ backslash:
   'd'                   n  expr-quant                              doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
-   'N'                      expr-quant                              doProperty       #   \N{NAME}  named char
+   'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
   'P'                      expr-quant                              doProperty
   'Q'                   n  term                                    doEnterQuoteMode
@ -250,11 +252,210 @@ backslash:
   'X'                   n  expr-quant                              doBackslashX
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
-   digit_char	         n  expr-quant                              doBackRef         #  Will scan multiple digits
+   digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
   eof                      errorDeath                              doEscapeError
-   default               n  expr-quant		                    doLiteralChar     #  Escaped literal char.		       
+   default               n  expr-quant                              doEscapedLiteralChar

+
+
+#
+# [set expression] parsing,
+#    All states involved in parsing set expressions have names beginning with "set-"
+#
+
+set-open:
+   '^'                   n  set-open2                               doSetNegate
+   ':'                      set-posix                               doSetPosixProp
+   default                  set-open2
+
+set-open2:
+   ']'                   n  set-after-lit                           doSetLiteral
+   default                  set-start
+
+#  set-posix:
+#                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
+#                  moved the scan to the closing ']'.  If it wasn't a property
+#                  expression, the scan will still be at the opening ':', which should
+#                  be interpreted as a normal set expression.
+set-posix:
+    ']'                  n   pop                                    doSetEnd
+    ':'                      set-start
+    default                  errorDeath                             doRuleError  # should not be possible.
+
+#
+#   set-start   after the [ and special case leading characters (^ and/or ]) but before
+#               everything else.   A '-' is literal at this point.
+#
+set-start:
+    ']'                  n  pop                                     doSetEnd
+    '['                  n  set-open      ^set-after-set            doSetBeginUnion
+    '\'                  n  set-escape
+    '-'                  n  set-start-dash
+    '&'                  n  set-start-amp
+    default              n  set-after-lit                           doSetLiteral
+
+#    set-start-dash    Turn "[--" into a syntax error.
+#                           "[-x" is good, - and x are literals.
+#
+set-start-dash:
+    '-'                     errorDeath                              doRuleError
+    default                 set-after-lit                           doSetAddDash
+
+#    set-start-amp     Turn "[&&" into a syntax error.
+#                           "[&x" is good, & and x are literals.
+#
+set-start-amp:
+    '&'                     errorDeath                              doRuleError
+    default                 set-after-lit                           doSetAddAmp
+
+#
+#   set-after-lit    The last thing scanned was a literal character within a set.
+#                    Can be followed by anything.  Single '-' or '&' are
+#                    literals in this context, not operators.
+set-after-lit:
+    ']'                  n  pop                                     doSetEnd
+    '['                  n  set-open      ^set-after-set            doSetBeginUnion
+    '-'                  n  set-lit-dash
+    '&'                  n  set-lit-amp
+    '\'                  n  set-escape
+    eof                     errorDeath                              doSetNoCloseError
+    default              n  set-after-lit                           doSetLiteral
+
+set-after-set:
+    ']'                  n  pop                                     doSetEnd
+    '['                  n  set-open      ^set-after-set            doSetBeginUnion
+    '-'                  n  set-set-dash
+    '&'                  n  set-set-amp
+    '\'                  n  set-escape
+    eof                     errorDeath                              doSetNoCloseError
+    default              n  set-after-lit                           doSetLiteral
+
+set-after-range:
+    ']'                  n  pop                                     doSetEnd
+    '['                  n  set-open      ^set-after-set            doSetBeginUnion
+    '-'                  n  set-range-dash
+    '&'                  n  set-range-amp
+    '\'                  n  set-escape
+    eof                     errorDeath                              doSetNoCloseError
+    default              n  set-after-lit                           doSetLiteral
    
+
+# set-after-op
+#     After a --  or &&
+#     It is an error to close a set at this point.
+#
+set-after-op:
+    '['                  n  set-open         ^set-after-set         doSetBeginUnion
+    ']'                     errorDeath                              doSetOpError
+    '\'                  n  set-escape
+    default              n  set-after-lit                           doSetLiteral
+
+#
+#   set-set-amp
+#      Have scanned [[set]&
+#      Could be a '&' intersection operator, if a set follows.
+#      Could be the start of a '&&' operator.
+#      Otherewise is a literal.
+set-set-amp:
+    '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
+    '&'                  n  set-after-op                           doSetIntersection2
+    default                 set-after-lit                          doSetAddAmp
+
+
+# set-lit-amp   Have scanned "[literals&"
+#               Could be a start of "&&" operator or a literal
+#               In [abc&[def]],   the '&' is a literal
+#
+set-lit-amp:
+    '&'                  n  set-after-op                            doSetIntersection2
+    default                 set-after-lit                           doSetAddAmp
+
+
+#
+#  set-set-dash
+#      Have scanned [set]-
+#      Could be a '-' difference operator, if a [set] follows.
+#      Could be the start of a '--' operator.
+#      Otherewise is a literal.
+set-set-dash:
+    '['                  n  set-open      ^set-after-set           doSetBeginDifference1
+    '-'                  n  set-after-op                           doSetDifference2
+    default                 set-after-lit                          doSetAddDash
+
+
+#
+#  set-range-dash
+#      scanned  a-b-  or \w-
+#         any set or range like item where the trailing single '-' should
+#         be literal, not a set difference operation.
+#         A trailing "--" is still a difference operator.
+set-range-dash:
+    '-'                  n  set-after-op                           doSetDifference2
+    default                 set-after-lit                          doSetAddDash
+
+
+set-range-amp:
+    '&'                  n  set-after-op                           doSetIntersection2
+    default                 set-after-lit                          doSetAddAmp
+
+
+#  set-lit-dash
+#     Have scanned "[literals-" Could be a range or a -- operator or a literal
+#     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
+#        [abc-\p{xx}  the '-' is an error
+#        [abc-]       the '-' is a literal
+#        [ab-xy]      the '-' is a range
+#
+set-lit-dash:
+    '-'                  n  set-after-op                            doSetDifference2
+    '['                     set-after-lit                           doSetAddDash
+    ']'                     set-after-lit                           doSetAddDash
+    '\'                  n  set-lit-dash-escape
+    default              n  set-after-range                         doSetRange
+
+# set-lit-dash-escape
+#
+#    scanned "[literal-\"
+#    Could be a range, if the \ introduces an escaped literal char or a named char.
+#    Otherwise it is an error.
+#
+set-lit-dash-escape:
+   's'                      errorDeath                             doSetOpError
+   'S'                      errorDeath                             doSetOpError
+   'w'                      errorDeath                             doSetOpError
+   'W'                      errorDeath                             doSetOpError
+   'd'                      errorDeath                             doSetOpError
+   'D'                      errorDeath                             doSetOpError
+   'N'                      set-after-range                        doSetNamedRange
+   default               n  set-after-range                        doSetRange
+
+   
+#
+#  set-escape
+#       Common back-slash escape processing within set expressions
+#
+set-escape:
+   'p'                      set-after-set                           doSetProp
+   'P'                      set-after-set                           doSetProp
+   'N'                      set-after-lit                           doSetNamedChar
+   's'                   n  set-after-range                         doSetBackslash_s
+   'S'                   n  set-after-range                         doSetBackslash_S
+   'w'                   n  set-after-range                         doSetBackslash_w
+   'W'                   n  set-after-range                         doSetBackslash_W
+   'd'                   n  set-after-range                         doSetBackslash_d
+   'D'                   n  set-after-range                         doSetBackslash_D
+   default               n  set-after-lit                           doSetLiteralEscaped 
+
+#
+# set-finish
+#     Have just encountered the final ']' that completes a [set], and
+#     arrived here via a pop.  From here, we exit the set parsing world, and go
+#     back to generic regular expression parsing.
+#
+set-finish:
+    default                 expr-quant                              doSetFinish
+
+
 #
 # errorDeath.   This state is specified as the next state whenever a syntax error
 #               in the source rules is detected.  Barring bugs, the state machine will never
--- a/icu4c/source/i18n/regeximp.h
+++ b/icu4c/source/i18n/regeximp.h
@ -1,6 +1,6 @@
-// 
-//   Copyright (C) 2002-2005 International Business Machines Corporation 
-//   and others. All rights reserved.  
+//
+//   Copyright (C) 2002-2007 International Business Machines Corporation
+//   and others. All rights reserved.
 //
 //   file:  regeximp.h
 //
@ -57,7 +57,7 @@ U_NAMESPACE_BEGIN
 enum {
     URX_RESERVED_OP   = 0,    // For multi-operand ops, most non-first words.
     URX_RESERVED_OP_N = 255,  // For multi-operand ops, negative operand values.
-     URX_BACKTRACK     = 1,
+     URX_BACKTRACK     = 1,    // Force a backtrack, as if a match test had failed.
     URX_END           = 2,
     URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match
     URX_STRING        = 4,    // Value field is index of string start
@ -66,16 +66,16 @@ enum {
     URX_NOP           = 7,
     URX_START_CAPTURE = 8,    // Value field is capture group number.
     URX_END_CAPTURE   = 9,    // Value field is capture group number
-     URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets.   
+     URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets.
     URX_SETREF        = 11,   // Value field is index of set in array of sets.
-     URX_DOTANY        = 12, 
+     URX_DOTANY        = 12,
     URX_JMP           = 13,   // Value field is destination position in
                                                    //   the pattern.
     URX_FAIL          = 14,   // Stop match operation,  No match.

     URX_JMP_SAV       = 15,   // Operand:  JMP destination location
     URX_BACKSLASH_B   = 16,   // Value field:  0:  \b    1:  \B
-     URX_BACKSLASH_G   = 17, 
+     URX_BACKSLASH_G   = 17,
     URX_JMP_SAV_X     = 18,   // Conditional JMP_SAV,
                               //    Used in (x)+, breaks loop on zero length match.
                               //    Operand:  Jmp destination.
@ -88,21 +88,22 @@ enum {
     URX_DOLLAR        = 24,  // Also for \Z

     URX_CTR_INIT      = 25,   // Counter Inits for {Interval} loops.
-     URX_CTR_INIT_NG   = 26,   //   3 kinds, normal, non-greedy, and possessive.
+     URX_CTR_INIT_NG   = 26,   //   2 kinds, normal and non-greedy.
                               //   These are 4 word opcodes.  See description.
                               //    First Operand:  Data loc of counter variable
-                               //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx 
+                               //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx
                               //                    at the end of the loop.
                               //    3rd   Operand:  Minimum count.
                               //    4th   Operand:  Max count, -1 for unbounded.

-     URX_DOTANY_PL     = 27,   // .+, match rest of the line.  Fail already at end.
+     URX_DOTANY_UNIX   = 27,   // '.' operator in UNIX_LINES mode, only \n marks end of line.

     URX_CTR_LOOP      = 28,   // Loop Ops for {interval} loops.
     URX_CTR_LOOP_NG   = 29,   //   Also in three flavors.
                               //   Operand is loc of corresponding CTR_INIT.

-     URX_DOTANY_ALL_PL = 30,   // .+, match rest of the Input.  Fail if already at end
+     URX_CARET_M_UNIX  = 30,   // '^' operator, test for start of line in multi-line
+                               //      plus UNIX_LINES mode.

     URX_RELOC_OPRND   = 31,   // Operand value in multi-operand ops that refers
                               //   back into compiled pattern code, and thus must
@ -118,7 +119,7 @@ enum {
                               //   within the matcher stack frame.
     URX_JMPX          = 36,  // Conditional JMP.
                               //   First Operand:  JMP target location.
-                               //   Second Operand:  Data location containing an 
+                               //   Second Operand:  Data location containing an
                               //     input position.  If current input position ==
                               //     saved input position, FAIL rather than taking
                               //     the JMP
@ -157,7 +158,7 @@ enum {
     URX_LBN_END       = 48,   // Negative LookBehind end
                               //   Parameter is the data location.
                               //   Check that the match ended at the right spot.
-     URX_STAT_SETREF_N = 49,   // Reference to a prebuilt set (e.g. \w), negated  
+     URX_STAT_SETREF_N = 49,   // Reference to a prebuilt set (e.g. \w), negated
                               //   Operand is index of set in array of sets.
     URX_LOOP_SR_I     = 50,   // Init a [set]* loop.
                               //   Operand is the sets index in array of user sets.
@ -166,12 +167,18 @@ enum {
                               //   Must always immediately follow  LOOP_x_I instruction.
     URX_LOOP_DOT_I    = 52,   // .*, initialization of the optimized loop.
                               //   Operand value:
-                               //      0:  Normal (. doesn't match new-line) mode.
-                               //      1:  . matches new-line mode.
-     URX_BACKSLASH_BU  = 53    // \b or \B in UREGEX_UWORD mode, using Unicode style
+                               //      bit 0:
+                               //         0:  Normal (. doesn't match new-line) mode.
+                               //         1:  . matches new-line mode.
+                               //      bit 1:  controls what new-lines are recognized by this operation.
+                               //         0:  All Unicode New-lines
+                               //         1:  UNIX_LINES, \u000a only.
+     URX_BACKSLASH_BU  = 53,   // \b or \B in UREGEX_UWORD mode, using Unicode style
                               //   word boundaries.
+     URX_DOLLAR_D      = 54,   // $ end of input test, in UNIX_LINES mode.
+     URX_DOLLAR_MD     = 55    // $ end of input test, in MULTI_LINE and UNIX_LINES mode.

-};           
+};

 // Keep this list of opcode names in sync with the above enum
 //   Used for debug printing only.
@ -203,10 +210,10 @@ enum {
        "DOLLAR",              \
        "CTR_INIT",            \
        "CTR_INIT_NG",         \
-        "DOTANY_PL",           \
+        "DOTANY_UNIX",         \
        "CTR_LOOP",            \
        "CTR_LOOP_NG",         \
-        "DOTANY_ALL_PL",       \
+        "URX_CARET_M_UNIX",    \
        "RELOC_OPRND",         \
        "STO_SP",              \
        "LD_SP",               \
@ -229,21 +236,23 @@ enum {
        "LOOP_SR_I",           \
        "LOOP_C",              \
        "LOOP_DOT_I",          \
-        "BACKSLASH_BU"
+        "BACKSLASH_BU",        \
+        "DOLLAR_D",            \
+        "DOLLAR_MD"


 //
 //  Convenience macros for assembling and disassembling a compiled operation.
 //
 #define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
-#define URX_TYPE(x)          ((uint32_t)(x) >> 24) 
+#define URX_TYPE(x)          ((uint32_t)(x) >> 24)
 #define URX_VAL(x)           ((x) & 0xffffff)

-                
+
 //
 //  Access to Unicode Sets composite character properties
 //     The sets are accessed by the match engine for things like \w (word boundary)
-//     
+//
 enum {
     URX_ISWORD_SET  = 1,
     URX_ISALNUM_SET = 2,
@ -297,7 +306,7 @@ enum StartOfMatch {
                               (v)==START_LINE?    "START_LINE"    : \
                               (v)==START_STRING?  "START_STRING"  : \
                                                   "ILLEGAL")
-    
+

 //
 //  8 bit set, to fast-path latin-1 set membership tests.
--- a/icu4c/source/i18n/regexst.cpp
+++ b/icu4c/source/i18n/regexst.cpp
@ -59,9 +59,6 @@ static const UChar gRuleSet_rule_char_pattern[]       = {
 static const UChar gRuleSet_digit_char_pattern[] = {
 //    [    0      -    9     ]
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
-//static const UnicodeSet *gRuleDigits = NULL;
-
-

 //
 //   Here are the backslash escape characters that ICU's unescape() function
@ -72,16 +69,6 @@ static const UChar gUnescapeCharPattern[] = {
    0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};


-//
-//  White space characters that may appear within a pattern in free-form mode
-//
-static const UChar gRuleWhiteSpacePattern[] = {
-    /* "[[:Cf:][:WSpace:]]" */
-    91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
-        83, 112, 97, 99, 101, 58, 93, 93, 0 };
-
-
-
 //
 //  Unicode Set Definitions for Regular Expression  \w
 //
@ -89,7 +76,7 @@ static const UChar gIsWordPattern[] = {
 //    [     \     p     {    A     l     p     h     a     b     e     t     i      c    }
    0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d,
 //          \     p     {    M     }                               Mark
-          0x5c, 0x70, 0x7b, 0x4d, 0x7d, 
+          0x5c, 0x70, 0x7b, 0x4d, 0x7d,
 //          \     p     {    N     d     }                         Digit_Numeric
          0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d,
 //          \     p     {    P     c     }      ]                  Connector_Punctuation
@ -108,8 +95,8 @@ static const UChar gIsSpacePattern[] = {
 //  UnicodeSets used in implementation of Grapheme Cluster detection, \X
 //
 static const UChar gGC_ControlPattern[] = {
-//    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]    
-    0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, 
+//    [     [     :     Z     l     :     ]     [     :     Z     p     :     ]
+    0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
 //    [     :     C     c     :     ]     [     :     C     f     :     ]     -
    0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
 //    [     :     G     r     a     p     h     e     m     e     _
@ -124,34 +111,35 @@ static const UChar gGC_ExtendPattern[] = {
    0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};

 static const UChar gGC_LPattern[] = {
-//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
+//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     L     }     ]
-    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d,  0x5d, 0}; 
+    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d,  0x5d, 0};

 static const UChar gGC_VPattern[] = {
-//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
+//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     V     }     ]
-    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d,  0x5d, 0}; 
+    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d,  0x5d, 0};

 static const UChar gGC_TPattern[] = {
-//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
+//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     T     }    ]
-    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; 
+    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};

 static const UChar gGC_LVPattern[] = {
-//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
+//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     L     V     }     ]
-    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; 
+    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};

 static const UChar gGC_LVTPattern[] = {
-//    [     \     p     {     H     a     n     g     u     l     _     S     y     l    
+//    [     \     p     {     H     a     n     g     u     l     _     S     y     l
    0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
 //    l     a     b     l     e     _     T     y     p     e     =     L     V     T     }     ]
-    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0}; 
+    0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
+

 RegexStaticSets *RegexStaticSets::gStaticSets = NULL;

@ -160,7 +148,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status)
 fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
 fRuleDigitsAlias(NULL)
 {
-    // First zero out everything  
+    // First zero out everything
    int i;
    for (i=0; i<URX_LAST_SET; i++) {
        fPropSets[i] = NULL;
@ -171,7 +159,7 @@ fRuleDigitsAlias(NULL)

    // Then init the sets to their correct values.
    fPropSets[URX_ISWORD_SET]  = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1),     *status);
-    fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1),    *status);    
+    fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1),    *status);
    fPropSets[URX_GC_EXTEND]   = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1),  *status);
    fPropSets[URX_GC_CONTROL]  = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
    fPropSets[URX_GC_L]        = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1),       *status);
@ -184,14 +172,14 @@ fRuleDigitsAlias(NULL)
        // The rest of the initialization needs them, so we cannot proceed.
        return;
    }
-    
-    
+
+
    //
    // The following sets  are dynamically constructed, because their
    //   initialization strings would be unreasonable.
    //
-    
-    
+
+
    //
    //  "Normal" is the set of characters that don't need special handling
    //            when finding grapheme cluster boundaries.
@ -202,7 +190,7 @@ fRuleDigitsAlias(NULL)
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
    fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
-    
+
    // Initialize the 8-bit fast bit sets from the parallel full
    //   UnicodeSets.
    for (i=0; i<URX_LAST_SET; i++) {
@ -213,9 +201,8 @@ fRuleDigitsAlias(NULL)
    }

    // Sets used while parsing rules, but not referenced from the parse state table
-    fRuleSets[kRuleSet_rule_char-128]   = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1),  *status);
-    fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1),      *status);
-    fRuleSets[kRuleSet_digit_char-128]  = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
+    fRuleSets[kRuleSet_rule_char-128]   = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1),   *status);
+    fRuleSets[kRuleSet_digit_char-128]  = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1),  *status);
    fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128];
    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
        if (fRuleSets[i]) {
@ -281,7 +268,7 @@ void RegexStaticSets::initGlobals(UErrorCode *status) {
        ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
    }
 }
-    
+

 U_NAMESPACE_END
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@ -1,9 +1,9 @@
 //
-//  file:  repattrn.cpp    
+//  file:  repattrn.cpp
 //
 /*
 ***************************************************************************
-*   Copyright (C) 2002-2006 International Business Machines Corporation   *
+*   Copyright (C) 2002-2007 International Business Machines Corporation   *
 *   and others. All rights reserved.                                      *
 ***************************************************************************
 */
@ -46,7 +46,7 @@ RegexPattern::RegexPattern() {
 //
 //--------------------------------------------------------------------------
 RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
-    init(); 
+    init();
    *this = other;
 }

@ -78,9 +78,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fFrameSize        = other.fFrameSize;
    fDataSize         = other.fDataSize;
    fMaxCaptureDigits = other.fMaxCaptureDigits;
-    fStaticSets       = other.fStaticSets; 
+    fStaticSets       = other.fStaticSets;
    fStaticSets8      = other.fStaticSets8;
-    
+
    fStartType        = other.fStartType;
    fInitialStringIdx = other.fInitialStringIdx;
    fInitialStringLen = other.fInitialStringLen;
@ -92,9 +92,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
    fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
    fGroupMap->assign(*other.fGroupMap, fDeferredStatus);

-    //  Copy the Unicode Sets.  
+    //  Copy the Unicode Sets.
    //    Could be made more efficient if the sets were reference counted and shared,
-    //    but I doubt that pattern copying will be particularly common. 
+    //    but I doubt that pattern copying will be particularly common.
    //    Note:  init() already added an empty element zero to fSets
    int32_t i;
    int32_t  numSets = other.fSets->size();
@ -135,7 +135,7 @@ void RegexPattern::init() {
    fFrameSize        = 0;
    fDataSize         = 0;
    fGroupMap         = NULL;
-    fMaxCaptureDigits = 1;  
+    fMaxCaptureDigits = 1;
    fStaticSets       = NULL;
    fStaticSets8      = NULL;
    fStartType        = START_NO_INFO;
@ -144,7 +144,7 @@ void RegexPattern::init() {
    fInitialChars     = NULL;
    fInitialChar      = 0;
    fInitialChars8    = NULL;
-    
+
    fCompiledPat      = new UVector32(fDeferredStatus);
    fGroupMap         = new UVector32(fDeferredStatus);
    fSets             = new UVector(fDeferredStatus);
@ -166,7 +166,7 @@ void RegexPattern::init() {

 //--------------------------------------------------------------------------
 //
-//   zap            Delete everything owned by this RegexPattern. 
+//   zap            Delete everything owned by this RegexPattern.
 //
 //--------------------------------------------------------------------------
 void RegexPattern::zap() {
@ -208,7 +208,7 @@ RegexPattern::~RegexPattern() {
 //   Clone
 //
 //--------------------------------------------------------------------------
-RegexPattern  *RegexPattern::clone() const { 
+RegexPattern  *RegexPattern::clone() const {
    RegexPattern  *copy = new RegexPattern(*this);
    return copy;
 }
@ -229,7 +229,7 @@ UBool   RegexPattern::operator ==(const RegexPattern &other) const {

 //---------------------------------------------------------------------
 //
-//   compile        
+//   compile
 //
 //---------------------------------------------------------------------
 RegexPattern * U_EXPORT2
@ -244,7 +244,8 @@ RegexPattern::compile(const UnicodeString &regex,
    }

    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
-                              UREGEX_DOTALL   | UREGEX_MULTILINE | UREGEX_UWORD;
+                              UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
+                              UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES;

    if ((flags & ~allFlags) != 0) {
        status = U_REGEX_INVALID_FLAG;
@ -269,19 +270,24 @@ RegexPattern::compile(const UnicodeString &regex,

    RegexCompile     compiler(This, status);
    compiler.compile(regex, pe, status);
+    
+    if (U_FAILURE(status)) {
+        delete This;
+        This = NULL;
+    }

    return This;
 }
-    
+
 //
 //   compile with default flags.
 //
 RegexPattern * U_EXPORT2
 RegexPattern::compile(const UnicodeString &regex,
                      UParseError         &pe,
-                      UErrorCode          &err) 
+                      UErrorCode          &err)
 {
-    return compile(regex, 0, pe, err); 
+    return compile(regex, 0, pe, err);
 }


@ -292,10 +298,10 @@ RegexPattern::compile(const UnicodeString &regex,
 RegexPattern * U_EXPORT2
 RegexPattern::compile( const UnicodeString &regex,
        uint32_t             flags,
-        UErrorCode           &err) 
+        UErrorCode           &err)
 {
    UParseError pe;
-    return compile(regex, flags, pe, err); 
+    return compile(regex, flags, pe, err);
 }


@ -326,7 +332,7 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,

 #if 0
 RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
-                                    UErrorCode          &status)  const 
+                                    UErrorCode          &status)  const
 {
    /* This should never get called. The API with UnicodeString should be called instead. */
    if (U_SUCCESS(status)) {
@ -352,7 +358,7 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
        return NULL;
    }

-    retMatcher = new RegexMatcher(this); 
+    retMatcher = new RegexMatcher(this);
    if (retMatcher == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
@ -437,17 +443,15 @@ void   RegexPattern::dumpOp(int32_t index) const {
    int32_t val         = URX_VAL(op);
    int32_t type        = URX_TYPE(op);
    int32_t pinnedType  = type;
-    if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
+    if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
        pinnedType = 0;
    }
-    
+
    REGEX_DUMP_DEBUG_PRINTF(("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]));
    switch (type) {
    case URX_NOP:
    case URX_DOTANY:
    case URX_DOTANY_ALL:
-    case URX_DOTANY_PL:
-    case URX_DOTANY_ALL_PL:
    case URX_FAIL:
    case URX_CARET:
    case URX_DOLLAR:
@ -458,7 +462,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
    case URX_CARET_M:
        // Types with no operand field of interest.
        break;
-        
+
    case URX_RESERVED_OP:
    case URX_START_CAPTURE:
    case URX_END_CAPTURE:
@ -494,12 +498,12 @@ void   RegexPattern::dumpOp(int32_t index) const {
        // types with an integer operand field.
        REGEX_DUMP_DEBUG_PRINTF(("%d", val));
        break;
-        
+
    case URX_ONECHAR:
    case URX_ONECHAR_I:
        REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
        break;
-        
+
    case URX_STRING:
    case URX_STRING_I:
        {
@ -543,7 +547,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
        }
        break;

-        
+
    default:
        REGEX_DUMP_DEBUG_PRINTF(("??????"));
        break;
@ -554,7 +558,7 @@ void   RegexPattern::dumpOp(int32_t index) const {


 #if defined(REGEX_DEBUG)
-U_CAPI void  U_EXPORT2 
+U_CAPI void  U_EXPORT2
 RegexPatternDump(const RegexPattern *This) {
    int      index;
    int      i;
@ -565,7 +569,7 @@ RegexPatternDump(const RegexPattern *This) {
    }
    REGEX_DUMP_DEBUG_PRINTF(("\n"));
    REGEX_DUMP_DEBUG_PRINTF(("   Min Match Length:  %d\n", This->fMinMatchLen));
-    REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));   
+    REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));
    if (This->fStartType == START_STRING) {
        REGEX_DUMP_DEBUG_PRINTF(("    Initial match sting: \""));
        for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
@ -580,7 +584,7 @@ RegexPatternDump(const RegexPattern *This) {
        REGEX_DUMP_DEBUG_PRINTF(("     Match First Chars : "));
        for (i=0; i<numSetChars; i++) {
            UChar32 c = This->fInitialChars->charAt(i);
-            if (0x20<c && c <0x7e) { 
+            if (0x20<c && c <0x7e) {
                REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
            } else {
                REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
@ -606,7 +610,7 @@ RegexPatternDump(const RegexPattern *This) {
        This->dumpOp(index);
    }
    REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
-};
+}
 #endif


--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@ -16,7 +16,7 @@
 #ifndef REGEX_H
 #define REGEX_H

-//#define REGEX_DEBUG
+#define REGEX_DEBUG

 /**
 * \file
@ -36,7 +36,7 @@
 *  operations, for search and replace operations, and for obtaining detailed
 *  information about bounds of a match. </p>
 *
- * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular 
+ * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
 * expression pattern strings application code can be simplified and the explicit
 * need for <code>RegexPattern</code> objects can usually be eliminated.
 * </p>
@ -480,7 +480,7 @@ public:
      * critical that the string not be altered or deleted before use by the regular
      * expression operations is complete.
      *
-      *  @param regexp The Regular Expression to be compiled.  
+      *  @param regexp The Regular Expression to be compiled.
      *  @param input  The string to match.  The matcher retains a reference to the
      *                caller's string; mo copy is made.
      *  @param flags  Regular expression options, such as case insensitive matching.
@ -517,7 +517,7 @@ public:


   /**
-    *   Attempts to match the entire input string against the pattern.
+    *   Attempts to match the entire input region against the pattern.
    *    @param   status     A reference to a UErrorCode to receive any errors.
    *    @return TRUE if there is a match
    *    @stable ICU 2.4
@ -525,8 +525,10 @@ public:
    virtual UBool matches(UErrorCode &status);

   /**
-    *   Attempts to match the input string, beginning at startIndex, against the pattern.
-    *   The match must extend to the end of the input string.
+    *   Resets the matcher, then attempts to match the input beginning 
+    *   at the specified startIndex, and extending to the end of the input.
+    *   The input region is reset to include the entire input string.
+    *   A successful match must extend to the end of the input.
    *    @param   startIndex The input string index at which to begin matching.
    *    @param   status     A reference to a UErrorCode to receive any errors.
    *    @return TRUE if there is a match
@ -538,9 +540,10 @@ public:


   /**
-    *   Attempts to match the input string, starting from the beginning, against the pattern.
-    *   Like the matches() method, this function always starts at the beginning of the input string;
-    *   unlike that function, it does not require that the entire input string be matched.
+    *   Attempts to match the input string, starting from the beginning of the region,
+    *   against the pattern.  Like the matches() method, this function 
+    *   always starts at the beginning of the input region;
+    *   unlike that function, it does not require that the entire region be matched.
    *
    *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
    *     <code>end()</code>, and <code>group()</code> functions.</p>
@ -699,6 +702,12 @@ public:
    *   The effect is to remove any memory of previous matches,
    *       and to cause subsequent find() operations to begin at
    *       the specified position in the input string.
+    * <p>
+    *   The matcher's region is reset to its default, which is the entire
+    *   input string.
+    * <p>
+    *   An alternative to this function is to set a match region
+    *   beginning at the desired index.
    *
    *   @return this RegexMatcher.
    *   @stable ICU 2.8
@ -709,13 +718,13 @@ public:
   /**
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
    *     to be reused, which is more efficient than creating a new RegexMatcher for
-    *     each input string to be processed.  
+    *     each input string to be processed.
    *   @param input The new string on which subsequent pattern matches will operate.
    *                The matcher retains a reference to the callers string, and operates
    *                directly on that.  Ownership of the string remains with the caller.
    *                Because no copy of the string is made, it is essential that the
    *                caller not delete the string until after regexp operations on it
-    *                are done.  
+    *                are done.
    *   @return this RegexMatcher.
    *   @stable ICU 2.4
    */
@ -743,6 +752,132 @@ public:
    *   @stable ICU 2.4
    */
    virtual const UnicodeString &input() const;
+    
+    
+
+   /** Sets the limits of this matcher's region.
+     * The region is the part of the input string that will be searched to find a match.
+     * Invoking this method resets the matcher, and then sets the region to start
+     * at the index specified by the start parameter and end at the index specified
+     * by the end parameter.
+     *
+     * Depending on the transparency and anchoring being used (see useTransparentBounds
+     * and useAnchoringBounds), certain constructs such as anchors may behave differently
+     * at or around the boundaries of the region
+     *
+     * The function will fail if start is greater than limit, or if either index
+     *  is less than zero or greater than the length of the string being matched.
+     *
+     * @param start  The index to begin searches at.
+     * @param limit  The index to end searches at (exclusive).
+     * @param status A reference to a UErrorCode to receive any errors.
+     * @draft ICU 4.0
+     */
+     virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
+
+
+   /**
+     * Reports the start index of this matcher's region. The searches this matcher
+     * conducts are limited to finding matches within regionStart (inclusive) and
+     * regionEnd (exclusive).
+     *
+     * @return The starting index of this matcher's region.
+     * @draft ICU 4.0
+     */
+     virtual int32_t regionStart() const;
+
+
+    /**
+      * Reports the end (limit) index (exclusive) of this matcher's region. The searches
+      * this matcher conducts are limited to finding matches within regionStart
+      * (inclusive) and regionEnd (exclusive).
+      *
+      * @return The ending point of this matcher's region.
+      * @draft ICU 4.0
+      */
+      virtual int32_t regionEnd() const;
+
+    /**
+      * Queries the transparency of region bounds for this matcher.
+      * See useTransparentBounds for a description of transparent and opaque bounds.
+      * By default, a matcher uses opaque region boundaries.
+      *
+      * @return TRUE if this matcher is using opaque bounds, false if it is not.
+      * @draft ICU 4.0
+      */
+      virtual UBool hasTransparentBounds() const;
+
+    /**
+      * Sets the transparency of region bounds for this matcher.
+      * Invoking this function with an argument of true will set this matcher to use transparent bounds.
+      * If the boolean argument is false, then opaque bounds will be used.
+      *
+      * Using transparent bounds, the boundaries of this matcher's region are transparent
+      * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
+      * see text beyond the boundaries of the region while checking for a match.
+      *
+      * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
+      * lookbehind, and boundary matching constructs.
+      *
+      * By default, a matcher uses opaque bounds.
+      *
+      * @param   b TRUE for transparent bounds; FALSE for opaque bounds
+      * @return  This Matcher;
+      * @draft   ICU 4.0
+      **/
+      virtual RegexMatcher &useTransparentBounds(UBool b);
+
+     
+    /**
+      * Return true if this matcher is using anchoring bounds.
+      * By default, matchers use anchoring region boounds.
+      *
+      * @return TRUE if this matcher is using anchoring bounds.
+      * @draft  ICU 4.0
+      */    
+      virtual UBool hasAnchoringBounds() const;
+
+    /**
+      * Set whether this matcher is using Anchoring Bounds for its region.
+      * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
+      * and end of the region.  Without Anchoring Bounds, anchors will only match at
+      * the positions they would in the complete text.
+      *
+      * Anchoring Bounds are the default for regions.
+      *
+      * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
+      * @return  This Matcher
+      * @draft   ICU 4.0
+      */
+      virtual RegexMatcher &useAnchoringBounds(UBool b);
+
+    /**
+      * Return TRUE if the most recent matching operation touched the
+      *  end of the text being processed.  In this case, additional input text could
+      *  change the results of that match.
+      *
+      *  hitEnd() is defined for both successful and unsuccessful matches.
+      *  In either case hitEnd() will return TRUE if if the end of the text was
+      *  reached at any point during the matching process.
+      *
+      *  @return  TRUE if the most recent match hit the end of input
+      *  @draft   ICU 4.0
+      */
+      virtual UBool hitEnd() const;
+
+    /**
+      * Return TRUE the most recent match succeeded and additional input could cause
+      * it to fail. If this method returns false and a match was found, then more input
+      * might change the match but the match won't be lost. If a match was not found,
+      * then requireEnd has no meaning.
+      *
+      * @return TRUE if more input could cause the most recent match to no longer match.
+      * @draft  ICU 4.0
+      */
+      virtual UBool requireEnd() const;
+
+
+


   /**
@ -901,12 +1036,16 @@ private:
    RegexMatcher &operator =(const RegexMatcher &rhs);
    friend class RegexPattern;
    friend class RegexCImpl;
+public:
+    /** @internal  */
+    void resetPreserveRegion();  // Reset matcher state, but preserve any region.
+private:

    //
    //  MatchAt   This is the internal interface to the match engine itself.
    //            Match status comes back in matcher member variables.
    //
-    void                 MatchAt(int32_t startIdx, UErrorCode &status);
+    void                 MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
    inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
    UBool                isWordBoundary(int32_t pos);         // perform Perl-like  \b test
    UBool                isUWordBoundary(int32_t pos);        // perform RBBI based \b test
@ -918,18 +1057,45 @@ private:
    const RegexPattern  *fPattern;
    RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
                                           //   should delete it when through.
-    const UnicodeString *fInput;

-    UBool                fMatch;           // True if the last match was successful.
+    const UnicodeString *fInput;           // The text being matched. Is never NULL.
+    
+    int32_t              fRegionStart;     // Start of the input region, default = 0.
+    int32_t              fRegionLimit;     // End of input region, default to input.length.
+    
+    int32_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
+    int32_t              fAnchorLimit;     //   See useAnchoringBounds
+    
+    int32_t              fLookStart;       // Region bounds for look-ahead/behind and
+    int32_t              fLookLimit;       //   and other boundary tests.  See
+                                           //   useTransparentBounds
+
+    int32_t              fActiveStart;     // Currently active bounds for matching.
+    int32_t              fActiveLimit;     //   Usually is the same as region, but
+                                           //   is changed to fLookStart/Limit when
+                                           //   entering look around regions.
+
+    UBool                fTransparentBounds;  // True if using transparent bounds.
+    UBool                fAnchoringBounds; // True if using anchoring bounds.
+
+    UBool                fMatch;           // True if the last attempted match was successful.
    int32_t              fMatchStart;      // Position of the start of the most recent match
    int32_t              fMatchEnd;        // First position after the end of the most recent match
+                                           //   Zero if no previous match, even when a region
+                                           //   is active.
    int32_t              fLastMatchEnd;    // First position after the end of the previous match,
                                           //   or -1 if there was no previous match.
-    int32_t              fLastReplaceEnd;  // First position after the end of the previous appendReplacement();
+    int32_t              fAppendPosition;  // First position after the end of the previous
+                                           //   appendReplacement().  As described by the
+                                           //   JavaDoc for Java Matcher, where it is called 
+                                           //   "append position"
+    UBool                fHitEnd;          // True if the last match touched the end of input.
+    UBool                fRequireEnd;      // True if the last match required end-of-input
+                                           //    (matched $ or Z)

    UVector32           *fStack;
-    REStackFrame        *fFrame;           // After finding a match, the last active stack
-                                           //   frame, which will contain the capture group results.
+    REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
+                                           //   which will contain the capture group results.
                                           //   NOT valid while match engine is running.

    int32_t             *fData;            // Data area for use by the compiled pattern.
--- a/icu4c/source/i18n/unicode/uregex.h
+++ b/icu4c/source/i18n/unicode/uregex.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 2004-2006, International Business Machines
+*   Copyright (C) 2004-2007, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *   file name:  regex.h
@ -59,12 +59,32 @@ typedef enum URegexpFlag{
    /**  If set, '.' matches line terminators,  otherwise '.' matching stops at line end.
      *  @stable ICU 2.4 */
    UREGEX_DOTALL           = 32,
+    
+    /**  If set, treat the entire pattern as a literal string.  
+      *  Metacharacters or escape sequences in the input sequence will be given 
+      *  no special meaning.
+      *
+      *  The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact
+      *  on matching when used in conjunction with this flag.
+      *  The other flags become superfluous.
+      *  TODO:  say which escapes are still handled; anything Java does
+      *         early (\u) we should still do.
+      * @draft ICU 4.0
+      */
+    UREGEX_LITERAL = 16,

    /**   Control behavior of "$" and "^"
      *    If set, recognize line terminators within string,
      *    otherwise, match only at start and end of input string.
      *   @stable ICU 2.4 */
    UREGEX_MULTILINE        = 8,
+    
+    /**   Unix-only line endings.
+      *   When this mode is enabled, only \u000a is recognized as a line ending
+      *    in the behavior of ., ^, and $.
+      *   @draft ICU 4.0
+      */
+    UREGEX_UNIX_LINES = 1,

    /**  Unicode word boundaries.
      *     If set, \b uses the Unicode TR 29 definition of word boundaries.
@ -73,7 +93,17 @@ typedef enum URegexpFlag{
      *     http://unicode.org/reports/tr29/#Word_Boundaries
      *     @stable ICU 2.8
      */
-    UREGEX_UWORD            = 256
+    UREGEX_UWORD            = 256,
+
+     /**  Error on Unrecognized backslash escapes.
+       *     If set, fail with an error on patterns that contain
+       *     backslash-escaped ASCII letters without a known specail
+       *     meaning.  If this flag is not set, these
+       *     escaped letters represent themselves.
+       *     @draft ICU 4.0
+       */
+     UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
+
 }  URegexpFlag;

 /**
@ -251,11 +281,21 @@ uregex_getText(URegularExpression *regexp,
               UErrorCode         *status);

 /**
-  *   Attempts to match the input string, beginning at startIndex, against the pattern.
-  *   To succeed, the match must extend to the end of the input string.
+  *   Attempts to match the input string against the pattern.
+  *   To succeed, the match must extend to the end of the string,
+  *   or cover the complete match region.
+  *
+  *   If startIndex >= zero the match operation starts at the specified
+  *   index and must extend to the end of the input string.  Any region
+  *   that has been specified is reset.
+  *
+  *   If startIndex == -1 the match must cover the input region, or the entire
+  *   input string if no region has been set.  This directly corresponds to
+  *   Matcher.matches() in Java
  *
  *    @param  regexp      The compiled regular expression.
-  *    @param  startIndex  The input string index at which to begin matching.
+  *    @param  startIndex  The input string index at which to begin matching, or -1
+  *                        to match the input Region.
  *    @param  status      Receives errors detected by this function.
  *    @return             TRUE if there is a match
  *    @stable ICU 3.0
@ -270,12 +310,20 @@ uregex_matches(URegularExpression *regexp,
  *   The match may be of any length, and is not required to extend to the end
  *   of the input string.  Contrast with uregex_matches().
  *
+  *   <p>If startIndex is >= 0 any input region that was set for this
+  *   URegularExpression is reset before the operation begins.
+  *
+  *   <p>If the specified starting index == -1 the match begins at the start of the input 
+  *   region, or at the start of the full string if no region has been specified.
+  *   This corresponds directly with Matcher.lookingAt() in Java.
+  *
  *   <p>If the match succeeds then more information can be obtained via the
  *    <code>uregexp_start()</code>, <code>uregexp_end()</code>,
  *    and <code>uregexp_group()</code> functions.</p>
  *
  *    @param   regexp      The compiled regular expression.
-  *    @param   startIndex  The input string index at which to begin matching.
+  *    @param   startIndex  The input string index at which to begin matching, or
+  *                         -1 to match the Input Region
  *    @param   status      A reference to a UErrorCode to receive any errors.
  *    @return  TRUE if there is a match.
  *    @stable ICU 3.0
@ -287,12 +335,19 @@ uregex_lookingAt(URegularExpression *regexp,

 /**
  *   Find the first matching substring of the input string that matches the pattern.
-  *   The search for a match begins at the specified index.
+  *   If startIndex is >= zero the search for a match begins at the specified index,
+  *          and any match region is reset.  This corresponds directly with
+  *          Matcher.find(startIndex) in Java.
+  *
+  *   If startIndex == -1 the search begins at the start of the input region,
+  *           or at the start of the full string if no region has been specified.
+  *
  *   If a match is found, <code>uregex_start(), uregex_end()</code>, and
  *   <code>uregex_group()</code> will provide more information regarding the match.
  *
  *   @param   regexp      The compiled regular expression.
-  *   @param   startIndex  The position in the input string to begin the search
+  *   @param   startIndex  The position in the input string to begin the search, or
+  *                        -1 to search within the Input Region.
  *   @param   status      A reference to a UErrorCode to receive any errors.
  *   @return              TRUE if a match is found.
  *   @stable ICU 3.0
@ -303,10 +358,10 @@ uregex_find(URegularExpression *regexp,
            UErrorCode         *status);

 /**
-  *  Find the next pattern match in the input string.
-  *  Begin searching the input at the location following the end of
-  *  the previous match, or at the start of the string if there is no previous match.
-  *  If a match is found, <code>uregex_start(), uregex_end()</code>, and
+  *  Find the next pattern match in the input string.  Begin searching 
+  *  the input at the location following the end of he previous match, 
+  *  or at the start of the string (or region) if there is no 
+  *  previous match.  If a match is found, <code>uregex_start(), uregex_end()</code>, and
  *  <code>uregex_group()</code> will provide more information regarding the match.
  *
  *  @param   regexp      The compiled regular expression.
@ -395,7 +450,8 @@ uregex_end(URegularExpression   *regexp,
  *  Reset any saved state from the previous match.  Has the effect of
  *  causing uregex_findNext to begin at the specified index, and causing
  *  uregex_start(), uregex_end() and uregex_group() to return an error 
-  *  indicating that there is no match information available.
+  *  indicating that there is no match information available.  Clears any
+  *  match region that may have been set.
  *
  *    @param   regexp      The compiled regular expression.
  *    @param   index       The position in the text at which a
@ -407,6 +463,166 @@ U_STABLE void U_EXPORT2
 uregex_reset(URegularExpression    *regexp,
             int32_t               index,
             UErrorCode            *status);
+             
+             
+/** Sets the limits of the matching region for this URegularExpression.
+  * The region is the part of the input string that will be considered when matching.
+  * Invoking this method resets any saved state from the previous match, 
+  * then sets the region to start at the index specified by the start parameter
+  * and end at the index specified by the end parameter.
+  *
+  * Depending on the transparency and anchoring being used (see useTransparentBounds
+  * and useAnchoringBounds), certain constructs such as anchors may behave differently
+  * at or around the boundaries of the region
+  *
+  * The function will fail if start is greater than limit, or if either index
+  *  is less than zero or greater than the length of the string being matched.
+  *
+  * @param regexp The compiled regular expression.
+  * @param start  The index to begin searches at.
+  * @param limit  The index to end searches at (exclusive).
+  * @param status A pointer to a UErrorCode to receive any errors.
+  * @draft ICU 4.0
+  */
+U_DRAFT void U_EXPORT2
+uregex_setRegion(URegularExpression   *regexp,
+                 int32_t               regionStart,
+                 int32_t               regionLimit,
+                 UErrorCode           *status);
+
+/**
+  * Reports the start index of the matching region. Any matches found are limited to
+  * to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
+  *
+  * @param regexp The compiled regular expression.
+  * @param status A pointer to a UErrorCode to receive any errors.
+  * @return The starting index of this matcher's region.
+  * @draft ICU 4.0
+  */
+U_DRAFT int32_t U_EXPORT2
+uregex_regionStart(const  URegularExpression   *regexp,
+                          UErrorCode           *status);
+
+
+
+/**
+  * Reports the end index (exclusive) of the matching region for this URegularExpression.
+  * Any matches found are limited to to the region bounded by regionStart (inclusive)
+  * and regionEnd (exclusive).
+  *
+  * @param regexp The compiled regular expression.
+  * @param status A pointer to a UErrorCode to receive any errors.
+  * @return The ending point of this matcher's region.
+  * @draft ICU 4.0
+  */
+U_DRAFT int32_t U_EXPORT2
+uregex_regionEnd(const  URegularExpression   *regexp,
+                        UErrorCode           *status);
+
+/**
+  * Queries the transparency of region bounds for this URegularExpression.
+  * See useTransparentBounds for a description of transparent and opaque bounds.
+  * By default, matching boundaries are opaque.
+  *
+  * @param regexp The compiled regular expression.
+  * @param status A pointer to a UErrorCode to receive any errors.
+  * @return TRUE if this matcher is using opaque bounds, false if it is not.
+  * @draft ICU 4.0
+  */
+U_DRAFT UBool U_EXPORT2
+uregex_hasTransparentBounds(const  URegularExpression   *regexp,
+                                   UErrorCode           *status);
+
+
+/**
+  * Sets the transparency of region bounds for this URegularExpression.
+  * Invoking this function with an argument of TRUE will set matches to use transparent bounds.
+  * If the boolean argument is FALSE, then opaque bounds will be used.
+  *
+  * Using transparent bounds, the boundaries of the matching region are transparent
+  * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
+  * see text beyond the boundaries of the region while checking for a match.
+  *
+  * With opaque bounds, no text outside of the matching region is visible to lookahead,
+  * lookbehind, and boundary matching constructs.
+  *
+  * By default, opaque bounds are used.
+  *
+  * @param   regexp The compiled regular expression.
+  * @param   b      TRUE for transparent bounds; FALSE for opaque bounds
+  * @param   status A pointer to a UErrorCode to receive any errors.
+  * @draft   ICU 4.0
+  **/
+U_DRAFT void U_EXPORT2  
+uregex_useTransparentBounds(URegularExpression   *regexp, 
+                            UBool                b,
+                            UErrorCode           *status);
+
+
+/**
+  * Return true if this URegularExpression is using anchoring bounds.
+  * By default, anchoring region bounds are used.
+  *
+  * @param  regexp The compiled regular expression.
+  * @param  status A pointer to a UErrorCode to receive any errors.
+  * @return TRUE if this matcher is using anchoring bounds.
+  * @draft  ICU 4.0
+  */
+U_DRAFT UBool U_EXPORT2
+uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
+                                 UErrorCode           *status);
+
+
+/**
+  * Set whether this URegularExpression is using Anchoring Bounds for its region.
+  * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
+  * and end of the region.  Without Anchoring Bounds, anchors will only match at
+  * the positions they would in the complete text.
+  *
+  * Anchoring Bounds are the default for regions.
+  *
+  * @param regexp The compiled regular expression.
+  * @param b      TRUE if to enable anchoring bounds; FALSE to disable them.
+  * @param status A pointer to a UErrorCode to receive any errors.
+  * @draft   ICU 4.0
+  */
+U_DRAFT void U_EXPORT2
+uregex_useAnchoringBounds(URegularExpression   *regexp,
+                          UBool                 b,
+                          UErrorCode           *status);
+
+/**
+  * Return TRUE if the most recent matching operation touched the
+  *  end of the text being processed.  In this case, additional input text could
+  *  change the results of that match.
+  *
+  *  @param regexp The compiled regular expression.
+  *  @param status A pointer to a UErrorCode to receive any errors.
+  *  @return  TRUE if the most recent match hit the end of input
+  *  @draft   ICU 4.0
+  */
+U_DRAFT UBool U_EXPORT2
+uregex_hitEnd(const  URegularExpression   *regexp,
+                     UErrorCode           *status);
+
+/**
+  * Return TRUE the most recent match succeeded and additional input could cause
+  * it to fail. If this function returns false and a match was found, then more input
+  * might change the match but the match won't be lost. If a match was not found,
+  * then requireEnd has no meaning.
+  *
+  * @param regexp The compiled regular expression.
+  * @param status A pointer to a UErrorCode to receive any errors.
+  * @return TRUE  if more input could cause the most recent match to no longer match.
+  * @draft  ICU 4.0
+  */
+U_DRAFT UBool U_EXPORT2   
+uregex_requireEnd(const  URegularExpression   *regexp,
+                         UErrorCode           *status);
+
+
+
+

 /**
  *    Replaces every substring of the input that matches the pattern
--- a/icu4c/source/i18n/uregex.cpp
+++ b/icu4c/source/i18n/uregex.cpp
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-*   Copyright (C) 2004-2006, International Business Machines
+*   Copyright (C) 2004-2007, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
 *   file name:  regex.cpp
@ -304,10 +304,15 @@ U_CAPI UBool U_EXPORT2
 uregex_matches(URegularExpression *regexp,
                int32_t            startIndex,
                UErrorCode        *status)  {
+    UBool result = FALSE;
    if (validateRE(regexp, status) == FALSE) {
-        return FALSE;
+        return result;
+    }
+    if (startIndex == -1) {
+        result = regexp->fMatcher->matches(*status);
+    } else {
+        result = regexp->fMatcher->matches(startIndex, *status);
    }
-    UBool result = regexp->fMatcher->matches(startIndex, *status);
    return result;
 }

@ -322,10 +327,15 @@ U_CAPI UBool U_EXPORT2
 uregex_lookingAt(URegularExpression *regexp,
                 int32_t             startIndex,
                 UErrorCode         *status)  {
+    UBool result = FALSE;
    if (validateRE(regexp, status) == FALSE) {
-        return FALSE;
+        return result;
+    }
+    if (startIndex == -1) {
+        result = regexp->fMatcher->lookingAt(*status);
+    } else {
+        result = regexp->fMatcher->lookingAt(startIndex, *status);
    }
-    UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
    return result;
 }

@ -340,10 +350,16 @@ U_CAPI UBool U_EXPORT2
 uregex_find(URegularExpression *regexp,
            int32_t             startIndex, 
            UErrorCode         *status)  {
+    UBool result = FALSE;
    if (validateRE(regexp, status) == FALSE) {
-        return FALSE;
+        return result;
+    }
+    if (startIndex == -1) {
+        regexp->fMatcher->resetPreserveRegion();
+        result = regexp->fMatcher->find();
+    } else {
+        result = regexp->fMatcher->find(startIndex, *status);
    }
-    UBool result = regexp->fMatcher->find(startIndex, *status);
    return result;
 }

@ -479,6 +495,145 @@ uregex_reset(URegularExpression    *regexp,
 }


+//------------------------------------------------------------------------------
+//
+//    uregex_setRegion
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2 
+uregex_setRegion(URegularExpression   *regexp,
+                 int32_t               regionStart,
+                 int32_t               regionLimit,
+                 UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return;
+    }
+    regexp->fMatcher->region(regionStart, regionLimit, *status);
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_regionStart
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2 
+uregex_regionStart(const  URegularExpression   *regexp,
+                          UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return 0;
+    }
+    return regexp->fMatcher->regionStart();
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_regionEnd
+//
+//------------------------------------------------------------------------------
+U_CAPI int32_t U_EXPORT2 
+uregex_regionEnd(const  URegularExpression   *regexp,
+                        UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return 0;
+    }
+    return regexp->fMatcher->regionEnd();
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_hasTransparentBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2 
+uregex_hasTransparentBounds(const  URegularExpression   *regexp,
+                                   UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return FALSE;
+    }
+    return regexp->fMatcher->hasTransparentBounds();
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_useTransparentBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2 
+uregex_useTransparentBounds(URegularExpression    *regexp,
+             UBool                 b,
+             UErrorCode            *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return;
+    }
+    regexp->fMatcher->useTransparentBounds(b);
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_hasAnchoringBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2 
+uregex_hasAnchoringBounds(const  URegularExpression   *regexp,
+                                   UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return FALSE;
+    }
+    return regexp->fMatcher->hasAnchoringBounds();
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_useAnchoringBounds
+//
+//------------------------------------------------------------------------------
+U_CAPI void U_EXPORT2 
+uregex_useAnchoringBounds(URegularExpression    *regexp,
+             UBool                 b,
+             UErrorCode            *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return;
+    }
+    regexp->fMatcher->useAnchoringBounds(b);
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_hitEnd
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2 
+uregex_hitEnd(const  URegularExpression   *regexp,
+                     UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return FALSE;
+    }
+    return regexp->fMatcher->hitEnd();
+}
+
+
+//------------------------------------------------------------------------------
+//
+//    uregex_requireEnd
+//
+//------------------------------------------------------------------------------
+U_CAPI UBool U_EXPORT2 
+uregex_requireEnd(const  URegularExpression   *regexp,
+                         UErrorCode           *status)  {
+    if (validateRE(regexp, status) == FALSE) {
+        return FALSE;
+    }
+    return regexp->fMatcher->requireEnd();
+}
+
+
 //------------------------------------------------------------------------------
 //
 //    uregex_replaceAll
--- a/icu4c/source/test/cintltst/reapits.c
+++ b/icu4c/source/test/cintltst/reapits.c
@ -1,6 +1,6 @@
 /********************************************************************
 * COPYRIGHT: 
- * Copyright (c) 2004-2006, International Business Machines Corporation and
+ * Copyright (c) 2004-2007, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
 /********************************************************************************
@ -34,6 +34,36 @@ log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_error
 #define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
 log_err("Test Failure at file %s, line %d\n", __FILE__, __LINE__);}}

+/*
+ *   TEST_SETUP and TEST_TEARDOWN
+ *         macros to handle the boilerplate around setting up regex test cases.
+ *         parameteres to setup:
+ *              pattern:     The regex pattern, a (char *) null terminated C string.
+ *              testString:  The string data, also a (char *) C string.
+ *              flags:       Regex flags to set when compiling the pattern
+ *
+ *         Put arbitrary test code between SETUP and TEARDOWN.
+ *         're" is the compiled, ready-to-go  regular expression.
+ */
+#define TEST_SETUP(pattern, testString, flags) {  \
+    UChar   *srcString = NULL;  \
+    status = U_ZERO_ERROR; \
+    re = uregex_openC(pattern, flags, NULL, &status);  \
+    TEST_ASSERT_SUCCESS(status);   \
+    srcString = (UChar *)malloc((strlen(testString)+2)*sizeof(UChar)); \
+    u_uastrncpy(srcString, testString,  strlen(testString)+1); \
+    uregex_setText(re, srcString, -1, &status); \
+    TEST_ASSERT_SUCCESS(status);  \
+    if (U_SUCCESS(status)) {
+    
+#define TEST_TEARDOWN  \
+    }  \
+    TEST_ASSERT_SUCCESS(status);  \
+    uregex_close(re);  \
+    free(srcString);   \
+    }
+
+
 static void test_assert_string(const char *expected, const UChar *actual, UBool nulTerm, const char *file, int line) {
     char     buf_inside_macro[120];
     int32_t  len = (int32_t)strlen(expected);
@ -544,6 +574,135 @@ static void TestRegexCAPI(void) {
        uregex_close(re);

    }
+    
+    /*
+     *  Regions
+     */
+        
+        
+        /* SetRegion(), getRegion() do something  */
+        TEST_SETUP(".*", "0123456789ABCDEF", 0)
+        UChar resultString[40];
+        TEST_ASSERT(uregex_regionStart(re, &status) == 0);
+        TEST_ASSERT(uregex_regionEnd(re, &status) == 16);
+        uregex_setRegion(re, 3, 6, &status);
+        TEST_ASSERT(uregex_regionStart(re, &status) == 3);
+        TEST_ASSERT(uregex_regionEnd(re, &status) == 6);
+        TEST_ASSERT(uregex_findNext(re, &status));
+        TEST_ASSERT(uregex_group(re, 0, resultString, sizeof(resultString)/2, &status) == 3)
+        TEST_ASSERT_STRING("345", resultString, TRUE);
+        TEST_TEARDOWN;
+        
+        /* find(start=-1) uses regions   */
+        TEST_SETUP(".*", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 4);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 6);
+        TEST_TEARDOWN;
+        
+        /* find (start >=0) does not use regions   */
+        TEST_SETUP(".*", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 0);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 16);
+        TEST_TEARDOWN;
+         
+        /* findNext() obeys regions    */
+        TEST_SETUP(".", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_findNext(re,&status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 4);
+        TEST_ASSERT(uregex_findNext(re, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 5);
+        TEST_ASSERT(uregex_findNext(re, &status) == FALSE);
+        TEST_TEARDOWN;
+
+        /* matches(start=-1) uses regions                                           */
+        /*    Also, verify that non-greedy *? succeeds in finding the full match.   */
+        TEST_SETUP(".*?", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_matches(re, -1, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 4);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 6);
+        TEST_TEARDOWN;
+        
+        /* matches (start >=0) does not use regions       */
+        TEST_SETUP(".*?", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_matches(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 0);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 16);
+        TEST_TEARDOWN;
+        
+        /* lookingAt(start=-1) uses regions                                         */
+        /*    Also, verify that non-greedy *? finds the first (shortest) match.     */
+        TEST_SETUP(".*?", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_lookingAt(re, -1, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 4);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 4);
+        TEST_TEARDOWN;
+        
+        /* lookingAt (start >=0) does not use regions  */
+        TEST_SETUP(".*?", "0123456789ABCDEF", 0);
+        uregex_setRegion(re, 4, 6, &status);
+        TEST_ASSERT(uregex_lookingAt(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_start(re, 0, &status) == 0);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 0);
+        TEST_TEARDOWN;
+
+        /* hitEnd()       */
+        TEST_SETUP("[a-f]*", "abcdefghij", 0);
+        TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_hitEnd(re, &status) == FALSE);
+        TEST_TEARDOWN;
+
+        TEST_SETUP("[a-f]*", "abcdef", 0);
+        TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_hitEnd(re, &status) == TRUE);
+        TEST_TEARDOWN;
+
+        /* requireEnd   */
+        TEST_SETUP("abcd", "abcd", 0);
+        TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_requireEnd(re, &status) == FALSE);
+        TEST_TEARDOWN;
+
+        TEST_SETUP("abcd$", "abcd", 0);
+        TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
+        TEST_ASSERT(uregex_requireEnd(re, &status) == TRUE);
+        TEST_TEARDOWN;
+        
+        /* anchoringBounds        */
+        TEST_SETUP("abc$", "abcdef", 0);
+        TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == TRUE);
+        uregex_useAnchoringBounds(re, FALSE, &status);
+        TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == FALSE);
+        
+        TEST_ASSERT(uregex_find(re, -1, &status) == FALSE);
+        uregex_useAnchoringBounds(re, TRUE, &status);
+        uregex_setRegion(re, 0, 3, &status);
+        TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
+        TEST_ASSERT(uregex_end(re, 0, &status) == 3);
+        TEST_TEARDOWN;
+        
+        /* Transparent Bounds      */
+        TEST_SETUP("abc(?=def)", "abcdef", 0);
+        TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == FALSE);
+        uregex_useTransparentBounds(re, TRUE, &status);
+        TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == TRUE);
+        
+        uregex_useTransparentBounds(re, FALSE, &status);
+        TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);    /* No Region */
+        uregex_setRegion(re, 0, 3, &status);
+        TEST_ASSERT(uregex_find(re, -1, &status) == FALSE);   /* with region, opaque bounds */
+        uregex_useTransparentBounds(re, TRUE, &status);
+        TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);    /* with region, transparent bounds */
+        TEST_ASSERT(uregex_end(re, 0, &status) == 3);
+        TEST_TEARDOWN;
+        

    /*
     *  replaceFirst()
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@ -175,209 +175,6 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,



-//---------------------------------------------------------------------------
-//
-//    regex_find(pattern, inputString, lineNumber)
-//
-//         function to simplify writing tests regex tests.
-//
-//          The input text is unescaped.  The pattern is not.
-//          The input text is marked with the expected match positions
-//              <0>text  <1> more text </1>   </0>
-//          The <n> </n> tags are removed before trying the match.
-//          The tags mark the start and end of the match and of any capture groups.
-//
-//
-//---------------------------------------------------------------------------
-
-
-//  Set a value into a UVector at position specified by a decimal number in
-//   a UnicodeString.   This is a utility function needed by the actual test function,
-//   which follows.
-static void set(UVector &vec, int32_t val, UnicodeString index) {
-    UErrorCode  status=U_ZERO_ERROR;
-    int32_t  idx = 0;
-    for (int32_t i=0; i<index.length(); i++) {
-        int32_t d=u_charDigitValue(index.charAt(i));
-        if (d<0) {return;}
-        idx = idx*10 + d;
-    }
-    while (vec.size()<idx+1) {vec.addElement(-1, status);}
-    vec.setElementAt(val, idx);
-}
-
-void RegexTest::regex_find(const UnicodeString &pattern,
-                           const UnicodeString &flags,
-                           const UnicodeString &inputString,
-                           int32_t line) {
-    UnicodeString       unEscapedInput;
-    UnicodeString       deTaggedInput;
-
-    UErrorCode          status         = U_ZERO_ERROR;
-    UParseError         pe;
-    RegexPattern        *parsePat      = NULL;
-    RegexMatcher        *parseMatcher  = NULL;
-    RegexPattern        *callerPattern = NULL;
-    RegexMatcher        *matcher       = NULL;
-    UVector             groupStarts(status);
-    UVector             groupEnds(status);
-    UBool               isMatch        = FALSE;
-    UBool               failed         = FALSE;
-    int32_t                 numFinds;
-    int32_t                 i;
-
-    //
-    //  Compile the caller's pattern
-    //
-    uint32_t bflags = 0;
-    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
-        bflags |= UREGEX_CASE_INSENSITIVE;
-    }
-    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
-        bflags |= UREGEX_COMMENTS;
-    }
-    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
-        bflags |= UREGEX_DOTALL;
-    }
-    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
-        bflags |= UREGEX_MULTILINE;
-    }
-
-
-    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
-    if (status != U_ZERO_ERROR) {
-        #if UCONFIG_NO_BREAK_ITERATION==1
-        // 'v' test flag means that the test pattern should not compile if ICU was configured
-        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
-        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
-            goto cleanupAndReturn;
-        }
-        #endif
-        errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
-        goto cleanupAndReturn;
-    }
-
-    if (flags.indexOf((UChar)'d') >= 0) {
-        RegexPatternDump(callerPattern);
-    }
-
-    //
-    // Number of times find() should be called on the test string, default to 1
-    //
-    numFinds = 1;
-    for (i=2; i<=9; i++) {
-        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
-            if (numFinds != 1) {
-                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
-                goto cleanupAndReturn;
-            }
-            numFinds = i;
-        }
-    }
-
-    //
-    //  Find the tags in the input data, remove them, and record the group boundary
-    //    positions.
-    //
-    parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
-    REGEX_CHECK_STATUS_L(line);
-
-    unEscapedInput = inputString.unescape();
-    parseMatcher = parsePat->matcher(unEscapedInput, status);
-    REGEX_CHECK_STATUS_L(line);
-    while(parseMatcher->find()) {
-        parseMatcher->appendReplacement(deTaggedInput, "", status);
-        REGEX_CHECK_STATUS;
-        UnicodeString groupNum = parseMatcher->group(2, status);
-        if (parseMatcher->group(1, status) == "/") {
-            // close tag
-            set(groupEnds, deTaggedInput.length(), groupNum);
-        } else {
-            set(groupStarts, deTaggedInput.length(), groupNum);
-        }
-    }
-    parseMatcher->appendTail(deTaggedInput);
-    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
-
-
-    //
-    // Do a find on the de-tagged input using the caller's pattern
-    //
-    matcher = callerPattern->matcher(deTaggedInput, status);
-    REGEX_CHECK_STATUS_L(line);
-    if (flags.indexOf((UChar)'t') >= 0) {
-        matcher->setTrace(TRUE);
-    }
-
-    for (i=0; i<numFinds; i++) {
-        isMatch = matcher->find();
-    }
-    matcher->setTrace(FALSE);
-
-    //
-    // Match up the groups from the find() with the groups from the tags
-    //
-
-    // number of tags should match number of groups from find operation.
-    // matcher->groupCount does not include group 0, the entire match, hence the +1.
-    //   G option in test means that capture group data is not available in the
-    //     expected results, so the check needs to be suppressed.
-    if (isMatch == FALSE && groupStarts.size() != 0) {
-        errln("Error at line %d:  Match expected, but none found.\n", line);
-        failed = TRUE;
-        goto cleanupAndReturn;
-    }
-
-    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
-        // Only check for match / no match.  Don't check capture groups.
-        if (isMatch && groupStarts.size() == 0) {
-            errln("Error at line %d:  No match expected, but one found.\n", line);
-            failed = TRUE;
-        }
-        goto cleanupAndReturn;
-    }
-
-    for (i=0; i<=matcher->groupCount(); i++) {
-        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
-        if (matcher->start(i, status) != expectedStart) {
-            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
-                line, i, expectedStart, matcher->start(i, status));
-            failed = TRUE;
-            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
-        }
-        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
-        if (matcher->end(i, status) != expectedEnd) {
-            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
-                line, i, expectedEnd, matcher->end(i, status));
-            failed = TRUE;
-            // Error on end position;  keep going; real error is probably yet to come as group
-            //   end positions work from end of the input data towards the front.
-        }
-    }
-    if ( matcher->groupCount()+1 < groupStarts.size()) {
-        errln("Error at line %d: Expected %d capture groups, found %d.",
-            line, groupStarts.size()-1, matcher->groupCount());
-        failed = TRUE;
-        }
-
-cleanupAndReturn:
-    if (failed) {
-        errln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
-            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
-        // callerPattern->dump();
-    }
-    delete parseMatcher;
-    delete parsePat;
-    delete matcher;
-    delete callerPattern;
-}
-
-
-
-
-
-
-

 //---------------------------------------------------------------------------
 //
@ -938,6 +735,87 @@ void RegexTest::API_Match() {
        delete m;
        delete p;
    }
+    
+    //
+    // Regions
+    //
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeString testString("This is test data");
+        RegexMatcher m(".*", testString,  0, status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(m.regionStart() == 0);
+        REGEX_ASSERT(m.regionEnd() == testString.length());
+        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
+        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
+        
+        m.region(2,4, status);
+        REGEX_CHECK_STATUS;
+        REGEX_ASSERT(m.matches(status));
+        REGEX_ASSERT(m.start(status)==2);
+        REGEX_ASSERT(m.end(status)==4);
+        REGEX_CHECK_STATUS;
+        
+        m.reset();
+        REGEX_ASSERT(m.regionStart() == 0);
+        REGEX_ASSERT(m.regionEnd() == testString.length());
+        
+        UnicodeString shorterString("short");
+        m.reset(shorterString);
+        REGEX_ASSERT(m.regionStart() == 0);
+        REGEX_ASSERT(m.regionEnd() == shorterString.length());
+        
+        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
+        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
+        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
+        REGEX_ASSERT(&m == &m.reset());
+        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
+        
+        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
+        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
+        REGEX_ASSERT(&m == &m.reset());
+        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
+    
+        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
+        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
+        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
+        REGEX_ASSERT(&m == &m.reset());
+        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
+
+        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
+        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
+        REGEX_ASSERT(&m == &m.reset());
+        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
+        
+    }
+    
+    //
+    // hitEnd() and requireEnd()
+    //
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeString testString("aabb");
+        RegexMatcher m1(".*", testString,  0, status);
+        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
+        REGEX_ASSERT(m1.hitEnd() == TRUE);
+        REGEX_ASSERT(m1.requireEnd() == FALSE);
+        REGEX_CHECK_STATUS;
+        
+        status = U_ZERO_ERROR;
+        RegexMatcher m2("a*", testString, 0, status);
+        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
+        REGEX_ASSERT(m2.hitEnd() == FALSE);
+        REGEX_ASSERT(m2.requireEnd() == FALSE);
+        REGEX_CHECK_STATUS;
+
+        status = U_ZERO_ERROR;
+        RegexMatcher m3(".*$", testString, 0, status);
+        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
+        REGEX_ASSERT(m3.hitEnd() == TRUE);
+        REGEX_ASSERT(m3.requireEnd() == TRUE);
+        REGEX_CHECK_STATUS;
+    }
+

    //
    // Compilation error on reset with UChar *
@ -1470,7 +1348,7 @@ void RegexTest::Extended() {
    }

    int32_t    len;
-    UChar *testData = ReadAndConvertFile(srcPath, len, status);
+    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
    if (U_FAILURE(status)) {
        return; /* something went wrong, error already output */
    }
@ -1482,7 +1360,7 @@ void RegexTest::Extended() {

    RegexMatcher    quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
    RegexMatcher    commentMat    ("\\s*(#.*)?$", 0, status);
-    RegexMatcher    flagsMat      ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);
+    RegexMatcher    flagsMat      ("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)", 0, status);

    RegexMatcher    lineMat("(.*?)\\r?\\n", testString, 0, status);
    UnicodeString   testPattern;   // The pattern for test from the test file.
@ -1581,6 +1459,295 @@ void RegexTest::Extended() {



+//---------------------------------------------------------------------------
+//
+//    regex_find(pattern, flags, inputString, lineNumber)
+//
+//         Function to run a single test from the Extended (data driven) tests.
+//         See file test/testdata/regextst.txt for a description of the
+//         pattern and inputString fields, and the allowed flags.
+//         lineNumber is the source line in regextst.txt of the test.
+//
+//---------------------------------------------------------------------------
+
+
+//  Set a value into a UVector at position specified by a decimal number in
+//   a UnicodeString.   This is a utility function needed by the actual test function,
+//   which follows.
+static void set(UVector &vec, int32_t val, UnicodeString index) {
+    UErrorCode  status=U_ZERO_ERROR;
+    int32_t  idx = 0;
+    for (int32_t i=0; i<index.length(); i++) {
+        int32_t d=u_charDigitValue(index.charAt(i));
+        if (d<0) {return;}
+        idx = idx*10 + d;
+    }
+    while (vec.size()<idx+1) {vec.addElement(-1, status);}
+    vec.setElementAt(val, idx);
+}
+
+void RegexTest::regex_find(const UnicodeString &pattern,
+                           const UnicodeString &flags,
+                           const UnicodeString &inputString,
+                           int32_t line) {
+    UnicodeString       unEscapedInput;
+    UnicodeString       deTaggedInput;
+
+    UErrorCode          status         = U_ZERO_ERROR;
+    UParseError         pe;
+    RegexPattern        *parsePat      = NULL;
+    RegexMatcher        *parseMatcher  = NULL;
+    RegexPattern        *callerPattern = NULL;
+    RegexMatcher        *matcher       = NULL;
+    UVector             groupStarts(status);
+    UVector             groupEnds(status);
+    UBool               isMatch        = FALSE;
+    UBool               failed         = FALSE;
+    int32_t             numFinds;
+    int32_t             i;
+    UBool               useMatchesFunc   = FALSE;
+    UBool               useLookingAtFunc = FALSE;
+    int32_t             regionStart      = -1;
+    int32_t             regionEnd        = -1;
+
+    //
+    //  Compile the caller's pattern
+    //
+    uint32_t bflags = 0;
+    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
+        bflags |= UREGEX_CASE_INSENSITIVE;
+    }
+    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
+        bflags |= UREGEX_COMMENTS;
+    }
+    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
+        bflags |= UREGEX_DOTALL;
+    }
+    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
+        bflags |= UREGEX_MULTILINE;
+    }
+    
+    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
+        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
+    }
+    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
+        bflags |= UREGEX_UNIX_LINES;
+    }
+
+
+    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
+    if (status != U_ZERO_ERROR) {
+        #if UCONFIG_NO_BREAK_ITERATION==1
+        // 'v' test flag means that the test pattern should not compile if ICU was configured
+        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
+        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
+            goto cleanupAndReturn;
+        }
+        #endif
+        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
+            // Expected pattern compilation error.
+            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
+                logln("Pattern Compile returns \"%s\"", u_errorName(status));
+            }
+            goto cleanupAndReturn;
+        } else {
+            // Unexpected pattern compilation error.
+            errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
+            goto cleanupAndReturn;
+        }
+    }
+
+    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
+        RegexPatternDump(callerPattern);
+    }
+
+    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
+        errln("Expected, but did not get, a pattern compilation error.");
+        goto cleanupAndReturn;
+    }
+
+
+    //
+    // Number of times find() should be called on the test string, default to 1
+    //
+    numFinds = 1;
+    for (i=2; i<=9; i++) {
+        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
+            if (numFinds != 1) {
+                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
+                goto cleanupAndReturn;
+            }
+            numFinds = i;
+        }
+    }
+    
+    // 'M' flag.  Use matches() instead of find()
+    if (flags.indexOf((UChar)0x4d) >= 0) {
+        useMatchesFunc = TRUE;
+    }
+    if (flags.indexOf((UChar)0x4c) >= 0) {
+        useLookingAtFunc = TRUE;
+    }
+
+    //
+    //  Find the tags in the input data, remove them, and record the group boundary
+    //    positions.
+    //
+    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
+    REGEX_CHECK_STATUS_L(line);
+
+    unEscapedInput = inputString.unescape();
+    parseMatcher = parsePat->matcher(unEscapedInput, status);
+    REGEX_CHECK_STATUS_L(line);
+    while(parseMatcher->find()) {
+        parseMatcher->appendReplacement(deTaggedInput, "", status);
+        REGEX_CHECK_STATUS;
+        UnicodeString groupNum = parseMatcher->group(2, status);
+        if (groupNum == "r") {
+            // <r> or </r>, a region specification within the string
+            if (parseMatcher->group(1, status) == "/") {
+                regionEnd = deTaggedInput.length();
+            } else {
+                regionStart = deTaggedInput.length();
+            }
+        } else {
+            // <digits> or </digits>, a group match boundary tag.
+            if (parseMatcher->group(1, status) == "/") {
+                set(groupEnds, deTaggedInput.length(), groupNum);
+            } else {
+                set(groupStarts, deTaggedInput.length(), groupNum);
+            }
+        }
+    }
+    parseMatcher->appendTail(deTaggedInput);
+    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
+    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
+      errln("mismatched <r> tags");
+      failed = TRUE;
+      goto cleanupAndReturn;
+    }
+
+
+    //
+    //  Configure the matcher according to the flags specified with this test.
+    //
+    matcher = callerPattern->matcher(deTaggedInput, status);
+    REGEX_CHECK_STATUS_L(line);
+    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
+        matcher->setTrace(TRUE);
+    }
+    if (regionStart>=0) {
+       matcher->region(regionStart, regionEnd, status);
+       REGEX_CHECK_STATUS_L(line);
+    }
+    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
+        matcher->useAnchoringBounds(FALSE);
+    }
+    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
+        matcher->useTransparentBounds(TRUE);
+    }
+    
+    
+
+    //
+    // Do a find on the de-tagged input using the caller's pattern
+    //     TODO: error on count>1 and not find().
+    //           error on both matches() and lookingAt().
+    //
+    for (i=0; i<numFinds; i++) {
+        if (useMatchesFunc) {
+            isMatch = matcher->matches(status);
+        } else  if (useLookingAtFunc) {
+            isMatch = matcher->lookingAt(status);
+        } else {
+            isMatch = matcher->find();
+        }
+    }
+    matcher->setTrace(FALSE);
+
+    //
+    // Match up the groups from the find() with the groups from the tags
+    //
+
+    // number of tags should match number of groups from find operation.
+    // matcher->groupCount does not include group 0, the entire match, hence the +1.
+    //   G option in test means that capture group data is not available in the
+    //     expected results, so the check needs to be suppressed.
+    if (isMatch == FALSE && groupStarts.size() != 0) {
+        errln("Error at line %d:  Match expected, but none found.\n", line);
+        failed = TRUE;
+        goto cleanupAndReturn;
+    }
+
+    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
+        // Only check for match / no match.  Don't check capture groups.
+        if (isMatch && groupStarts.size() == 0) {
+            errln("Error at line %d:  No match expected, but one found.\n", line);
+            failed = TRUE;
+        }
+        goto cleanupAndReturn;
+    }
+
+    for (i=0; i<=matcher->groupCount(); i++) {
+        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
+        if (matcher->start(i, status) != expectedStart) {
+            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
+                line, i, expectedStart, matcher->start(i, status));
+            failed = TRUE;
+            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
+        }
+        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
+        if (matcher->end(i, status) != expectedEnd) {
+            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
+                line, i, expectedEnd, matcher->end(i, status));
+            failed = TRUE;
+            // Error on end position;  keep going; real error is probably yet to come as group
+            //   end positions work from end of the input data towards the front.
+        }
+    }
+    if ( matcher->groupCount()+1 < groupStarts.size()) {
+        errln("Error at line %d: Expected %d capture groups, found %d.",
+            line, groupStarts.size()-1, matcher->groupCount());
+        failed = TRUE;
+        }
+
+    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
+        matcher->requireEnd() == TRUE) {
+        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
+        failed = TRUE;
+    }
+    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
+        matcher->requireEnd() == FALSE) {
+        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
+        failed = TRUE;
+    }
+    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
+        matcher->hitEnd() == TRUE) {
+        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
+        failed = TRUE;
+    }
+    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
+        matcher->hitEnd() == FALSE) {
+        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
+        failed = TRUE;
+    }
+
+
+cleanupAndReturn:
+    if (failed) {
+        errln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
+            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
+        // callerPattern->dump();
+    }
+    delete parseMatcher;
+    delete parsePat;
+    delete matcher;
+    delete callerPattern;
+}
+
+
+
+
 //---------------------------------------------------------------------------
 //
 //      Errors     Check for error handling in patterns.
@ -1633,10 +1800,6 @@ void RegexTest::Errors() {
    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);

-
-    // UnicodeSet containing a string
-    REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
-
    // Ticket 5389
    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);

@ -1649,7 +1812,8 @@ void RegexTest::Errors() {
 //    in one big UChar * buffer, which the caller must delete.
 //
 //--------------------------------------------------------------------------------
-UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, UErrorCode &status) {
+UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
+                                     const char *defEncoding, UErrorCode &status) {
    UChar       *retPtr  = NULL;
    char        *fileBuf = NULL;
    UConverter* conv     = NULL;
@ -1698,6 +1862,11 @@ UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, UError
    if(encoding!=NULL ){
        fileBufC  += signatureLength;
        fileSize  -= signatureLength;
+    } else {
+        encoding = defEncoding;
+        if (strcmp(encoding, "utf-8") == 0) {
+            errln("file %s is missing its BOM", fileName);
+        }
    }

    //
@ -1804,7 +1973,7 @@ void RegexTest::PerlTests() {
    }

    int32_t    len;
-    UChar *testData = ReadAndConvertFile(srcPath, len, status);
+    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
    if (U_FAILURE(status)) {
        return; /* something went wrong, error already output */
    }
@ -1981,6 +2150,14 @@ void RegexTest::PerlTests() {
                lineNum, expected?"":"no ", found?"":"no " );
            continue;
        }
+        
+        // Don't try to check expected results if there is no match.
+        //   (Some have stuff in the expected fields)
+        if (!found) {
+            delete testMat;
+            delete testPat;
+            continue;
+        }

        //
        // Interpret the Perl expression from the fourth field of the data file,
--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@ -1,5 +1,5 @@
 /********************************************************************
- * COPYRIGHT: 
+ * COPYRIGHT:
 * Copyright (c) 2002-2007, International Business Machines Corporation and
 * others. All Rights Reserved.
 ********************************************************************/
@ -16,7 +16,7 @@

 class RegexTest: public IntlTest {
 public:
-  
+
    RegexTest();
    virtual ~RegexTest();

@ -37,7 +37,7 @@ public:
        const UnicodeString &input, int32_t line);
    virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
                            UErrorCode expectedStatus, int32_t line);
-    virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, UErrorCode &status);
+    virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, const char *charset, UErrorCode &status);
    virtual const char *getPath(char buffer[2048], const char *filename);

 };
--- a/icu4c/source/test/testdata/re_tests.txt
+++ b/icu4c/source/test/testdata/re_tests.txt
@ -62,8 +62,8 @@ ab{0,1}c	abc	y	$&	abc
 ^abc$	aabc	n	-	-
 abc$	aabc	y	$&	abc
 abc$	aabcd	n	-	-
-^	abc	y	$&	
-$	abc	y	$&	
+^	abc	y	$&
+$	abc	y	$&
 a.c	abc	y	$&	abc
 a.c	axc	y	$&	axc
 a.*c	axyzc	y	$&	axyzc
@ -79,13 +79,13 @@ a[b-a]	-	c	-	Invalid [] range "b-a"
 a[]b	-	ci	-	Unmatched [
 a[	-	c	-	Unmatched [
 a]	a]	y	$&	a]
-a[]]b	a]b	yi	$&	a]b	ICU makes [] into an empty set.
+a[]]b	a]b	y	$&	a]b
 a[^bc]d	aed	y	$&	aed
 a[^bc]d	abd	n	-	-
-a[^-b]c	adc	yi	$&	adc	ICU [] set rules
-a[^-b]c	a-c	ni	-	-	ICU [] set rules
+a[^-b]c	adc	y	$&	adc
+a[^-b]c	a-c	n	-	-
 a[^]b]c	a]c	n	-	-
-a[^]b]c	adc	yi	$&	adc     ICU [] set rules.
+a[^]b]c	adc	y	$&	adc
 \ba\b	a-	y	-	-
 \ba\b	-a	y	-	-
 \ba\b	-a-	y	-	-
@ -113,18 +113,18 @@ a\Sb	a-b	y	-	-
 \d	-	n	-	-
 \D	1	n	-	-
 \D	-	y	-	-
-[\w]	a	iy	-	-
-[\w]	-	in	-	-
-[\W]	a	in	-	-
-[\W]	-	iy	-	-
-a[\s]b	a b	iy	-	-
-a[\s]b	a-b	in	-	-
-a[\S]b	a b	in	-	-
-a[\S]b	a-b	iy	-	-
-[\d]	1	iy	-	-
-[\d]	-	in	-	-
-[\D]	1	in	-	-
-[\D]	-	iy	-	-
+[\w]	a	y	-	-
+[\w]	-	n	-	-
+[\W]	a	n	-	-
+[\W]	-	y	-	-
+a[\s]b	a b	y	-	-
+a[\s]b	a-b	n	-	-
+a[\S]b	a b	n	-	-
+a[\S]b	a-b	y	-	-
+[\d]	1	y	-	-
+[\d]	-	n	-	-
+[\D]	1	n	-	-
+[\D]	-	y	-	-
 ab|cd	abc	y	$&	ab
 ab|cd	abcd	y	$&	ab
 ()ef	def	y	$&-$1	ef-
@ -167,7 +167,7 @@ a.+?c	abcabc	y	$&	abc
 )(	-	c	-	Unmatched )
 [^ab]*	cde	y	$&	cde
 abc		n	-	-
-a*		y	$&	
+a*		y	$&
 ([abc])*d	abbbcd	y	$&-$1	abbbcd-c
 ([abc])*bcd	abcd	y	$&-$1	abcd-a
 a|b|c|d|e	e	y	$&	e
@ -292,8 +292,8 @@ a[-]?c	ac	y	$&	ac
 '^abc'i	ABCC	y	$&	ABC
 '^abc$'i	AABC	n	-	-
 'abc$'i	AABC	y	$&	ABC
-'^'i	ABC	y	$&	
-'$'i	ABC	y	$&	
+'^'i	ABC	y	$&
+'$'i	ABC	y	$&
 'a.c'i	ABC	y	$&	ABC
 'a.c'i	AXC	y	$&	AXC
 'a.*?c'i	AXYZC	y	$&	AXYZC
@ -309,13 +309,13 @@ a[-]?c	ac	y	$&	ac
 'a[]b'i	-	ci	-	Unmatched [
 'a['i	-	c	-	Unmatched [
 'a]'i	A]	y	$&	A]
-'a[]]b'i	A]B	yi	$&	A]B
+'a[]]b'i	A]B	y	$&	A]B
 'a[^bc]d'i	AED	y	$&	AED
 'a[^bc]d'i	ABD	n	-	-
-'a[^-b]c'i	ADC	yi	$&	ADC	ICU [] set rules
-'a[^-b]c'i	A-C	ni	-	-	ICU [] set rules
+'a[^-b]c'i	ADC	y	$&	ADC
+'a[^-b]c'i	A-C	n	-	-
 'a[^]b]c'i	A]C	n	-	-
-'a[^]b]c'i	ADC	yi	$&	ADC
+'a[^]b]c'i	ADC	y	$&	ADC
 'ab|cd'i	ABC	y	$&	AB
 'ab|cd'i	ABCD	y	$&	AB
 '()ef'i	DEF	y	$&-$1	EF-
@ -347,7 +347,7 @@ a[-]?c	ac	y	$&	ac
 ')('i	-	c	-	Unmatched )
 '[^ab]*'i	CDE	y	$&	CDE
 'abc'i		n	-	-
-'a*'i		y	$&	
+'a*'i		y	$&
 '([abc])*d'i	ABBBCD	y	$&-$1	ABBBCD-C
 '([abc])*bcd'i	ABCD	y	$&-$1	ABCD-A
 'a|b|c|d|e'i	E	y	$&	E
@ -357,7 +357,7 @@ a[-]?c	ac	y	$&	ac
 'ab*'i	XAYABBBZ	y	$&	A
 '(ab|cd)e'i	ABCDE	y	$&-$1	CDE-CD
 '[abhgefdc]ij'i	HIJ	y	$&	HIJ
-'^(ab|cd)e'i	ABCDE	ni	x$1y	XY
+'^(ab|cd)e'i	ABCDE	n	x$1y	XY
 '(abc|)ef'i	ABCDEF	y	$&-$1	EF-
 '(a|b)c*d'i	ABCD	y	$&-$1	BCD-B
 '(ab|ab*)bc'i	ABC	y	$&-$1	ABC-A
@ -486,7 +486,7 @@ foo\w*\d{4}baz	foobar1234baz	y	$&	foobar1234baz
 a(?{})b	cabd	y	$&	ab
 a(?{)b	-	c	-	Sequence (?{...}) not terminated or not {}-balanced
 a(?{{})b	-	c	-	Sequence (?{...}) not terminated or not {}-balanced
-a(?{}})b	-	c	-	
+a(?{}})b	-	c	-
 a(?{"{"})b	-	c	-	Sequence (?{...}) not terminated or not {}-balanced
 a(?{"\{"})b	cabd	y	$&	ab
 a(?{"{"}})b	-	c	-	Unmatched right curly bracket
@ -546,50 +546,50 @@ $(?<=^(a))	a	y	$1	a
 ^(?=(a+?))\1ab	aaab	n	-	-
 ([\w:]+::)?(\w+)$	abcd:	n	-	-
 ([\w:]+::)?(\w+)$	abcd	y	$1-$2	-abcd
-([\w:]+::)?(\w+)$	xy:z:::abcd	iy	$1-$2	xy:z:::-abcd
+([\w:]+::)?(\w+)$	xy:z:::abcd	y	$1-$2	xy:z:::-abcd
 ^[^bcd]*(c+)	aexycd	y	$1	c
 (a*)b+	caab	y	$1	aa
 ([\w:]+::)?(\w+)$	abcd:	n	-	-
 ([\w:]+::)?(\w+)$	abcd	y	$1-$2	-abcd
-([\w:]+::)?(\w+)$	xy:z:::abcd	iy	$1-$2	xy:z:::-abcd
+([\w:]+::)?(\w+)$	xy:z:::abcd	y	$1-$2	xy:z:::-abcd
 ^[^bcd]*(c+)	aexycd	y	$1	c
 (?{$a=2})a*aa(?{local$a=$a+1})k*c(?{$b=$a})	yaaxxaaaacd	y	$b	3
 (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})	yaaxxaaaacd	y	$b	4
 (>a+)ab	aaab	n	-	-
 (?>a+)b	aaab	y	-	-
-([[:]+)	a:[b]:	iy	$1	:[
-([[=]+)	a=[b]=	iy	$1	=[
-([[.]+)	a.[b].	iy	$1	.[
+([[:]+)	a:[b]:	yi	$1	:[	 Java and ICU dont escape [[xyz
+([[=]+)	a=[b]=	yi	$1	=[	 Java and ICU dont escape [[xyz
+([[.]+)	a.[b].	yi	$1	.[	 Java and ICU dont escape [[xyz
 [a[:xyz:	-	c	-	Unmatched [
 [a[:xyz:]	-	c	-	POSIX class [:xyz:] unknown
-[a[:]b[:c]	abc	iy	$&	abc
+[a[:]b[:c]	abc	yi	$&	abc	  Java and ICU embedded [ is nested set
 ([a[:xyz:]b]+)	pbaq	c	-	POSIX class [:xyz:] unknown
-[a[:]b[:c]	abc	iy	$&	abc
+[a[:]b[:c]	abc	iy	$&	abc	  Java and ICU embedded [ is nested set
 ([[:alpha:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd
-([[:alnum:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy
+([[:alnum:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy
 ([[:ascii:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--  ${nulnul}
-([[:cntrl:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	${nulnul}
-([[:digit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	01
-([[:graph:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy__--
+([[:cntrl:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	${nulnul}
+([[:digit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	01
+([[:graph:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--
 ([[:lower:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	cd
-([[:print:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy__--  
-([[:punct:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	__--
-([[:space:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	  
+([[:print:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--  
+([[:punct:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	__--
+([[:space:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	  
 ([[:word:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy__
 ([[:upper:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	AB
-([[:xdigit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01
+([[:xdigit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01
 ([[:^alpha:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	01
-([[:^alnum:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	__--  ${nulnul}${ffff}
+([[:^alnum:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	__--  ${nulnul}${ffff}
 ([[:^ascii:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	${ffff}
-([[:^cntrl:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy__--  
-([[:^digit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd
+([[:^cntrl:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--  
+([[:^digit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd
 ([[:^lower:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	AB
-([[:^print:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	${nulnul}${ffff}
-([[:^punct:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy
-([[:^space:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	ABcd01Xy__--
+([[:^print:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	${nulnul}${ffff}
+([[:^punct:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy
+([[:^space:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	ABcd01Xy__--
 ([[:^word:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	--  ${nulnul}${ffff}
 ([[:^upper:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	cd01
-([[:^xdigit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	yi	$1	Xy__--  ${nulnul}${ffff}
+([[:^xdigit:]]+)	ABcd01Xy__--  ${nulnul}${ffff}	y	$1	Xy__--  ${nulnul}${ffff}
 [[:foo:]]	-	c	-	POSIX class [:foo:] unknown
 [[:^foo:]]	-	c	-	POSIX class [:^foo:] unknown
 ((?>a+)b)	aaab	y	$1	aaab
@ -823,11 +823,11 @@ foo.bart	foo.bart	y	-	-
 .[X][X](.+)+[X]	bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa	ni	-	-
 tt+$	xxxtt	y	-	-
 ([a-\d]+)	za-9z	yi	$1	a-9
-([\d-z]+)	a0-za	yi	$1	0-z
-([\d-\s]+)	a0- z	yi	$1	0- 
-([a-[:digit:]]+)	za-9z	iy	$1	a-9
-([[:digit:]-z]+)	=0-z=	iy	$1	0-z
-([[:digit:]-[:alpha:]]+)	=0-z=	iy	$1	0-z
+([\d-z]+)	a0-za	y	$1	0-z
+([\d-\s]+)	a0- z	y	$1	0- 
+([a-[:digit:]]+)	za-9z	y	$1	a-9
+([[:digit:]-z]+)	=0-z=	y	$1	0-z
+([[:digit:]-[:alpha:]]+)	=0-z=	iy	$1	0-z	 Set difference in ICU
 \GX.*X	aaaXbX	n	-	-
 (\d+\.\d+)	3.1415926	y	$1	3.1415926
 (\ba.{0,10}br)	have a web browser	y	$1	a web br
@ -857,7 +857,7 @@ tt+$	xxxtt	y	-	-
 ^([^,]{0,3},){0,3}d	aaa,b,c,d	y	$1	c,
 (?i)		y	-	-
 '(?!\A)x'm	a\nxb\n	y	-	-
-^(a(b)?)+$	aba	iy	-$1-$2-	-a--
+^(a(b)?)+$	aba	yi	-$1-$2-	-a--	Java disagrees.  Not clear who is right.
 '^.{9}abc.*\n'm	123\nabcabcabcabc\n	y	-	-
 ^(a)?a$	a	y	-$1-	--
 ^(a)?(?(1)a|b)+$	a	n	-	-
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@ -1,7 +1,7 @@
 # Copyright (c) 2001-2007 International Business Machines
 # Corporation and others. All Rights Reserved.
 #
-#  file:  
+#  file:
 #
 #   ICU regular expression test cases.
 #
@ -10,24 +10,161 @@
 #               <pattern>      =  "<regular expression pattern>"
 #               <match string> =  "<tagged string>"
 #                                 the quotes on the pattern and match string can be " or ' or /
-#               <tagged string> = text, with the start and end of each 
+#               <tagged string> = text, with the start and end of each
 #                                 capture group tagged with <n>...</n>.  The overall match,
 #                                 if any, is group 0, as in <0>matched text</0>
-#               <flags>         = any combination of 
+#
+#                                  A region can be specified with <r>...</r> tags.
+#
+#               <flags>         = any combination of
 #                                   i      case insensitive match
 #                                   x      free spacing and comments
 #                                   s      dot-matches-all mode
-#                                   m      multi-line mode.  $ and ^ match at embedded new-lines
+#                                   m      multi-line mode.  
+#                                            ($ and ^ match at embedded new-lines)
+#                                   D      Unix Lines mode (only recognize 0x0a as new-line)
 #                                   v      If icu configured without break iteration, this
 #                                          regex test pattern should not compile.
+#                                   e      set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
 #                                   d      dump the compiled pattern
 #                                   t      trace operation of match engine.
-#                                   2-9    a digit between 2 and 9, specifies the number of 
+#                                   2-9    a digit between 2 and 9, specifies the number of
 #                                          times to execute find().  The expected results are
 #                                          for the last find() in the sequence.
+#                                   G      Only check match / no match.  Do not check capture groups.
+#                                   E      Pattern compilation error expected
+#                                   L      Use LookingAt() rather than find()
+#                                   M      Use matches() rather than find().
+#
+#                                   a      Use non-Anchoring Bounds.
+#                                   b      Use Transparent Bounds.
+#                                          The a and t options only make a difference if
+#                                          a <r>region</r> has been specified in the string.
+#                                   z|Z    hitEnd was expected(z) or not expected (Z).
+#                                          With neither, hitEnd is not checked.
+#                                   y|Y    Require End expected(y) or not expected (Y).
+#
 #                                 White space must be present between the flags and the match string.
 #

+# Look-ahead expressions
+#
+"abc(?=def)"                   "<0>abc</0>def"
+"(.*)(?=c)"                    "<0><1>ab</1></0>cdef"
+
+"(?:.*)(?=c)"                  "<r>ab</r>cdef"
+"(?:.*)(?=c)"             b    "<r><0>ab</0></r>cdef"      # transparent bounds
+"(?:.*)(?=c)"             bM   "<r><0>ab</0></r>cdef"      # transparent bounds
+
+"(?:.*)(?=(c))"           b    "<0>ab</0><1>c</1>def"      # Capture in look-ahead
+"(?=(.)\1\1)\1"                "abcc<0><1>d</1></0>ddefg"  # Backrefs to look-ahead capture
+
+".(?!\p{L})"                   "abc<0>d</0> "              # Negated look-ahead
+".(?!(\p{L}))"                 "abc<0>d</0> "              # Negated look-ahead, no capture
+                                                           #   visible outside of look-ahead
+"and(?=roid)"            L     "<0>and</0>roid"
+"and(?=roid)"            M     "<r>and</r>roid"
+"and(?=roid)"            bM    "<r><0>and</0></r>roid"
+
+"and(?!roid)"            L     "<0>and</0>roix"
+"and(?!roid)"            L     "android"
+
+"and(?!roid)"            M     "<r><0>and</0></r>roid"     # Opaque bounds
+"and(?!roid)"            bM    "<r>and</r>roid"
+"and(?!roid)"            bM    "<r><0>and</0></r>roix"
+
+#
+# Negated Lookahead, various regions and region transparency
+#
+"abc(?!def)"                   "<0>abc</0>xyz"
+"abc(?!def)"                   "abcdef"
+"abc(?!def)"                   "<r><0>abc</0></r>def"
+"abc(?!def)"              b    "<r>abc</r>def"
+"abc(?!def)"              b    "<r><0>abc</0></r>xyz"
+
+#
+#  Anchoring Bounds
+#
+"^def$"                        "abc<r><0>def</0></r>ghi"           # anchoring (default) bounds
+"^def$"                  a     "abc<r>def</r>ghi"                  # non-anchoring bounds
+"^def"                   a     "<r><0>def</0></r>ghi"              # non-anchoring bounds
+"def$"                   a     "abc<r><0>def</0></r>"              # non-anchoring bounds
+
+"^.*$"                   m     "<0>line 1</0>\n line 2"
+"^.*$"                   m2    "line 1\n<0> line 2</0>"
+"^.*$"                   m3    "line 1\n line 2"
+"^.*$"                   m     "li<r><0>ne </0></r>1\n line 2"     # anchoring bounds
+"^.*$"                   m2    "li<r>ne </r>1\n line 2"            # anchoring bounds
+"^.*$"                  am     "li<r>ne </r>1\n line 2"            # non-anchoring bounds
+"^.*$"                  am     "li\n<r><0>ne </0></r>\n1\n line 2" # non-anchoring bounds
+
+#
+#  HitEnd and RequireEnd for new-lines just before end-of-input
+#
+"xyz$"                  yz     "<0>xyz</0>\n"
+"xyz$"                  yz     "<0>xyz</0>\x{d}\x{a}"
+
+"xyz$"                 myz     "<0>xyz</0>"                        # multi-line mode
+"xyz$"                 mYZ     "<0>xyz</0>\n" 
+"xyz$"                 mYZ     "<0>xyz</0>\r\n"
+"xyz$"                 mYZ     "<0>xyz</0>\x{85}abcd"
+
+"xyz$"                  Yz     "xyz\nx"
+"xyz$"                  Yz     "xyza"
+"xyz$"                  yz     "<0>xyz</0>"
+
+#
+#  All Unicode line endings recognized.
+#     0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
+#     Multi-line and non-multiline mode take different paths, so repeated tests.
+#
+"^def$"                 mYZ    "abc\x{a}<0>def</0>\x{a}ghi"
+"^def$"                 mYZ    "abc\x{b}<0>def</0>\x{b}ghi"
+"^def$"                 mYZ    "abc\x{c}<0>def</0>\x{c}ghi"
+"^def$"                 mYZ    "abc\x{d}<0>def</0>\x{d}ghi"
+"^def$"                 mYZ    "abc\x{85}<0>def</0>\x{85}ghi"
+"^def$"                 mYZ    "abc\x{2028}<0>def</0>\x{2028}ghi"
+"^def$"                 mYZ    "abc\x{2029}<0>def</0>\x{2029}ghi"
+"^def$"                 mYZ    "abc\r\n<0>def</0>\r\nghi"
+
+"^def$"                 yz     "<0>def</0>\x{a}"
+"^def$"                 yz     "<0>def</0>\x{b}"
+"^def$"                 yz     "<0>def</0>\x{c}"
+"^def$"                 yz     "<0>def</0>\x{d}"
+"^def$"                 yz     "<0>def</0>\x{85}"
+"^def$"                 yz     "<0>def</0>\x{2028}"
+"^def$"                 yz     "<0>def</0>\x{2029}"
+"^def$"                 yz     "<0>def</0>\r\n"
+"^def$"                 yz     "<0>def</0>"
+
+
+"^def$"                       "<0>def</0>\x{2028"    #TODO: should be an error of some sort.
+
+#
+#  UNIX_LINES mode
+#
+"abc$"                 D      "<0>abc</0>\n"
+"abc$"                 D      "abc\r"
+"abc$"                 D      "abc\u0085"
+"a.b"                  D      "<0>a\rb</0>"
+"a.b"                  D      "a\nb"
+"(?d)abc$"                    "<0>abc</0>\n"
+"(?d)abc$"                    "abc\r"
+"abc$"                 mD     "<0>abc</0>\ndef"
+"abc$"                 mD     "abc\rdef"
+
+".*def"                L      "abc\r def xyz"          # Normal mode, LookingAt() stops at \r
+".*def"                DL     "<0>abc\r def</0> xyz"   # Unix Lines mode, \r not line end.
+".*def"                DL     "abc\n def xyz"   
+
+"(?d)a.b"                     "a\nb"
+"(?d)a.b"                     "<0>a\rb</0>"
+
+"^abc"                 m      "xyz\r<0>abc</0>"
+"^abc"                 Dm     "xyz\rabc"
+"^abc"                 Dm     "xyz\n<0>abc</0>"
+
+

 # Capturing parens
 ".(..)."                       "<0>a<1>bc</1>d</0>"
@ -97,6 +234,16 @@
 "(?w:.+?(\b\S.+?\b).*)"          v   "<0>  <1>don't</1>   </0>"
 "(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)"     v "<0><1>.</1><2> </2><3>,</3><4>:</4><5>$</5><6>37,000.50</6><7> </7>   </0>"

+#
+#  Unicode word boundaries with Regions
+#
+"(?w).*?\b"                      v   "abc<r><0>def</0></r>ghi"
+"(?w).*?\b"                      v2  "abc<r>def<0></0></r>ghi"
+"(?w).*?\b"                      v3  "abc<r>def</r>ghi"
+#"(?w).*?\b"                      vb  "abc<r><0>def</0></r>ghi"    # TODO:  bug.  Ticket 6073
+#"(?w).*?\b"                      vb2 "abc<r>def</r>ghi"
+
+

 # . does not match new-lines
 "."                            "\u000a\u000d\u0085\u000c\u000b\u2028\u2029<0>X</0>\u000aY"
@ -128,20 +275,20 @@
 ".*^(Hello)"                   " Hello Hello Hello Hello Goodbye"# No Match

 # $ matches only at end of line, or before a newline preceding the end of line
-".*?(Goodbye)$"                "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
-".*?(Goodbye)"                 "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
-".*?(Goodbye)$"                "Hello Goodbye> Goodbye Goodbye "# No Match
+".*?(Goodbye)$"           zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
+".*?(Goodbye)"            ZY   "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
+".*?(Goodbye)$"           z    "Hello Goodbye> Goodbye Goodbye "# No Match

-".*?(Goodbye)$"                "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
-".*?(Goodbye)$"                "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
-".*?(Goodbye)$"                "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
-".*?(Goodbye)$"                "Hello Goodbye Goodbye Goodbye\n\n"# No Match
+".*?(Goodbye)$"           zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
+".*?(Goodbye)$"           zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
+".*?(Goodbye)$"           zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
+".*?(Goodbye)$"           z    "Hello Goodbye Goodbye Goodbye\n\n"# No Match

 # \Z matches at end of input, like $ with default flags.
-".*?(Goodbye)\Z"               "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
-".*?(Goodbye)"                 "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
-".*?(Goodbye)\Z"               "Hello Goodbye> Goodbye Goodbye "# No Match
-"here$"                        "here\nthe end"# No Match
+".*?(Goodbye)\Z"          zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
+".*?(Goodbye)"            ZY   "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
+".*?(Goodbye)\Z"          z    "Hello Goodbye> Goodbye Goodbye "# No Match
+"here$"                   z    "here\nthe end"# No Match

 ".*?(Goodbye)\Z"               "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
 ".*?(Goodbye)\Z"               "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
@ -151,12 +298,13 @@
 # \z matches only at the end of string.
 #    no special treatment of new lines.
 #    no dependencies on flag settings.
-".*?(Goodbye)\z"               "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
-".*?(Goodbye)\z"               "Hello Goodbye Goodbye Goodbye "# No Match
-"here$"                        "here\nthe end"# No Match
+".*?(Goodbye)\z"          zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
+".*?(Goodbye)\z"          z    "Hello Goodbye Goodbye Goodbye "# No Match
+"here$"                   z    "here\nthe end"# No Match

-".*?(Goodbye)\z"               "Hello Goodbye Goodbye Goodbye\n"# No Match
-".*?(Goodbye)\n\z"             "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
+".*?(Goodbye)\z"          z    "Hello Goodbye Goodbye Goodbye\n"# No Match
+".*?(Goodbye)\n\z"        zy   "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
+"abc\z|def"               ZY   "abc<0>def</0>"

 # (?# comment) doesn't muck up pattern
 "Hello (?# this is a comment) world"  "  <0>Hello  world</0>..."
@ -180,6 +328,61 @@
 "(x?)*xyz"                     "<0>xx<1></1>xyz</0>"    # Sligthly wierd, but correct.  The "last" time through (x?),
                                                        #   it matches the empty string.

+# Set expressions, basic operators and escapes work
+#
+"[\d]+"                        "<0>0123</0>abc/.,"
+"[^\d]+"                       "0123<0>abc/.,</0>"
+"[\D]+"                        "0123<0>abc/.,</0>"
+"[^\D]+"                       "<0>0123</0>abc/.,"
+
+"[\s]+"                        "<0> \t</0>abc/.,"
+"[^\s]+"                       " \t<0>abc/.,</0>"
+"[\S]+"                        " \t<0>abc/.,</0>"
+"[^\S]+"                       "<0> \t</0>abc/.,"
+
+"[\w]+"                        "<0>abc123</0> .,;"
+"[^\w]+"                       "abc123<0> .,;</0>"
+"[\W]+"                        "abc123<0> .,;</0>"
+"[^\W]+"                       "<0>abc123</0> .,;"
+
+"[\z]+"                        "abc<0>zzz</0>def"     # \z has no special meaning
+"[^\z]+"                       "<0>abc</0>zzzdef"
+"[\^]+"                        "abc<0>^^</0>"
+"[^\^]+"                       "<0>abc</0>^^"
+
+"[\u0041c]+"                   "<0>AcAc</0>def"
+"[\U00010002]+"                "<0>\ud800\udc02</0>\U00010003"
+"[^\U00010002]+"               "<0>Hello</0>\x{10002}"
+"[\x61b]+"                     "<0>abab</0>cde"
+#"[\x6z]+"                      "\x06"                  #TODO:  single hex digits should fail
+"[\x{9}\x{75}\x{6d6}\x{6ba6}\x{6146B}\x{10ffe3}]+"  "<0>\u0009\u0075\u06d6\u6ba6\U0006146B\U0010ffe3</0>abc"
+
+"[\N{LATIN CAPITAL LETTER TONE SIX}ab\N{VARIATION SELECTOR-70} ]+"       "x<0> \u0184\U000E0135 ab</0>c"
+"[\N{LATIN SMALL LETTER C}-\N{LATIN SMALL LETTER F}]+"    "ab<0>cdef</0>ghi"
+
+
+
+#
+#  [set expressions], check the precedence of '-', '&', '--', '&&'
+#      '-' and '&', for compatibility with ICU UnicodeSet, have the same
+#                   precedence as the implicit Union between adjacent items.
+#      '--' and '&&', for compatibility with Java, have lower precedence than
+#                   the implicit Union operations.  '--' and '&&' themselves
+#                   have the same precedence, and group left to right.
+#
+"[[a-m]-[f-w]p]+"              "<0>dep</0>fgwxyz"
+"[^[a-m]-[f-w]p]+"             "dep<0>fgwxyz</0>"
+
+"[[a-m]--[f-w]p]+"             "<0>de</0>pfgwxyz"
+"[^[a-m]--[f-w]p]+"            "de<0>pfgwxyz</0>"
+
+"[[a-m]&[e-s]w]+"              "<0>efmw</0>adnst"
+"[^[a-m]&[e-s]w]+"             "efmw<0>adnst</0>"
+
+"[[a-m]&[e-s]]+"              "<0>efm</0>adnst"
+
+
+
 # {min,max} iteration qualifier
 "A{3}BC"                       "<0>AAABC</0>"

@ -247,8 +450,8 @@
 "ab(?:c|(d?))(\1)"             "<0>ab<1></1><2></2></0>"

 # Case Insensitive
-"aBc"                    i      "<0>ABC</0>"      
-"a[^bc]d"                i      "ABD"   
+"aBc"                    i      "<0>ABC</0>"
+"a[^bc]d"                i      "ABD"
 '((((((((((a))))))))))\10' i    "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"

 "(?:(?i)a)b"                    "<0>Ab</0>"
@ -259,15 +462,36 @@
 "a b"                           "ab"
 "abc "                          "abc"
 "abc "                          "<0>abc </0>"
-"ab[cd e]z"                     "<0>ab z</0>" 
+"ab[cd e]z"                     "<0>ab z</0>"
 "ab\ c"                         "<0>ab c</0> "
 "ab c"                          "<0>ab c</0> "
 "ab c"                        x "ab c "
 "ab\ c"                       x "<0>ab c</0> "

+#
+# Pattern Flags
+#
+"(?u)abc"                       "<0>abc</0>"
+"(?-u)abc"                      "<0>abc</0>"
+
+#
+#  \c escapes  (Control-whatever)
+#
+"\cA"                           "<0>\u0001</0>"
+"\ca"                           "<0>\u0001</0>"
+"\c\x"                          "<0>\u001cx</0>"
+

 #Multi-line mode
-'b\s^'                        m  "a\nb\n"
+'b\s^'                        m "a\nb\n"
+"(?m)^abc$"                     "abc \n abc\n<0>abc</0>\nabc"
+"(?m)^abc$"                   2 "abc \n abc\nabc\n<0>abc</0>"
+"^abc$"                       2 "abc \n abc\nabc\nabc"
+
+# Empty and full range
+"[\u0000-\U0010ffff]+"          "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
+"[^\u0000-\U0010ffff]"          "abc\u0000\uffff\U00010000\U0010ffffzz"
+"[^a--a]+"                      "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"

 # Free-spacing mode
 "a b c  # this is a comment"  x "<0>abc</0> "
@ -316,8 +540,8 @@
 "abc.*$"                         "<0>abcdef</0>"
 "abc(.*)"                        "<0>abc<1>def</1></0>"
 "abc(.*)"                        "<0>abc<1></1></0>"
-"abc.*"                          "<0>abc</0>\ndef"     
-"abc.*"                     s    "<0>abc\ndef</0>"     
+"abc.*"                          "<0>abc</0>\ndef"
+"abc.*"                     s    "<0>abc\ndef</0>"
 "abc.*$"                    s    "<0>abc\ndef</0>"
 "abc.*$"                         "abc\ndef"
 "abc.*$"                    m    "<0>abc</0>\ndef"
@ -357,9 +581,16 @@
 "ab\x09w"                        "<0>ab\u0009w</0>"
 "ab\xabcdc"                      "<0>ab\u00abcdc</0>"
 "ab\x{abcd}c"                    "<0>ab\uabcdc</0>"
-"ab\x{101234}c"                    "<0>ab\U00101234c</0>"
+"ab\x{101234}c"                  "<0>ab\U00101234c</0>"
 "abα"                            "<0>abα</0>"

+#
+#  Octal Escaping.   This conforms to Java conventions, not Perl.
+"\0101\00\03\073\0154\01442"      "<0>A\u0000\u0003\u003b\u006c\u0064\u0032</0>"
+"\0776"                          "<0>\u003f\u0036</0>"  # overflow, the 6 is literal.
+"\0376xyz"                       "<0>\u00fexyz</0>"
+"\08"                        E   "<0>\u00008</0>"
+"\0"                         E   "x"

 #
 #  \u Surrogate Pairs
@ -369,6 +600,24 @@
 "\ud800\ud800\udc00"              "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
 "(\ud800)(\udc00)"                "\U00010000"

+#
+# hitEnd with find()
+#
+"abc"                        Z    "aa<0>abc</0>  abcab"
+"abc"                       2Z    "aaabc  <0>abc</0>ab"
+"abc"                       3z    "aa>abc  abcab"
+
+#
+# Bug xxxx
+#
+"(?:\-|(\-?\d+\d\d\d))?(?:\-|\-(\d\d))?(?:\-|\-(\d\d))?(T)?(?:(\d\d):(\d\d):(\d\d)(\.\d+)?)?(?:(?:((?:\+|\-)\d\d):(\d\d))|(Z))?"   MG  "<0>-1234-21-31T41:51:61.789+71:81</0>"
+
+
+#
+# A random, complex, meaningless pattern that should at least compile
+#
+"(?![^\<C\f\0146\0270\}&&[|\02-\x3E\}|X-\|]]{7,}+)[|\\\x98\<\?\u4FCFr\,\0025\}\004|\0025-\0521]|(?<![|\01-\u829E])|(?<!\p{Alpha})|^|(?-s:[^\x15\\\x24F\a\,\a\u97D8[\x38\a[\0224-\0306[^\0020-\u6A57]]]]??)(?xix:[^|\{\[\0367\t\e\x8C\{\[\074c\]V[|b\fu\r\0175\<\07f\066s[^D-\x5D]]])(?xx:^{5,}+)(?d)(?=^\D)|(?!\G)(?>\G)(?![^|\]\070\ne\{\t\[\053\?\\\x51\a\075\0023-\[&&[|\022-\xEA\00-\u41C2&&[^|a-\xCC&&[^\037\uECB3\u3D9A\x31\|\<b\0206\uF2EC\01m\,\ak\a\03&&\p{Punct}]]]])(?-dxs:[|\06-\07|\e-\x63&&[|Tp\u18A3\00\|\xE4\05\061\015\0116C|\r\{\}\006\xEA\0367\xC4\01\0042\0267\xBB\01T\}\0100\?[|\[-\u459B|\x23\x91\rF\0376[|\?-\x94\0113-\\\s]]]]{6}?)(?<=[^\t-\x42H\04\f\03\0172\?i\u97B6\e\f\uDAC2])(?=\B)(?>[^\016\r\{\,\uA29D\034\02[\02-\[|\t\056\uF599\x62\e\<\032\uF0AC\0026\0205Q\|\\\06\0164[|\057-\u7A98&&[\061-g|\|\0276\n\042\011\e\xE8\x64B\04\u6D0EDW^\p{Lower}]]]]?)(?<=[^\n\\\t\u8E13\,\0114\u656E\xA5\]&&[\03-\026|\uF39D\01\{i\u3BC2\u14FE]])(?<=[^|\uAE62\054H\|\}&&^\p{Space}])(?sxx)(?<=[\f\006\a\r\xB4]{1,5})|(?x-xd:^{5}+)()"  "<0></0>abc"
+

 #
 # Bug 3225
@ -435,7 +684,7 @@
 "^"                               "<0></0>"
 "^"                           2   ""

-"\Z"				  "<0></0>"
+"\Z"                              "<0></0>"
 "\Z"                          2   ""
 "\Z"                          2   "\u000a<0></0>"
 "\Z"                              "<0></0>\u000d\u000a"
@ -471,6 +720,173 @@
 ".{6}"                            "123\u000a\u000dXYZ"
 ".{6}"                         s  "<0>123\u000a\u000dX</0>Y"

+
+#
+# Ranges
+#
+".*"                              "abc<r><0>def</0></r>ghi"
+"a"                               "aaa<r><0>a</0>aa</r>aaa"
+"a"                           2   "aaa<r>a<0>a</0>a</r>aaa"
+"a"                           3   "aaa<r>aa<0>a</0></r>aaa"
+"a"                           4   "aaa<r>aaa</r>aaa"
+"a"                               "aaa<r><0>a</0>aa</r>aaa"
+
+#
+# [set] parsing, systematically run through all of the parser states.
+#
+#
+"[def]+"                          "abc<0>ddeeff</0>ghi"       # set-open
+"[^def]+"                         "<0>abc</0>defghi"
+"[:digit:]+"                      "abc<0>123</0>def"
+"[:^digit:]+"                     "<0>abc</0>123def"
+"[\u005edef]+"                    "abc<0>de^f</0>ghi"
+
+"[]]+"                            "abc<0>]]]</0>[def"         # set-open2
+"[^]]+"                           "<0>abc</0>]]][def"
+
+"[:Lu:]+"                         "abc<0>ABC</0>def"          # set-posix
+"[:Lu]+"                          "abc<0>uL::Lu</0>"
+"[:^Lu]+"                         "abc<0>uL:^:Lu</0>"
+"[:]+"                            "abc<0>:::</0>def"
+"[:whats this:]"               E  " "
+"[--]+"                       dE  "-------"
+
+"[[nested]]+"                      "xyz[<0>nnetsteed</0>]abc"   #set-start
+"[\x{41}]+"                        "CB<0>AA</0>ZYX"
+"[\[\]\\]+"                        "&*<0>[]\\</0>..."
+"[*({<]+"                          "^&<0>{{(<<*</0>)))"
+
+
+"[-def]+"                          "abc<0>def-ef-d</0>xyz"     # set-start-dash
+"[abc[--def]]"                 E   " "
+
+"[x[&def]]+"                        "abc<0>def&</0>ghi"        # set-start-amp
+"[&& is bad at start]"         E   " "
+
+"[abc"                         E   " "                         # set-after-lit
+"[def]]"                           "abcdef"
+"[def]]"                           "abcde<0>f]</0>]"
+
+"[[def][ghi]]+"                    "abc]<0>defghi</0>[xyz"     # set-after-set
+"[[def]ghi]+"                      "abc]<0>defghi</0>[xyz" 
+"[[[[[[[[[[[abc]"              E   " "
+"[[abc]\p{Lu}]+"                   "def<0>abcABC</0>xyz"
+
+"[d-f]+"                           "abc<0>def</0>ghi"          # set-after-range
+"[d-f[x-z]]+"                      "abc<0>defxyzzz</0>gw"
+"[\s\d]+"                          "abc<0>  123</0>def"
+"[d-f\d]+"                         "abc<0>def123</0>ghi"
+"[d-fr-t]+"                        "abc<0>defrst</0>uvw"
+
+"[abc--]"                      E   " "                         # set-after-op
+"[[def]&&]"                    E   " "
+"[-abcd---]+"                     "<0>abc</0>--"                 #[-abcd]--[-]
+"[&abcd&&&ac]+"                   "b<0>ac&&ca</0>d"              #[&abcd]&&[&ac]
+
+"[[abcd]&[ac]]+"                  "b<0>acac</0>d"              # set-set-amp
+"[[abcd]&&[ac]]+"                 "b<0>acac</0>d"
+"[[abcd]&&ac]+"                   "b<0>acac</0>d"
+"[[abcd]&ac]+"                    "<0>bacacd&&&</0>"
+
+"[abcd&[ac]]+"                    "<0>bacacd&&&</0>"           #set-lit-amp
+"[abcd&&[ac]]+"                   "b<0>acac</0>d"
+"[abcd&&ac]+"                     "b<0>acac</0>d"
+
+"[[abcd]-[ac]]+"                  "a<0>bdbd</0>c"              # set-set-dash
+"[[abcd]--[ac]]+"                 "a<0>bdbd</0>c"
+"[[abcd]--ac]+"                   "a<0>bdbd</0>c"
+"[[abcd]-ac]+"                    "<0>bacacd---</0>"
+
+"[a-d--[b-c]]+"                   "b<0>adad</0>c"              # set-range-dash
+"[a-d--b-c]+"                     "b<0>adad</0>c"   
+"[a-d-[b-c]]+"                    "<0>bad-adc</0>"
+"[a-d-b-c]+"                      "<0>bad-adc</0>"
+"[\w--[b-c]]+"                    "b<0>adad</0>c"  
+"[\w--b-c]+"                      "b<0>adad</0>c"   
+"[\w-[b-c]]+"                     "<0>bad-adc</0>"
+"[\w-b-c]+"                       "<0>bad-adc</0>"
+
+"[a-d&&[b-c]]+"                   "a<0>bcbc</0>d"              # set-range-amp
+"[a-d&&b-c]+"                     "a<0>bcbc</0>d"
+"[a-d&[b-c]]+"                    "<0>abc&bcd</0>"
+"[a-d&b-c]+"                      "<0>abc&bcd</0>"
+
+"[abcd--bc]+"                     "b<0>adda</0>c"              # set-lit-dash
+"[abcd--[bc]]+"                   "b<0>adda</0>c"
+"[abcd-[bc]]+"                    "<0>bad--dac</0>xyz"
+"[abcd-]+"                        "<0>bad--dac</0>xyz"
+
+"[abcd-\s]+"                 E    "xyz<0>abcd  --</0>xyz"      # set-lit-dash-esc
+"[abcd-\N{LATIN SMALL LETTER G}]+"  "xyz-<0>abcdefg</0>hij-"
+"[bcd-\{]+"                       "a<0>bcdefyz{</0>|}"
+
+"[\p{Ll}]+"                       "ABC<0>abc</0>^&*&"          # set-escape
+"[\P{Ll}]+"                       "abc<0>ABC^&*&</0>xyz"
+"[\N{LATIN SMALL LETTER Q}]+"     "mnop<0>qqq</0>rst"
+"[\sa]+"                          "cb<0>a  a  </0>(*&"
+"[\S]+"                           "   <0>hello</0>  "
+"[\w]+"                           "   <0>hello_world</0>!  "
+"[\W]+"                           "a<0>   *$%#,</0>hello "
+"[\d]+"                           "abc<0>123</0>def"
+"[\D]+"                           "123<0>abc</0>567"
+"[\$\#]+"                         "123<0>$#$#</0>\\"
+
+#
+#  Try each of the Java compatibility properties.
+#    These are checked here, while normal Unicode properties aren't, because
+#    these Java compatibility properties are implemented directly by regexp, while other
+#    properties are handled by ICU's Property and UnicodeSet APIs.
+#
+#    These tests are only to verify that the names are recognized and the
+#    implementation isn't dead.  They are not intended to verify that the
+#    function defintions are 100% correct.
+#
+"[:InBasic Latin:]+"               "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
+"[:^InBasic Latin:]+"              "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
+"\p{InBasicLatin}+"                "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
+"\P{InBasicLatin}+"                "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
+"\p{InGreek}+"                     "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
+"\p{InCombining Marks for Symbols}" "<0>\u20d0</0>"
+"\p{Incombiningmarksforsymbols}"    "<0>\u20d0</0>"
+
+
+"\p{javaDefined}+"                 "\uffff<0>abcd</0>\U00045678"
+"\p{javaDigit}+"                   "abc<0>1234</0>xyz"
+"\p{javaIdentifierIgnorable}+"     "abc<0>\u0000\u000e\u009f</0>xyz"
+"\p{javaISOControl}+"              "abc<0>\u0000\u000d\u0083</0>xyz"
+"\p{javaJavaIdentifierPart}+"      "#@!<0>abc123_$</0>;"
+"\p{javaJavaIdentifierStart}+"     "123\u0301<0>abc$_</0>%^&"
+"\p{javaLetter}+"                  "123<0>abcDEF</0>&*()("
+"\p{javaLetterOrDigit}+"           "$%^&*<0>123abcகஙசஜஞ</0>☺♘♚☔☎♬⚄⚡"
+"\p{javaLowerCase}+"               "ABC<0>def</0>&^%#:="
+"\p{javaMirrored}+"                "ab$%<0>(){}[]</0>xyz"
+"\p{javaSpaceChar}+"               "abc<0> \u00ao\u2028</0>!@#"
+"\p{javaSupplementaryCodePoint}+"  "abc\uffff<0>\U00010000\U0010ffff</0>\u0000"
+"\p{javaTitleCase}+"               "abCE<0>ǅῌᾨ</0>123"
+"\p{javaUnicodeIdentifierStart}+"  "123<0>abcⅣ</0>%^&&*"
+"\p{javaUnicodeIdentifierPart}+"   "%&&^<0>abc123\u0301\u0002</0>..."
+"\p{javaUpperCase}+"               "abc<0>ABC</0>123"
+"\p{javaValidCodePoint}+"          "<0>\u0000abc\ud800 unpaired \udfff |\U0010ffff</0>"
+"\p{javaWhitespace}+"              "abc\u00a0\u2007\u202f<0> \u0009\u001c\u001f\u2028</0>42"
+"\p{all}+"                         "<0>123\u0000\U0010ffff</0>"
+"\P{all}+"                         "123\u0000\U0010ffff"
+
+#
+#  Errors on unrecognized ASCII letter escape sequences.
+#
+"[abc\Y]+"                         "<0>abcY</0>"
+"[abc\Y]+"                     eE  "<0>abcY</0>"
+
+"(?:a|b|c|\Y)+"                    "<0>abcY</0>"
+"(?:a|b|c|\Y)+"                eE  "<0>abcY</0>"
+
+"\Q\Y\E"                       e   "<0>\\Y</0>"
+
+#
+# Reported problem
+#
+"[a-\w]"                       E  "x"
+
 #
 # Bug 4045
 #
@ -485,7 +901,7 @@
 "A*"                           3  ""
 "A*"                           4  ""
 "A*"                           5  ""
-				  
+
 #
 # Bug 4046
 #
@ -512,11 +928,10 @@
 # Bug 4058    ICU Unicode Set patterns have an odd feature -
 #             A $ as the last character before the close bracket means match
 #             a \uffff, which means off the end of the string in transliterators.
-#             Doesn't make much sense for regex, but works that way anyhow.
+#             Didn't make sense for regular expressions, and is now fixed.
 #
 "[\$](P|C|D);"                    "<0>$<1>P</1>;</0>"
-"[$](P|C|D);"                     "<0>\uffff<1>P</1>;</0>"
-"[$](P|C|D);"                     "$P;"
+"[$](P|C|D);"                     "<0>$<1>P</1>;</0>"
 "[$$](P|C|D);"                    "<0>$<1>P</1>;</0>"

 #
@ -537,10 +952,68 @@
 ".+?\b"                       2   " <0>\u0935\u0915\u094D\u200D\u0924\u0947</0> "
 ".+?\b"                       3   " \u0935\u0915\u094D\u200D\u0924\u0947 "

+#
+# bug 5386  "^.*$" should match empty input
+#
+"^.*$"                            "<0></0>"
+"^.*$"                     m      "<0></0>"
+"^.*$"                            "<0></0>\n"
+"(?s)^.*$"                        "<0>\n</0>"
+
+#
+# bug 5386  Empty pattern and empty input should match.
+#
+""                                "<0></0>abc"
+""                                "<0></0>"
+
+#
+# bug 5386   Range upper and lower bounds can be equal
+#
+"[a-a]"                           "<0>a</0>"
+
+#
+# bug 5386  $* should not fail, should match empty string.
+#
+"$*"                              "<0></0>abc"
+
+#
+# bug 5386  \Q ... \E escaping problem
+#
+"[a-z\Q-$\E]+"                    "QE<0>abc-def$</0>."
+
+# More reported 5386 Java comaptibility failures
+#
+"[^]*abb]*"                       "<0>kkkk</0>"
+"\xa"                             "huh"              # Java would like to be warned.
+"^.*$"                            "<0></0>"
+
+#
+# bug 5386  Empty left alternation should produce a zero length match.
+#
+"|a"                              "<0></0>a"
+"$|ab"                            "<0>ab</0>"
+"$|ba"                            "ab<0></0>"
+
+#
+# bug 5386  Java compatibility for set expressions
+#
+"[a-z&&[cde]]+"                   "ab<0>cde</0>fg"
+
+#
+# bug 6019  matches() needs to backtrack and check for a longer match if the
+#                     first match(es) found don't match the entire input.
+#
+"a?|b"                            "<0></0>b"
+"a?|b"                         M  "<0>b</0>"
+"a?|.*?u|stuff|d"              M  "<0>stuff</0>"
+"a?|.*?(u)|stuff|d"            M  "<0>stuff<1>u</1></0>"
+"a+?"                             "<0>a</0>aaaaaaaaaaaa"
+"a+?"                          M  "<0>aaaaaaaaaaaaa</0>"
+
 #
 #  Random debugging, Temporary
 #
-#"^(?:a?b?)*$"	                  "a--"	
+#"^(?:a?b?)*$"	                  "a--"
 "^(?:a?b?)*$"	                  "a--"

 "This is a string with (?:one |two |three )endings"   "<0>This is a string with two endings</0>"
@ -681,7 +1154,7 @@
 "^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$"   G "<0>ftp://ftp.blah.co.uk:2828/blah%20blah.gif</0>"
 "^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$"   G "<0>https://blah.gov/blah-blah.as</0>"
 "^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$"     "www.blah.com"
-"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$"     "http://www.blah.com/I have spaces!" 
+"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$"     "http://www.blah.com/I have spaces!"
 "^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$"     "ftp://blah_underscore/[nope]"
 "^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$"   G "<0>12/01/2002</0>"
 "^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$"   G "<0>12/01/2002 12:32:10</0>"
@ -959,18 +1432,18 @@
 "^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$"     "10.0.5.4"
 "^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$"     "192.168.0.1"
 "^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$"     "my ip address"
-#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"   G "<0>foo@foo.com</0>"   # TODO:  \w in pattern
-#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"   G "<0>foo@foo-foo.com.au</0>"   # TODO:  \w in pattern
-#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"   G "<0>foo@foo.foo.info</0>"   # TODO:  \w in pattern
-#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"     "foo@.com"   # TODO:  \w in pattern
-#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"     "foo@foo..com"   # TODO:  \w in pattern
-#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"     "foo@me@.com"   # TODO:  \w in pattern
-#"/\*[\d\D]*?\*/"   G "<0>/* my comment */</0>"
-#"/\*[\d\D]*?\*/"   G "<0>/* my multiline comment */</0>"
-#"/\*[\d\D]*?\*/"   G "<0>/* my nested comment */</0>"
-#"/\*[\d\D]*?\*/"     "*/ anything here /*"
-#"/\*[\d\D]*?\*/"     "anything between 2 seperate comments"
-#"/\*[\d\D]*?\*/"     "\* *\"
+"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"   G "<0>foo@foo.com</0>"
+"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"   G "<0>foo@foo-foo.com.au</0>"
+"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"   G "<0>foo@foo.foo.info</0>"
+"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"     "foo@.com"
+"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"     "foo@foo..com"
+"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$"     "foo@me@.com"
+"/\*[\d\D]*?\*/"   G "<0>/* my comment */</0>"
+"/\*[\d\D]*?\*/"   G "<0>/* my multiline comment */</0>"
+"/\*[\d\D]*?\*/"   G "<0>/* my nested comment */</0>"
+"/\*[\d\D]*?\*/"     "*/ anything here /*"
+"/\*[\d\D]*?\*/"     "anything between 2 seperate comments"
+"/\*[\d\D]*?\*/"     "\* *\"
 "/\*[\p{N}\P{N}]*?\*/"   G "<0>/* my comment */</0>"
 "/\*[\p{N}\P{N}]*?\*/"   G "<0>/* my multiline comment */</0>"
 "/\*[\p{N}\P{N}]*?\*/"   G "<0>/* my nested comment */</0>"
@ -986,9 +1459,9 @@
 '^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$'   G "<0>blah@[10.0.0.1]</0>"
 '^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$'   G "<0>a@b.c</0>"
 '^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$'     "non@match@."
-#"^\d{9}[\d|X]$"   G "<0>1234123412</0>"
-#"^\d{9}[\d|X]$"   G "<0>123412341X</0>"
-#"^\d{9}[\d|X]$"     "not an isbn"
+"^\d{9}[\d|X]$"   G "<0>1234123412</0>"
+"^\d{9}[\d|X]$"   G "<0>123412341X</0>"
+"^\d{9}[\d|X]$"     "not an isbn"
 "^\d{9}(\d|X)$"   G "<0>1234123412</0>"
 "^\d{9}(\d|X)$"   G "<0>123412341X</0>"
 "^\d{9}(\d|X)$"     "not an isbn"
@ -1056,12 +1529,12 @@
 "\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}"     "12 123 1234"
 "\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}"     "(012) 123/1234"
 "\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}"     "(012) 123 12345"
-#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"   G "<0>bob-smith@foo.com</0>"   # TODO:  \w in pattern
-#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"   G "<0>bob.smith@foo.com</0>"   # TODO:  \w in pattern
-#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"   G "<0>bob_smith@foo.com</0>"   # TODO:  \w in pattern
-#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"     "-smith@foo.com"   # TODO:  \w in pattern 
-#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"     ".smith@foo.com"   # TODO:  \w in pattern
-#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"     "smith@foo_com"   # TODO:  \w in pattern
+"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"   G "<0>bob-smith@foo.com</0>"
+"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"   G "<0>bob.smith@foo.com</0>"
+"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"   G "<0>bob_smith@foo.com</0>"
+"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"     "-smith@foo.com"
+"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"     ".smith@foo.com"
+"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$"     "smith@foo_com"
 "^(?=.*\d).{4,8}$"   G "<0>1234</0>"
 "^(?=.*\d).{4,8}$"   G "<0>asdf1234</0>"
 "^(?=.*\d).{4,8}$"   G "<0>asp123</0>"
@ -1175,7 +1648,7 @@
 "^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$"     "$12,3456.01"
 "^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$"     "12345"
 "^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$"     "$1.234"
-"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})"   G "<0>C:\\temp\\this allows spaces\\web.config</0>" 
+"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})"   G "<0>C:\\temp\\this allows spaces\\web.config</0>"
 "([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})"   G "<0>\\\\Andromeda\\share\\file name.123</0>"
 "([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})"     "tz:\temp\ fi*le?na:m<e>.doc"
 "([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})"     "\\Andromeda\share\filename.a"
@ -1206,24 +1679,24 @@
 "^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$"     "qqqBFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA"
 "^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$"     "BFDB4D31-3E-4DAB-AFCA-5E6E5C8F61EA"
 "^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$"     "BFDB4D31-3E35-4DAB-AF"
-#"^\d{2}(\x2e)(\d{3})(-\d{3})?$"   G "<0>12.345-678</0>"  # TODO: \x not implemented.
-#"^\d{2}(\x2e)(\d{3})(-\d{3})?$"   G "<0>23.345-123</0>"
-#"^\d{2}(\x2e)(\d{3})(-\d{3})?$"   G "<0>99.999</0>"
-#"^\d{2}(\x2e)(\d{3})(-\d{3})?$"     "41222-222"
-#"^\d{2}(\x2e)(\d{3})(-\d{3})?$"     "3.444-233"
-#"^\d{2}(\x2e)(\d{3})(-\d{3})?$"     "43.324444"
+"^\d{2}(\x2e)(\d{3})(-\d{3})?$"   G "<0>12.345-678</0>"
+"^\d{2}(\x2e)(\d{3})(-\d{3})?$"   G "<0>23.345-123</0>"
+"^\d{2}(\x2e)(\d{3})(-\d{3})?$"   G "<0>99.999</0>"
+"^\d{2}(\x2e)(\d{3})(-\d{3})?$"     "41222-222"
+"^\d{2}(\x2e)(\d{3})(-\d{3})?$"     "3.444-233"
+"^\d{2}(\x2e)(\d{3})(-\d{3})?$"     "43.324444"
 "^\d{2}(\u002e)(\d{3})(-\d{3})?$"   G "<0>12.345-678</0>"
 "^\d{2}(\u002e)(\d{3})(-\d{3})?$"   G "<0>23.345-123</0>"
 "^\d{2}(\u002e)(\d{3})(-\d{3})?$"   G "<0>99.999</0>"
 "^\d{2}(\u002e)(\d{3})(-\d{3})?$"     "41222-222"
 "^\d{2}(\u002e)(\d{3})(-\d{3})?$"     "3.444-233"
 "^\d{2}(\u002e)(\d{3})(-\d{3})?$"     "43.324444"
-#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"   G "<0>c:\file.txt</0>"
-#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"   G "<0>c:\folder\sub folder\file.txt</0>"
-#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"   G "<0>\\network\folder\file.txt</0>"    # TODO:  \w in pattern
-#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"     "C:"   # TODO:  \w in pattern
-#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"     "C:\file.xls"   # TODO:  \w in pattern
-#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"     "folder.txt"   # TODO:  \w in pattern
+#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"   G "<0>c:\file.txt</0>"   # TODO:  debug
+#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"   G "<0>c:\folder\sub folder\file.txt</0>"   # TODO:  debug
+#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"   G "<0>\\network\folder\file.txt</0>"    # TODO:  debug
+"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"     "C:"
+"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"     "C:\file.xls"
+"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$"     "folder.txt"
 "^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$"   G "<0>my.domain.com</0>"
 "^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$"   G "<0>regexlib.com</0>"
 "^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$"   G "<0>big-reg.com</0>"
@ -1265,12 +1738,12 @@
 "^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$"     "1-555-5555"
 "^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$"     "15553333"
 "^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$"     "0-561-555-1212"
-#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>"   G "<0><input type = text name = "bob"></0>"    # TODO:  \w in pattern
-#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>"   G "<0><select name = "fred"></0>"   # TODO:  \w in pattern
-#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>"   G "<0><form</0>"   # TODO:  \w in pattern
-#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>"     "<input type = submit>"   # TODO:  \w in pattern
-#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>"     "<font face = "arial">"   # TODO:  \w in pattern
-#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>"     "The drity brown fox stank like"   # TODO:  \w in pattern
+'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>'   G '<0><input type = text name = "bob"></0>'
+'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>'   G '<0><select name = "fred"></0>'
+#'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>'   G '<0><form></0>'    #TODO:  Debug
+'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>'     "<input type = submit>"   # TODO:  \w in pattern
+'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>'     '<font face = "arial">'   # TODO:  \w in pattern
+'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>'      "The drity brown fox stank like"
 "^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$"   G "<0>1:00 AM</0>"
 "^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$"   G "<0>12:00 PM</0>"
 "^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$"   G "<0>1:00am</0>"
@ -1495,9 +1968,9 @@
 "^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$"     "10.57.98.23."
 "<img([^>]*[^/])>"   G '<0><img src="bob"></0>'
 "<img([^>]*[^/])>"     '<img src="bob" />'
-#"<!--[\s\S]*?-->"   G "<0><!-- comments --></0>"
-#"<!--[\s\S]*?-->"   G "<0><!-- x = a > b - 3 --></0>"
-#"<!--[\s\S]*?-->"     "<COMMENTS>this is a comment</COMMENTS>"
+"<!--[\s\S]*?-->"   G "<0><!-- comments --></0>"
+"<!--[\s\S]*?-->"   G "<0><!-- x = a > b - 3 --></0>"
+"<!--[\s\S]*?-->"     "<COMMENTS>this is a comment</COMMENTS>"
 "<!--[\p{Zs}\P{Zs}]*?-->"   G "<0><!-- comments --></0>"
 "<!--[\p{Zs}\P{Zs}]*?-->"   G "<0><!-- x = a > b - 3 --></0>"
 "<!--[\p{Zs}\P{Zs}]*?-->"     "<COMMENTS>this is a comment</COMMENTS>"
@ -1509,8 +1982,8 @@
 "(\{\\f\d*)\\([^;]+;)"   G "<0>{\\f1\\fswiss\\fcharset0\\fprq2{\\*\\panose 020b0604020202020204}Arial;</0>"
 "(\{\\f\d*)\\([^;]+;)"   G "{\\f"
 "(\{\\f\d*)\\([^;]+;)"     "{f0fs20 some text}"
-#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>"   G "<0><IMG src='stars.gif' alt="space" height=1></0>"    # TODO:  \w in pattern
-#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>"     "this is not a tag"   # TODO:  \w in pattern
+#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>"   G '<0><IMG src='stars.gif' alt="space" height=1></0>'    # TODO:  Can't quote this pattern with the test syntax!
+#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>"     "this is not a tag"
 "^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$"   G "<0>12/30/2002</0>"
 "^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$"   G "<0>01/12/1998 13:30</0>"
 "^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$"   G "<0>01/28/2002 22:35:00</0>"
@ -1586,10 +2059,10 @@
 "^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$"     "bad.bad.gif"
 "^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$"     "slash\gif."
 "<[^>\s]*\bauthor\b[^>]*>"   G '<0><author name="Daniel"></0>'
-#"<[^>\s]*\bauthor\b[^>]*>"   G "<0></sch:author></0>"
-#"<[^>\s]*\bauthor\b[^>]*>"   G '<0><pp:author name="Daniel"</0>'
+"<[^>\s]*\bauthor\b[^>]*>"   G "<0></sch:author></0>"
+# "<[^>\s]*\bauthor\b[^>]*>"   G '<0><pp:author name="Daniel"</0>'  #Debug  should work
 "<[^> ]*\bauthor\b[^>]*>"   G "<0></sch:author></0>"
-"<[^> ]*\bauthor\b[^>]*>"   G '<0><pp:author name="Daniel"></0>' 
+"<[^> ]*\bauthor\b[^>]*>"   G '<0><pp:author name="Daniel"></0>'
 "<[^>\s]*\bauthor\b[^>]*>"     "<other>"
 "<[^>\s]*\bauthor\b[^>]*>"     "</authors>"
 "<[^>\s]*\bauthor\b[^>]*>"     "<work>author</work>"
@ -1625,15 +2098,15 @@
 "(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)"     "0"
 "(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)"     "0.0"
 "(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)"     ".0"
-#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"   G "<0>Sacramento</0>"          #TODO: Octal
-#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"   G "<0>San Francisco</0>"
-#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"   G "<0>San Luis Obispo</0>"
-#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "SanFrancisco"
-#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "SanLuisObispo"
-#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "San francisco"
-#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$"   G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
-#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$"   G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
-#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$"     "0xe02ff0e400ad090Ac0300d00a0008ba0"
+"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"   G "<0>Sacramento</0>"
+"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "<0><2>San Francisco</2></0>"
+"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "<0><3>San Luis Obispo</3></0>"
+"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "SanFrancisco"
+"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "SanLuisObispo"
+"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$"     "San francisco"
+"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$"   G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
+"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$"   G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
+"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$"     "0xe02ff0e400ad090Ac0300d00a0008ba0"
 "^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$"   G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
 "^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$"   G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
 "^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$"     "0xe02ff0e400ad090Ac0300d00a0008ba0"
@ -1682,15 +2155,15 @@
 "^((0[1-9])|(1[0-2]))\/(\d{2})$"   G "<0>01/04</0>"
 "^((0[1-9])|(1[0-2]))\/(\d{2})$"     "13/03"
 "^((0[1-9])|(1[0-2]))\/(\d{2})$"     "10/2003"
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0><script language=javascript>document.write("one");</script></0>"    # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"     "--"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"     "A-Z][a-z]+"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0>strFirstName</0>"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0>intAgeInYears</0>"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0>Where the Wild Things Are</0>"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"     "123"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"     "abc"   # TODO:  \w in pattern
-#"<script[^>]*>[\w|\t|\r|\W]*</script>"     "this has no caps in it"   # TODO:  \w in pattern
+"<script[^>]*>[\w|\t|\r|\W]*</script>"   G '<0><script language=javascript>document.write("one");</script></0>'
+"<script[^>]*>[\w|\t|\r|\W]*</script>"     "--"
+"<script[^>]*>[\w|\t|\r|\W]*</script>"     "A-Z][a-z]+"
+#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0>strFirstName</0>"   # Test Case damaged?
+#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0>intAgeInYears</0>"   # Test Case damaged?
+#"<script[^>]*>[\w|\t|\r|\W]*</script>"   G "<0>Where the Wild Things Are</0>"   #  Test Case damaged?
+"<script[^>]*>[\w|\t|\r|\W]*</script>"     "123"
+"<script[^>]*>[\w|\t|\r|\W]*</script>"     "abc"
+"<script[^>]*>[\w|\t|\r|\W]*</script>"     "this has no caps in it"
 "(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)"   G "<0>-0.050</0>"
 "(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)"   G "<0>-5.000</0>"
 "(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)"   G "<0>-5</0>"
@ -1725,12 +2198,12 @@
 "^.{4,8}$"     "asd"
 "^.{4,8}$"     "123"
 "^.{4,8}$"     "asdfe12345"
-#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"   G "<0>a@a.com</0>"    # TODO:  \w in pattern
-#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"   G "<0>a@a.com.au</   # TODO:  \w in pattern0>"
-#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"   G "<0>a@a.au</0>"   # TODO:  \w in pattern
-#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"     "word"   # TODO:  \w in pattern
-#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"     "word@"   # TODO:  \w in pattern
-#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"     "@word"   # TODO:  \w in pattern
+"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"   G "<0>a@a.com</0>"
+"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"   G "<0>a@a.com.au</0>"
+"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"   G "<0>a@a.au</0>"
+"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"     "word"
+"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"     "word@"
+"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$"     "@word"
 "^\d{5}-\d{4}$"   G "<0>22222-3333</0>"
 "^\d{5}-\d{4}$"   G "<0>34545-2367</0>"
 "^\d{5}-\d{4}$"   G "<0>56334-2343</0>"
@ -1795,22 +2268,22 @@
 "^[12345]$"     "6"
 "^[12345]$"     "-1"
 "^[12345]$"     "abc"
-#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"   G "<0>joe@aol.com</0>"    # TODO:  \w in pattern
-#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"   G "<0>joe@wrox.co.uk</0>"   # TODO:  \w in pattern
-#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"   G "<0>joe@domain.info</0>"   # TODO:  \w in pattern
-#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"     "a@b"   # TODO:  \w in pattern
-#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"     "notanemail"   # TODO:  \w in pattern
-#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"     "joe@@."   # TODO:  \w in pattern
+"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"   G "<0>joe@aol.com</0>"
+"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"   G "<0>joe@wrox.co.uk</0>"
+"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"   G "<0>joe@domain.info</0>"
+"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"     "a@b"
+"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"     "notanemail"
+"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$"     "joe@@."
 "^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$"   G "<0>joe@aol.com</0>"
 "^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$"   G "<0>ssmith@aspalliance.com</0>"
 "^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$"   G "<0>a@b.cc</0>"
 "^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$"     "joe@123aspx.com"
 "^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$"     "joe@web.info"
 "^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$"     "joe@company.co.uk"
-#"[\w-]+@([\w-]+\.)+[\w-]+"   G "<0>joe@aol.com</0>"   # TODO:  \w in pattern
-#"[\w-]+@([\w-]+\.)+[\w-]+"   G "<0>a@b.c</0>"   # TODO:  \w in pattern
-#"[\w-]+@([\w-]+\.)+[\w-]+"     "asdf"   # TODO:  \w in pattern
-#"[\w-]+@([\w-]+\.)+[\w-]+"     "1234"   # TODO:  \w in pattern
+"[\w-]+@([\w-]+\.)+[\w-]+"   G "<0>joe@aol.com</0>"
+"[\w-]+@([\w-]+\.)+[\w-]+"   G "<0>a@b.c</0>"
+"[\w-]+@([\w-]+\.)+[\w-]+"     "asdf"
+"[\w-]+@([\w-]+\.)+[\w-]+"     "1234"
 "\d{4}-?\d{4}-?\d{4}-?\d{4}"   G "<0>1234-1234-1234-1234</0>"
 "\d{4}-?\d{4}-?\d{4}-?\d{4}"   G "<0>1234123412341234</0>"
 "\d{4}-?\d{4}-?\d{4}-?\d{4}"     "1234123412345"