ICU-5386 Regular Expressions update, improved Java 1.5 compatibility. svn merge -r 22747:23061 from branches/andy/regex

X-SVN-Rev: 23063
This commit is contained in:
Andy Heninger 2007-12-11 21:30:10 +00:00
parent 0d216c877d
commit 67e296e813
21 changed files with 3974 additions and 1300 deletions

View file

@ -16,10 +16,10 @@
#include "unicode/uset.h"
/**
* \file
* \file
* \brief C++ API: Unicode Set
*/
U_NAMESPACE_BEGIN
class BMPSet;
@ -1213,6 +1213,14 @@ public:
*/
UnicodeSet& closeOver(int32_t attribute);
/**
* Remove all strings from this set.
*
* @return a reference to this set.
* @internal
*/
virtual UnicodeSet &removeAllStrings();
/**
* Iteration method that returns the number of ranges contained in
* this set.

View file

@ -718,6 +718,9 @@ typedef enum UErrorCode {
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */
U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/
U_REGEX_OCTAL_TOO_BIG, /**< Octal character constants must be <= 0377. */
U_REGEX_MISSING_CLOSE_BRACKET, /**< Missing closing bracket on a bracket expression. */
U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
/*

View file

@ -1037,6 +1037,12 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
return *this;
}
UnicodeSet& UnicodeSet::removeAllStrings() {
strings->removeAllElements();
return *this;
}
/**
* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>

View file

@ -152,7 +152,10 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_INVALID_BACK_REF",
"U_REGEX_INVALID_FLAG",
"U_REGEX_LOOK_BEHIND_LIMIT",
"U_REGEX_SET_CONTAINS_STRING"
"U_REGEX_SET_CONTAINS_STRING",
"U_REGEX_OCTAL_TOO_BIG",
"U_REGEX_MISSING_CLOSE_BRACKET",
"U_REGEX_INVALID_RANGE"
};
static const char * const

File diff suppressed because it is too large Load diff

View file

@ -51,7 +51,7 @@ public:
};
RegexCompile(RegexPattern *rp, UErrorCode &e);
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
@ -68,7 +68,7 @@ public:
// determines the code to be generated when the matching close ) is encountered.
enum EParenClass {
plain = -1, // No special handling
capturing = -2,
capturing = -2,
atomic = -3,
lookAhead = -4,
negLookAhead = -5,
@ -85,8 +85,8 @@ private:
UChar32 nextCharLL();
UChar32 peekCharLL();
UnicodeSet *scanSet();
UnicodeSet *scanProp();
UnicodeSet *scanPosixProp();
void handleCloseParen();
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
// at the top of the just completed block
@ -109,7 +109,11 @@ private:
int32_t end);
void matchStartType();
void stripNOPs();
void OptDotStar();
void setEval(int32_t op);
void setPushOp(int32_t op);
UChar32 scanNamedChar();
UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
UErrorCode *fStatus;
@ -125,7 +129,7 @@ private:
// is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
int32_t fLineNum; // Line number in input file.
int32_t fCharNum; // Char position within the line.
@ -167,7 +171,7 @@ private:
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
// needing fixup, followed by negative value. The
// needing fixup, followed by negative value. The
// first entry in each frame is the position of the
// spot reserved for use when a quantifier
// needs to add a SAVE at the start of a (block)
@ -194,8 +198,33 @@ private:
int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
// pattern, valid while remainder of name is
// scanned.
UStack fSetStack; // Stack of UnicodeSets, used while evaluating
// (at compile time) set expressions within
// the pattern.
UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
UChar32 fLastSetLiteral; // The last single code point added to a set.
// needed when "-y" is scanned, and we need
// to turn "x-y" into a range.
};
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
enum SetOperations {
setStart = 0 << 16 | 1,
setEnd = 1 << 16 | 2,
setNegation = 2 << 16 | 3,
setCaseClose = 2 << 16 | 9,
setDifference2 = 3 << 16 | 4, // '--' set difference operator
setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
setUnion = 4 << 16 | 6, // implicit union of adjacent items
setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif // RBBISCAN_H

View file

@ -5,7 +5,7 @@
// It is generated by the Perl script "regexcst.pl" from
// the rule parser state definitions file "regexcst.txt".
//
// Copyright (C) 2002-2003 International Business Machines Corporation
// Copyright (C) 2002-2007 International Business Machines Corporation
// and others. All rights reserved.
//
//---------------------------------------------------------------------------------
@ -17,74 +17,100 @@ U_NAMESPACE_BEGIN
// Character classes for regex pattern scanning.
//
static const uint8_t kRuleSet_digit_char = 128;
static const uint8_t kRuleSet_white_space = 129;
static const uint8_t kRuleSet_rule_char = 130;
static const uint8_t kRuleSet_rule_char = 129;
enum Regex_PatternParseAction {
doPossessivePlus,
doCloseParen,
doProperty,
doBeginMatchMode,
doOrOperator,
doOpenCaptureParen,
doBadOpenParenType,
doRuleError,
doIntevalLowerDigit,
doBackslashs,
doNGOpt,
doBackslashw,
doMismatchedParenErr,
doOpenLookBehind,
doBackslashz,
doIntervalError,
doStar,
doCaret,
doEnterQuoteMode,
doNGStar,
doMatchMode,
doIntervalUpperDigit,
doOpenLookAheadNeg,
doPlus,
doOpenNonCaptureParen,
doBackslashA,
doBackslashB,
doNGPlus,
doSetMatchMode,
doPatFinish,
doBackslashD,
doPossessiveInterval,
doEscapeError,
doBackslashG,
doSuppressComments,
doMatchModeParen,
doOpt,
doInterval,
doLiteralChar,
doIntervalInit,
doOpenAtomicParen,
doBackslashS,
doOpenLookAhead,
doBackRef,
doDollar,
doDotAny,
doBackslashW,
doBackslashX,
doScanUnicodeSet,
doBackslashZ,
doPerlInline,
doPossessiveOpt,
doSetEnd,
doBackslashA,
doSetBeginUnion,
doNOP,
doConditionalExpr,
doExit,
doNGInterval,
doPatStart,
doBadModeFlag,
doBackslashb,
doPossessiveStar,
doBackslashd,
doIntervalSame,
doSetBackslash_w,
doSetRange,
doBackslashG,
doPerlInline,
doSetAddDash,
doIntevalLowerDigit,
doProperty,
doBackslashX,
doOpenAtomicParen,
doSetLiteralEscaped,
doPatFinish,
doSetBackslash_D,
doSetDifference2,
doNamedChar,
doNGPlus,
doOpenLookBehindNeg,
doIntervalError,
doIntervalSame,
doBackRef,
doPlus,
doOpenCaptureParen,
doMismatchedParenErr,
doBeginMatchMode,
doEscapeError,
doOpenNonCaptureParen,
doDollar,
doSetProp,
doIntervalUpperDigit,
doSetBegin,
doBackslashs,
doOpenLookBehind,
doSetMatchMode,
doOrOperator,
doCaret,
doMatchModeParen,
doStar,
doOpt,
doMatchMode,
doSuppressComments,
doPossessiveInterval,
doOpenLookAheadNeg,
doBackslashW,
doCloseParen,
doSetOpError,
doIntervalInit,
doSetFinish,
doSetIntersection2,
doNGStar,
doEnterQuoteMode,
doSetAddAmp,
doBackslashB,
doBackslashw,
doPossessiveOpt,
doSetNegate,
doRuleError,
doBackslashb,
doConditionalExpr,
doPossessivePlus,
doBadOpenParenType,
doNGInterval,
doSetLiteral,
doSetNamedChar,
doBackslashd,
doSetBeginDifference1,
doBackslashD,
doExit,
doSetBackslash_S,
doInterval,
doSetNoCloseError,
doNGOpt,
doSetPosixProp,
doBackslashS,
doBackslashZ,
doSetBeginIntersection1,
doSetBackslash_W,
doSetBackslash_d,
doOpenLookAhead,
doBadModeFlag,
doPatStart,
doSetNamedRange,
doPossessiveStar,
doEscapedLiteralChar,
doSetBackslash_s,
doBackslashz,
doDotAny,
rbbiLastAction};
//-------------------------------------------------------------------------------
@ -106,21 +132,21 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
, {doLiteralChar, 130, 14,0, TRUE} // 3
, {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4
, {doLiteralChar, 129, 14,0, TRUE} // 3
, {doSetBegin, 91 /* [ */, 104, 182, TRUE} // 4
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
, {doNOP, 92 /* \ */, 81,0, TRUE} // 9
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
, {doDollar, 36 /* $ */, 14,0, TRUE} // 8
, {doNOP, 92 /* \ */, 84,0, TRUE} // 9
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
, {doRuleError, 255, 101,0, FALSE} // 13
, {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 62,0, TRUE} // 15
, {doNOP, 63 /* ? */, 65,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17
, {doRuleError, 255, 183,0, FALSE} // 13
, {doNOP, 42 /* * */, 63,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 66,0, TRUE} // 15
, {doNOP, 63 /* ? */, 69,0, TRUE} // 16
, {doIntervalInit, 123 /* { */, 72,0, TRUE} // 17
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
, {doNOP, 255, 20,0, FALSE} // 19
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
@ -128,7 +154,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doNOP, 255, 2,0, FALSE} // 22
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
, {doNOP, 255, 27,0, FALSE} // 24
, {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 35 /* # */, 49, 14, TRUE} // 25 open-paren-quant2
, {doNOP, 255, 29,0, FALSE} // 26
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
@ -136,75 +162,157 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
, {doNOP, 60 /* < */, 44,0, TRUE} // 33
, {doNOP, 35 /* # */, 47, 2, TRUE} // 34
, {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35
, {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36
, {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37
, {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38
, {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39
, {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 42
, {doBadOpenParenType, 255, 101,0, FALSE} // 43
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45
, {doBadOpenParenType, 255, 101,0, FALSE} // 46
, {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment
, {doMismatchedParenErr, 253, 101,0, FALSE} // 48
, {doNOP, 255, 47,0, TRUE} // 49
, {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag
, {doMatchMode, 109 /* m */, 50,0, TRUE} // 51
, {doMatchMode, 115 /* s */, 50,0, TRUE} // 52
, {doMatchMode, 119 /* w */, 50,0, TRUE} // 53
, {doMatchMode, 120 /* x */, 50,0, TRUE} // 54
, {doMatchMode, 45 /* - */, 50,0, TRUE} // 55
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57
, {doBadModeFlag, 255, 101,0, FALSE} // 58
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60
, {doStar, 255, 20,0, FALSE} // 61
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63
, {doPlus, 255, 20,0, FALSE} // 64
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66
, {doOpt, 255, 20,0, FALSE} // 67
, {doNOP, 129, 68,0, TRUE} // 68 interval-open
, {doNOP, 128, 71,0, FALSE} // 69
, {doIntervalError, 255, 101,0, FALSE} // 70
, {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower
, {doNOP, 44 /* , */, 75,0, TRUE} // 72
, {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73
, {doIntervalError, 255, 101,0, FALSE} // 74
, {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper
, {doNOP, 125 /* } */, 78,0, TRUE} // 76
, {doIntervalError, 255, 101,0, FALSE} // 77
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79
, {doInterval, 255, 20,0, FALSE} // 80
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 82
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 83
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 84
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 85
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 86
, {doProperty, 78 /* N */, 14,0, FALSE} // 87
, {doProperty, 112 /* p */, 14,0, FALSE} // 88
, {doProperty, 80 /* P */, 14,0, FALSE} // 89
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 91
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 92
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 93
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 94
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 95
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 97
, {doBackRef, 128, 14,0, TRUE} // 98
, {doEscapeError, 253, 101,0, FALSE} // 99
, {doLiteralChar, 255, 14,0, TRUE} // 100
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
, {doNOP, 60 /* < */, 46,0, TRUE} // 33
, {doNOP, 35 /* # */, 49, 2, TRUE} // 34
, {doBeginMatchMode, 105 /* i */, 52,0, FALSE} // 35
, {doBeginMatchMode, 100 /* d */, 52,0, FALSE} // 36
, {doBeginMatchMode, 109 /* m */, 52,0, FALSE} // 37
, {doBeginMatchMode, 115 /* s */, 52,0, FALSE} // 38
, {doBeginMatchMode, 117 /* u */, 52,0, FALSE} // 39
, {doBeginMatchMode, 119 /* w */, 52,0, FALSE} // 40
, {doBeginMatchMode, 120 /* x */, 52,0, FALSE} // 41
, {doBeginMatchMode, 45 /* - */, 52,0, FALSE} // 42
, {doConditionalExpr, 40 /* ( */, 183,0, TRUE} // 43
, {doPerlInline, 123 /* { */, 183,0, TRUE} // 44
, {doBadOpenParenType, 255, 183,0, FALSE} // 45
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
, {doBadOpenParenType, 255, 183,0, FALSE} // 48
, {doNOP, 41 /* ) */, 255,0, TRUE} // 49 paren-comment
, {doMismatchedParenErr, 253, 183,0, FALSE} // 50
, {doNOP, 255, 49,0, TRUE} // 51
, {doMatchMode, 105 /* i */, 52,0, TRUE} // 52 paren-flag
, {doMatchMode, 100 /* d */, 52,0, TRUE} // 53
, {doMatchMode, 109 /* m */, 52,0, TRUE} // 54
, {doMatchMode, 115 /* s */, 52,0, TRUE} // 55
, {doMatchMode, 117 /* u */, 52,0, TRUE} // 56
, {doMatchMode, 119 /* w */, 52,0, TRUE} // 57
, {doMatchMode, 120 /* x */, 52,0, TRUE} // 58
, {doMatchMode, 45 /* - */, 52,0, TRUE} // 59
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 60
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 61
, {doBadModeFlag, 255, 183,0, FALSE} // 62
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 63 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 64
, {doStar, 255, 20,0, FALSE} // 65
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 66 quant-plus
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 67
, {doPlus, 255, 20,0, FALSE} // 68
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 69 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 70
, {doOpt, 255, 20,0, FALSE} // 71
, {doNOP, 128, 74,0, FALSE} // 72 interval-open
, {doIntervalError, 255, 183,0, FALSE} // 73
, {doIntevalLowerDigit, 128, 74,0, TRUE} // 74 interval-lower
, {doNOP, 44 /* , */, 78,0, TRUE} // 75
, {doIntervalSame, 125 /* } */, 81,0, TRUE} // 76
, {doIntervalError, 255, 183,0, FALSE} // 77
, {doIntervalUpperDigit, 128, 78,0, TRUE} // 78 interval-upper
, {doNOP, 125 /* } */, 81,0, TRUE} // 79
, {doIntervalError, 255, 183,0, FALSE} // 80
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 81 interval-type
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 82
, {doInterval, 255, 20,0, FALSE} // 83
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 84 backslash
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 85
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 86
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 87
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 88
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 89
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 90
, {doProperty, 112 /* p */, 14,0, FALSE} // 91
, {doProperty, 80 /* P */, 14,0, FALSE} // 92
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 93
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 94
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 95
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 96
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 97
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 98
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 99
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 100
, {doBackRef, 128, 14,0, TRUE} // 101
, {doEscapeError, 253, 183,0, FALSE} // 102
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 103
, {doSetNegate, 94 /* ^ */, 107,0, TRUE} // 104 set-open
, {doSetPosixProp, 58 /* : */, 109,0, FALSE} // 105
, {doNOP, 255, 107,0, FALSE} // 106
, {doSetLiteral, 93 /* ] */, 122,0, TRUE} // 107 set-open2
, {doNOP, 255, 112,0, FALSE} // 108
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 109 set-posix
, {doNOP, 58 /* : */, 112,0, FALSE} // 110
, {doRuleError, 255, 183,0, FALSE} // 111
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 112 set-start
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 113
, {doNOP, 92 /* \ */, 172,0, TRUE} // 114
, {doNOP, 45 /* - */, 118,0, TRUE} // 115
, {doNOP, 38 /* & */, 120,0, TRUE} // 116
, {doSetLiteral, 255, 122,0, TRUE} // 117
, {doRuleError, 45 /* - */, 183,0, FALSE} // 118 set-start-dash
, {doSetAddDash, 255, 122,0, FALSE} // 119
, {doRuleError, 38 /* & */, 183,0, FALSE} // 120 set-start-amp
, {doSetAddAmp, 255, 122,0, FALSE} // 121
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 122 set-after-lit
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 123
, {doNOP, 45 /* - */, 159,0, TRUE} // 124
, {doNOP, 38 /* & */, 150,0, TRUE} // 125
, {doNOP, 92 /* \ */, 172,0, TRUE} // 126
, {doSetNoCloseError, 253, 183,0, FALSE} // 127
, {doSetLiteral, 255, 122,0, TRUE} // 128
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 129 set-after-set
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 130
, {doNOP, 45 /* - */, 152,0, TRUE} // 131
, {doNOP, 38 /* & */, 147,0, TRUE} // 132
, {doNOP, 92 /* \ */, 172,0, TRUE} // 133
, {doSetNoCloseError, 253, 183,0, FALSE} // 134
, {doSetLiteral, 255, 122,0, TRUE} // 135
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-range
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 137
, {doNOP, 45 /* - */, 155,0, TRUE} // 138
, {doNOP, 38 /* & */, 157,0, TRUE} // 139
, {doNOP, 92 /* \ */, 172,0, TRUE} // 140
, {doSetNoCloseError, 253, 183,0, FALSE} // 141
, {doSetLiteral, 255, 122,0, TRUE} // 142
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 143 set-after-op
, {doSetOpError, 93 /* ] */, 183,0, FALSE} // 144
, {doNOP, 92 /* \ */, 172,0, TRUE} // 145
, {doSetLiteral, 255, 122,0, TRUE} // 146
, {doSetBeginIntersection1, 91 /* [ */, 104, 129, TRUE} // 147 set-set-amp
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 148
, {doSetAddAmp, 255, 122,0, FALSE} // 149
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 150 set-lit-amp
, {doSetAddAmp, 255, 122,0, FALSE} // 151
, {doSetBeginDifference1, 91 /* [ */, 104, 129, TRUE} // 152 set-set-dash
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 153
, {doSetAddDash, 255, 122,0, FALSE} // 154
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 155 set-range-dash
, {doSetAddDash, 255, 122,0, FALSE} // 156
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 157 set-range-amp
, {doSetAddAmp, 255, 122,0, FALSE} // 158
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 159 set-lit-dash
, {doSetAddDash, 91 /* [ */, 122,0, FALSE} // 160
, {doSetAddDash, 93 /* ] */, 122,0, FALSE} // 161
, {doNOP, 92 /* \ */, 164,0, TRUE} // 162
, {doSetRange, 255, 136,0, TRUE} // 163
, {doSetOpError, 115 /* s */, 183,0, FALSE} // 164 set-lit-dash-escape
, {doSetOpError, 83 /* S */, 183,0, FALSE} // 165
, {doSetOpError, 119 /* w */, 183,0, FALSE} // 166
, {doSetOpError, 87 /* W */, 183,0, FALSE} // 167
, {doSetOpError, 100 /* d */, 183,0, FALSE} // 168
, {doSetOpError, 68 /* D */, 183,0, FALSE} // 169
, {doSetNamedRange, 78 /* N */, 136,0, FALSE} // 170
, {doSetRange, 255, 136,0, TRUE} // 171
, {doSetProp, 112 /* p */, 129,0, FALSE} // 172 set-escape
, {doSetProp, 80 /* P */, 129,0, FALSE} // 173
, {doSetNamedChar, 78 /* N */, 122,0, FALSE} // 174
, {doSetBackslash_s, 115 /* s */, 136,0, TRUE} // 175
, {doSetBackslash_S, 83 /* S */, 136,0, TRUE} // 176
, {doSetBackslash_w, 119 /* w */, 136,0, TRUE} // 177
, {doSetBackslash_W, 87 /* W */, 136,0, TRUE} // 178
, {doSetBackslash_d, 100 /* d */, 136,0, TRUE} // 179
, {doSetBackslash_D, 68 /* D */, 136,0, TRUE} // 180
, {doSetLiteralEscaped, 255, 122,0, TRUE} // 181
, {doSetFinish, 255, 14,0, FALSE} // 182 set-finish
, {doExit, 255, 183,0, TRUE} // 183 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -249,6 +357,8 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"open-paren-lookbehind",
0,
@ -264,6 +374,8 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
"quant-star",
0,
@ -275,7 +387,6 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
"interval-open",
0,
0,
"interval-lower",
0,
@ -307,6 +418,85 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
"set-open",
0,
0,
"set-open2",
0,
"set-posix",
0,
0,
"set-start",
0,
0,
0,
0,
0,
"set-start-dash",
0,
"set-start-amp",
0,
"set-after-lit",
0,
0,
0,
0,
0,
0,
"set-after-set",
0,
0,
0,
0,
0,
0,
"set-after-range",
0,
0,
0,
0,
0,
0,
"set-after-op",
0,
0,
0,
"set-set-amp",
0,
0,
"set-lit-amp",
0,
"set-set-dash",
0,
0,
"set-range-dash",
0,
"set-range-amp",
0,
"set-lit-dash",
0,
0,
0,
0,
"set-lit-dash-escape",
0,
0,
0,
0,
0,
0,
0,
"set-escape",
0,
0,
0,
0,
0,
0,
0,
0,
0,
"set-finish",
"errorDeath",
0};

View file

@ -1,7 +1,7 @@
#!/usr/bin/perl
# ********************************************************************
# * COPYRIGHT:
# * Copyright (c) 2002-2003, International Business Machines Corporation and
# * Copyright (c) 2002-2007, International Business Machines Corporation and
# * others. All Rights Reserved.
# ********************************************************************
#
@ -22,10 +22,6 @@
# for the Rule Based Break Iterator Rule Parser. Perhaps they could be
# merged?
#
#*********************************************************************
# Copyright (C) 2002 International Business Machines Corporation *
# and others. All rights reserved. *
#*********************************************************************
$num_states = 1; # Always the state number for the line being compiled.
@ -210,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P
print "// It is generated by the Perl script \"regexcst.pl\" from\n";
print "// the rule parser state definitions file \"regexcst.txt\".\n";
print "//\n";
print "// Copyright (C) 2002-2003 International Business Machines Corporation \n";
print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
print "// and others. All rights reserved. \n";
print "//\n";
print "//---------------------------------------------------------------------------------\n";

View file

@ -1,7 +1,7 @@
#*****************************************************************************
#
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
# Copyright (C) 2002-2007, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
@ -25,8 +25,8 @@
#
#
#StateName:
# input-char n next-state ^push-state action
# input-char n next-state ^push-state action
# input-char n next-state ^push-state action
# input-char n next-state ^push-state action
# | | | | |
# | | | | |--- action to be performed by state machine
# | | | | See function RBBIRuleScanner::doParseActions()
@ -46,7 +46,7 @@
# matches, peform the actions and go to the state specified on this line.
# The input character is tested sequentally, in the order written. The characters and
# character classes tested for do not need to be mutually exclusive. The first match wins.
#
#
@ -56,27 +56,27 @@
#
start:
default term doPatStart
#
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
'[' n expr-quant doScanUnicodeSet
'(' n open-paren
'[' n set-open ^set-finish doSetBegin
'(' n open-paren
'.' n expr-quant doDotAny
'^' n term doCaret
'$' n term doDollar
'^' n expr-quant doCaret
'$' n expr-quant doDollar
'\' n backslash
'|' n term doOrOperator
')' n pop doCloseParen
eof term doPatFinish
default errorDeath doRuleError
#
@ -84,14 +84,14 @@ term:
# trailing quantifier - *, +, ?, *?, etc.
#
expr-quant:
'*' n quant-star
'+' n quant-plus
'?' n quant-opt
'*' n quant-star
'+' n quant-plus
'?' n quant-opt
'{' n interval-open doIntervalInit
'(' n open-paren-quant
default expr-cont
default expr-cont
#
# expr-cont Expression, continuation. At a point where additional terms are
# allowed, but not required. No Quantifiers
@ -99,8 +99,8 @@ expr-quant:
expr-cont:
'|' n term doOrOperator
')' n pop doCloseParen
default term
default term
#
# open-paren-quant Special case handling for comments appearing before a quantifier,
@ -111,12 +111,12 @@ expr-cont:
open-paren-quant:
'?' n open-paren-quant2 doSuppressComments
default open-paren
open-paren-quant2:
'#' n paren-comment ^expr-quant
default open-paren-extended
#
# open-paren We've got an open paren. We need to scan further to
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
@ -124,7 +124,7 @@ open-paren-quant2:
open-paren:
'?' n open-paren-extended doSuppressComments
default term ^expr-quant doOpenCaptureParen
open-paren-extended:
':' n term ^expr-quant doOpenNonCaptureParen # (?:
'>' n term ^expr-quant doOpenAtomicParen # (?>
@ -133,24 +133,25 @@ open-paren-extended:
'<' n open-paren-lookbehind
'#' n paren-comment ^term
'i' paren-flag doBeginMatchMode
'd' paren-flag doBeginMatchMode
'm' paren-flag doBeginMatchMode
's' paren-flag doBeginMatchMode
'u' paren-flag doBeginMatchMode
'w' paren-flag doBeginMatchMode
'x' paren-flag doBeginMatchMode
'-' paren-flag doBeginMatchMode
'(' n errorDeath doConditionalExpr
'{' n errorDeath doPerlInline
default errorDeath doBadOpenParenType
open-paren-lookbehind:
'=' n term ^expr-cont doOpenLookBehind # (?<=
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
default errorDeath doBadOpenParenType
#
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
# TODO: should parens nest here? Check what perl does.
#
paren-comment:
')' n pop
@ -158,20 +159,22 @@ paren-comment:
default n paren-comment
#
# paren-flag Scanned a (?ismx-ismx flag setting
#
# paren-flag Scanned a (?ismx-ismx flag setting
#
paren-flag:
'i' n paren-flag doMatchMode
'd' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'u' n paren-flag doMatchMode
'w' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
')' n term doSetMatchMode
':' n term ^expr-quant doMatchModeParen
default errorDeath doBadModeFlag
#
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
# between plain '*', '*?', '*+'
@ -204,13 +207,12 @@ quant-opt:
#
# Interval scanning a '{', the opening delimiter for an interval specification
# {number} or {min, max} or {min, }
# {number} or {min, max} or {min,}
#
interval-open:
white_space n interval-open # TODO: is white space allowed here in non-free mode?
digit_char interval-lower
digit_char interval-lower
default errorDeath doIntervalError
interval-lower:
digit_char n interval-lower doIntevalLowerDigit
',' n interval-upper
@ -221,13 +223,13 @@ interval-upper:
digit_char n interval-upper doIntervalUpperDigit
'}' n interval-type
default errorDeath doIntervalError
interval-type:
'?' n expr-cont doNGInterval # {n,m}?
'+' n expr-cont doPossessiveInterval # {n,m}+
default expr-cont doInterval # {m,n}
#
# backslash # Backslash. Figure out which of the \thingies we have encountered.
# The low level next-char function will have preprocessed
@ -239,7 +241,7 @@ backslash:
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
'N' expr-quant doProperty # \N{NAME} named char
'N' expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'Q' n term doEnterQuoteMode
@ -250,11 +252,210 @@ backslash:
'X' n expr-quant doBackslashX
'Z' n term doBackslashZ
'z' n term doBackslashz
digit_char n expr-quant doBackRef # Will scan multiple digits
digit_char n expr-quant doBackRef # Will scan multiple digits
eof errorDeath doEscapeError
default n expr-quant doLiteralChar # Escaped literal char.
default n expr-quant doEscapedLiteralChar
#
# [set expression] parsing,
# All states involved in parsing set expressions have names beginning with "set-"
#
set-open:
'^' n set-open2 doSetNegate
':' set-posix doSetPosixProp
default set-open2
set-open2:
']' n set-after-lit doSetLiteral
default set-start
# set-posix:
# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
# moved the scan to the closing ']'. If it wasn't a property
# expression, the scan will still be at the opening ':', which should
# be interpreted as a normal set expression.
set-posix:
']' n pop doSetEnd
':' set-start
default errorDeath doRuleError # should not be possible.
#
# set-start after the [ and special case leading characters (^ and/or ]) but before
# everything else. A '-' is literal at this point.
#
set-start:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'\' n set-escape
'-' n set-start-dash
'&' n set-start-amp
default n set-after-lit doSetLiteral
# set-start-dash Turn "[--" into a syntax error.
# "[-x" is good, - and x are literals.
#
set-start-dash:
'-' errorDeath doRuleError
default set-after-lit doSetAddDash
# set-start-amp Turn "[&&" into a syntax error.
# "[&x" is good, & and x are literals.
#
set-start-amp:
'&' errorDeath doRuleError
default set-after-lit doSetAddAmp
#
# set-after-lit The last thing scanned was a literal character within a set.
# Can be followed by anything. Single '-' or '&' are
# literals in this context, not operators.
set-after-lit:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-lit-dash
'&' n set-lit-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
set-after-set:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-set-dash
'&' n set-set-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
set-after-range:
']' n pop doSetEnd
'[' n set-open ^set-after-set doSetBeginUnion
'-' n set-range-dash
'&' n set-range-amp
'\' n set-escape
eof errorDeath doSetNoCloseError
default n set-after-lit doSetLiteral
# set-after-op
# After a -- or &&
# It is an error to close a set at this point.
#
set-after-op:
'[' n set-open ^set-after-set doSetBeginUnion
']' errorDeath doSetOpError
'\' n set-escape
default n set-after-lit doSetLiteral
#
# set-set-amp
# Have scanned [[set]&
# Could be a '&' intersection operator, if a set follows.
# Could be the start of a '&&' operator.
# Otherewise is a literal.
set-set-amp:
'[' n set-open ^set-after-set doSetBeginIntersection1
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
# set-lit-amp Have scanned "[literals&"
# Could be a start of "&&" operator or a literal
# In [abc&[def]], the '&' is a literal
#
set-lit-amp:
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
#
# set-set-dash
# Have scanned [set]-
# Could be a '-' difference operator, if a [set] follows.
# Could be the start of a '--' operator.
# Otherewise is a literal.
set-set-dash:
'[' n set-open ^set-after-set doSetBeginDifference1
'-' n set-after-op doSetDifference2
default set-after-lit doSetAddDash
#
# set-range-dash
# scanned a-b- or \w-
# any set or range like item where the trailing single '-' should
# be literal, not a set difference operation.
# A trailing "--" is still a difference operator.
set-range-dash:
'-' n set-after-op doSetDifference2
default set-after-lit doSetAddDash
set-range-amp:
'&' n set-after-op doSetIntersection2
default set-after-lit doSetAddAmp
# set-lit-dash
# Have scanned "[literals-" Could be a range or a -- operator or a literal
# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
# [abc-\p{xx} the '-' is an error
# [abc-] the '-' is a literal
# [ab-xy] the '-' is a range
#
set-lit-dash:
'-' n set-after-op doSetDifference2
'[' set-after-lit doSetAddDash
']' set-after-lit doSetAddDash
'\' n set-lit-dash-escape
default n set-after-range doSetRange
# set-lit-dash-escape
#
# scanned "[literal-\"
# Could be a range, if the \ introduces an escaped literal char or a named char.
# Otherwise it is an error.
#
set-lit-dash-escape:
's' errorDeath doSetOpError
'S' errorDeath doSetOpError
'w' errorDeath doSetOpError
'W' errorDeath doSetOpError
'd' errorDeath doSetOpError
'D' errorDeath doSetOpError
'N' set-after-range doSetNamedRange
default n set-after-range doSetRange
#
# set-escape
# Common back-slash escape processing within set expressions
#
set-escape:
'p' set-after-set doSetProp
'P' set-after-set doSetProp
'N' set-after-lit doSetNamedChar
's' n set-after-range doSetBackslash_s
'S' n set-after-range doSetBackslash_S
'w' n set-after-range doSetBackslash_w
'W' n set-after-range doSetBackslash_W
'd' n set-after-range doSetBackslash_d
'D' n set-after-range doSetBackslash_D
default n set-after-lit doSetLiteralEscaped
#
# set-finish
# Have just encountered the final ']' that completes a [set], and
# arrived here via a pop. From here, we exit the set parsing world, and go
# back to generic regular expression parsing.
#
set-finish:
default expr-quant doSetFinish
#
# errorDeath. This state is specified as the next state whenever a syntax error
# in the source rules is detected. Barring bugs, the state machine will never

View file

@ -1,6 +1,6 @@
//
// Copyright (C) 2002-2005 International Business Machines Corporation
// and others. All rights reserved.
//
// Copyright (C) 2002-2007 International Business Machines Corporation
// and others. All rights reserved.
//
// file: regeximp.h
//
@ -57,7 +57,7 @@ U_NAMESPACE_BEGIN
enum {
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values.
URX_BACKTRACK = 1,
URX_BACKTRACK = 1, // Force a backtrack, as if a match test had failed.
URX_END = 2,
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
URX_STRING = 4, // Value field is index of string start
@ -66,16 +66,16 @@ enum {
URX_NOP = 7,
URX_START_CAPTURE = 8, // Value field is capture group number.
URX_END_CAPTURE = 9, // Value field is capture group number
URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
URX_SETREF = 11, // Value field is index of set in array of sets.
URX_DOTANY = 12,
URX_DOTANY = 12,
URX_JMP = 13, // Value field is destination position in
// the pattern.
URX_FAIL = 14, // Stop match operation, No match.
URX_JMP_SAV = 15, // Operand: JMP destination location
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
URX_BACKSLASH_G = 17,
URX_BACKSLASH_G = 17,
URX_JMP_SAV_X = 18, // Conditional JMP_SAV,
// Used in (x)+, breaks loop on zero length match.
// Operand: Jmp destination.
@ -88,21 +88,22 @@ enum {
URX_DOLLAR = 24, // Also for \Z
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possessive.
URX_CTR_INIT_NG = 26, // 2 kinds, normal and non-greedy.
// These are 4 word opcodes. See description.
// First Operand: Data loc of counter variable
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
// at the end of the loop.
// 3rd Operand: Minimum count.
// 4th Operand: Max count, -1 for unbounded.
URX_DOTANY_PL = 27, // .+, match rest of the line. Fail already at end.
URX_DOTANY_UNIX = 27, // '.' operator in UNIX_LINES mode, only \n marks end of line.
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
URX_CTR_LOOP_NG = 29, // Also in three flavors.
// Operand is loc of corresponding CTR_INIT.
URX_DOTANY_ALL_PL = 30, // .+, match rest of the Input. Fail if already at end
URX_CARET_M_UNIX = 30, // '^' operator, test for start of line in multi-line
// plus UNIX_LINES mode.
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
// back into compiled pattern code, and thus must
@ -118,7 +119,7 @@ enum {
// within the matcher stack frame.
URX_JMPX = 36, // Conditional JMP.
// First Operand: JMP target location.
// Second Operand: Data location containing an
// Second Operand: Data location containing an
// input position. If current input position ==
// saved input position, FAIL rather than taking
// the JMP
@ -157,7 +158,7 @@ enum {
URX_LBN_END = 48, // Negative LookBehind end
// Parameter is the data location.
// Check that the match ended at the right spot.
URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
// Operand is index of set in array of sets.
URX_LOOP_SR_I = 50, // Init a [set]* loop.
// Operand is the sets index in array of user sets.
@ -166,12 +167,18 @@ enum {
// Must always immediately follow LOOP_x_I instruction.
URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop.
// Operand value:
// 0: Normal (. doesn't match new-line) mode.
// 1: . matches new-line mode.
URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style
// bit 0:
// 0: Normal (. doesn't match new-line) mode.
// 1: . matches new-line mode.
// bit 1: controls what new-lines are recognized by this operation.
// 0: All Unicode New-lines
// 1: UNIX_LINES, \u000a only.
URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style
// word boundaries.
URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode.
URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
};
};
// Keep this list of opcode names in sync with the above enum
// Used for debug printing only.
@ -203,10 +210,10 @@ enum {
"DOLLAR", \
"CTR_INIT", \
"CTR_INIT_NG", \
"DOTANY_PL", \
"DOTANY_UNIX", \
"CTR_LOOP", \
"CTR_LOOP_NG", \
"DOTANY_ALL_PL", \
"URX_CARET_M_UNIX", \
"RELOC_OPRND", \
"STO_SP", \
"LD_SP", \
@ -229,21 +236,23 @@ enum {
"LOOP_SR_I", \
"LOOP_C", \
"LOOP_DOT_I", \
"BACKSLASH_BU"
"BACKSLASH_BU", \
"DOLLAR_D", \
"DOLLAR_MD"
//
// Convenience macros for assembling and disassembling a compiled operation.
//
#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
#define URX_TYPE(x) ((uint32_t)(x) >> 24)
#define URX_TYPE(x) ((uint32_t)(x) >> 24)
#define URX_VAL(x) ((x) & 0xffffff)
//
// Access to Unicode Sets composite character properties
// The sets are accessed by the match engine for things like \w (word boundary)
//
//
enum {
URX_ISWORD_SET = 1,
URX_ISALNUM_SET = 2,
@ -297,7 +306,7 @@ enum StartOfMatch {
(v)==START_LINE? "START_LINE" : \
(v)==START_STRING? "START_STRING" : \
"ILLEGAL")
//
// 8 bit set, to fast-path latin-1 set membership tests.

View file

@ -59,9 +59,6 @@ static const UChar gRuleSet_rule_char_pattern[] = {
static const UChar gRuleSet_digit_char_pattern[] = {
// [ 0 - 9 ]
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
//static const UnicodeSet *gRuleDigits = NULL;
//
// Here are the backslash escape characters that ICU's unescape() function
@ -72,16 +69,6 @@ static const UChar gUnescapeCharPattern[] = {
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};
//
// White space characters that may appear within a pattern in free-form mode
//
static const UChar gRuleWhiteSpacePattern[] = {
/* "[[:Cf:][:WSpace:]]" */
91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
83, 112, 97, 99, 101, 58, 93, 93, 0 };
//
// Unicode Set Definitions for Regular Expression \w
//
@ -89,7 +76,7 @@ static const UChar gIsWordPattern[] = {
// [ \ p { A l p h a b e t i c }
0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d,
// \ p { M } Mark
0x5c, 0x70, 0x7b, 0x4d, 0x7d,
0x5c, 0x70, 0x7b, 0x4d, 0x7d,
// \ p { N d } Digit_Numeric
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d,
// \ p { P c } ] Connector_Punctuation
@ -108,8 +95,8 @@ static const UChar gIsSpacePattern[] = {
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
//
static const UChar gGC_ControlPattern[] = {
// [ [ : Z l : ] [ : Z p : ]
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
// [ [ : Z l : ] [ : Z p : ]
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
// [ : C c : ] [ : C f : ] -
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
// [ : G r a p h e m e _
@ -124,34 +111,35 @@ static const UChar gGC_ExtendPattern[] = {
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
static const UChar gGC_LPattern[] = {
// [ \ p { H a n g u l _ S y l
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
static const UChar gGC_VPattern[] = {
// [ \ p { H a n g u l _ S y l
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = V } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
static const UChar gGC_TPattern[] = {
// [ \ p { H a n g u l _ S y l
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = T } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
static const UChar gGC_LVPattern[] = {
// [ \ p { H a n g u l _ S y l
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L V } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
static const UChar gGC_LVTPattern[] = {
// [ \ p { H a n g u l _ S y l
// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L V T } ]
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
@ -160,7 +148,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status)
fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
fRuleDigitsAlias(NULL)
{
// First zero out everything
// First zero out everything
int i;
for (i=0; i<URX_LAST_SET; i++) {
fPropSets[i] = NULL;
@ -171,7 +159,7 @@ fRuleDigitsAlias(NULL)
// Then init the sets to their correct values.
fPropSets[URX_ISWORD_SET] = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1), *status);
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
fPropSets[URX_GC_EXTEND] = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status);
fPropSets[URX_GC_CONTROL] = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
fPropSets[URX_GC_L] = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1), *status);
@ -184,14 +172,14 @@ fRuleDigitsAlias(NULL)
// The rest of the initialization needs them, so we cannot proceed.
return;
}
//
// The following sets are dynamically constructed, because their
// initialization strings would be unreasonable.
//
//
// "Normal" is the set of characters that don't need special handling
// when finding grapheme cluster boundaries.
@ -202,7 +190,7 @@ fRuleDigitsAlias(NULL)
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
// Initialize the 8-bit fast bit sets from the parallel full
// UnicodeSets.
for (i=0; i<URX_LAST_SET; i++) {
@ -213,9 +201,8 @@ fRuleDigitsAlias(NULL)
}
// Sets used while parsing rules, but not referenced from the parse state table
fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1), *status);
fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128];
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
if (fRuleSets[i]) {
@ -281,7 +268,7 @@ void RegexStaticSets::initGlobals(UErrorCode *status) {
ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
}
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,9 @@
//
// file: repattrn.cpp
// file: repattrn.cpp
//
/*
***************************************************************************
* Copyright (C) 2002-2006 International Business Machines Corporation *
* Copyright (C) 2002-2007 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
*/
@ -46,7 +46,7 @@ RegexPattern::RegexPattern() {
//
//--------------------------------------------------------------------------
RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
init();
init();
*this = other;
}
@ -78,9 +78,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fFrameSize = other.fFrameSize;
fDataSize = other.fDataSize;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
fStaticSets = other.fStaticSets;
fStaticSets8 = other.fStaticSets8;
fStartType = other.fStartType;
fInitialStringIdx = other.fInitialStringIdx;
fInitialStringLen = other.fInitialStringLen;
@ -92,9 +92,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
// Copy the Unicode Sets.
// Copy the Unicode Sets.
// Could be made more efficient if the sets were reference counted and shared,
// but I doubt that pattern copying will be particularly common.
// but I doubt that pattern copying will be particularly common.
// Note: init() already added an empty element zero to fSets
int32_t i;
int32_t numSets = other.fSets->size();
@ -135,7 +135,7 @@ void RegexPattern::init() {
fFrameSize = 0;
fDataSize = 0;
fGroupMap = NULL;
fMaxCaptureDigits = 1;
fMaxCaptureDigits = 1;
fStaticSets = NULL;
fStaticSets8 = NULL;
fStartType = START_NO_INFO;
@ -144,7 +144,7 @@ void RegexPattern::init() {
fInitialChars = NULL;
fInitialChar = 0;
fInitialChars8 = NULL;
fCompiledPat = new UVector32(fDeferredStatus);
fGroupMap = new UVector32(fDeferredStatus);
fSets = new UVector(fDeferredStatus);
@ -166,7 +166,7 @@ void RegexPattern::init() {
//--------------------------------------------------------------------------
//
// zap Delete everything owned by this RegexPattern.
// zap Delete everything owned by this RegexPattern.
//
//--------------------------------------------------------------------------
void RegexPattern::zap() {
@ -208,7 +208,7 @@ RegexPattern::~RegexPattern() {
// Clone
//
//--------------------------------------------------------------------------
RegexPattern *RegexPattern::clone() const {
RegexPattern *RegexPattern::clone() const {
RegexPattern *copy = new RegexPattern(*this);
return copy;
}
@ -229,7 +229,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
//---------------------------------------------------------------------
//
// compile
// compile
//
//---------------------------------------------------------------------
RegexPattern * U_EXPORT2
@ -244,7 +244,8 @@ RegexPattern::compile(const UnicodeString &regex,
}
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD;
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
if ((flags & ~allFlags) != 0) {
status = U_REGEX_INVALID_FLAG;
@ -269,19 +270,24 @@ RegexPattern::compile(const UnicodeString &regex,
RegexCompile compiler(This, status);
compiler.compile(regex, pe, status);
if (U_FAILURE(status)) {
delete This;
This = NULL;
}
return This;
}
//
// compile with default flags.
//
RegexPattern * U_EXPORT2
RegexPattern::compile(const UnicodeString &regex,
UParseError &pe,
UErrorCode &err)
UErrorCode &err)
{
return compile(regex, 0, pe, err);
return compile(regex, 0, pe, err);
}
@ -292,10 +298,10 @@ RegexPattern::compile(const UnicodeString &regex,
RegexPattern * U_EXPORT2
RegexPattern::compile( const UnicodeString &regex,
uint32_t flags,
UErrorCode &err)
UErrorCode &err)
{
UParseError pe;
return compile(regex, flags, pe, err);
return compile(regex, flags, pe, err);
}
@ -326,7 +332,7 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
#if 0
RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
UErrorCode &status) const
UErrorCode &status) const
{
/* This should never get called. The API with UnicodeString should be called instead. */
if (U_SUCCESS(status)) {
@ -352,7 +358,7 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
return NULL;
}
retMatcher = new RegexMatcher(this);
retMatcher = new RegexMatcher(this);
if (retMatcher == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
@ -437,17 +443,15 @@ void RegexPattern::dumpOp(int32_t index) const {
int32_t val = URX_VAL(op);
int32_t type = URX_TYPE(op);
int32_t pinnedType = type;
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
pinnedType = 0;
}
REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
switch (type) {
case URX_NOP:
case URX_DOTANY:
case URX_DOTANY_ALL:
case URX_DOTANY_PL:
case URX_DOTANY_ALL_PL:
case URX_FAIL:
case URX_CARET:
case URX_DOLLAR:
@ -458,7 +462,7 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_CARET_M:
// Types with no operand field of interest.
break;
case URX_RESERVED_OP:
case URX_START_CAPTURE:
case URX_END_CAPTURE:
@ -494,12 +498,12 @@ void RegexPattern::dumpOp(int32_t index) const {
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF(("%d", val));
break;
case URX_ONECHAR:
case URX_ONECHAR_I:
REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
break;
case URX_STRING:
case URX_STRING_I:
{
@ -543,7 +547,7 @@ void RegexPattern::dumpOp(int32_t index) const {
}
break;
default:
REGEX_DUMP_DEBUG_PRINTF(("??????"));
break;
@ -554,7 +558,7 @@ void RegexPattern::dumpOp(int32_t index) const {
#if defined(REGEX_DEBUG)
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
RegexPatternDump(const RegexPattern *This) {
int index;
int i;
@ -565,7 +569,7 @@ RegexPatternDump(const RegexPattern *This) {
}
REGEX_DUMP_DEBUG_PRINTF(("\n"));
REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
if (This->fStartType == START_STRING) {
REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
@ -580,7 +584,7 @@ RegexPatternDump(const RegexPattern *This) {
REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
for (i=0; i<numSetChars; i++) {
UChar32 c = This->fInitialChars->charAt(i);
if (0x20<c && c <0x7e) {
if (0x20<c && c <0x7e) {
REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
} else {
REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
@ -606,7 +610,7 @@ RegexPatternDump(const RegexPattern *This) {
This->dumpOp(index);
}
REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
};
}
#endif

View file

@ -16,7 +16,7 @@
#ifndef REGEX_H
#define REGEX_H
//#define REGEX_DEBUG
#define REGEX_DEBUG
/**
* \file
@ -36,7 +36,7 @@
* operations, for search and replace operations, and for obtaining detailed
* information about bounds of a match. </p>
*
* <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
* <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
* expression pattern strings application code can be simplified and the explicit
* need for <code>RegexPattern</code> objects can usually be eliminated.
* </p>
@ -480,7 +480,7 @@ public:
* critical that the string not be altered or deleted before use by the regular
* expression operations is complete.
*
* @param regexp The Regular Expression to be compiled.
* @param regexp The Regular Expression to be compiled.
* @param input The string to match. The matcher retains a reference to the
* caller's string; mo copy is made.
* @param flags Regular expression options, such as case insensitive matching.
@ -517,7 +517,7 @@ public:
/**
* Attempts to match the entire input string against the pattern.
* Attempts to match the entire input region against the pattern.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @stable ICU 2.4
@ -525,8 +525,10 @@ public:
virtual UBool matches(UErrorCode &status);
/**
* Attempts to match the input string, beginning at startIndex, against the pattern.
* The match must extend to the end of the input string.
* Resets the matcher, then attempts to match the input beginning
* at the specified startIndex, and extending to the end of the input.
* The input region is reset to include the entire input string.
* A successful match must extend to the end of the input.
* @param startIndex The input string index at which to begin matching.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
@ -538,9 +540,10 @@ public:
/**
* Attempts to match the input string, starting from the beginning, against the pattern.
* Like the matches() method, this function always starts at the beginning of the input string;
* unlike that function, it does not require that the entire input string be matched.
* Attempts to match the input string, starting from the beginning of the region,
* against the pattern. Like the matches() method, this function
* always starts at the beginning of the input region;
* unlike that function, it does not require that the entire region be matched.
*
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
@ -699,6 +702,12 @@ public:
* The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at
* the specified position in the input string.
* <p>
* The matcher's region is reset to its default, which is the entire
* input string.
* <p>
* An alternative to this function is to set a match region
* beginning at the desired index.
*
* @return this RegexMatcher.
* @stable ICU 2.8
@ -709,13 +718,13 @@ public:
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
* each input string to be processed.
* each input string to be processed.
* @param input The new string on which subsequent pattern matches will operate.
* The matcher retains a reference to the callers string, and operates
* directly on that. Ownership of the string remains with the caller.
* Because no copy of the string is made, it is essential that the
* caller not delete the string until after regexp operations on it
* are done.
* are done.
* @return this RegexMatcher.
* @stable ICU 2.4
*/
@ -743,6 +752,132 @@ public:
* @stable ICU 2.4
*/
virtual const UnicodeString &input() const;
/** Sets the limits of this matcher's region.
* The region is the part of the input string that will be searched to find a match.
* Invoking this method resets the matcher, and then sets the region to start
* at the index specified by the start parameter and end at the index specified
* by the end parameter.
*
* Depending on the transparency and anchoring being used (see useTransparentBounds
* and useAnchoringBounds), certain constructs such as anchors may behave differently
* at or around the boundaries of the region
*
* The function will fail if start is greater than limit, or if either index
* is less than zero or greater than the length of the string being matched.
*
* @param start The index to begin searches at.
* @param limit The index to end searches at (exclusive).
* @param status A reference to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
/**
* Reports the start index of this matcher's region. The searches this matcher
* conducts are limited to finding matches within regionStart (inclusive) and
* regionEnd (exclusive).
*
* @return The starting index of this matcher's region.
* @draft ICU 4.0
*/
virtual int32_t regionStart() const;
/**
* Reports the end (limit) index (exclusive) of this matcher's region. The searches
* this matcher conducts are limited to finding matches within regionStart
* (inclusive) and regionEnd (exclusive).
*
* @return The ending point of this matcher's region.
* @draft ICU 4.0
*/
virtual int32_t regionEnd() const;
/**
* Queries the transparency of region bounds for this matcher.
* See useTransparentBounds for a description of transparent and opaque bounds.
* By default, a matcher uses opaque region boundaries.
*
* @return TRUE if this matcher is using opaque bounds, false if it is not.
* @draft ICU 4.0
*/
virtual UBool hasTransparentBounds() const;
/**
* Sets the transparency of region bounds for this matcher.
* Invoking this function with an argument of true will set this matcher to use transparent bounds.
* If the boolean argument is false, then opaque bounds will be used.
*
* Using transparent bounds, the boundaries of this matcher's region are transparent
* to lookahead, lookbehind, and boundary matching constructs. Those constructs can
* see text beyond the boundaries of the region while checking for a match.
*
* With opaque bounds, no text outside of the matcher's region is visible to lookahead,
* lookbehind, and boundary matching constructs.
*
* By default, a matcher uses opaque bounds.
*
* @param b TRUE for transparent bounds; FALSE for opaque bounds
* @return This Matcher;
* @draft ICU 4.0
**/
virtual RegexMatcher &useTransparentBounds(UBool b);
/**
* Return true if this matcher is using anchoring bounds.
* By default, matchers use anchoring region boounds.
*
* @return TRUE if this matcher is using anchoring bounds.
* @draft ICU 4.0
*/
virtual UBool hasAnchoringBounds() const;
/**
* Set whether this matcher is using Anchoring Bounds for its region.
* With anchoring bounds, pattern anchors such as ^ and $ will match at the start
* and end of the region. Without Anchoring Bounds, anchors will only match at
* the positions they would in the complete text.
*
* Anchoring Bounds are the default for regions.
*
* @param b TRUE if to enable anchoring bounds; FALSE to disable them.
* @return This Matcher
* @draft ICU 4.0
*/
virtual RegexMatcher &useAnchoringBounds(UBool b);
/**
* Return TRUE if the most recent matching operation touched the
* end of the text being processed. In this case, additional input text could
* change the results of that match.
*
* hitEnd() is defined for both successful and unsuccessful matches.
* In either case hitEnd() will return TRUE if if the end of the text was
* reached at any point during the matching process.
*
* @return TRUE if the most recent match hit the end of input
* @draft ICU 4.0
*/
virtual UBool hitEnd() const;
/**
* Return TRUE the most recent match succeeded and additional input could cause
* it to fail. If this method returns false and a match was found, then more input
* might change the match but the match won't be lost. If a match was not found,
* then requireEnd has no meaning.
*
* @return TRUE if more input could cause the most recent match to no longer match.
* @draft ICU 4.0
*/
virtual UBool requireEnd() const;
/**
@ -901,12 +1036,16 @@ private:
RegexMatcher &operator =(const RegexMatcher &rhs);
friend class RegexPattern;
friend class RegexCImpl;
public:
/** @internal */
void resetPreserveRegion(); // Reset matcher state, but preserve any region.
private:
//
// MatchAt This is the internal interface to the match engine itself.
// Match status comes back in matcher member variables.
//
void MatchAt(int32_t startIdx, UErrorCode &status);
void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
@ -918,18 +1057,45 @@ private:
const RegexPattern *fPattern;
RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
// should delete it when through.
const UnicodeString *fInput;
UBool fMatch; // True if the last match was successful.
const UnicodeString *fInput; // The text being matched. Is never NULL.
int32_t fRegionStart; // Start of the input region, default = 0.
int32_t fRegionLimit; // End of input region, default to input.length.
int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
int32_t fAnchorLimit; // See useAnchoringBounds
int32_t fLookStart; // Region bounds for look-ahead/behind and
int32_t fLookLimit; // and other boundary tests. See
// useTransparentBounds
int32_t fActiveStart; // Currently active bounds for matching.
int32_t fActiveLimit; // Usually is the same as region, but
// is changed to fLookStart/Limit when
// entering look around regions.
UBool fTransparentBounds; // True if using transparent bounds.
UBool fAnchoringBounds; // True if using anchoring bounds.
UBool fMatch; // True if the last attempted match was successful.
int32_t fMatchStart; // Position of the start of the most recent match
int32_t fMatchEnd; // First position after the end of the most recent match
// Zero if no previous match, even when a region
// is active.
int32_t fLastMatchEnd; // First position after the end of the previous match,
// or -1 if there was no previous match.
int32_t fLastReplaceEnd; // First position after the end of the previous appendReplacement();
int32_t fAppendPosition; // First position after the end of the previous
// appendReplacement(). As described by the
// JavaDoc for Java Matcher, where it is called
// "append position"
UBool fHitEnd; // True if the last match touched the end of input.
UBool fRequireEnd; // True if the last match required end-of-input
// (matched $ or Z)
UVector32 *fStack;
REStackFrame *fFrame; // After finding a match, the last active stack
// frame, which will contain the capture group results.
REStackFrame *fFrame; // After finding a match, the last active stack frame,
// which will contain the capture group results.
// NOT valid while match engine is running.
int32_t *fData; // Data area for use by the compiled pattern.

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2004-2006, International Business Machines
* Copyright (C) 2004-2007, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: regex.h
@ -59,12 +59,32 @@ typedef enum URegexpFlag{
/** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
* @stable ICU 2.4 */
UREGEX_DOTALL = 32,
/** If set, treat the entire pattern as a literal string.
* Metacharacters or escape sequences in the input sequence will be given
* no special meaning.
*
* The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact
* on matching when used in conjunction with this flag.
* The other flags become superfluous.
* TODO: say which escapes are still handled; anything Java does
* early (\u) we should still do.
* @draft ICU 4.0
*/
UREGEX_LITERAL = 16,
/** Control behavior of "$" and "^"
* If set, recognize line terminators within string,
* otherwise, match only at start and end of input string.
* @stable ICU 2.4 */
UREGEX_MULTILINE = 8,
/** Unix-only line endings.
* When this mode is enabled, only \u000a is recognized as a line ending
* in the behavior of ., ^, and $.
* @draft ICU 4.0
*/
UREGEX_UNIX_LINES = 1,
/** Unicode word boundaries.
* If set, \b uses the Unicode TR 29 definition of word boundaries.
@ -73,7 +93,17 @@ typedef enum URegexpFlag{
* http://unicode.org/reports/tr29/#Word_Boundaries
* @stable ICU 2.8
*/
UREGEX_UWORD = 256
UREGEX_UWORD = 256,
/** Error on Unrecognized backslash escapes.
* If set, fail with an error on patterns that contain
* backslash-escaped ASCII letters without a known specail
* meaning. If this flag is not set, these
* escaped letters represent themselves.
* @draft ICU 4.0
*/
UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
} URegexpFlag;
/**
@ -251,11 +281,21 @@ uregex_getText(URegularExpression *regexp,
UErrorCode *status);
/**
* Attempts to match the input string, beginning at startIndex, against the pattern.
* To succeed, the match must extend to the end of the input string.
* Attempts to match the input string against the pattern.
* To succeed, the match must extend to the end of the string,
* or cover the complete match region.
*
* If startIndex >= zero the match operation starts at the specified
* index and must extend to the end of the input string. Any region
* that has been specified is reset.
*
* If startIndex == -1 the match must cover the input region, or the entire
* input string if no region has been set. This directly corresponds to
* Matcher.matches() in Java
*
* @param regexp The compiled regular expression.
* @param startIndex The input string index at which to begin matching.
* @param startIndex The input string index at which to begin matching, or -1
* to match the input Region.
* @param status Receives errors detected by this function.
* @return TRUE if there is a match
* @stable ICU 3.0
@ -270,12 +310,20 @@ uregex_matches(URegularExpression *regexp,
* The match may be of any length, and is not required to extend to the end
* of the input string. Contrast with uregex_matches().
*
* <p>If startIndex is >= 0 any input region that was set for this
* URegularExpression is reset before the operation begins.
*
* <p>If the specified starting index == -1 the match begins at the start of the input
* region, or at the start of the full string if no region has been specified.
* This corresponds directly with Matcher.lookingAt() in Java.
*
* <p>If the match succeeds then more information can be obtained via the
* <code>uregexp_start()</code>, <code>uregexp_end()</code>,
* and <code>uregexp_group()</code> functions.</p>
*
* @param regexp The compiled regular expression.
* @param startIndex The input string index at which to begin matching.
* @param startIndex The input string index at which to begin matching, or
* -1 to match the Input Region
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match.
* @stable ICU 3.0
@ -287,12 +335,19 @@ uregex_lookingAt(URegularExpression *regexp,
/**
* Find the first matching substring of the input string that matches the pattern.
* The search for a match begins at the specified index.
* If startIndex is >= zero the search for a match begins at the specified index,
* and any match region is reset. This corresponds directly with
* Matcher.find(startIndex) in Java.
*
* If startIndex == -1 the search begins at the start of the input region,
* or at the start of the full string if no region has been specified.
*
* If a match is found, <code>uregex_start(), uregex_end()</code>, and
* <code>uregex_group()</code> will provide more information regarding the match.
*
* @param regexp The compiled regular expression.
* @param startIndex The position in the input string to begin the search
* @param startIndex The position in the input string to begin the search, or
* -1 to search within the Input Region.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if a match is found.
* @stable ICU 3.0
@ -303,10 +358,10 @@ uregex_find(URegularExpression *regexp,
UErrorCode *status);
/**
* Find the next pattern match in the input string.
* Begin searching the input at the location following the end of
* the previous match, or at the start of the string if there is no previous match.
* If a match is found, <code>uregex_start(), uregex_end()</code>, and
* Find the next pattern match in the input string. Begin searching
* the input at the location following the end of he previous match,
* or at the start of the string (or region) if there is no
* previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and
* <code>uregex_group()</code> will provide more information regarding the match.
*
* @param regexp The compiled regular expression.
@ -395,7 +450,8 @@ uregex_end(URegularExpression *regexp,
* Reset any saved state from the previous match. Has the effect of
* causing uregex_findNext to begin at the specified index, and causing
* uregex_start(), uregex_end() and uregex_group() to return an error
* indicating that there is no match information available.
* indicating that there is no match information available. Clears any
* match region that may have been set.
*
* @param regexp The compiled regular expression.
* @param index The position in the text at which a
@ -407,6 +463,166 @@ U_STABLE void U_EXPORT2
uregex_reset(URegularExpression *regexp,
int32_t index,
UErrorCode *status);
/** Sets the limits of the matching region for this URegularExpression.
* The region is the part of the input string that will be considered when matching.
* Invoking this method resets any saved state from the previous match,
* then sets the region to start at the index specified by the start parameter
* and end at the index specified by the end parameter.
*
* Depending on the transparency and anchoring being used (see useTransparentBounds
* and useAnchoringBounds), certain constructs such as anchors may behave differently
* at or around the boundaries of the region
*
* The function will fail if start is greater than limit, or if either index
* is less than zero or greater than the length of the string being matched.
*
* @param regexp The compiled regular expression.
* @param start The index to begin searches at.
* @param limit The index to end searches at (exclusive).
* @param status A pointer to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
U_DRAFT void U_EXPORT2
uregex_setRegion(URegularExpression *regexp,
int32_t regionStart,
int32_t regionLimit,
UErrorCode *status);
/**
* Reports the start index of the matching region. Any matches found are limited to
* to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return The starting index of this matcher's region.
* @draft ICU 4.0
*/
U_DRAFT int32_t U_EXPORT2
uregex_regionStart(const URegularExpression *regexp,
UErrorCode *status);
/**
* Reports the end index (exclusive) of the matching region for this URegularExpression.
* Any matches found are limited to to the region bounded by regionStart (inclusive)
* and regionEnd (exclusive).
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return The ending point of this matcher's region.
* @draft ICU 4.0
*/
U_DRAFT int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression *regexp,
UErrorCode *status);
/**
* Queries the transparency of region bounds for this URegularExpression.
* See useTransparentBounds for a description of transparent and opaque bounds.
* By default, matching boundaries are opaque.
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return TRUE if this matcher is using opaque bounds, false if it is not.
* @draft ICU 4.0
*/
U_DRAFT UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression *regexp,
UErrorCode *status);
/**
* Sets the transparency of region bounds for this URegularExpression.
* Invoking this function with an argument of TRUE will set matches to use transparent bounds.
* If the boolean argument is FALSE, then opaque bounds will be used.
*
* Using transparent bounds, the boundaries of the matching region are transparent
* to lookahead, lookbehind, and boundary matching constructs. Those constructs can
* see text beyond the boundaries of the region while checking for a match.
*
* With opaque bounds, no text outside of the matching region is visible to lookahead,
* lookbehind, and boundary matching constructs.
*
* By default, opaque bounds are used.
*
* @param regexp The compiled regular expression.
* @param b TRUE for transparent bounds; FALSE for opaque bounds
* @param status A pointer to a UErrorCode to receive any errors.
* @draft ICU 4.0
**/
U_DRAFT void U_EXPORT2
uregex_useTransparentBounds(URegularExpression *regexp,
UBool b,
UErrorCode *status);
/**
* Return true if this URegularExpression is using anchoring bounds.
* By default, anchoring region bounds are used.
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return TRUE if this matcher is using anchoring bounds.
* @draft ICU 4.0
*/
U_DRAFT UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression *regexp,
UErrorCode *status);
/**
* Set whether this URegularExpression is using Anchoring Bounds for its region.
* With anchoring bounds, pattern anchors such as ^ and $ will match at the start
* and end of the region. Without Anchoring Bounds, anchors will only match at
* the positions they would in the complete text.
*
* Anchoring Bounds are the default for regions.
*
* @param regexp The compiled regular expression.
* @param b TRUE if to enable anchoring bounds; FALSE to disable them.
* @param status A pointer to a UErrorCode to receive any errors.
* @draft ICU 4.0
*/
U_DRAFT void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression *regexp,
UBool b,
UErrorCode *status);
/**
* Return TRUE if the most recent matching operation touched the
* end of the text being processed. In this case, additional input text could
* change the results of that match.
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return TRUE if the most recent match hit the end of input
* @draft ICU 4.0
*/
U_DRAFT UBool U_EXPORT2
uregex_hitEnd(const URegularExpression *regexp,
UErrorCode *status);
/**
* Return TRUE the most recent match succeeded and additional input could cause
* it to fail. If this function returns false and a match was found, then more input
* might change the match but the match won't be lost. If a match was not found,
* then requireEnd has no meaning.
*
* @param regexp The compiled regular expression.
* @param status A pointer to a UErrorCode to receive any errors.
* @return TRUE if more input could cause the most recent match to no longer match.
* @draft ICU 4.0
*/
U_DRAFT UBool U_EXPORT2
uregex_requireEnd(const URegularExpression *regexp,
UErrorCode *status);
/**
* Replaces every substring of the input that matches the pattern

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2004-2006, International Business Machines
* Copyright (C) 2004-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: regex.cpp
@ -304,10 +304,15 @@ U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status) {
UBool result = FALSE;
if (validateRE(regexp, status) == FALSE) {
return FALSE;
return result;
}
if (startIndex == -1) {
result = regexp->fMatcher->matches(*status);
} else {
result = regexp->fMatcher->matches(startIndex, *status);
}
UBool result = regexp->fMatcher->matches(startIndex, *status);
return result;
}
@ -322,10 +327,15 @@ U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status) {
UBool result = FALSE;
if (validateRE(regexp, status) == FALSE) {
return FALSE;
return result;
}
if (startIndex == -1) {
result = regexp->fMatcher->lookingAt(*status);
} else {
result = regexp->fMatcher->lookingAt(startIndex, *status);
}
UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
return result;
}
@ -340,10 +350,16 @@ U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression *regexp,
int32_t startIndex,
UErrorCode *status) {
UBool result = FALSE;
if (validateRE(regexp, status) == FALSE) {
return FALSE;
return result;
}
if (startIndex == -1) {
regexp->fMatcher->resetPreserveRegion();
result = regexp->fMatcher->find();
} else {
result = regexp->fMatcher->find(startIndex, *status);
}
UBool result = regexp->fMatcher->find(startIndex, *status);
return result;
}
@ -479,6 +495,145 @@ uregex_reset(URegularExpression *regexp,
}
//------------------------------------------------------------------------------
//
// uregex_setRegion
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression *regexp,
int32_t regionStart,
int32_t regionLimit,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return;
}
regexp->fMatcher->region(regionStart, regionLimit, *status);
}
//------------------------------------------------------------------------------
//
// uregex_regionStart
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
return regexp->fMatcher->regionStart();
}
//------------------------------------------------------------------------------
//
// uregex_regionEnd
//
//------------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return 0;
}
return regexp->fMatcher->regionEnd();
}
//------------------------------------------------------------------------------
//
// uregex_hasTransparentBounds
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->hasTransparentBounds();
}
//------------------------------------------------------------------------------
//
// uregex_useTransparentBounds
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression *regexp,
UBool b,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return;
}
regexp->fMatcher->useTransparentBounds(b);
}
//------------------------------------------------------------------------------
//
// uregex_hasAnchoringBounds
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->hasAnchoringBounds();
}
//------------------------------------------------------------------------------
//
// uregex_useAnchoringBounds
//
//------------------------------------------------------------------------------
U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression *regexp,
UBool b,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return;
}
regexp->fMatcher->useAnchoringBounds(b);
}
//------------------------------------------------------------------------------
//
// uregex_hitEnd
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->hitEnd();
}
//------------------------------------------------------------------------------
//
// uregex_requireEnd
//
//------------------------------------------------------------------------------
U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression *regexp,
UErrorCode *status) {
if (validateRE(regexp, status) == FALSE) {
return FALSE;
}
return regexp->fMatcher->requireEnd();
}
//------------------------------------------------------------------------------
//
// uregex_replaceAll

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2004-2006, International Business Machines Corporation and
* Copyright (c) 2004-2007, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -34,6 +34,36 @@ log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_error
#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
log_err("Test Failure at file %s, line %d\n", __FILE__, __LINE__);}}
/*
* TEST_SETUP and TEST_TEARDOWN
* macros to handle the boilerplate around setting up regex test cases.
* parameteres to setup:
* pattern: The regex pattern, a (char *) null terminated C string.
* testString: The string data, also a (char *) C string.
* flags: Regex flags to set when compiling the pattern
*
* Put arbitrary test code between SETUP and TEARDOWN.
* 're" is the compiled, ready-to-go regular expression.
*/
#define TEST_SETUP(pattern, testString, flags) { \
UChar *srcString = NULL; \
status = U_ZERO_ERROR; \
re = uregex_openC(pattern, flags, NULL, &status); \
TEST_ASSERT_SUCCESS(status); \
srcString = (UChar *)malloc((strlen(testString)+2)*sizeof(UChar)); \
u_uastrncpy(srcString, testString, strlen(testString)+1); \
uregex_setText(re, srcString, -1, &status); \
TEST_ASSERT_SUCCESS(status); \
if (U_SUCCESS(status)) {
#define TEST_TEARDOWN \
} \
TEST_ASSERT_SUCCESS(status); \
uregex_close(re); \
free(srcString); \
}
static void test_assert_string(const char *expected, const UChar *actual, UBool nulTerm, const char *file, int line) {
char buf_inside_macro[120];
int32_t len = (int32_t)strlen(expected);
@ -544,6 +574,135 @@ static void TestRegexCAPI(void) {
uregex_close(re);
}
/*
* Regions
*/
/* SetRegion(), getRegion() do something */
TEST_SETUP(".*", "0123456789ABCDEF", 0)
UChar resultString[40];
TEST_ASSERT(uregex_regionStart(re, &status) == 0);
TEST_ASSERT(uregex_regionEnd(re, &status) == 16);
uregex_setRegion(re, 3, 6, &status);
TEST_ASSERT(uregex_regionStart(re, &status) == 3);
TEST_ASSERT(uregex_regionEnd(re, &status) == 6);
TEST_ASSERT(uregex_findNext(re, &status));
TEST_ASSERT(uregex_group(re, 0, resultString, sizeof(resultString)/2, &status) == 3)
TEST_ASSERT_STRING("345", resultString, TRUE);
TEST_TEARDOWN;
/* find(start=-1) uses regions */
TEST_SETUP(".*", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
TEST_ASSERT(uregex_end(re, 0, &status) == 6);
TEST_TEARDOWN;
/* find (start >=0) does not use regions */
TEST_SETUP(".*", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 0);
TEST_ASSERT(uregex_end(re, 0, &status) == 16);
TEST_TEARDOWN;
/* findNext() obeys regions */
TEST_SETUP(".", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_findNext(re,&status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
TEST_ASSERT(uregex_findNext(re, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 5);
TEST_ASSERT(uregex_findNext(re, &status) == FALSE);
TEST_TEARDOWN;
/* matches(start=-1) uses regions */
/* Also, verify that non-greedy *? succeeds in finding the full match. */
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_matches(re, -1, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
TEST_ASSERT(uregex_end(re, 0, &status) == 6);
TEST_TEARDOWN;
/* matches (start >=0) does not use regions */
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_matches(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 0);
TEST_ASSERT(uregex_end(re, 0, &status) == 16);
TEST_TEARDOWN;
/* lookingAt(start=-1) uses regions */
/* Also, verify that non-greedy *? finds the first (shortest) match. */
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_lookingAt(re, -1, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
TEST_ASSERT(uregex_end(re, 0, &status) == 4);
TEST_TEARDOWN;
/* lookingAt (start >=0) does not use regions */
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
uregex_setRegion(re, 4, 6, &status);
TEST_ASSERT(uregex_lookingAt(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_start(re, 0, &status) == 0);
TEST_ASSERT(uregex_end(re, 0, &status) == 0);
TEST_TEARDOWN;
/* hitEnd() */
TEST_SETUP("[a-f]*", "abcdefghij", 0);
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_hitEnd(re, &status) == FALSE);
TEST_TEARDOWN;
TEST_SETUP("[a-f]*", "abcdef", 0);
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_hitEnd(re, &status) == TRUE);
TEST_TEARDOWN;
/* requireEnd */
TEST_SETUP("abcd", "abcd", 0);
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_requireEnd(re, &status) == FALSE);
TEST_TEARDOWN;
TEST_SETUP("abcd$", "abcd", 0);
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
TEST_ASSERT(uregex_requireEnd(re, &status) == TRUE);
TEST_TEARDOWN;
/* anchoringBounds */
TEST_SETUP("abc$", "abcdef", 0);
TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == TRUE);
uregex_useAnchoringBounds(re, FALSE, &status);
TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == FALSE);
TEST_ASSERT(uregex_find(re, -1, &status) == FALSE);
uregex_useAnchoringBounds(re, TRUE, &status);
uregex_setRegion(re, 0, 3, &status);
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
TEST_ASSERT(uregex_end(re, 0, &status) == 3);
TEST_TEARDOWN;
/* Transparent Bounds */
TEST_SETUP("abc(?=def)", "abcdef", 0);
TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == FALSE);
uregex_useTransparentBounds(re, TRUE, &status);
TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == TRUE);
uregex_useTransparentBounds(re, FALSE, &status);
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* No Region */
uregex_setRegion(re, 0, 3, &status);
TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); /* with region, opaque bounds */
uregex_useTransparentBounds(re, TRUE, &status);
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* with region, transparent bounds */
TEST_ASSERT(uregex_end(re, 0, &status) == 3);
TEST_TEARDOWN;
/*
* replaceFirst()

View file

@ -175,209 +175,6 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
//---------------------------------------------------------------------------
//
// regex_find(pattern, inputString, lineNumber)
//
// function to simplify writing tests regex tests.
//
// The input text is unescaped. The pattern is not.
// The input text is marked with the expected match positions
// <0>text <1> more text </1> </0>
// The <n> </n> tags are removed before trying the match.
// The tags mark the start and end of the match and of any capture groups.
//
//
//---------------------------------------------------------------------------
// Set a value into a UVector at position specified by a decimal number in
// a UnicodeString. This is a utility function needed by the actual test function,
// which follows.
static void set(UVector &vec, int32_t val, UnicodeString index) {
UErrorCode status=U_ZERO_ERROR;
int32_t idx = 0;
for (int32_t i=0; i<index.length(); i++) {
int32_t d=u_charDigitValue(index.charAt(i));
if (d<0) {return;}
idx = idx*10 + d;
}
while (vec.size()<idx+1) {vec.addElement(-1, status);}
vec.setElementAt(val, idx);
}
void RegexTest::regex_find(const UnicodeString &pattern,
const UnicodeString &flags,
const UnicodeString &inputString,
int32_t line) {
UnicodeString unEscapedInput;
UnicodeString deTaggedInput;
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *parsePat = NULL;
RegexMatcher *parseMatcher = NULL;
RegexPattern *callerPattern = NULL;
RegexMatcher *matcher = NULL;
UVector groupStarts(status);
UVector groupEnds(status);
UBool isMatch = FALSE;
UBool failed = FALSE;
int32_t numFinds;
int32_t i;
//
// Compile the caller's pattern
//
uint32_t bflags = 0;
if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
bflags |= UREGEX_CASE_INSENSITIVE;
}
if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
bflags |= UREGEX_COMMENTS;
}
if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
bflags |= UREGEX_DOTALL;
}
if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
bflags |= UREGEX_MULTILINE;
}
callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
if (status != U_ZERO_ERROR) {
#if UCONFIG_NO_BREAK_ITERATION==1
// 'v' test flag means that the test pattern should not compile if ICU was configured
// to not include break iteration. RBBI is needed for Unicode word boundaries.
if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
goto cleanupAndReturn;
}
#endif
errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
goto cleanupAndReturn;
}
if (flags.indexOf((UChar)'d') >= 0) {
RegexPatternDump(callerPattern);
}
//
// Number of times find() should be called on the test string, default to 1
//
numFinds = 1;
for (i=2; i<=9; i++) {
if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
if (numFinds != 1) {
errln("Line %d: more than one digit flag. Scanning %d.", line, i);
goto cleanupAndReturn;
}
numFinds = i;
}
}
//
// Find the tags in the input data, remove them, and record the group boundary
// positions.
//
parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
REGEX_CHECK_STATUS_L(line);
unEscapedInput = inputString.unescape();
parseMatcher = parsePat->matcher(unEscapedInput, status);
REGEX_CHECK_STATUS_L(line);
while(parseMatcher->find()) {
parseMatcher->appendReplacement(deTaggedInput, "", status);
REGEX_CHECK_STATUS;
UnicodeString groupNum = parseMatcher->group(2, status);
if (parseMatcher->group(1, status) == "/") {
// close tag
set(groupEnds, deTaggedInput.length(), groupNum);
} else {
set(groupStarts, deTaggedInput.length(), groupNum);
}
}
parseMatcher->appendTail(deTaggedInput);
REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
//
// Do a find on the de-tagged input using the caller's pattern
//
matcher = callerPattern->matcher(deTaggedInput, status);
REGEX_CHECK_STATUS_L(line);
if (flags.indexOf((UChar)'t') >= 0) {
matcher->setTrace(TRUE);
}
for (i=0; i<numFinds; i++) {
isMatch = matcher->find();
}
matcher->setTrace(FALSE);
//
// Match up the groups from the find() with the groups from the tags
//
// number of tags should match number of groups from find operation.
// matcher->groupCount does not include group 0, the entire match, hence the +1.
// G option in test means that capture group data is not available in the
// expected results, so the check needs to be suppressed.
if (isMatch == FALSE && groupStarts.size() != 0) {
errln("Error at line %d: Match expected, but none found.\n", line);
failed = TRUE;
goto cleanupAndReturn;
}
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
// Only check for match / no match. Don't check capture groups.
if (isMatch && groupStarts.size() == 0) {
errln("Error at line %d: No match expected, but one found.\n", line);
failed = TRUE;
}
goto cleanupAndReturn;
}
for (i=0; i<=matcher->groupCount(); i++) {
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
if (matcher->start(i, status) != expectedStart) {
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
line, i, expectedStart, matcher->start(i, status));
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
}
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
if (matcher->end(i, status) != expectedEnd) {
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
line, i, expectedEnd, matcher->end(i, status));
failed = TRUE;
// Error on end position; keep going; real error is probably yet to come as group
// end positions work from end of the input data towards the front.
}
}
if ( matcher->groupCount()+1 < groupStarts.size()) {
errln("Error at line %d: Expected %d capture groups, found %d.",
line, groupStarts.size()-1, matcher->groupCount());
failed = TRUE;
}
cleanupAndReturn:
if (failed) {
errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
// callerPattern->dump();
}
delete parseMatcher;
delete parsePat;
delete matcher;
delete callerPattern;
}
//---------------------------------------------------------------------------
//
@ -938,6 +735,87 @@ void RegexTest::API_Match() {
delete m;
delete p;
}
//
// Regions
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString("This is test data");
RegexMatcher m(".*", testString, 0, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == testString.length());
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
m.region(2,4, status);
REGEX_CHECK_STATUS;
REGEX_ASSERT(m.matches(status));
REGEX_ASSERT(m.start(status)==2);
REGEX_ASSERT(m.end(status)==4);
REGEX_CHECK_STATUS;
m.reset();
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == testString.length());
UnicodeString shorterString("short");
m.reset(shorterString);
REGEX_ASSERT(m.regionStart() == 0);
REGEX_ASSERT(m.regionEnd() == shorterString.length());
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
REGEX_ASSERT(&m == &m.reset());
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
}
//
// hitEnd() and requireEnd()
//
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString("aabb");
RegexMatcher m1(".*", testString, 0, status);
REGEX_ASSERT(m1.lookingAt(status) == TRUE);
REGEX_ASSERT(m1.hitEnd() == TRUE);
REGEX_ASSERT(m1.requireEnd() == FALSE);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
RegexMatcher m2("a*", testString, 0, status);
REGEX_ASSERT(m2.lookingAt(status) == TRUE);
REGEX_ASSERT(m2.hitEnd() == FALSE);
REGEX_ASSERT(m2.requireEnd() == FALSE);
REGEX_CHECK_STATUS;
status = U_ZERO_ERROR;
RegexMatcher m3(".*$", testString, 0, status);
REGEX_ASSERT(m3.lookingAt(status) == TRUE);
REGEX_ASSERT(m3.hitEnd() == TRUE);
REGEX_ASSERT(m3.requireEnd() == TRUE);
REGEX_CHECK_STATUS;
}
//
// Compilation error on reset with UChar *
@ -1470,7 +1348,7 @@ void RegexTest::Extended() {
}
int32_t len;
UChar *testData = ReadAndConvertFile(srcPath, len, status);
UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
if (U_FAILURE(status)) {
return; /* something went wrong, error already output */
}
@ -1482,7 +1360,7 @@ void RegexTest::Extended() {
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
RegexMatcher flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);
RegexMatcher flagsMat ("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)", 0, status);
RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
UnicodeString testPattern; // The pattern for test from the test file.
@ -1581,6 +1459,295 @@ void RegexTest::Extended() {
//---------------------------------------------------------------------------
//
// regex_find(pattern, flags, inputString, lineNumber)
//
// Function to run a single test from the Extended (data driven) tests.
// See file test/testdata/regextst.txt for a description of the
// pattern and inputString fields, and the allowed flags.
// lineNumber is the source line in regextst.txt of the test.
//
//---------------------------------------------------------------------------
// Set a value into a UVector at position specified by a decimal number in
// a UnicodeString. This is a utility function needed by the actual test function,
// which follows.
static void set(UVector &vec, int32_t val, UnicodeString index) {
UErrorCode status=U_ZERO_ERROR;
int32_t idx = 0;
for (int32_t i=0; i<index.length(); i++) {
int32_t d=u_charDigitValue(index.charAt(i));
if (d<0) {return;}
idx = idx*10 + d;
}
while (vec.size()<idx+1) {vec.addElement(-1, status);}
vec.setElementAt(val, idx);
}
void RegexTest::regex_find(const UnicodeString &pattern,
const UnicodeString &flags,
const UnicodeString &inputString,
int32_t line) {
UnicodeString unEscapedInput;
UnicodeString deTaggedInput;
UErrorCode status = U_ZERO_ERROR;
UParseError pe;
RegexPattern *parsePat = NULL;
RegexMatcher *parseMatcher = NULL;
RegexPattern *callerPattern = NULL;
RegexMatcher *matcher = NULL;
UVector groupStarts(status);
UVector groupEnds(status);
UBool isMatch = FALSE;
UBool failed = FALSE;
int32_t numFinds;
int32_t i;
UBool useMatchesFunc = FALSE;
UBool useLookingAtFunc = FALSE;
int32_t regionStart = -1;
int32_t regionEnd = -1;
//
// Compile the caller's pattern
//
uint32_t bflags = 0;
if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
bflags |= UREGEX_CASE_INSENSITIVE;
}
if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
bflags |= UREGEX_COMMENTS;
}
if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
bflags |= UREGEX_DOTALL;
}
if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
bflags |= UREGEX_MULTILINE;
}
if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
}
if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
bflags |= UREGEX_UNIX_LINES;
}
callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
if (status != U_ZERO_ERROR) {
#if UCONFIG_NO_BREAK_ITERATION==1
// 'v' test flag means that the test pattern should not compile if ICU was configured
// to not include break iteration. RBBI is needed for Unicode word boundaries.
if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
goto cleanupAndReturn;
}
#endif
if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
// Expected pattern compilation error.
if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
logln("Pattern Compile returns \"%s\"", u_errorName(status));
}
goto cleanupAndReturn;
} else {
// Unexpected pattern compilation error.
errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
goto cleanupAndReturn;
}
}
if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
RegexPatternDump(callerPattern);
}
if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
errln("Expected, but did not get, a pattern compilation error.");
goto cleanupAndReturn;
}
//
// Number of times find() should be called on the test string, default to 1
//
numFinds = 1;
for (i=2; i<=9; i++) {
if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
if (numFinds != 1) {
errln("Line %d: more than one digit flag. Scanning %d.", line, i);
goto cleanupAndReturn;
}
numFinds = i;
}
}
// 'M' flag. Use matches() instead of find()
if (flags.indexOf((UChar)0x4d) >= 0) {
useMatchesFunc = TRUE;
}
if (flags.indexOf((UChar)0x4c) >= 0) {
useLookingAtFunc = TRUE;
}
//
// Find the tags in the input data, remove them, and record the group boundary
// positions.
//
parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
REGEX_CHECK_STATUS_L(line);
unEscapedInput = inputString.unescape();
parseMatcher = parsePat->matcher(unEscapedInput, status);
REGEX_CHECK_STATUS_L(line);
while(parseMatcher->find()) {
parseMatcher->appendReplacement(deTaggedInput, "", status);
REGEX_CHECK_STATUS;
UnicodeString groupNum = parseMatcher->group(2, status);
if (groupNum == "r") {
// <r> or </r>, a region specification within the string
if (parseMatcher->group(1, status) == "/") {
regionEnd = deTaggedInput.length();
} else {
regionStart = deTaggedInput.length();
}
} else {
// <digits> or </digits>, a group match boundary tag.
if (parseMatcher->group(1, status) == "/") {
set(groupEnds, deTaggedInput.length(), groupNum);
} else {
set(groupStarts, deTaggedInput.length(), groupNum);
}
}
}
parseMatcher->appendTail(deTaggedInput);
REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
errln("mismatched <r> tags");
failed = TRUE;
goto cleanupAndReturn;
}
//
// Configure the matcher according to the flags specified with this test.
//
matcher = callerPattern->matcher(deTaggedInput, status);
REGEX_CHECK_STATUS_L(line);
if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
matcher->setTrace(TRUE);
}
if (regionStart>=0) {
matcher->region(regionStart, regionEnd, status);
REGEX_CHECK_STATUS_L(line);
}
if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
matcher->useAnchoringBounds(FALSE);
}
if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
matcher->useTransparentBounds(TRUE);
}
//
// Do a find on the de-tagged input using the caller's pattern
// TODO: error on count>1 and not find().
// error on both matches() and lookingAt().
//
for (i=0; i<numFinds; i++) {
if (useMatchesFunc) {
isMatch = matcher->matches(status);
} else if (useLookingAtFunc) {
isMatch = matcher->lookingAt(status);
} else {
isMatch = matcher->find();
}
}
matcher->setTrace(FALSE);
//
// Match up the groups from the find() with the groups from the tags
//
// number of tags should match number of groups from find operation.
// matcher->groupCount does not include group 0, the entire match, hence the +1.
// G option in test means that capture group data is not available in the
// expected results, so the check needs to be suppressed.
if (isMatch == FALSE && groupStarts.size() != 0) {
errln("Error at line %d: Match expected, but none found.\n", line);
failed = TRUE;
goto cleanupAndReturn;
}
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
// Only check for match / no match. Don't check capture groups.
if (isMatch && groupStarts.size() == 0) {
errln("Error at line %d: No match expected, but one found.\n", line);
failed = TRUE;
}
goto cleanupAndReturn;
}
for (i=0; i<=matcher->groupCount(); i++) {
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
if (matcher->start(i, status) != expectedStart) {
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
line, i, expectedStart, matcher->start(i, status));
failed = TRUE;
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
}
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
if (matcher->end(i, status) != expectedEnd) {
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
line, i, expectedEnd, matcher->end(i, status));
failed = TRUE;
// Error on end position; keep going; real error is probably yet to come as group
// end positions work from end of the input data towards the front.
}
}
if ( matcher->groupCount()+1 < groupStarts.size()) {
errln("Error at line %d: Expected %d capture groups, found %d.",
line, groupStarts.size()-1, matcher->groupCount());
failed = TRUE;
}
if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
matcher->requireEnd() == TRUE) {
errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
failed = TRUE;
}
if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
matcher->requireEnd() == FALSE) {
errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
failed = TRUE;
}
if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
matcher->hitEnd() == TRUE) {
errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
failed = TRUE;
}
if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
matcher->hitEnd() == FALSE) {
errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
failed = TRUE;
}
cleanupAndReturn:
if (failed) {
errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
// callerPattern->dump();
}
delete parseMatcher;
delete parsePat;
delete matcher;
delete callerPattern;
}
//---------------------------------------------------------------------------
//
// Errors Check for error handling in patterns.
@ -1633,10 +1800,6 @@ void RegexTest::Errors() {
REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
// UnicodeSet containing a string
REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
// Ticket 5389
REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
@ -1649,7 +1812,8 @@ void RegexTest::Errors() {
// in one big UChar * buffer, which the caller must delete.
//
//--------------------------------------------------------------------------------
UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, UErrorCode &status) {
UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
const char *defEncoding, UErrorCode &status) {
UChar *retPtr = NULL;
char *fileBuf = NULL;
UConverter* conv = NULL;
@ -1698,6 +1862,11 @@ UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, UError
if(encoding!=NULL ){
fileBufC += signatureLength;
fileSize -= signatureLength;
} else {
encoding = defEncoding;
if (strcmp(encoding, "utf-8") == 0) {
errln("file %s is missing its BOM", fileName);
}
}
//
@ -1804,7 +1973,7 @@ void RegexTest::PerlTests() {
}
int32_t len;
UChar *testData = ReadAndConvertFile(srcPath, len, status);
UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
if (U_FAILURE(status)) {
return; /* something went wrong, error already output */
}
@ -1981,6 +2150,14 @@ void RegexTest::PerlTests() {
lineNum, expected?"":"no ", found?"":"no " );
continue;
}
// Don't try to check expected results if there is no match.
// (Some have stuff in the expected fields)
if (!found) {
delete testMat;
delete testPat;
continue;
}
//
// Interpret the Perl expression from the fourth field of the data file,

View file

@ -1,5 +1,5 @@
/********************************************************************
* COPYRIGHT:
* COPYRIGHT:
* Copyright (c) 2002-2007, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -16,7 +16,7 @@
class RegexTest: public IntlTest {
public:
RegexTest();
virtual ~RegexTest();
@ -37,7 +37,7 @@ public:
const UnicodeString &input, int32_t line);
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
UErrorCode expectedStatus, int32_t line);
virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, UErrorCode &status);
virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, const char *charset, UErrorCode &status);
virtual const char *getPath(char buffer[2048], const char *filename);
};

View file

@ -62,8 +62,8 @@ ab{0,1}c abc y $& abc
^abc$ aabc n - -
abc$ aabc y $& abc
abc$ aabcd n - -
^ abc y $&
$ abc y $&
^ abc y $&
$ abc y $&
a.c abc y $& abc
a.c axc y $& axc
a.*c axyzc y $& axyzc
@ -79,13 +79,13 @@ a[b-a] - c - Invalid [] range "b-a"
a[]b - ci - Unmatched [
a[ - c - Unmatched [
a] a] y $& a]
a[]]b a]b yi $& a]b ICU makes [] into an empty set.
a[]]b a]b y $& a]b
a[^bc]d aed y $& aed
a[^bc]d abd n - -
a[^-b]c adc yi $& adc ICU [] set rules
a[^-b]c a-c ni - - ICU [] set rules
a[^-b]c adc y $& adc
a[^-b]c a-c n - -
a[^]b]c a]c n - -
a[^]b]c adc yi $& adc ICU [] set rules.
a[^]b]c adc y $& adc
\ba\b a- y - -
\ba\b -a y - -
\ba\b -a- y - -
@ -113,18 +113,18 @@ a\Sb a-b y - -
\d - n - -
\D 1 n - -
\D - y - -
[\w] a iy - -
[\w] - in - -
[\W] a in - -
[\W] - iy - -
a[\s]b a b iy - -
a[\s]b a-b in - -
a[\S]b a b in - -
a[\S]b a-b iy - -
[\d] 1 iy - -
[\d] - in - -
[\D] 1 in - -
[\D] - iy - -
[\w] a y - -
[\w] - n - -
[\W] a n - -
[\W] - y - -
a[\s]b a b y - -
a[\s]b a-b n - -
a[\S]b a b n - -
a[\S]b a-b y - -
[\d] 1 y - -
[\d] - n - -
[\D] 1 n - -
[\D] - y - -
ab|cd abc y $& ab
ab|cd abcd y $& ab
()ef def y $&-$1 ef-
@ -167,7 +167,7 @@ a.+?c abcabc y $& abc
)( - c - Unmatched )
[^ab]* cde y $& cde
abc n - -
a* y $&
a* y $&
([abc])*d abbbcd y $&-$1 abbbcd-c
([abc])*bcd abcd y $&-$1 abcd-a
a|b|c|d|e e y $& e
@ -292,8 +292,8 @@ a[-]?c ac y $& ac
'^abc'i ABCC y $& ABC
'^abc$'i AABC n - -
'abc$'i AABC y $& ABC
'^'i ABC y $&
'$'i ABC y $&
'^'i ABC y $&
'$'i ABC y $&
'a.c'i ABC y $& ABC
'a.c'i AXC y $& AXC
'a.*?c'i AXYZC y $& AXYZC
@ -309,13 +309,13 @@ a[-]?c ac y $& ac
'a[]b'i - ci - Unmatched [
'a['i - c - Unmatched [
'a]'i A] y $& A]
'a[]]b'i A]B yi $& A]B
'a[]]b'i A]B y $& A]B
'a[^bc]d'i AED y $& AED
'a[^bc]d'i ABD n - -
'a[^-b]c'i ADC yi $& ADC ICU [] set rules
'a[^-b]c'i A-C ni - - ICU [] set rules
'a[^-b]c'i ADC y $& ADC
'a[^-b]c'i A-C n - -
'a[^]b]c'i A]C n - -
'a[^]b]c'i ADC yi $& ADC
'a[^]b]c'i ADC y $& ADC
'ab|cd'i ABC y $& AB
'ab|cd'i ABCD y $& AB
'()ef'i DEF y $&-$1 EF-
@ -347,7 +347,7 @@ a[-]?c ac y $& ac
')('i - c - Unmatched )
'[^ab]*'i CDE y $& CDE
'abc'i n - -
'a*'i y $&
'a*'i y $&
'([abc])*d'i ABBBCD y $&-$1 ABBBCD-C
'([abc])*bcd'i ABCD y $&-$1 ABCD-A
'a|b|c|d|e'i E y $& E
@ -357,7 +357,7 @@ a[-]?c ac y $& ac
'ab*'i XAYABBBZ y $& A
'(ab|cd)e'i ABCDE y $&-$1 CDE-CD
'[abhgefdc]ij'i HIJ y $& HIJ
'^(ab|cd)e'i ABCDE ni x$1y XY
'^(ab|cd)e'i ABCDE n x$1y XY
'(abc|)ef'i ABCDEF y $&-$1 EF-
'(a|b)c*d'i ABCD y $&-$1 BCD-B
'(ab|ab*)bc'i ABC y $&-$1 ABC-A
@ -486,7 +486,7 @@ foo\w*\d{4}baz foobar1234baz y $& foobar1234baz
a(?{})b cabd y $& ab
a(?{)b - c - Sequence (?{...}) not terminated or not {}-balanced
a(?{{})b - c - Sequence (?{...}) not terminated or not {}-balanced
a(?{}})b - c -
a(?{}})b - c -
a(?{"{"})b - c - Sequence (?{...}) not terminated or not {}-balanced
a(?{"\{"})b cabd y $& ab
a(?{"{"}})b - c - Unmatched right curly bracket
@ -546,50 +546,50 @@ $(?<=^(a)) a y $1 a
^(?=(a+?))\1ab aaab n - -
([\w:]+::)?(\w+)$ abcd: n - -
([\w:]+::)?(\w+)$ abcd y $1-$2 -abcd
([\w:]+::)?(\w+)$ xy:z:::abcd iy $1-$2 xy:z:::-abcd
([\w:]+::)?(\w+)$ xy:z:::abcd y $1-$2 xy:z:::-abcd
^[^bcd]*(c+) aexycd y $1 c
(a*)b+ caab y $1 aa
([\w:]+::)?(\w+)$ abcd: n - -
([\w:]+::)?(\w+)$ abcd y $1-$2 -abcd
([\w:]+::)?(\w+)$ xy:z:::abcd iy $1-$2 xy:z:::-abcd
([\w:]+::)?(\w+)$ xy:z:::abcd y $1-$2 xy:z:::-abcd
^[^bcd]*(c+) aexycd y $1 c
(?{$a=2})a*aa(?{local$a=$a+1})k*c(?{$b=$a}) yaaxxaaaacd y $b 3
(?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) yaaxxaaaacd y $b 4
(>a+)ab aaab n - -
(?>a+)b aaab y - -
([[:]+) a:[b]: iy $1 :[
([[=]+) a=[b]= iy $1 =[
([[.]+) a.[b]. iy $1 .[
([[:]+) a:[b]: yi $1 :[ Java and ICU dont escape [[xyz
([[=]+) a=[b]= yi $1 =[ Java and ICU dont escape [[xyz
([[.]+) a.[b]. yi $1 .[ Java and ICU dont escape [[xyz
[a[:xyz: - c - Unmatched [
[a[:xyz:] - c - POSIX class [:xyz:] unknown
[a[:]b[:c] abc iy $& abc
[a[:]b[:c] abc yi $& abc Java and ICU embedded [ is nested set
([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown
[a[:]b[:c] abc iy $& abc
[a[:]b[:c] abc iy $& abc Java and ICU embedded [ is nested set
([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy
([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul}
([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ${nulnul}
([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 01
([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}
([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
([[:lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd
([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 __--
([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1
([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __--
([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1
([[:word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__
([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01
([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01
([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 __-- ${nulnul}${ffff}
([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff}
([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff}
([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd
([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ${nulnul}${ffff}
([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy
([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff}
([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 -- ${nulnul}${ffff}
([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01
([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 Xy__-- ${nulnul}${ffff}
([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff}
[[:foo:]] - c - POSIX class [:foo:] unknown
[[:^foo:]] - c - POSIX class [:^foo:] unknown
((?>a+)b) aaab y $1 aaab
@ -823,11 +823,11 @@ foo.bart foo.bart y - -
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
tt+$ xxxtt y - -
([a-\d]+) za-9z yi $1 a-9
([\d-z]+) a0-za yi $1 0-z
([\d-\s]+) a0- z yi $1 0-
([a-[:digit:]]+) za-9z iy $1 a-9
([[:digit:]-z]+) =0-z= iy $1 0-z
([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z
([\d-z]+) a0-za y $1 0-z
([\d-\s]+) a0- z y $1 0-
([a-[:digit:]]+) za-9z y $1 a-9
([[:digit:]-z]+) =0-z= y $1 0-z
([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z Set difference in ICU
\GX.*X aaaXbX n - -
(\d+\.\d+) 3.1415926 y $1 3.1415926
(\ba.{0,10}br) have a web browser y $1 a web br
@ -857,7 +857,7 @@ tt+$ xxxtt y - -
^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c,
(?i) y - -
'(?!\A)x'm a\nxb\n y - -
^(a(b)?)+$ aba iy -$1-$2- -a--
^(a(b)?)+$ aba yi -$1-$2- -a-- Java disagrees. Not clear who is right.
'^.{9}abc.*\n'm 123\nabcabcabcabc\n y - -
^(a)?a$ a y -$1- --
^(a)?(?(1)a|b)+$ a n - -

View file

@ -1,7 +1,7 @@
# Copyright (c) 2001-2007 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file:
# file:
#
# ICU regular expression test cases.
#
@ -10,24 +10,161 @@
# <pattern> = "<regular expression pattern>"
# <match string> = "<tagged string>"
# the quotes on the pattern and match string can be " or ' or /
# <tagged string> = text, with the start and end of each
# <tagged string> = text, with the start and end of each
# capture group tagged with <n>...</n>. The overall match,
# if any, is group 0, as in <0>matched text</0>
# <flags> = any combination of
#
# A region can be specified with <r>...</r> tags.
#
# <flags> = any combination of
# i case insensitive match
# x free spacing and comments
# s dot-matches-all mode
# m multi-line mode. $ and ^ match at embedded new-lines
# m multi-line mode.
# ($ and ^ match at embedded new-lines)
# D Unix Lines mode (only recognize 0x0a as new-line)
# v If icu configured without break iteration, this
# regex test pattern should not compile.
# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
# d dump the compiled pattern
# t trace operation of match engine.
# 2-9 a digit between 2 and 9, specifies the number of
# 2-9 a digit between 2 and 9, specifies the number of
# times to execute find(). The expected results are
# for the last find() in the sequence.
# G Only check match / no match. Do not check capture groups.
# E Pattern compilation error expected
# L Use LookingAt() rather than find()
# M Use matches() rather than find().
#
# a Use non-Anchoring Bounds.
# b Use Transparent Bounds.
# The a and t options only make a difference if
# a <r>region</r> has been specified in the string.
# z|Z hitEnd was expected(z) or not expected (Z).
# With neither, hitEnd is not checked.
# y|Y Require End expected(y) or not expected (Y).
#
# White space must be present between the flags and the match string.
#
# Look-ahead expressions
#
"abc(?=def)" "<0>abc</0>def"
"(.*)(?=c)" "<0><1>ab</1></0>cdef"
"(?:.*)(?=c)" "<r>ab</r>cdef"
"(?:.*)(?=c)" b "<r><0>ab</0></r>cdef" # transparent bounds
"(?:.*)(?=c)" bM "<r><0>ab</0></r>cdef" # transparent bounds
"(?:.*)(?=(c))" b "<0>ab</0><1>c</1>def" # Capture in look-ahead
"(?=(.)\1\1)\1" "abcc<0><1>d</1></0>ddefg" # Backrefs to look-ahead capture
".(?!\p{L})" "abc<0>d</0> " # Negated look-ahead
".(?!(\p{L}))" "abc<0>d</0> " # Negated look-ahead, no capture
# visible outside of look-ahead
"and(?=roid)" L "<0>and</0>roid"
"and(?=roid)" M "<r>and</r>roid"
"and(?=roid)" bM "<r><0>and</0></r>roid"
"and(?!roid)" L "<0>and</0>roix"
"and(?!roid)" L "android"
"and(?!roid)" M "<r><0>and</0></r>roid" # Opaque bounds
"and(?!roid)" bM "<r>and</r>roid"
"and(?!roid)" bM "<r><0>and</0></r>roix"
#
# Negated Lookahead, various regions and region transparency
#
"abc(?!def)" "<0>abc</0>xyz"
"abc(?!def)" "abcdef"
"abc(?!def)" "<r><0>abc</0></r>def"
"abc(?!def)" b "<r>abc</r>def"
"abc(?!def)" b "<r><0>abc</0></r>xyz"
#
# Anchoring Bounds
#
"^def$" "abc<r><0>def</0></r>ghi" # anchoring (default) bounds
"^def$" a "abc<r>def</r>ghi" # non-anchoring bounds
"^def" a "<r><0>def</0></r>ghi" # non-anchoring bounds
"def$" a "abc<r><0>def</0></r>" # non-anchoring bounds
"^.*$" m "<0>line 1</0>\n line 2"
"^.*$" m2 "line 1\n<0> line 2</0>"
"^.*$" m3 "line 1\n line 2"
"^.*$" m "li<r><0>ne </0></r>1\n line 2" # anchoring bounds
"^.*$" m2 "li<r>ne </r>1\n line 2" # anchoring bounds
"^.*$" am "li<r>ne </r>1\n line 2" # non-anchoring bounds
"^.*$" am "li\n<r><0>ne </0></r>\n1\n line 2" # non-anchoring bounds
#
# HitEnd and RequireEnd for new-lines just before end-of-input
#
"xyz$" yz "<0>xyz</0>\n"
"xyz$" yz "<0>xyz</0>\x{d}\x{a}"
"xyz$" myz "<0>xyz</0>" # multi-line mode
"xyz$" mYZ "<0>xyz</0>\n"
"xyz$" mYZ "<0>xyz</0>\r\n"
"xyz$" mYZ "<0>xyz</0>\x{85}abcd"
"xyz$" Yz "xyz\nx"
"xyz$" Yz "xyza"
"xyz$" yz "<0>xyz</0>"
#
# All Unicode line endings recognized.
# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
# Multi-line and non-multiline mode take different paths, so repeated tests.
#
"^def$" mYZ "abc\x{a}<0>def</0>\x{a}ghi"
"^def$" mYZ "abc\x{b}<0>def</0>\x{b}ghi"
"^def$" mYZ "abc\x{c}<0>def</0>\x{c}ghi"
"^def$" mYZ "abc\x{d}<0>def</0>\x{d}ghi"
"^def$" mYZ "abc\x{85}<0>def</0>\x{85}ghi"
"^def$" mYZ "abc\x{2028}<0>def</0>\x{2028}ghi"
"^def$" mYZ "abc\x{2029}<0>def</0>\x{2029}ghi"
"^def$" mYZ "abc\r\n<0>def</0>\r\nghi"
"^def$" yz "<0>def</0>\x{a}"
"^def$" yz "<0>def</0>\x{b}"
"^def$" yz "<0>def</0>\x{c}"
"^def$" yz "<0>def</0>\x{d}"
"^def$" yz "<0>def</0>\x{85}"
"^def$" yz "<0>def</0>\x{2028}"
"^def$" yz "<0>def</0>\x{2029}"
"^def$" yz "<0>def</0>\r\n"
"^def$" yz "<0>def</0>"
"^def$" "<0>def</0>\x{2028" #TODO: should be an error of some sort.
#
# UNIX_LINES mode
#
"abc$" D "<0>abc</0>\n"
"abc$" D "abc\r"
"abc$" D "abc\u0085"
"a.b" D "<0>a\rb</0>"
"a.b" D "a\nb"
"(?d)abc$" "<0>abc</0>\n"
"(?d)abc$" "abc\r"
"abc$" mD "<0>abc</0>\ndef"
"abc$" mD "abc\rdef"
".*def" L "abc\r def xyz" # Normal mode, LookingAt() stops at \r
".*def" DL "<0>abc\r def</0> xyz" # Unix Lines mode, \r not line end.
".*def" DL "abc\n def xyz"
"(?d)a.b" "a\nb"
"(?d)a.b" "<0>a\rb</0>"
"^abc" m "xyz\r<0>abc</0>"
"^abc" Dm "xyz\rabc"
"^abc" Dm "xyz\n<0>abc</0>"
# Capturing parens
".(..)." "<0>a<1>bc</1>d</0>"
@ -97,6 +234,16 @@
"(?w:.+?(\b\S.+?\b).*)" v "<0> <1>don't</1> </0>"
"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" v "<0><1>.</1><2> </2><3>,</3><4>:</4><5>$</5><6>37,000.50</6><7> </7> </0>"
#
# Unicode word boundaries with Regions
#
"(?w).*?\b" v "abc<r><0>def</0></r>ghi"
"(?w).*?\b" v2 "abc<r>def<0></0></r>ghi"
"(?w).*?\b" v3 "abc<r>def</r>ghi"
#"(?w).*?\b" vb "abc<r><0>def</0></r>ghi" # TODO: bug. Ticket 6073
#"(?w).*?\b" vb2 "abc<r>def</r>ghi"
# . does not match new-lines
"." "\u000a\u000d\u0085\u000c\u000b\u2028\u2029<0>X</0>\u000aY"
@ -128,20 +275,20 @@
".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match
# $ matches only at end of line, or before a newline preceding the end of line
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)" "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
".*?(Goodbye)$" "Hello Goodbye> Goodbye Goodbye "# No Match
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)" ZY "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
".*?(Goodbye)$" z "Hello Goodbye> Goodbye Goodbye "# No Match
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
".*?(Goodbye)$" "Hello Goodbye Goodbye Goodbye\n\n"# No Match
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
".*?(Goodbye)$" z "Hello Goodbye Goodbye Goodbye\n\n"# No Match
# \Z matches at end of input, like $ with default flags.
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)" "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
".*?(Goodbye)\Z" "Hello Goodbye> Goodbye Goodbye "# No Match
"here$" "here\nthe end"# No Match
".*?(Goodbye)\Z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)" ZY "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
".*?(Goodbye)\Z" z "Hello Goodbye> Goodbye Goodbye "# No Match
"here$" z "here\nthe end"# No Match
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
@ -151,12 +298,13 @@
# \z matches only at the end of string.
# no special treatment of new lines.
# no dependencies on flag settings.
".*?(Goodbye)\z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye "# No Match
"here$" "here\nthe end"# No Match
".*?(Goodbye)\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye "# No Match
"here$" z "here\nthe end"# No Match
".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye\n"# No Match
".*?(Goodbye)\n\z" "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye\n"# No Match
".*?(Goodbye)\n\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
"abc\z|def" ZY "abc<0>def</0>"
# (?# comment) doesn't muck up pattern
"Hello (?# this is a comment) world" " <0>Hello world</0>..."
@ -180,6 +328,61 @@
"(x?)*xyz" "<0>xx<1></1>xyz</0>" # Sligthly wierd, but correct. The "last" time through (x?),
# it matches the empty string.
# Set expressions, basic operators and escapes work
#
"[\d]+" "<0>0123</0>abc/.,"
"[^\d]+" "0123<0>abc/.,</0>"
"[\D]+" "0123<0>abc/.,</0>"
"[^\D]+" "<0>0123</0>abc/.,"
"[\s]+" "<0> \t</0>abc/.,"
"[^\s]+" " \t<0>abc/.,</0>"
"[\S]+" " \t<0>abc/.,</0>"
"[^\S]+" "<0> \t</0>abc/.,"
"[\w]+" "<0>abc123</0> .,;"
"[^\w]+" "abc123<0> .,;</0>"
"[\W]+" "abc123<0> .,;</0>"
"[^\W]+" "<0>abc123</0> .,;"
"[\z]+" "abc<0>zzz</0>def" # \z has no special meaning
"[^\z]+" "<0>abc</0>zzzdef"
"[\^]+" "abc<0>^^</0>"
"[^\^]+" "<0>abc</0>^^"
"[\u0041c]+" "<0>AcAc</0>def"
"[\U00010002]+" "<0>\ud800\udc02</0>\U00010003"
"[^\U00010002]+" "<0>Hello</0>\x{10002}"
"[\x61b]+" "<0>abab</0>cde"
#"[\x6z]+" "\x06" #TODO: single hex digits should fail
"[\x{9}\x{75}\x{6d6}\x{6ba6}\x{6146B}\x{10ffe3}]+" "<0>\u0009\u0075\u06d6\u6ba6\U0006146B\U0010ffe3</0>abc"
"[\N{LATIN CAPITAL LETTER TONE SIX}ab\N{VARIATION SELECTOR-70} ]+" "x<0> \u0184\U000E0135 ab</0>c"
"[\N{LATIN SMALL LETTER C}-\N{LATIN SMALL LETTER F}]+" "ab<0>cdef</0>ghi"
#
# [set expressions], check the precedence of '-', '&', '--', '&&'
# '-' and '&', for compatibility with ICU UnicodeSet, have the same
# precedence as the implicit Union between adjacent items.
# '--' and '&&', for compatibility with Java, have lower precedence than
# the implicit Union operations. '--' and '&&' themselves
# have the same precedence, and group left to right.
#
"[[a-m]-[f-w]p]+" "<0>dep</0>fgwxyz"
"[^[a-m]-[f-w]p]+" "dep<0>fgwxyz</0>"
"[[a-m]--[f-w]p]+" "<0>de</0>pfgwxyz"
"[^[a-m]--[f-w]p]+" "de<0>pfgwxyz</0>"
"[[a-m]&[e-s]w]+" "<0>efmw</0>adnst"
"[^[a-m]&[e-s]w]+" "efmw<0>adnst</0>"
"[[a-m]&[e-s]]+" "<0>efm</0>adnst"
# {min,max} iteration qualifier
"A{3}BC" "<0>AAABC</0>"
@ -247,8 +450,8 @@
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>"
# Case Insensitive
"aBc" i "<0>ABC</0>"
"a[^bc]d" i "ABD"
"aBc" i "<0>ABC</0>"
"a[^bc]d" i "ABD"
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
"(?:(?i)a)b" "<0>Ab</0>"
@ -259,15 +462,36 @@
"a b" "ab"
"abc " "abc"
"abc " "<0>abc </0>"
"ab[cd e]z" "<0>ab z</0>"
"ab[cd e]z" "<0>ab z</0>"
"ab\ c" "<0>ab c</0> "
"ab c" "<0>ab c</0> "
"ab c" x "ab c "
"ab\ c" x "<0>ab c</0> "
#
# Pattern Flags
#
"(?u)abc" "<0>abc</0>"
"(?-u)abc" "<0>abc</0>"
#
# \c escapes (Control-whatever)
#
"\cA" "<0>\u0001</0>"
"\ca" "<0>\u0001</0>"
"\c\x" "<0>\u001cx</0>"
#Multi-line mode
'b\s^' m "a\nb\n"
'b\s^' m "a\nb\n"
"(?m)^abc$" "abc \n abc\n<0>abc</0>\nabc"
"(?m)^abc$" 2 "abc \n abc\nabc\n<0>abc</0>"
"^abc$" 2 "abc \n abc\nabc\nabc"
# Empty and full range
"[\u0000-\U0010ffff]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
"[^\u0000-\U0010ffff]" "abc\u0000\uffff\U00010000\U0010ffffzz"
"[^a--a]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
# Free-spacing mode
"a b c # this is a comment" x "<0>abc</0> "
@ -316,8 +540,8 @@
"abc.*$" "<0>abcdef</0>"
"abc(.*)" "<0>abc<1>def</1></0>"
"abc(.*)" "<0>abc<1></1></0>"
"abc.*" "<0>abc</0>\ndef"
"abc.*" s "<0>abc\ndef</0>"
"abc.*" "<0>abc</0>\ndef"
"abc.*" s "<0>abc\ndef</0>"
"abc.*$" s "<0>abc\ndef</0>"
"abc.*$" "abc\ndef"
"abc.*$" m "<0>abc</0>\ndef"
@ -357,9 +581,16 @@
"ab\x09w" "<0>ab\u0009w</0>"
"ab\xabcdc" "<0>ab\u00abcdc</0>"
"ab\x{abcd}c" "<0>ab\uabcdc</0>"
"ab\x{101234}c" "<0>ab\U00101234c</0>"
"ab\x{101234}c" "<0>ab\U00101234c</0>"
"abα" "<0>abα</0>"
#
# Octal Escaping. This conforms to Java conventions, not Perl.
"\0101\00\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032</0>"
"\0776" "<0>\u003f\u0036</0>" # overflow, the 6 is literal.
"\0376xyz" "<0>\u00fexyz</0>"
"\08" E "<0>\u00008</0>"
"\0" E "x"
#
# \u Surrogate Pairs
@ -369,6 +600,24 @@
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
"(\ud800)(\udc00)" "\U00010000"
#
# hitEnd with find()
#
"abc" Z "aa<0>abc</0> abcab"
"abc" 2Z "aaabc <0>abc</0>ab"
"abc" 3z "aa>abc abcab"
#
# Bug xxxx
#
"(?:\-|(\-?\d+\d\d\d))?(?:\-|\-(\d\d))?(?:\-|\-(\d\d))?(T)?(?:(\d\d):(\d\d):(\d\d)(\.\d+)?)?(?:(?:((?:\+|\-)\d\d):(\d\d))|(Z))?" MG "<0>-1234-21-31T41:51:61.789+71:81</0>"
#
# A random, complex, meaningless pattern that should at least compile
#
"(?![^\<C\f\0146\0270\}&&[|\02-\x3E\}|X-\|]]{7,}+)[|\\\x98\<\?\u4FCFr\,\0025\}\004|\0025-\0521]|(?<![|\01-\u829E])|(?<!\p{Alpha})|^|(?-s:[^\x15\\\x24F\a\,\a\u97D8[\x38\a[\0224-\0306[^\0020-\u6A57]]]]??)(?xix:[^|\{\[\0367\t\e\x8C\{\[\074c\]V[|b\fu\r\0175\<\07f\066s[^D-\x5D]]])(?xx:^{5,}+)(?d)(?=^\D)|(?!\G)(?>\G)(?![^|\]\070\ne\{\t\[\053\?\\\x51\a\075\0023-\[&&[|\022-\xEA\00-\u41C2&&[^|a-\xCC&&[^\037\uECB3\u3D9A\x31\|\<b\0206\uF2EC\01m\,\ak\a\03&&\p{Punct}]]]])(?-dxs:[|\06-\07|\e-\x63&&[|Tp\u18A3\00\|\xE4\05\061\015\0116C|\r\{\}\006\xEA\0367\xC4\01\0042\0267\xBB\01T\}\0100\?[|\[-\u459B|\x23\x91\rF\0376[|\?-\x94\0113-\\\s]]]]{6}?)(?<=[^\t-\x42H\04\f\03\0172\?i\u97B6\e\f\uDAC2])(?=\B)(?>[^\016\r\{\,\uA29D\034\02[\02-\[|\t\056\uF599\x62\e\<\032\uF0AC\0026\0205Q\|\\\06\0164[|\057-\u7A98&&[\061-g|\|\0276\n\042\011\e\xE8\x64B\04\u6D0EDW^\p{Lower}]]]]?)(?<=[^\n\\\t\u8E13\,\0114\u656E\xA5\]&&[\03-\026|\uF39D\01\{i\u3BC2\u14FE]])(?<=[^|\uAE62\054H\|\}&&^\p{Space}])(?sxx)(?<=[\f\006\a\r\xB4]{1,5})|(?x-xd:^{5}+)()" "<0></0>abc"
#
# Bug 3225
@ -435,7 +684,7 @@
"^" "<0></0>"
"^" 2 ""
"\Z" "<0></0>"
"\Z" "<0></0>"
"\Z" 2 ""
"\Z" 2 "\u000a<0></0>"
"\Z" "<0></0>\u000d\u000a"
@ -471,6 +720,173 @@
".{6}" "123\u000a\u000dXYZ"
".{6}" s "<0>123\u000a\u000dX</0>Y"
#
# Ranges
#
".*" "abc<r><0>def</0></r>ghi"
"a" "aaa<r><0>a</0>aa</r>aaa"
"a" 2 "aaa<r>a<0>a</0>a</r>aaa"
"a" 3 "aaa<r>aa<0>a</0></r>aaa"
"a" 4 "aaa<r>aaa</r>aaa"
"a" "aaa<r><0>a</0>aa</r>aaa"
#
# [set] parsing, systematically run through all of the parser states.
#
#
"[def]+" "abc<0>ddeeff</0>ghi" # set-open
"[^def]+" "<0>abc</0>defghi"
"[:digit:]+" "abc<0>123</0>def"
"[:^digit:]+" "<0>abc</0>123def"
"[\u005edef]+" "abc<0>de^f</0>ghi"
"[]]+" "abc<0>]]]</0>[def" # set-open2
"[^]]+" "<0>abc</0>]]][def"
"[:Lu:]+" "abc<0>ABC</0>def" # set-posix
"[:Lu]+" "abc<0>uL::Lu</0>"
"[:^Lu]+" "abc<0>uL:^:Lu</0>"
"[:]+" "abc<0>:::</0>def"
"[:whats this:]" E " "
"[--]+" dE "-------"
"[[nested]]+" "xyz[<0>nnetsteed</0>]abc" #set-start
"[\x{41}]+" "CB<0>AA</0>ZYX"
"[\[\]\\]+" "&*<0>[]\\</0>..."
"[*({<]+" "^&<0>{{(<<*</0>)))"
"[-def]+" "abc<0>def-ef-d</0>xyz" # set-start-dash
"[abc[--def]]" E " "
"[x[&def]]+" "abc<0>def&</0>ghi" # set-start-amp
"[&& is bad at start]" E " "
"[abc" E " " # set-after-lit
"[def]]" "abcdef"
"[def]]" "abcde<0>f]</0>]"
"[[def][ghi]]+" "abc]<0>defghi</0>[xyz" # set-after-set
"[[def]ghi]+" "abc]<0>defghi</0>[xyz"
"[[[[[[[[[[[abc]" E " "
"[[abc]\p{Lu}]+" "def<0>abcABC</0>xyz"
"[d-f]+" "abc<0>def</0>ghi" # set-after-range
"[d-f[x-z]]+" "abc<0>defxyzzz</0>gw"
"[\s\d]+" "abc<0> 123</0>def"
"[d-f\d]+" "abc<0>def123</0>ghi"
"[d-fr-t]+" "abc<0>defrst</0>uvw"
"[abc--]" E " " # set-after-op
"[[def]&&]" E " "
"[-abcd---]+" "<0>abc</0>--" #[-abcd]--[-]
"[&abcd&&&ac]+" "b<0>ac&&ca</0>d" #[&abcd]&&[&ac]
"[[abcd]&[ac]]+" "b<0>acac</0>d" # set-set-amp
"[[abcd]&&[ac]]+" "b<0>acac</0>d"
"[[abcd]&&ac]+" "b<0>acac</0>d"
"[[abcd]&ac]+" "<0>bacacd&&&</0>"
"[abcd&[ac]]+" "<0>bacacd&&&</0>" #set-lit-amp
"[abcd&&[ac]]+" "b<0>acac</0>d"
"[abcd&&ac]+" "b<0>acac</0>d"
"[[abcd]-[ac]]+" "a<0>bdbd</0>c" # set-set-dash
"[[abcd]--[ac]]+" "a<0>bdbd</0>c"
"[[abcd]--ac]+" "a<0>bdbd</0>c"
"[[abcd]-ac]+" "<0>bacacd---</0>"
"[a-d--[b-c]]+" "b<0>adad</0>c" # set-range-dash
"[a-d--b-c]+" "b<0>adad</0>c"
"[a-d-[b-c]]+" "<0>bad-adc</0>"
"[a-d-b-c]+" "<0>bad-adc</0>"
"[\w--[b-c]]+" "b<0>adad</0>c"
"[\w--b-c]+" "b<0>adad</0>c"
"[\w-[b-c]]+" "<0>bad-adc</0>"
"[\w-b-c]+" "<0>bad-adc</0>"
"[a-d&&[b-c]]+" "a<0>bcbc</0>d" # set-range-amp
"[a-d&&b-c]+" "a<0>bcbc</0>d"
"[a-d&[b-c]]+" "<0>abc&bcd</0>"
"[a-d&b-c]+" "<0>abc&bcd</0>"
"[abcd--bc]+" "b<0>adda</0>c" # set-lit-dash
"[abcd--[bc]]+" "b<0>adda</0>c"
"[abcd-[bc]]+" "<0>bad--dac</0>xyz"
"[abcd-]+" "<0>bad--dac</0>xyz"
"[abcd-\s]+" E "xyz<0>abcd --</0>xyz" # set-lit-dash-esc
"[abcd-\N{LATIN SMALL LETTER G}]+" "xyz-<0>abcdefg</0>hij-"
"[bcd-\{]+" "a<0>bcdefyz{</0>|}"
"[\p{Ll}]+" "ABC<0>abc</0>^&*&" # set-escape
"[\P{Ll}]+" "abc<0>ABC^&*&</0>xyz"
"[\N{LATIN SMALL LETTER Q}]+" "mnop<0>qqq</0>rst"
"[\sa]+" "cb<0>a a </0>(*&"
"[\S]+" " <0>hello</0> "
"[\w]+" " <0>hello_world</0>! "
"[\W]+" "a<0> *$%#,</0>hello "
"[\d]+" "abc<0>123</0>def"
"[\D]+" "123<0>abc</0>567"
"[\$\#]+" "123<0>$#$#</0>\\"
#
# Try each of the Java compatibility properties.
# These are checked here, while normal Unicode properties aren't, because
# these Java compatibility properties are implemented directly by regexp, while other
# properties are handled by ICU's Property and UnicodeSet APIs.
#
# These tests are only to verify that the names are recognized and the
# implementation isn't dead. They are not intended to verify that the
# function defintions are 100% correct.
#
"[:InBasic Latin:]+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
"[:^InBasic Latin:]+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InBasicLatin}+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
"\P{InBasicLatin}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InGreek}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
"\p{InCombining Marks for Symbols}" "<0>\u20d0</0>"
"\p{Incombiningmarksforsymbols}" "<0>\u20d0</0>"
"\p{javaDefined}+" "\uffff<0>abcd</0>\U00045678"
"\p{javaDigit}+" "abc<0>1234</0>xyz"
"\p{javaIdentifierIgnorable}+" "abc<0>\u0000\u000e\u009f</0>xyz"
"\p{javaISOControl}+" "abc<0>\u0000\u000d\u0083</0>xyz"
"\p{javaJavaIdentifierPart}+" "#@!<0>abc123_$</0>;"
"\p{javaJavaIdentifierStart}+" "123\u0301<0>abc$_</0>%^&"
"\p{javaLetter}+" "123<0>abcDEF</0>&*()("
"\p{javaLetterOrDigit}+" "$%^&*<0>123abcகஙசஜஞ</0>☺♘♚☔☎♬⚄⚡"
"\p{javaLowerCase}+" "ABC<0>def</0>&^%#:="
"\p{javaMirrored}+" "ab$%<0>(){}[]</0>xyz"
"\p{javaSpaceChar}+" "abc<0> \u00ao\u2028</0>!@#"
"\p{javaSupplementaryCodePoint}+" "abc\uffff<0>\U00010000\U0010ffff</0>\u0000"
"\p{javaTitleCase}+" "abCE<0>Džῌᾨ</0>123"
"\p{javaUnicodeIdentifierStart}+" "123<0>abcⅣ</0>%^&&*"
"\p{javaUnicodeIdentifierPart}+" "%&&^<0>abc123\u0301\u0002</0>..."
"\p{javaUpperCase}+" "abc<0>ABC</0>123"
"\p{javaValidCodePoint}+" "<0>\u0000abc\ud800 unpaired \udfff |\U0010ffff</0>"
"\p{javaWhitespace}+" "abc\u00a0\u2007\u202f<0> \u0009\u001c\u001f\u2028</0>42"
"\p{all}+" "<0>123\u0000\U0010ffff</0>"
"\P{all}+" "123\u0000\U0010ffff"
#
# Errors on unrecognized ASCII letter escape sequences.
#
"[abc\Y]+" "<0>abcY</0>"
"[abc\Y]+" eE "<0>abcY</0>"
"(?:a|b|c|\Y)+" "<0>abcY</0>"
"(?:a|b|c|\Y)+" eE "<0>abcY</0>"
"\Q\Y\E" e "<0>\\Y</0>"
#
# Reported problem
#
"[a-\w]" E "x"
#
# Bug 4045
#
@ -485,7 +901,7 @@
"A*" 3 ""
"A*" 4 ""
"A*" 5 ""
#
# Bug 4046
#
@ -512,11 +928,10 @@
# Bug 4058 ICU Unicode Set patterns have an odd feature -
# A $ as the last character before the close bracket means match
# a \uffff, which means off the end of the string in transliterators.
# Doesn't make much sense for regex, but works that way anyhow.
# Didn't make sense for regular expressions, and is now fixed.
#
"[\$](P|C|D);" "<0>$<1>P</1>;</0>"
"[$](P|C|D);" "<0>\uffff<1>P</1>;</0>"
"[$](P|C|D);" "$P;"
"[$](P|C|D);" "<0>$<1>P</1>;</0>"
"[$$](P|C|D);" "<0>$<1>P</1>;</0>"
#
@ -537,10 +952,68 @@
".+?\b" 2 " <0>\u0935\u0915\u094D\u200D\u0924\u0947</0> "
".+?\b" 3 " \u0935\u0915\u094D\u200D\u0924\u0947 "
#
# bug 5386 "^.*$" should match empty input
#
"^.*$" "<0></0>"
"^.*$" m "<0></0>"
"^.*$" "<0></0>\n"
"(?s)^.*$" "<0>\n</0>"
#
# bug 5386 Empty pattern and empty input should match.
#
"" "<0></0>abc"
"" "<0></0>"
#
# bug 5386 Range upper and lower bounds can be equal
#
"[a-a]" "<0>a</0>"
#
# bug 5386 $* should not fail, should match empty string.
#
"$*" "<0></0>abc"
#
# bug 5386 \Q ... \E escaping problem
#
"[a-z\Q-$\E]+" "QE<0>abc-def$</0>."
# More reported 5386 Java comaptibility failures
#
"[^]*abb]*" "<0>kkkk</0>"
"\xa" "huh" # Java would like to be warned.
"^.*$" "<0></0>"
#
# bug 5386 Empty left alternation should produce a zero length match.
#
"|a" "<0></0>a"
"$|ab" "<0>ab</0>"
"$|ba" "ab<0></0>"
#
# bug 5386 Java compatibility for set expressions
#
"[a-z&&[cde]]+" "ab<0>cde</0>fg"
#
# bug 6019 matches() needs to backtrack and check for a longer match if the
# first match(es) found don't match the entire input.
#
"a?|b" "<0></0>b"
"a?|b" M "<0>b</0>"
"a?|.*?u|stuff|d" M "<0>stuff</0>"
"a?|.*?(u)|stuff|d" M "<0>stuff<1>u</1></0>"
"a+?" "<0>a</0>aaaaaaaaaaaa"
"a+?" M "<0>aaaaaaaaaaaaa</0>"
#
# Random debugging, Temporary
#
#"^(?:a?b?)*$" "a--"
#"^(?:a?b?)*$" "a--"
"^(?:a?b?)*$" "a--"
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
@ -681,7 +1154,7 @@
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>ftp://ftp.blah.co.uk:2828/blah%20blah.gif</0>"
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>https://blah.gov/blah-blah.as</0>"
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "www.blah.com"
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "http://www.blah.com/I have spaces!"
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "http://www.blah.com/I have spaces!"
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "ftp://blah_underscore/[nope]"
"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002</0>"
"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002 12:32:10</0>"
@ -959,18 +1432,18 @@
"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "10.0.5.4"
"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "192.168.0.1"
"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "my ip address"
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.com</0>" # TODO: \w in pattern
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo-foo.com.au</0>" # TODO: \w in pattern
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.foo.info</0>" # TODO: \w in pattern
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@.com" # TODO: \w in pattern
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@foo..com" # TODO: \w in pattern
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@me@.com" # TODO: \w in pattern
#"/\*[\d\D]*?\*/" G "<0>/* my comment */</0>"
#"/\*[\d\D]*?\*/" G "<0>/* my multiline comment */</0>"
#"/\*[\d\D]*?\*/" G "<0>/* my nested comment */</0>"
#"/\*[\d\D]*?\*/" "*/ anything here /*"
#"/\*[\d\D]*?\*/" "anything between 2 seperate comments"
#"/\*[\d\D]*?\*/" "\* *\"
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.com</0>"
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo-foo.com.au</0>"
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.foo.info</0>"
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@.com"
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@foo..com"
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@me@.com"
"/\*[\d\D]*?\*/" G "<0>/* my comment */</0>"
"/\*[\d\D]*?\*/" G "<0>/* my multiline comment */</0>"
"/\*[\d\D]*?\*/" G "<0>/* my nested comment */</0>"
"/\*[\d\D]*?\*/" "*/ anything here /*"
"/\*[\d\D]*?\*/" "anything between 2 seperate comments"
"/\*[\d\D]*?\*/" "\* *\"
"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my comment */</0>"
"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my multiline comment */</0>"
"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my nested comment */</0>"
@ -986,9 +1459,9 @@
'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>blah@[10.0.0.1]</0>"
'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>a@b.c</0>"
'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' "non@match@."
#"^\d{9}[\d|X]$" G "<0>1234123412</0>"
#"^\d{9}[\d|X]$" G "<0>123412341X</0>"
#"^\d{9}[\d|X]$" "not an isbn"
"^\d{9}[\d|X]$" G "<0>1234123412</0>"
"^\d{9}[\d|X]$" G "<0>123412341X</0>"
"^\d{9}[\d|X]$" "not an isbn"
"^\d{9}(\d|X)$" G "<0>1234123412</0>"
"^\d{9}(\d|X)$" G "<0>123412341X</0>"
"^\d{9}(\d|X)$" "not an isbn"
@ -1056,12 +1529,12 @@
"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "12 123 1234"
"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123/1234"
"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123 12345"
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob-smith@foo.com</0>" # TODO: \w in pattern
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob.smith@foo.com</0>" # TODO: \w in pattern
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob_smith@foo.com</0>" # TODO: \w in pattern
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "-smith@foo.com" # TODO: \w in pattern
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" ".smith@foo.com" # TODO: \w in pattern
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "smith@foo_com" # TODO: \w in pattern
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob-smith@foo.com</0>"
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob.smith@foo.com</0>"
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob_smith@foo.com</0>"
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "-smith@foo.com"
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" ".smith@foo.com"
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "smith@foo_com"
"^(?=.*\d).{4,8}$" G "<0>1234</0>"
"^(?=.*\d).{4,8}$" G "<0>asdf1234</0>"
"^(?=.*\d).{4,8}$" G "<0>asp123</0>"
@ -1175,7 +1648,7 @@
"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$12,3456.01"
"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "12345"
"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$1.234"
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>C:\\temp\\this allows spaces\\web.config</0>"
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>C:\\temp\\this allows spaces\\web.config</0>"
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>\\\\Andromeda\\share\\file name.123</0>"
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "tz:\temp\ fi*le?na:m<e>.doc"
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "\\Andromeda\share\filename.a"
@ -1206,24 +1679,24 @@
"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "qqqBFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA"
"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E-4DAB-AFCA-5E6E5C8F61EA"
"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E35-4DAB-AF"
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>12.345-678</0>" # TODO: \x not implemented.
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>23.345-123</0>"
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>99.999</0>"
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "41222-222"
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "3.444-233"
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "43.324444"
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>12.345-678</0>"
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>23.345-123</0>"
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>99.999</0>"
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "41222-222"
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "3.444-233"
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "43.324444"
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>12.345-678</0>"
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>23.345-123</0>"
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>99.999</0>"
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "41222-222"
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "3.444-233"
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "43.324444"
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\file.txt</0>"
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\folder\sub folder\file.txt</0>"
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>\\network\folder\file.txt</0>" # TODO: \w in pattern
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:" # TODO: \w in pattern
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:\file.xls" # TODO: \w in pattern
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "folder.txt" # TODO: \w in pattern
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\file.txt</0>" # TODO: debug
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\folder\sub folder\file.txt</0>" # TODO: debug
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>\\network\folder\file.txt</0>" # TODO: debug
"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:"
"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:\file.xls"
"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "folder.txt"
"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>my.domain.com</0>"
"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>regexlib.com</0>"
"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>big-reg.com</0>"
@ -1265,12 +1738,12 @@
"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "1-555-5555"
"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "15553333"
"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "0-561-555-1212"
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" G "<0><input type = text name = "bob"></0>" # TODO: \w in pattern
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" G "<0><select name = "fred"></0>" # TODO: \w in pattern
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" G "<0><form</0>" # TODO: \w in pattern
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" "<input type = submit>" # TODO: \w in pattern
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" "<font face = "arial">" # TODO: \w in pattern
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" "The drity brown fox stank like" # TODO: \w in pattern
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0><input type = text name = "bob"></0>'
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0><select name = "fred"></0>'
#'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0><form></0>' #TODO: Debug
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' "<input type = submit>" # TODO: \w in pattern
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' '<font face = "arial">' # TODO: \w in pattern
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' "The drity brown fox stank like"
"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00 AM</0>"
"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>12:00 PM</0>"
"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00am</0>"
@ -1495,9 +1968,9 @@
"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "10.57.98.23."
"<img([^>]*[^/])>" G '<0><img src="bob"></0>'
"<img([^>]*[^/])>" '<img src="bob" />'
#"<!--[\s\S]*?-->" G "<0><!-- comments --></0>"
#"<!--[\s\S]*?-->" G "<0><!-- x = a > b - 3 --></0>"
#"<!--[\s\S]*?-->" "<COMMENTS>this is a comment</COMMENTS>"
"<!--[\s\S]*?-->" G "<0><!-- comments --></0>"
"<!--[\s\S]*?-->" G "<0><!-- x = a > b - 3 --></0>"
"<!--[\s\S]*?-->" "<COMMENTS>this is a comment</COMMENTS>"
"<!--[\p{Zs}\P{Zs}]*?-->" G "<0><!-- comments --></0>"
"<!--[\p{Zs}\P{Zs}]*?-->" G "<0><!-- x = a > b - 3 --></0>"
"<!--[\p{Zs}\P{Zs}]*?-->" "<COMMENTS>this is a comment</COMMENTS>"
@ -1509,8 +1982,8 @@
"(\{\\f\d*)\\([^;]+;)" G "<0>{\\f1\\fswiss\\fcharset0\\fprq2{\\*\\panose 020b0604020202020204}Arial;</0>"
"(\{\\f\d*)\\([^;]+;)" G "{\\f"
"(\{\\f\d*)\\([^;]+;)" "{f0fs20 some text}"
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" G "<0><IMG src='stars.gif' alt="space" height=1></0>" # TODO: \w in pattern
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" "this is not a tag" # TODO: \w in pattern
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" G '<0><IMG src='stars.gif' alt="space" height=1></0>' # TODO: Can't quote this pattern with the test syntax!
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" "this is not a tag"
"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>12/30/2002</0>"
"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/12/1998 13:30</0>"
"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/28/2002 22:35:00</0>"
@ -1586,10 +2059,10 @@
"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "bad.bad.gif"
"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "slash\gif."
"<[^>\s]*\bauthor\b[^>]*>" G '<0><author name="Daniel"></0>'
#"<[^>\s]*\bauthor\b[^>]*>" G "<0></sch:author></0>"
#"<[^>\s]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"</0>'
"<[^>\s]*\bauthor\b[^>]*>" G "<0></sch:author></0>"
# "<[^>\s]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"</0>' #Debug should work
"<[^> ]*\bauthor\b[^>]*>" G "<0></sch:author></0>"
"<[^> ]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"></0>'
"<[^> ]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"></0>'
"<[^>\s]*\bauthor\b[^>]*>" "<other>"
"<[^>\s]*\bauthor\b[^>]*>" "</authors>"
"<[^>\s]*\bauthor\b[^>]*>" "<work>author</work>"
@ -1625,15 +2098,15 @@
"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0"
"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0.0"
"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" ".0"
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>Sacramento</0>" #TODO: Octal
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>San Francisco</0>"
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>San Luis Obispo</0>"
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanFrancisco"
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanLuisObispo"
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "San francisco"
#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0"
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>Sacramento</0>"
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><2>San Francisco</2></0>"
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><3>San Luis Obispo</3></0>"
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanFrancisco"
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanLuisObispo"
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "San francisco"
"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0"
"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0"
@ -1682,15 +2155,15 @@
"^((0[1-9])|(1[0-2]))\/(\d{2})$" G "<0>01/04</0>"
"^((0[1-9])|(1[0-2]))\/(\d{2})$" "13/03"
"^((0[1-9])|(1[0-2]))\/(\d{2})$" "10/2003"
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0><script language=javascript>document.write("one");</script></0>" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "--" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "A-Z][a-z]+" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>strFirstName</0>" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>intAgeInYears</0>" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>Where the Wild Things Are</0>" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "123" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "abc" # TODO: \w in pattern
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "this has no caps in it" # TODO: \w in pattern
"<script[^>]*>[\w|\t|\r|\W]*</script>" G '<0><script language=javascript>document.write("one");</script></0>'
"<script[^>]*>[\w|\t|\r|\W]*</script>" "--"
"<script[^>]*>[\w|\t|\r|\W]*</script>" "A-Z][a-z]+"
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>strFirstName</0>" # Test Case damaged?
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>intAgeInYears</0>" # Test Case damaged?
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>Where the Wild Things Are</0>" # Test Case damaged?
"<script[^>]*>[\w|\t|\r|\W]*</script>" "123"
"<script[^>]*>[\w|\t|\r|\W]*</script>" "abc"
"<script[^>]*>[\w|\t|\r|\W]*</script>" "this has no caps in it"
"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-0.050</0>"
"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5.000</0>"
"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5</0>"
@ -1725,12 +2198,12 @@
"^.{4,8}$" "asd"
"^.{4,8}$" "123"
"^.{4,8}$" "asdfe12345"
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com</0>" # TODO: \w in pattern
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com.au</ # TODO: \w in pattern0>"
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.au</0>" # TODO: \w in pattern
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word" # TODO: \w in pattern
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word@" # TODO: \w in pattern
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "@word" # TODO: \w in pattern
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com</0>"
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com.au</0>"
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.au</0>"
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word"
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word@"
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "@word"
"^\d{5}-\d{4}$" G "<0>22222-3333</0>"
"^\d{5}-\d{4}$" G "<0>34545-2367</0>"
"^\d{5}-\d{4}$" G "<0>56334-2343</0>"
@ -1795,22 +2268,22 @@
"^[12345]$" "6"
"^[12345]$" "-1"
"^[12345]$" "abc"
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@aol.com</0>" # TODO: \w in pattern
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@wrox.co.uk</0>" # TODO: \w in pattern
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@domain.info</0>" # TODO: \w in pattern
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "a@b" # TODO: \w in pattern
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "notanemail" # TODO: \w in pattern
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "joe@@." # TODO: \w in pattern
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@aol.com</0>"
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@wrox.co.uk</0>"
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@domain.info</0>"
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "a@b"
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "notanemail"
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "joe@@."
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>joe@aol.com</0>"
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>ssmith@aspalliance.com</0>"
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>a@b.cc</0>"
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@123aspx.com"
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@web.info"
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@company.co.uk"
#"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>joe@aol.com</0>" # TODO: \w in pattern
#"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>a@b.c</0>" # TODO: \w in pattern
#"[\w-]+@([\w-]+\.)+[\w-]+" "asdf" # TODO: \w in pattern
#"[\w-]+@([\w-]+\.)+[\w-]+" "1234" # TODO: \w in pattern
"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>joe@aol.com</0>"
"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>a@b.c</0>"
"[\w-]+@([\w-]+\.)+[\w-]+" "asdf"
"[\w-]+@([\w-]+\.)+[\w-]+" "1234"
"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234-1234-1234-1234</0>"
"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234123412341234</0>"
"\d{4}-?\d{4}-?\d{4}-?\d{4}" "1234123412345"