mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-5386 Regular Expressions update, improved Java 1.5 compatibility. svn merge -r 22747:23061 from branches/andy/regex
X-SVN-Rev: 23063
This commit is contained in:
parent
0d216c877d
commit
67e296e813
21 changed files with 3974 additions and 1300 deletions
|
@ -16,10 +16,10 @@
|
|||
#include "unicode/uset.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \file
|
||||
* \brief C++ API: Unicode Set
|
||||
*/
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class BMPSet;
|
||||
|
@ -1213,6 +1213,14 @@ public:
|
|||
*/
|
||||
UnicodeSet& closeOver(int32_t attribute);
|
||||
|
||||
/**
|
||||
* Remove all strings from this set.
|
||||
*
|
||||
* @return a reference to this set.
|
||||
* @internal
|
||||
*/
|
||||
virtual UnicodeSet &removeAllStrings();
|
||||
|
||||
/**
|
||||
* Iteration method that returns the number of ranges contained in
|
||||
* this set.
|
||||
|
|
|
@ -718,6 +718,9 @@ typedef enum UErrorCode {
|
|||
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
|
||||
U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */
|
||||
U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/
|
||||
U_REGEX_OCTAL_TOO_BIG, /**< Octal character constants must be <= 0377. */
|
||||
U_REGEX_MISSING_CLOSE_BRACKET, /**< Missing closing bracket on a bracket expression. */
|
||||
U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */
|
||||
U_REGEX_ERROR_LIMIT, /**< This must always be the last value to indicate the limit for regexp errors */
|
||||
|
||||
/*
|
||||
|
|
|
@ -1037,6 +1037,12 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
|
|||
return *this;
|
||||
}
|
||||
|
||||
UnicodeSet& UnicodeSet::removeAllStrings() {
|
||||
strings->removeAllElements();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Makes a set from a multicharacter string. Thus "ch" => {"ch"}
|
||||
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
||||
|
|
|
@ -152,7 +152,10 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
|||
"U_REGEX_INVALID_BACK_REF",
|
||||
"U_REGEX_INVALID_FLAG",
|
||||
"U_REGEX_LOOK_BEHIND_LIMIT",
|
||||
"U_REGEX_SET_CONTAINS_STRING"
|
||||
"U_REGEX_SET_CONTAINS_STRING",
|
||||
"U_REGEX_OCTAL_TOO_BIG",
|
||||
"U_REGEX_MISSING_CLOSE_BRACKET",
|
||||
"U_REGEX_INVALID_RANGE"
|
||||
};
|
||||
|
||||
static const char * const
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -51,7 +51,7 @@ public:
|
|||
};
|
||||
|
||||
RegexCompile(RegexPattern *rp, UErrorCode &e);
|
||||
|
||||
|
||||
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
|
||||
|
||||
|
||||
|
@ -68,7 +68,7 @@ public:
|
|||
// determines the code to be generated when the matching close ) is encountered.
|
||||
enum EParenClass {
|
||||
plain = -1, // No special handling
|
||||
capturing = -2,
|
||||
capturing = -2,
|
||||
atomic = -3,
|
||||
lookAhead = -4,
|
||||
negLookAhead = -5,
|
||||
|
@ -85,8 +85,8 @@ private:
|
|||
|
||||
UChar32 nextCharLL();
|
||||
UChar32 peekCharLL();
|
||||
UnicodeSet *scanSet();
|
||||
UnicodeSet *scanProp();
|
||||
UnicodeSet *scanPosixProp();
|
||||
void handleCloseParen();
|
||||
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
|
||||
// at the top of the just completed block
|
||||
|
@ -109,7 +109,11 @@ private:
|
|||
int32_t end);
|
||||
void matchStartType();
|
||||
void stripNOPs();
|
||||
void OptDotStar();
|
||||
|
||||
void setEval(int32_t op);
|
||||
void setPushOp(int32_t op);
|
||||
UChar32 scanNamedChar();
|
||||
UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
|
||||
|
||||
|
||||
UErrorCode *fStatus;
|
||||
|
@ -125,7 +129,7 @@ private:
|
|||
// is the first character not yet scanned.
|
||||
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
|
||||
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
|
||||
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
|
||||
UBool fEOLComments; // When scan is just after '(?', inhibit #... to
|
||||
// end of line comments, in favor of (?#...) comments.
|
||||
int32_t fLineNum; // Line number in input file.
|
||||
int32_t fCharNum; // Char position within the line.
|
||||
|
@ -167,7 +171,7 @@ private:
|
|||
|
||||
UVector32 fParenStack; // parentheses stack. Each frame consists of
|
||||
// the positions of compiled pattern operations
|
||||
// needing fixup, followed by negative value. The
|
||||
// needing fixup, followed by negative value. The
|
||||
// first entry in each frame is the position of the
|
||||
// spot reserved for use when a quantifier
|
||||
// needs to add a SAVE at the start of a (block)
|
||||
|
@ -194,8 +198,33 @@ private:
|
|||
int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
|
||||
// pattern, valid while remainder of name is
|
||||
// scanned.
|
||||
|
||||
UStack fSetStack; // Stack of UnicodeSets, used while evaluating
|
||||
// (at compile time) set expressions within
|
||||
// the pattern.
|
||||
UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
|
||||
|
||||
UChar32 fLastSetLiteral; // The last single code point added to a set.
|
||||
// needed when "-y" is scanned, and we need
|
||||
// to turn "x-y" into a range.
|
||||
|
||||
};
|
||||
|
||||
// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
|
||||
// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
|
||||
|
||||
enum SetOperations {
|
||||
setStart = 0 << 16 | 1,
|
||||
setEnd = 1 << 16 | 2,
|
||||
setNegation = 2 << 16 | 3,
|
||||
setCaseClose = 2 << 16 | 9,
|
||||
setDifference2 = 3 << 16 | 4, // '--' set difference operator
|
||||
setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
|
||||
setUnion = 4 << 16 | 6, // implicit union of adjacent items
|
||||
setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
|
||||
setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif // RBBISCAN_H
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
// It is generated by the Perl script "regexcst.pl" from
|
||||
// the rule parser state definitions file "regexcst.txt".
|
||||
//
|
||||
// Copyright (C) 2002-2003 International Business Machines Corporation
|
||||
// Copyright (C) 2002-2007 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
|
@ -17,74 +17,100 @@ U_NAMESPACE_BEGIN
|
|||
// Character classes for regex pattern scanning.
|
||||
//
|
||||
static const uint8_t kRuleSet_digit_char = 128;
|
||||
static const uint8_t kRuleSet_white_space = 129;
|
||||
static const uint8_t kRuleSet_rule_char = 130;
|
||||
static const uint8_t kRuleSet_rule_char = 129;
|
||||
|
||||
|
||||
enum Regex_PatternParseAction {
|
||||
doPossessivePlus,
|
||||
doCloseParen,
|
||||
doProperty,
|
||||
doBeginMatchMode,
|
||||
doOrOperator,
|
||||
doOpenCaptureParen,
|
||||
doBadOpenParenType,
|
||||
doRuleError,
|
||||
doIntevalLowerDigit,
|
||||
doBackslashs,
|
||||
doNGOpt,
|
||||
doBackslashw,
|
||||
doMismatchedParenErr,
|
||||
doOpenLookBehind,
|
||||
doBackslashz,
|
||||
doIntervalError,
|
||||
doStar,
|
||||
doCaret,
|
||||
doEnterQuoteMode,
|
||||
doNGStar,
|
||||
doMatchMode,
|
||||
doIntervalUpperDigit,
|
||||
doOpenLookAheadNeg,
|
||||
doPlus,
|
||||
doOpenNonCaptureParen,
|
||||
doBackslashA,
|
||||
doBackslashB,
|
||||
doNGPlus,
|
||||
doSetMatchMode,
|
||||
doPatFinish,
|
||||
doBackslashD,
|
||||
doPossessiveInterval,
|
||||
doEscapeError,
|
||||
doBackslashG,
|
||||
doSuppressComments,
|
||||
doMatchModeParen,
|
||||
doOpt,
|
||||
doInterval,
|
||||
doLiteralChar,
|
||||
doIntervalInit,
|
||||
doOpenAtomicParen,
|
||||
doBackslashS,
|
||||
doOpenLookAhead,
|
||||
doBackRef,
|
||||
doDollar,
|
||||
doDotAny,
|
||||
doBackslashW,
|
||||
doBackslashX,
|
||||
doScanUnicodeSet,
|
||||
doBackslashZ,
|
||||
doPerlInline,
|
||||
doPossessiveOpt,
|
||||
doSetEnd,
|
||||
doBackslashA,
|
||||
doSetBeginUnion,
|
||||
doNOP,
|
||||
doConditionalExpr,
|
||||
doExit,
|
||||
doNGInterval,
|
||||
doPatStart,
|
||||
doBadModeFlag,
|
||||
doBackslashb,
|
||||
doPossessiveStar,
|
||||
doBackslashd,
|
||||
doIntervalSame,
|
||||
doSetBackslash_w,
|
||||
doSetRange,
|
||||
doBackslashG,
|
||||
doPerlInline,
|
||||
doSetAddDash,
|
||||
doIntevalLowerDigit,
|
||||
doProperty,
|
||||
doBackslashX,
|
||||
doOpenAtomicParen,
|
||||
doSetLiteralEscaped,
|
||||
doPatFinish,
|
||||
doSetBackslash_D,
|
||||
doSetDifference2,
|
||||
doNamedChar,
|
||||
doNGPlus,
|
||||
doOpenLookBehindNeg,
|
||||
doIntervalError,
|
||||
doIntervalSame,
|
||||
doBackRef,
|
||||
doPlus,
|
||||
doOpenCaptureParen,
|
||||
doMismatchedParenErr,
|
||||
doBeginMatchMode,
|
||||
doEscapeError,
|
||||
doOpenNonCaptureParen,
|
||||
doDollar,
|
||||
doSetProp,
|
||||
doIntervalUpperDigit,
|
||||
doSetBegin,
|
||||
doBackslashs,
|
||||
doOpenLookBehind,
|
||||
doSetMatchMode,
|
||||
doOrOperator,
|
||||
doCaret,
|
||||
doMatchModeParen,
|
||||
doStar,
|
||||
doOpt,
|
||||
doMatchMode,
|
||||
doSuppressComments,
|
||||
doPossessiveInterval,
|
||||
doOpenLookAheadNeg,
|
||||
doBackslashW,
|
||||
doCloseParen,
|
||||
doSetOpError,
|
||||
doIntervalInit,
|
||||
doSetFinish,
|
||||
doSetIntersection2,
|
||||
doNGStar,
|
||||
doEnterQuoteMode,
|
||||
doSetAddAmp,
|
||||
doBackslashB,
|
||||
doBackslashw,
|
||||
doPossessiveOpt,
|
||||
doSetNegate,
|
||||
doRuleError,
|
||||
doBackslashb,
|
||||
doConditionalExpr,
|
||||
doPossessivePlus,
|
||||
doBadOpenParenType,
|
||||
doNGInterval,
|
||||
doSetLiteral,
|
||||
doSetNamedChar,
|
||||
doBackslashd,
|
||||
doSetBeginDifference1,
|
||||
doBackslashD,
|
||||
doExit,
|
||||
doSetBackslash_S,
|
||||
doInterval,
|
||||
doSetNoCloseError,
|
||||
doNGOpt,
|
||||
doSetPosixProp,
|
||||
doBackslashS,
|
||||
doBackslashZ,
|
||||
doSetBeginIntersection1,
|
||||
doSetBackslash_W,
|
||||
doSetBackslash_d,
|
||||
doOpenLookAhead,
|
||||
doBadModeFlag,
|
||||
doPatStart,
|
||||
doSetNamedRange,
|
||||
doPossessiveStar,
|
||||
doEscapedLiteralChar,
|
||||
doSetBackslash_s,
|
||||
doBackslashz,
|
||||
doDotAny,
|
||||
rbbiLastAction};
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
|
@ -106,21 +132,21 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
{doNOP, 0, 0, 0, TRUE}
|
||||
, {doPatStart, 255, 2,0, FALSE} // 1 start
|
||||
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
|
||||
, {doLiteralChar, 130, 14,0, TRUE} // 3
|
||||
, {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4
|
||||
, {doLiteralChar, 129, 14,0, TRUE} // 3
|
||||
, {doSetBegin, 91 /* [ */, 104, 182, TRUE} // 4
|
||||
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
|
||||
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
|
||||
, {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 2,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 81,0, TRUE} // 9
|
||||
, {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
|
||||
, {doDollar, 36 /* $ */, 14,0, TRUE} // 8
|
||||
, {doNOP, 92 /* \ */, 84,0, TRUE} // 9
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
|
||||
, {doPatFinish, 253, 2,0, FALSE} // 12
|
||||
, {doRuleError, 255, 101,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 62,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 65,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 68,0, TRUE} // 17
|
||||
, {doRuleError, 255, 183,0, FALSE} // 13
|
||||
, {doNOP, 42 /* * */, 63,0, TRUE} // 14 expr-quant
|
||||
, {doNOP, 43 /* + */, 66,0, TRUE} // 15
|
||||
, {doNOP, 63 /* ? */, 69,0, TRUE} // 16
|
||||
, {doIntervalInit, 123 /* { */, 72,0, TRUE} // 17
|
||||
, {doNOP, 40 /* ( */, 23,0, TRUE} // 18
|
||||
, {doNOP, 255, 20,0, FALSE} // 19
|
||||
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 20 expr-cont
|
||||
|
@ -128,7 +154,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doNOP, 255, 2,0, FALSE} // 22
|
||||
, {doSuppressComments, 63 /* ? */, 25,0, TRUE} // 23 open-paren-quant
|
||||
, {doNOP, 255, 27,0, FALSE} // 24
|
||||
, {doNOP, 35 /* # */, 47, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 35 /* # */, 49, 14, TRUE} // 25 open-paren-quant2
|
||||
, {doNOP, 255, 29,0, FALSE} // 26
|
||||
, {doSuppressComments, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 2, 14, FALSE} // 28
|
||||
|
@ -136,75 +162,157 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doOpenAtomicParen, 62 /* > */, 2, 14, TRUE} // 30
|
||||
, {doOpenLookAhead, 61 /* = */, 2, 20, TRUE} // 31
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 44,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 47, 2, TRUE} // 34
|
||||
, {doBeginMatchMode, 105 /* i */, 50,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 109 /* m */, 50,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 115 /* s */, 50,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39
|
||||
, {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40
|
||||
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41
|
||||
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 42
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 43
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45
|
||||
, {doBadOpenParenType, 255, 101,0, FALSE} // 46
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment
|
||||
, {doMismatchedParenErr, 253, 101,0, FALSE} // 48
|
||||
, {doNOP, 255, 47,0, TRUE} // 49
|
||||
, {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag
|
||||
, {doMatchMode, 109 /* m */, 50,0, TRUE} // 51
|
||||
, {doMatchMode, 115 /* s */, 50,0, TRUE} // 52
|
||||
, {doMatchMode, 119 /* w */, 50,0, TRUE} // 53
|
||||
, {doMatchMode, 120 /* x */, 50,0, TRUE} // 54
|
||||
, {doMatchMode, 45 /* - */, 50,0, TRUE} // 55
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57
|
||||
, {doBadModeFlag, 255, 101,0, FALSE} // 58
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60
|
||||
, {doStar, 255, 20,0, FALSE} // 61
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 62 quant-plus
|
||||
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 63
|
||||
, {doPlus, 255, 20,0, FALSE} // 64
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66
|
||||
, {doOpt, 255, 20,0, FALSE} // 67
|
||||
, {doNOP, 129, 68,0, TRUE} // 68 interval-open
|
||||
, {doNOP, 128, 71,0, FALSE} // 69
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 70
|
||||
, {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower
|
||||
, {doNOP, 44 /* , */, 75,0, TRUE} // 72
|
||||
, {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 74
|
||||
, {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper
|
||||
, {doNOP, 125 /* } */, 78,0, TRUE} // 76
|
||||
, {doIntervalError, 255, 101,0, FALSE} // 77
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79
|
||||
, {doInterval, 255, 20,0, FALSE} // 80
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 82
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 83
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 84
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 85
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 86
|
||||
, {doProperty, 78 /* N */, 14,0, FALSE} // 87
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 88
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 89
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 91
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 92
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 93
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 94
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 95
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 97
|
||||
, {doBackRef, 128, 14,0, TRUE} // 98
|
||||
, {doEscapeError, 253, 101,0, FALSE} // 99
|
||||
, {doLiteralChar, 255, 14,0, TRUE} // 100
|
||||
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
|
||||
, {doNOP, 60 /* < */, 46,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 49, 2, TRUE} // 34
|
||||
, {doBeginMatchMode, 105 /* i */, 52,0, FALSE} // 35
|
||||
, {doBeginMatchMode, 100 /* d */, 52,0, FALSE} // 36
|
||||
, {doBeginMatchMode, 109 /* m */, 52,0, FALSE} // 37
|
||||
, {doBeginMatchMode, 115 /* s */, 52,0, FALSE} // 38
|
||||
, {doBeginMatchMode, 117 /* u */, 52,0, FALSE} // 39
|
||||
, {doBeginMatchMode, 119 /* w */, 52,0, FALSE} // 40
|
||||
, {doBeginMatchMode, 120 /* x */, 52,0, FALSE} // 41
|
||||
, {doBeginMatchMode, 45 /* - */, 52,0, FALSE} // 42
|
||||
, {doConditionalExpr, 40 /* ( */, 183,0, TRUE} // 43
|
||||
, {doPerlInline, 123 /* { */, 183,0, TRUE} // 44
|
||||
, {doBadOpenParenType, 255, 183,0, FALSE} // 45
|
||||
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
|
||||
, {doBadOpenParenType, 255, 183,0, FALSE} // 48
|
||||
, {doNOP, 41 /* ) */, 255,0, TRUE} // 49 paren-comment
|
||||
, {doMismatchedParenErr, 253, 183,0, FALSE} // 50
|
||||
, {doNOP, 255, 49,0, TRUE} // 51
|
||||
, {doMatchMode, 105 /* i */, 52,0, TRUE} // 52 paren-flag
|
||||
, {doMatchMode, 100 /* d */, 52,0, TRUE} // 53
|
||||
, {doMatchMode, 109 /* m */, 52,0, TRUE} // 54
|
||||
, {doMatchMode, 115 /* s */, 52,0, TRUE} // 55
|
||||
, {doMatchMode, 117 /* u */, 52,0, TRUE} // 56
|
||||
, {doMatchMode, 119 /* w */, 52,0, TRUE} // 57
|
||||
, {doMatchMode, 120 /* x */, 52,0, TRUE} // 58
|
||||
, {doMatchMode, 45 /* - */, 52,0, TRUE} // 59
|
||||
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 60
|
||||
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 61
|
||||
, {doBadModeFlag, 255, 183,0, FALSE} // 62
|
||||
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 63 quant-star
|
||||
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 64
|
||||
, {doStar, 255, 20,0, FALSE} // 65
|
||||
, {doNGPlus, 63 /* ? */, 20,0, TRUE} // 66 quant-plus
|
||||
, {doPossessivePlus, 43 /* + */, 20,0, TRUE} // 67
|
||||
, {doPlus, 255, 20,0, FALSE} // 68
|
||||
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 69 quant-opt
|
||||
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 70
|
||||
, {doOpt, 255, 20,0, FALSE} // 71
|
||||
, {doNOP, 128, 74,0, FALSE} // 72 interval-open
|
||||
, {doIntervalError, 255, 183,0, FALSE} // 73
|
||||
, {doIntevalLowerDigit, 128, 74,0, TRUE} // 74 interval-lower
|
||||
, {doNOP, 44 /* , */, 78,0, TRUE} // 75
|
||||
, {doIntervalSame, 125 /* } */, 81,0, TRUE} // 76
|
||||
, {doIntervalError, 255, 183,0, FALSE} // 77
|
||||
, {doIntervalUpperDigit, 128, 78,0, TRUE} // 78 interval-upper
|
||||
, {doNOP, 125 /* } */, 81,0, TRUE} // 79
|
||||
, {doIntervalError, 255, 183,0, FALSE} // 80
|
||||
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 81 interval-type
|
||||
, {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 82
|
||||
, {doInterval, 255, 20,0, FALSE} // 83
|
||||
, {doBackslashA, 65 /* A */, 2,0, TRUE} // 84 backslash
|
||||
, {doBackslashB, 66 /* B */, 2,0, TRUE} // 85
|
||||
, {doBackslashb, 98 /* b */, 2,0, TRUE} // 86
|
||||
, {doBackslashd, 100 /* d */, 14,0, TRUE} // 87
|
||||
, {doBackslashD, 68 /* D */, 14,0, TRUE} // 88
|
||||
, {doBackslashG, 71 /* G */, 2,0, TRUE} // 89
|
||||
, {doNamedChar, 78 /* N */, 14,0, FALSE} // 90
|
||||
, {doProperty, 112 /* p */, 14,0, FALSE} // 91
|
||||
, {doProperty, 80 /* P */, 14,0, FALSE} // 92
|
||||
, {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 93
|
||||
, {doBackslashS, 83 /* S */, 14,0, TRUE} // 94
|
||||
, {doBackslashs, 115 /* s */, 14,0, TRUE} // 95
|
||||
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 96
|
||||
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 97
|
||||
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 98
|
||||
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 99
|
||||
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 100
|
||||
, {doBackRef, 128, 14,0, TRUE} // 101
|
||||
, {doEscapeError, 253, 183,0, FALSE} // 102
|
||||
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 103
|
||||
, {doSetNegate, 94 /* ^ */, 107,0, TRUE} // 104 set-open
|
||||
, {doSetPosixProp, 58 /* : */, 109,0, FALSE} // 105
|
||||
, {doNOP, 255, 107,0, FALSE} // 106
|
||||
, {doSetLiteral, 93 /* ] */, 122,0, TRUE} // 107 set-open2
|
||||
, {doNOP, 255, 112,0, FALSE} // 108
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 109 set-posix
|
||||
, {doNOP, 58 /* : */, 112,0, FALSE} // 110
|
||||
, {doRuleError, 255, 183,0, FALSE} // 111
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 112 set-start
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 113
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 114
|
||||
, {doNOP, 45 /* - */, 118,0, TRUE} // 115
|
||||
, {doNOP, 38 /* & */, 120,0, TRUE} // 116
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 117
|
||||
, {doRuleError, 45 /* - */, 183,0, FALSE} // 118 set-start-dash
|
||||
, {doSetAddDash, 255, 122,0, FALSE} // 119
|
||||
, {doRuleError, 38 /* & */, 183,0, FALSE} // 120 set-start-amp
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 121
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 122 set-after-lit
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 123
|
||||
, {doNOP, 45 /* - */, 159,0, TRUE} // 124
|
||||
, {doNOP, 38 /* & */, 150,0, TRUE} // 125
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 126
|
||||
, {doSetNoCloseError, 253, 183,0, FALSE} // 127
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 128
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 129 set-after-set
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 130
|
||||
, {doNOP, 45 /* - */, 152,0, TRUE} // 131
|
||||
, {doNOP, 38 /* & */, 147,0, TRUE} // 132
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 133
|
||||
, {doSetNoCloseError, 253, 183,0, FALSE} // 134
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 135
|
||||
, {doSetEnd, 93 /* ] */, 255,0, TRUE} // 136 set-after-range
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 137
|
||||
, {doNOP, 45 /* - */, 155,0, TRUE} // 138
|
||||
, {doNOP, 38 /* & */, 157,0, TRUE} // 139
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 140
|
||||
, {doSetNoCloseError, 253, 183,0, FALSE} // 141
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 142
|
||||
, {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE} // 143 set-after-op
|
||||
, {doSetOpError, 93 /* ] */, 183,0, FALSE} // 144
|
||||
, {doNOP, 92 /* \ */, 172,0, TRUE} // 145
|
||||
, {doSetLiteral, 255, 122,0, TRUE} // 146
|
||||
, {doSetBeginIntersection1, 91 /* [ */, 104, 129, TRUE} // 147 set-set-amp
|
||||
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 148
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 149
|
||||
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 150 set-lit-amp
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 151
|
||||
, {doSetBeginDifference1, 91 /* [ */, 104, 129, TRUE} // 152 set-set-dash
|
||||
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 153
|
||||
, {doSetAddDash, 255, 122,0, FALSE} // 154
|
||||
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 155 set-range-dash
|
||||
, {doSetAddDash, 255, 122,0, FALSE} // 156
|
||||
, {doSetIntersection2, 38 /* & */, 143,0, TRUE} // 157 set-range-amp
|
||||
, {doSetAddAmp, 255, 122,0, FALSE} // 158
|
||||
, {doSetDifference2, 45 /* - */, 143,0, TRUE} // 159 set-lit-dash
|
||||
, {doSetAddDash, 91 /* [ */, 122,0, FALSE} // 160
|
||||
, {doSetAddDash, 93 /* ] */, 122,0, FALSE} // 161
|
||||
, {doNOP, 92 /* \ */, 164,0, TRUE} // 162
|
||||
, {doSetRange, 255, 136,0, TRUE} // 163
|
||||
, {doSetOpError, 115 /* s */, 183,0, FALSE} // 164 set-lit-dash-escape
|
||||
, {doSetOpError, 83 /* S */, 183,0, FALSE} // 165
|
||||
, {doSetOpError, 119 /* w */, 183,0, FALSE} // 166
|
||||
, {doSetOpError, 87 /* W */, 183,0, FALSE} // 167
|
||||
, {doSetOpError, 100 /* d */, 183,0, FALSE} // 168
|
||||
, {doSetOpError, 68 /* D */, 183,0, FALSE} // 169
|
||||
, {doSetNamedRange, 78 /* N */, 136,0, FALSE} // 170
|
||||
, {doSetRange, 255, 136,0, TRUE} // 171
|
||||
, {doSetProp, 112 /* p */, 129,0, FALSE} // 172 set-escape
|
||||
, {doSetProp, 80 /* P */, 129,0, FALSE} // 173
|
||||
, {doSetNamedChar, 78 /* N */, 122,0, FALSE} // 174
|
||||
, {doSetBackslash_s, 115 /* s */, 136,0, TRUE} // 175
|
||||
, {doSetBackslash_S, 83 /* S */, 136,0, TRUE} // 176
|
||||
, {doSetBackslash_w, 119 /* w */, 136,0, TRUE} // 177
|
||||
, {doSetBackslash_W, 87 /* W */, 136,0, TRUE} // 178
|
||||
, {doSetBackslash_d, 100 /* d */, 136,0, TRUE} // 179
|
||||
, {doSetBackslash_D, 68 /* D */, 136,0, TRUE} // 180
|
||||
, {doSetLiteralEscaped, 255, 122,0, TRUE} // 181
|
||||
, {doSetFinish, 255, 14,0, FALSE} // 182 set-finish
|
||||
, {doExit, 255, 183,0, TRUE} // 183 errorDeath
|
||||
};
|
||||
static const char * const RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -249,6 +357,8 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"open-paren-lookbehind",
|
||||
0,
|
||||
|
@ -264,6 +374,8 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"quant-star",
|
||||
0,
|
||||
|
@ -275,7 +387,6 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
"interval-open",
|
||||
0,
|
||||
0,
|
||||
"interval-lower",
|
||||
0,
|
||||
|
@ -307,6 +418,85 @@ static const char * const RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
"set-open",
|
||||
0,
|
||||
0,
|
||||
"set-open2",
|
||||
0,
|
||||
"set-posix",
|
||||
0,
|
||||
0,
|
||||
"set-start",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-start-dash",
|
||||
0,
|
||||
"set-start-amp",
|
||||
0,
|
||||
"set-after-lit",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-after-set",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-after-range",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-after-op",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-set-amp",
|
||||
0,
|
||||
0,
|
||||
"set-lit-amp",
|
||||
0,
|
||||
"set-set-dash",
|
||||
0,
|
||||
0,
|
||||
"set-range-dash",
|
||||
0,
|
||||
"set-range-amp",
|
||||
0,
|
||||
"set-lit-dash",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-lit-dash-escape",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-escape",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"set-finish",
|
||||
"errorDeath",
|
||||
0};
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/perl
|
||||
# ********************************************************************
|
||||
# * COPYRIGHT:
|
||||
# * Copyright (c) 2002-2003, International Business Machines Corporation and
|
||||
# * Copyright (c) 2002-2007, International Business Machines Corporation and
|
||||
# * others. All Rights Reserved.
|
||||
# ********************************************************************
|
||||
#
|
||||
|
@ -22,10 +22,6 @@
|
|||
# for the Rule Based Break Iterator Rule Parser. Perhaps they could be
|
||||
# merged?
|
||||
#
|
||||
#*********************************************************************
|
||||
# Copyright (C) 2002 International Business Machines Corporation *
|
||||
# and others. All rights reserved. *
|
||||
#*********************************************************************
|
||||
|
||||
|
||||
$num_states = 1; # Always the state number for the line being compiled.
|
||||
|
@ -210,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P
|
|||
print "// It is generated by the Perl script \"regexcst.pl\" from\n";
|
||||
print "// the rule parser state definitions file \"regexcst.txt\".\n";
|
||||
print "//\n";
|
||||
print "// Copyright (C) 2002-2003 International Business Machines Corporation \n";
|
||||
print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
|
||||
print "// and others. All rights reserved. \n";
|
||||
print "//\n";
|
||||
print "//---------------------------------------------------------------------------------\n";
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
#*****************************************************************************
|
||||
#
|
||||
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
||||
# Copyright (C) 2002-2007, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
#*****************************************************************************
|
||||
|
@ -25,8 +25,8 @@
|
|||
#
|
||||
#
|
||||
#StateName:
|
||||
# input-char n next-state ^push-state action
|
||||
# input-char n next-state ^push-state action
|
||||
# input-char n next-state ^push-state action
|
||||
# input-char n next-state ^push-state action
|
||||
# | | | | |
|
||||
# | | | | |--- action to be performed by state machine
|
||||
# | | | | See function RBBIRuleScanner::doParseActions()
|
||||
|
@ -46,7 +46,7 @@
|
|||
# matches, peform the actions and go to the state specified on this line.
|
||||
# The input character is tested sequentally, in the order written. The characters and
|
||||
# character classes tested for do not need to be mutually exclusive. The first match wins.
|
||||
#
|
||||
#
|
||||
|
||||
|
||||
|
||||
|
@ -56,27 +56,27 @@
|
|||
#
|
||||
start:
|
||||
default term doPatStart
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# term. At a position where we can accept the start most items in a pattern.
|
||||
#
|
||||
term:
|
||||
quoted n expr-quant doLiteralChar
|
||||
rule_char n expr-quant doLiteralChar
|
||||
'[' n expr-quant doScanUnicodeSet
|
||||
'(' n open-paren
|
||||
'[' n set-open ^set-finish doSetBegin
|
||||
'(' n open-paren
|
||||
'.' n expr-quant doDotAny
|
||||
'^' n term doCaret
|
||||
'$' n term doDollar
|
||||
'^' n expr-quant doCaret
|
||||
'$' n expr-quant doDollar
|
||||
'\' n backslash
|
||||
'|' n term doOrOperator
|
||||
')' n pop doCloseParen
|
||||
eof term doPatFinish
|
||||
default errorDeath doRuleError
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
|
@ -84,14 +84,14 @@ term:
|
|||
# trailing quantifier - *, +, ?, *?, etc.
|
||||
#
|
||||
expr-quant:
|
||||
'*' n quant-star
|
||||
'+' n quant-plus
|
||||
'?' n quant-opt
|
||||
'*' n quant-star
|
||||
'+' n quant-plus
|
||||
'?' n quant-opt
|
||||
'{' n interval-open doIntervalInit
|
||||
'(' n open-paren-quant
|
||||
default expr-cont
|
||||
|
||||
|
||||
default expr-cont
|
||||
|
||||
|
||||
#
|
||||
# expr-cont Expression, continuation. At a point where additional terms are
|
||||
# allowed, but not required. No Quantifiers
|
||||
|
@ -99,8 +99,8 @@ expr-quant:
|
|||
expr-cont:
|
||||
'|' n term doOrOperator
|
||||
')' n pop doCloseParen
|
||||
default term
|
||||
|
||||
default term
|
||||
|
||||
|
||||
#
|
||||
# open-paren-quant Special case handling for comments appearing before a quantifier,
|
||||
|
@ -111,12 +111,12 @@ expr-cont:
|
|||
open-paren-quant:
|
||||
'?' n open-paren-quant2 doSuppressComments
|
||||
default open-paren
|
||||
|
||||
|
||||
open-paren-quant2:
|
||||
'#' n paren-comment ^expr-quant
|
||||
default open-paren-extended
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# open-paren We've got an open paren. We need to scan further to
|
||||
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
|
||||
|
@ -124,7 +124,7 @@ open-paren-quant2:
|
|||
open-paren:
|
||||
'?' n open-paren-extended doSuppressComments
|
||||
default term ^expr-quant doOpenCaptureParen
|
||||
|
||||
|
||||
open-paren-extended:
|
||||
':' n term ^expr-quant doOpenNonCaptureParen # (?:
|
||||
'>' n term ^expr-quant doOpenAtomicParen # (?>
|
||||
|
@ -133,24 +133,25 @@ open-paren-extended:
|
|||
'<' n open-paren-lookbehind
|
||||
'#' n paren-comment ^term
|
||||
'i' paren-flag doBeginMatchMode
|
||||
'd' paren-flag doBeginMatchMode
|
||||
'm' paren-flag doBeginMatchMode
|
||||
's' paren-flag doBeginMatchMode
|
||||
'u' paren-flag doBeginMatchMode
|
||||
'w' paren-flag doBeginMatchMode
|
||||
'x' paren-flag doBeginMatchMode
|
||||
'-' paren-flag doBeginMatchMode
|
||||
'(' n errorDeath doConditionalExpr
|
||||
'{' n errorDeath doPerlInline
|
||||
default errorDeath doBadOpenParenType
|
||||
|
||||
|
||||
open-paren-lookbehind:
|
||||
'=' n term ^expr-cont doOpenLookBehind # (?<=
|
||||
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
|
||||
default errorDeath doBadOpenParenType
|
||||
|
||||
|
||||
|
||||
#
|
||||
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
|
||||
# TODO: should parens nest here? Check what perl does.
|
||||
#
|
||||
paren-comment:
|
||||
')' n pop
|
||||
|
@ -158,20 +159,22 @@ paren-comment:
|
|||
default n paren-comment
|
||||
|
||||
#
|
||||
# paren-flag Scanned a (?ismx-ismx flag setting
|
||||
#
|
||||
# paren-flag Scanned a (?ismx-ismx flag setting
|
||||
#
|
||||
paren-flag:
|
||||
'i' n paren-flag doMatchMode
|
||||
'd' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'u' n paren-flag doMatchMode
|
||||
'w' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
')' n term doSetMatchMode
|
||||
':' n term ^expr-quant doMatchModeParen
|
||||
default errorDeath doBadModeFlag
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
|
||||
# between plain '*', '*?', '*+'
|
||||
|
@ -204,13 +207,12 @@ quant-opt:
|
|||
|
||||
#
|
||||
# Interval scanning a '{', the opening delimiter for an interval specification
|
||||
# {number} or {min, max} or {min, }
|
||||
# {number} or {min, max} or {min,}
|
||||
#
|
||||
interval-open:
|
||||
white_space n interval-open # TODO: is white space allowed here in non-free mode?
|
||||
digit_char interval-lower
|
||||
digit_char interval-lower
|
||||
default errorDeath doIntervalError
|
||||
|
||||
|
||||
interval-lower:
|
||||
digit_char n interval-lower doIntevalLowerDigit
|
||||
',' n interval-upper
|
||||
|
@ -221,13 +223,13 @@ interval-upper:
|
|||
digit_char n interval-upper doIntervalUpperDigit
|
||||
'}' n interval-type
|
||||
default errorDeath doIntervalError
|
||||
|
||||
|
||||
interval-type:
|
||||
'?' n expr-cont doNGInterval # {n,m}?
|
||||
'+' n expr-cont doPossessiveInterval # {n,m}+
|
||||
default expr-cont doInterval # {m,n}
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# backslash # Backslash. Figure out which of the \thingies we have encountered.
|
||||
# The low level next-char function will have preprocessed
|
||||
|
@ -239,7 +241,7 @@ backslash:
|
|||
'd' n expr-quant doBackslashd
|
||||
'D' n expr-quant doBackslashD
|
||||
'G' n term doBackslashG
|
||||
'N' expr-quant doProperty # \N{NAME} named char
|
||||
'N' expr-quant doNamedChar # \N{NAME} named char
|
||||
'p' expr-quant doProperty # \p{Lu} style property
|
||||
'P' expr-quant doProperty
|
||||
'Q' n term doEnterQuoteMode
|
||||
|
@ -250,11 +252,210 @@ backslash:
|
|||
'X' n expr-quant doBackslashX
|
||||
'Z' n term doBackslashZ
|
||||
'z' n term doBackslashz
|
||||
digit_char n expr-quant doBackRef # Will scan multiple digits
|
||||
digit_char n expr-quant doBackRef # Will scan multiple digits
|
||||
eof errorDeath doEscapeError
|
||||
default n expr-quant doLiteralChar # Escaped literal char.
|
||||
default n expr-quant doEscapedLiteralChar
|
||||
|
||||
|
||||
|
||||
#
|
||||
# [set expression] parsing,
|
||||
# All states involved in parsing set expressions have names beginning with "set-"
|
||||
#
|
||||
|
||||
set-open:
|
||||
'^' n set-open2 doSetNegate
|
||||
':' set-posix doSetPosixProp
|
||||
default set-open2
|
||||
|
||||
set-open2:
|
||||
']' n set-after-lit doSetLiteral
|
||||
default set-start
|
||||
|
||||
# set-posix:
|
||||
# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
|
||||
# moved the scan to the closing ']'. If it wasn't a property
|
||||
# expression, the scan will still be at the opening ':', which should
|
||||
# be interpreted as a normal set expression.
|
||||
set-posix:
|
||||
']' n pop doSetEnd
|
||||
':' set-start
|
||||
default errorDeath doRuleError # should not be possible.
|
||||
|
||||
#
|
||||
# set-start after the [ and special case leading characters (^ and/or ]) but before
|
||||
# everything else. A '-' is literal at this point.
|
||||
#
|
||||
set-start:
|
||||
']' n pop doSetEnd
|
||||
'[' n set-open ^set-after-set doSetBeginUnion
|
||||
'\' n set-escape
|
||||
'-' n set-start-dash
|
||||
'&' n set-start-amp
|
||||
default n set-after-lit doSetLiteral
|
||||
|
||||
# set-start-dash Turn "[--" into a syntax error.
|
||||
# "[-x" is good, - and x are literals.
|
||||
#
|
||||
set-start-dash:
|
||||
'-' errorDeath doRuleError
|
||||
default set-after-lit doSetAddDash
|
||||
|
||||
# set-start-amp Turn "[&&" into a syntax error.
|
||||
# "[&x" is good, & and x are literals.
|
||||
#
|
||||
set-start-amp:
|
||||
'&' errorDeath doRuleError
|
||||
default set-after-lit doSetAddAmp
|
||||
|
||||
#
|
||||
# set-after-lit The last thing scanned was a literal character within a set.
|
||||
# Can be followed by anything. Single '-' or '&' are
|
||||
# literals in this context, not operators.
|
||||
set-after-lit:
|
||||
']' n pop doSetEnd
|
||||
'[' n set-open ^set-after-set doSetBeginUnion
|
||||
'-' n set-lit-dash
|
||||
'&' n set-lit-amp
|
||||
'\' n set-escape
|
||||
eof errorDeath doSetNoCloseError
|
||||
default n set-after-lit doSetLiteral
|
||||
|
||||
set-after-set:
|
||||
']' n pop doSetEnd
|
||||
'[' n set-open ^set-after-set doSetBeginUnion
|
||||
'-' n set-set-dash
|
||||
'&' n set-set-amp
|
||||
'\' n set-escape
|
||||
eof errorDeath doSetNoCloseError
|
||||
default n set-after-lit doSetLiteral
|
||||
|
||||
set-after-range:
|
||||
']' n pop doSetEnd
|
||||
'[' n set-open ^set-after-set doSetBeginUnion
|
||||
'-' n set-range-dash
|
||||
'&' n set-range-amp
|
||||
'\' n set-escape
|
||||
eof errorDeath doSetNoCloseError
|
||||
default n set-after-lit doSetLiteral
|
||||
|
||||
|
||||
# set-after-op
|
||||
# After a -- or &&
|
||||
# It is an error to close a set at this point.
|
||||
#
|
||||
set-after-op:
|
||||
'[' n set-open ^set-after-set doSetBeginUnion
|
||||
']' errorDeath doSetOpError
|
||||
'\' n set-escape
|
||||
default n set-after-lit doSetLiteral
|
||||
|
||||
#
|
||||
# set-set-amp
|
||||
# Have scanned [[set]&
|
||||
# Could be a '&' intersection operator, if a set follows.
|
||||
# Could be the start of a '&&' operator.
|
||||
# Otherewise is a literal.
|
||||
set-set-amp:
|
||||
'[' n set-open ^set-after-set doSetBeginIntersection1
|
||||
'&' n set-after-op doSetIntersection2
|
||||
default set-after-lit doSetAddAmp
|
||||
|
||||
|
||||
# set-lit-amp Have scanned "[literals&"
|
||||
# Could be a start of "&&" operator or a literal
|
||||
# In [abc&[def]], the '&' is a literal
|
||||
#
|
||||
set-lit-amp:
|
||||
'&' n set-after-op doSetIntersection2
|
||||
default set-after-lit doSetAddAmp
|
||||
|
||||
|
||||
#
|
||||
# set-set-dash
|
||||
# Have scanned [set]-
|
||||
# Could be a '-' difference operator, if a [set] follows.
|
||||
# Could be the start of a '--' operator.
|
||||
# Otherewise is a literal.
|
||||
set-set-dash:
|
||||
'[' n set-open ^set-after-set doSetBeginDifference1
|
||||
'-' n set-after-op doSetDifference2
|
||||
default set-after-lit doSetAddDash
|
||||
|
||||
|
||||
#
|
||||
# set-range-dash
|
||||
# scanned a-b- or \w-
|
||||
# any set or range like item where the trailing single '-' should
|
||||
# be literal, not a set difference operation.
|
||||
# A trailing "--" is still a difference operator.
|
||||
set-range-dash:
|
||||
'-' n set-after-op doSetDifference2
|
||||
default set-after-lit doSetAddDash
|
||||
|
||||
|
||||
set-range-amp:
|
||||
'&' n set-after-op doSetIntersection2
|
||||
default set-after-lit doSetAddAmp
|
||||
|
||||
|
||||
# set-lit-dash
|
||||
# Have scanned "[literals-" Could be a range or a -- operator or a literal
|
||||
# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
|
||||
# [abc-\p{xx} the '-' is an error
|
||||
# [abc-] the '-' is a literal
|
||||
# [ab-xy] the '-' is a range
|
||||
#
|
||||
set-lit-dash:
|
||||
'-' n set-after-op doSetDifference2
|
||||
'[' set-after-lit doSetAddDash
|
||||
']' set-after-lit doSetAddDash
|
||||
'\' n set-lit-dash-escape
|
||||
default n set-after-range doSetRange
|
||||
|
||||
# set-lit-dash-escape
|
||||
#
|
||||
# scanned "[literal-\"
|
||||
# Could be a range, if the \ introduces an escaped literal char or a named char.
|
||||
# Otherwise it is an error.
|
||||
#
|
||||
set-lit-dash-escape:
|
||||
's' errorDeath doSetOpError
|
||||
'S' errorDeath doSetOpError
|
||||
'w' errorDeath doSetOpError
|
||||
'W' errorDeath doSetOpError
|
||||
'd' errorDeath doSetOpError
|
||||
'D' errorDeath doSetOpError
|
||||
'N' set-after-range doSetNamedRange
|
||||
default n set-after-range doSetRange
|
||||
|
||||
|
||||
#
|
||||
# set-escape
|
||||
# Common back-slash escape processing within set expressions
|
||||
#
|
||||
set-escape:
|
||||
'p' set-after-set doSetProp
|
||||
'P' set-after-set doSetProp
|
||||
'N' set-after-lit doSetNamedChar
|
||||
's' n set-after-range doSetBackslash_s
|
||||
'S' n set-after-range doSetBackslash_S
|
||||
'w' n set-after-range doSetBackslash_w
|
||||
'W' n set-after-range doSetBackslash_W
|
||||
'd' n set-after-range doSetBackslash_d
|
||||
'D' n set-after-range doSetBackslash_D
|
||||
default n set-after-lit doSetLiteralEscaped
|
||||
|
||||
#
|
||||
# set-finish
|
||||
# Have just encountered the final ']' that completes a [set], and
|
||||
# arrived here via a pop. From here, we exit the set parsing world, and go
|
||||
# back to generic regular expression parsing.
|
||||
#
|
||||
set-finish:
|
||||
default expr-quant doSetFinish
|
||||
|
||||
|
||||
#
|
||||
# errorDeath. This state is specified as the next state whenever a syntax error
|
||||
# in the source rules is detected. Barring bugs, the state machine will never
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
//
|
||||
// Copyright (C) 2002-2005 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
// Copyright (C) 2002-2007 International Business Machines Corporation
|
||||
// and others. All rights reserved.
|
||||
//
|
||||
// file: regeximp.h
|
||||
//
|
||||
|
@ -57,7 +57,7 @@ U_NAMESPACE_BEGIN
|
|||
enum {
|
||||
URX_RESERVED_OP = 0, // For multi-operand ops, most non-first words.
|
||||
URX_RESERVED_OP_N = 255, // For multi-operand ops, negative operand values.
|
||||
URX_BACKTRACK = 1,
|
||||
URX_BACKTRACK = 1, // Force a backtrack, as if a match test had failed.
|
||||
URX_END = 2,
|
||||
URX_ONECHAR = 3, // Value field is the 21 bit unicode char to match
|
||||
URX_STRING = 4, // Value field is index of string start
|
||||
|
@ -66,16 +66,16 @@ enum {
|
|||
URX_NOP = 7,
|
||||
URX_START_CAPTURE = 8, // Value field is capture group number.
|
||||
URX_END_CAPTURE = 9, // Value field is capture group number
|
||||
URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
|
||||
URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
|
||||
URX_SETREF = 11, // Value field is index of set in array of sets.
|
||||
URX_DOTANY = 12,
|
||||
URX_DOTANY = 12,
|
||||
URX_JMP = 13, // Value field is destination position in
|
||||
// the pattern.
|
||||
URX_FAIL = 14, // Stop match operation, No match.
|
||||
|
||||
URX_JMP_SAV = 15, // Operand: JMP destination location
|
||||
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
|
||||
URX_BACKSLASH_G = 17,
|
||||
URX_BACKSLASH_G = 17,
|
||||
URX_JMP_SAV_X = 18, // Conditional JMP_SAV,
|
||||
// Used in (x)+, breaks loop on zero length match.
|
||||
// Operand: Jmp destination.
|
||||
|
@ -88,21 +88,22 @@ enum {
|
|||
URX_DOLLAR = 24, // Also for \Z
|
||||
|
||||
URX_CTR_INIT = 25, // Counter Inits for {Interval} loops.
|
||||
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possessive.
|
||||
URX_CTR_INIT_NG = 26, // 2 kinds, normal and non-greedy.
|
||||
// These are 4 word opcodes. See description.
|
||||
// First Operand: Data loc of counter variable
|
||||
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
|
||||
// 2nd Operand: Pat loc of the URX_CTR_LOOPx
|
||||
// at the end of the loop.
|
||||
// 3rd Operand: Minimum count.
|
||||
// 4th Operand: Max count, -1 for unbounded.
|
||||
|
||||
URX_DOTANY_PL = 27, // .+, match rest of the line. Fail already at end.
|
||||
URX_DOTANY_UNIX = 27, // '.' operator in UNIX_LINES mode, only \n marks end of line.
|
||||
|
||||
URX_CTR_LOOP = 28, // Loop Ops for {interval} loops.
|
||||
URX_CTR_LOOP_NG = 29, // Also in three flavors.
|
||||
// Operand is loc of corresponding CTR_INIT.
|
||||
|
||||
URX_DOTANY_ALL_PL = 30, // .+, match rest of the Input. Fail if already at end
|
||||
URX_CARET_M_UNIX = 30, // '^' operator, test for start of line in multi-line
|
||||
// plus UNIX_LINES mode.
|
||||
|
||||
URX_RELOC_OPRND = 31, // Operand value in multi-operand ops that refers
|
||||
// back into compiled pattern code, and thus must
|
||||
|
@ -118,7 +119,7 @@ enum {
|
|||
// within the matcher stack frame.
|
||||
URX_JMPX = 36, // Conditional JMP.
|
||||
// First Operand: JMP target location.
|
||||
// Second Operand: Data location containing an
|
||||
// Second Operand: Data location containing an
|
||||
// input position. If current input position ==
|
||||
// saved input position, FAIL rather than taking
|
||||
// the JMP
|
||||
|
@ -157,7 +158,7 @@ enum {
|
|||
URX_LBN_END = 48, // Negative LookBehind end
|
||||
// Parameter is the data location.
|
||||
// Check that the match ended at the right spot.
|
||||
URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
|
||||
URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
|
||||
// Operand is index of set in array of sets.
|
||||
URX_LOOP_SR_I = 50, // Init a [set]* loop.
|
||||
// Operand is the sets index in array of user sets.
|
||||
|
@ -166,12 +167,18 @@ enum {
|
|||
// Must always immediately follow LOOP_x_I instruction.
|
||||
URX_LOOP_DOT_I = 52, // .*, initialization of the optimized loop.
|
||||
// Operand value:
|
||||
// 0: Normal (. doesn't match new-line) mode.
|
||||
// 1: . matches new-line mode.
|
||||
URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style
|
||||
// bit 0:
|
||||
// 0: Normal (. doesn't match new-line) mode.
|
||||
// 1: . matches new-line mode.
|
||||
// bit 1: controls what new-lines are recognized by this operation.
|
||||
// 0: All Unicode New-lines
|
||||
// 1: UNIX_LINES, \u000a only.
|
||||
URX_BACKSLASH_BU = 53, // \b or \B in UREGEX_UWORD mode, using Unicode style
|
||||
// word boundaries.
|
||||
URX_DOLLAR_D = 54, // $ end of input test, in UNIX_LINES mode.
|
||||
URX_DOLLAR_MD = 55 // $ end of input test, in MULTI_LINE and UNIX_LINES mode.
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
// Keep this list of opcode names in sync with the above enum
|
||||
// Used for debug printing only.
|
||||
|
@ -203,10 +210,10 @@ enum {
|
|||
"DOLLAR", \
|
||||
"CTR_INIT", \
|
||||
"CTR_INIT_NG", \
|
||||
"DOTANY_PL", \
|
||||
"DOTANY_UNIX", \
|
||||
"CTR_LOOP", \
|
||||
"CTR_LOOP_NG", \
|
||||
"DOTANY_ALL_PL", \
|
||||
"URX_CARET_M_UNIX", \
|
||||
"RELOC_OPRND", \
|
||||
"STO_SP", \
|
||||
"LD_SP", \
|
||||
|
@ -229,21 +236,23 @@ enum {
|
|||
"LOOP_SR_I", \
|
||||
"LOOP_C", \
|
||||
"LOOP_DOT_I", \
|
||||
"BACKSLASH_BU"
|
||||
"BACKSLASH_BU", \
|
||||
"DOLLAR_D", \
|
||||
"DOLLAR_MD"
|
||||
|
||||
|
||||
//
|
||||
// Convenience macros for assembling and disassembling a compiled operation.
|
||||
//
|
||||
#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
|
||||
#define URX_TYPE(x) ((uint32_t)(x) >> 24)
|
||||
#define URX_TYPE(x) ((uint32_t)(x) >> 24)
|
||||
#define URX_VAL(x) ((x) & 0xffffff)
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Access to Unicode Sets composite character properties
|
||||
// The sets are accessed by the match engine for things like \w (word boundary)
|
||||
//
|
||||
//
|
||||
enum {
|
||||
URX_ISWORD_SET = 1,
|
||||
URX_ISALNUM_SET = 2,
|
||||
|
@ -297,7 +306,7 @@ enum StartOfMatch {
|
|||
(v)==START_LINE? "START_LINE" : \
|
||||
(v)==START_STRING? "START_STRING" : \
|
||||
"ILLEGAL")
|
||||
|
||||
|
||||
|
||||
//
|
||||
// 8 bit set, to fast-path latin-1 set membership tests.
|
||||
|
|
|
@ -59,9 +59,6 @@ static const UChar gRuleSet_rule_char_pattern[] = {
|
|||
static const UChar gRuleSet_digit_char_pattern[] = {
|
||||
// [ 0 - 9 ]
|
||||
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
|
||||
//static const UnicodeSet *gRuleDigits = NULL;
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Here are the backslash escape characters that ICU's unescape() function
|
||||
|
@ -72,16 +69,6 @@ static const UChar gUnescapeCharPattern[] = {
|
|||
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};
|
||||
|
||||
|
||||
//
|
||||
// White space characters that may appear within a pattern in free-form mode
|
||||
//
|
||||
static const UChar gRuleWhiteSpacePattern[] = {
|
||||
/* "[[:Cf:][:WSpace:]]" */
|
||||
91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
|
||||
83, 112, 97, 99, 101, 58, 93, 93, 0 };
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Unicode Set Definitions for Regular Expression \w
|
||||
//
|
||||
|
@ -89,7 +76,7 @@ static const UChar gIsWordPattern[] = {
|
|||
// [ \ p { A l p h a b e t i c }
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d,
|
||||
// \ p { M } Mark
|
||||
0x5c, 0x70, 0x7b, 0x4d, 0x7d,
|
||||
0x5c, 0x70, 0x7b, 0x4d, 0x7d,
|
||||
// \ p { N d } Digit_Numeric
|
||||
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d,
|
||||
// \ p { P c } ] Connector_Punctuation
|
||||
|
@ -108,8 +95,8 @@ static const UChar gIsSpacePattern[] = {
|
|||
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
|
||||
//
|
||||
static const UChar gGC_ControlPattern[] = {
|
||||
// [ [ : Z l : ] [ : Z p : ]
|
||||
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
|
||||
// [ [ : Z l : ] [ : Z p : ]
|
||||
0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
|
||||
// [ : C c : ] [ : C f : ] -
|
||||
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
|
||||
// [ : G r a p h e m e _
|
||||
|
@ -124,34 +111,35 @@ static const UChar gGC_ExtendPattern[] = {
|
|||
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_VPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_TPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
|
||||
|
||||
static const UChar gGC_LVTPattern[] = {
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
// [ \ p { H a n g u l _ S y l
|
||||
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
|
||||
// l a b l e _ T y p e = L V T } ]
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
|
||||
0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
|
||||
|
||||
|
||||
RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
|
||||
|
||||
|
@ -160,7 +148,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status)
|
|||
fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
|
||||
fRuleDigitsAlias(NULL)
|
||||
{
|
||||
// First zero out everything
|
||||
// First zero out everything
|
||||
int i;
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
fPropSets[i] = NULL;
|
||||
|
@ -171,7 +159,7 @@ fRuleDigitsAlias(NULL)
|
|||
|
||||
// Then init the sets to their correct values.
|
||||
fPropSets[URX_ISWORD_SET] = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1), *status);
|
||||
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
|
||||
fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
|
||||
fPropSets[URX_GC_EXTEND] = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status);
|
||||
fPropSets[URX_GC_CONTROL] = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
|
||||
fPropSets[URX_GC_L] = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1), *status);
|
||||
|
@ -184,14 +172,14 @@ fRuleDigitsAlias(NULL)
|
|||
// The rest of the initialization needs them, so we cannot proceed.
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// The following sets are dynamically constructed, because their
|
||||
// initialization strings would be unreasonable.
|
||||
//
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// "Normal" is the set of characters that don't need special handling
|
||||
// when finding grapheme cluster boundaries.
|
||||
|
@ -202,7 +190,7 @@ fRuleDigitsAlias(NULL)
|
|||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
|
||||
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
|
||||
|
||||
|
||||
// Initialize the 8-bit fast bit sets from the parallel full
|
||||
// UnicodeSets.
|
||||
for (i=0; i<URX_LAST_SET; i++) {
|
||||
|
@ -213,9 +201,8 @@ fRuleDigitsAlias(NULL)
|
|||
}
|
||||
|
||||
// Sets used while parsing rules, but not referenced from the parse state table
|
||||
fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1), *status);
|
||||
fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
|
||||
fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
|
||||
fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128];
|
||||
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
|
||||
if (fRuleSets[i]) {
|
||||
|
@ -281,7 +268,7 @@ void RegexStaticSets::initGlobals(UErrorCode *status) {
|
|||
ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,9 +1,9 @@
|
|||
//
|
||||
// file: repattrn.cpp
|
||||
// file: repattrn.cpp
|
||||
//
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2002-2006 International Business Machines Corporation *
|
||||
* Copyright (C) 2002-2007 International Business Machines Corporation *
|
||||
* and others. All rights reserved. *
|
||||
***************************************************************************
|
||||
*/
|
||||
|
@ -46,7 +46,7 @@ RegexPattern::RegexPattern() {
|
|||
//
|
||||
//--------------------------------------------------------------------------
|
||||
RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
|
||||
init();
|
||||
init();
|
||||
*this = other;
|
||||
}
|
||||
|
||||
|
@ -78,9 +78,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fFrameSize = other.fFrameSize;
|
||||
fDataSize = other.fDataSize;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
fStaticSets = other.fStaticSets;
|
||||
fStaticSets8 = other.fStaticSets8;
|
||||
|
||||
|
||||
fStartType = other.fStartType;
|
||||
fInitialStringIdx = other.fInitialStringIdx;
|
||||
fInitialStringLen = other.fInitialStringLen;
|
||||
|
@ -92,9 +92,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
|
||||
fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
|
||||
|
||||
// Copy the Unicode Sets.
|
||||
// Copy the Unicode Sets.
|
||||
// Could be made more efficient if the sets were reference counted and shared,
|
||||
// but I doubt that pattern copying will be particularly common.
|
||||
// but I doubt that pattern copying will be particularly common.
|
||||
// Note: init() already added an empty element zero to fSets
|
||||
int32_t i;
|
||||
int32_t numSets = other.fSets->size();
|
||||
|
@ -135,7 +135,7 @@ void RegexPattern::init() {
|
|||
fFrameSize = 0;
|
||||
fDataSize = 0;
|
||||
fGroupMap = NULL;
|
||||
fMaxCaptureDigits = 1;
|
||||
fMaxCaptureDigits = 1;
|
||||
fStaticSets = NULL;
|
||||
fStaticSets8 = NULL;
|
||||
fStartType = START_NO_INFO;
|
||||
|
@ -144,7 +144,7 @@ void RegexPattern::init() {
|
|||
fInitialChars = NULL;
|
||||
fInitialChar = 0;
|
||||
fInitialChars8 = NULL;
|
||||
|
||||
|
||||
fCompiledPat = new UVector32(fDeferredStatus);
|
||||
fGroupMap = new UVector32(fDeferredStatus);
|
||||
fSets = new UVector(fDeferredStatus);
|
||||
|
@ -166,7 +166,7 @@ void RegexPattern::init() {
|
|||
|
||||
//--------------------------------------------------------------------------
|
||||
//
|
||||
// zap Delete everything owned by this RegexPattern.
|
||||
// zap Delete everything owned by this RegexPattern.
|
||||
//
|
||||
//--------------------------------------------------------------------------
|
||||
void RegexPattern::zap() {
|
||||
|
@ -208,7 +208,7 @@ RegexPattern::~RegexPattern() {
|
|||
// Clone
|
||||
//
|
||||
//--------------------------------------------------------------------------
|
||||
RegexPattern *RegexPattern::clone() const {
|
||||
RegexPattern *RegexPattern::clone() const {
|
||||
RegexPattern *copy = new RegexPattern(*this);
|
||||
return copy;
|
||||
}
|
||||
|
@ -229,7 +229,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
|
|||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// compile
|
||||
// compile
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
RegexPattern * U_EXPORT2
|
||||
|
@ -244,7 +244,8 @@ RegexPattern::compile(const UnicodeString ®ex,
|
|||
}
|
||||
|
||||
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
|
||||
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD;
|
||||
UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
|
||||
UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
|
||||
|
||||
if ((flags & ~allFlags) != 0) {
|
||||
status = U_REGEX_INVALID_FLAG;
|
||||
|
@ -269,19 +270,24 @@ RegexPattern::compile(const UnicodeString ®ex,
|
|||
|
||||
RegexCompile compiler(This, status);
|
||||
compiler.compile(regex, pe, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
This = NULL;
|
||||
}
|
||||
|
||||
return This;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// compile with default flags.
|
||||
//
|
||||
RegexPattern * U_EXPORT2
|
||||
RegexPattern::compile(const UnicodeString ®ex,
|
||||
UParseError &pe,
|
||||
UErrorCode &err)
|
||||
UErrorCode &err)
|
||||
{
|
||||
return compile(regex, 0, pe, err);
|
||||
return compile(regex, 0, pe, err);
|
||||
}
|
||||
|
||||
|
||||
|
@ -292,10 +298,10 @@ RegexPattern::compile(const UnicodeString ®ex,
|
|||
RegexPattern * U_EXPORT2
|
||||
RegexPattern::compile( const UnicodeString ®ex,
|
||||
uint32_t flags,
|
||||
UErrorCode &err)
|
||||
UErrorCode &err)
|
||||
{
|
||||
UParseError pe;
|
||||
return compile(regex, flags, pe, err);
|
||||
return compile(regex, flags, pe, err);
|
||||
}
|
||||
|
||||
|
||||
|
@ -326,7 +332,7 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
|
|||
|
||||
#if 0
|
||||
RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
|
||||
UErrorCode &status) const
|
||||
UErrorCode &status) const
|
||||
{
|
||||
/* This should never get called. The API with UnicodeString should be called instead. */
|
||||
if (U_SUCCESS(status)) {
|
||||
|
@ -352,7 +358,7 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
retMatcher = new RegexMatcher(this);
|
||||
retMatcher = new RegexMatcher(this);
|
||||
if (retMatcher == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
|
@ -437,17 +443,15 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
int32_t val = URX_VAL(op);
|
||||
int32_t type = URX_TYPE(op);
|
||||
int32_t pinnedType = type;
|
||||
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
|
||||
if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
|
||||
pinnedType = 0;
|
||||
}
|
||||
|
||||
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
|
||||
switch (type) {
|
||||
case URX_NOP:
|
||||
case URX_DOTANY:
|
||||
case URX_DOTANY_ALL:
|
||||
case URX_DOTANY_PL:
|
||||
case URX_DOTANY_ALL_PL:
|
||||
case URX_FAIL:
|
||||
case URX_CARET:
|
||||
case URX_DOLLAR:
|
||||
|
@ -458,7 +462,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
case URX_CARET_M:
|
||||
// Types with no operand field of interest.
|
||||
break;
|
||||
|
||||
|
||||
case URX_RESERVED_OP:
|
||||
case URX_START_CAPTURE:
|
||||
case URX_END_CAPTURE:
|
||||
|
@ -494,12 +498,12 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
// types with an integer operand field.
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%d", val));
|
||||
break;
|
||||
|
||||
|
||||
case URX_ONECHAR:
|
||||
case URX_ONECHAR_I:
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
|
||||
break;
|
||||
|
||||
|
||||
case URX_STRING:
|
||||
case URX_STRING_I:
|
||||
{
|
||||
|
@ -543,7 +547,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
default:
|
||||
REGEX_DUMP_DEBUG_PRINTF(("??????"));
|
||||
break;
|
||||
|
@ -554,7 +558,7 @@ void RegexPattern::dumpOp(int32_t index) const {
|
|||
|
||||
|
||||
#if defined(REGEX_DEBUG)
|
||||
U_CAPI void U_EXPORT2
|
||||
U_CAPI void U_EXPORT2
|
||||
RegexPatternDump(const RegexPattern *This) {
|
||||
int index;
|
||||
int i;
|
||||
|
@ -565,7 +569,7 @@ RegexPatternDump(const RegexPattern *This) {
|
|||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n"));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
|
||||
if (This->fStartType == START_STRING) {
|
||||
REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
|
||||
for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
|
||||
|
@ -580,7 +584,7 @@ RegexPatternDump(const RegexPattern *This) {
|
|||
REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
|
||||
for (i=0; i<numSetChars; i++) {
|
||||
UChar32 c = This->fInitialChars->charAt(i);
|
||||
if (0x20<c && c <0x7e) {
|
||||
if (0x20<c && c <0x7e) {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
|
||||
} else {
|
||||
REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
|
||||
|
@ -606,7 +610,7 @@ RegexPatternDump(const RegexPattern *This) {
|
|||
This->dumpOp(index);
|
||||
}
|
||||
REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
#ifndef REGEX_H
|
||||
#define REGEX_H
|
||||
|
||||
//#define REGEX_DEBUG
|
||||
#define REGEX_DEBUG
|
||||
|
||||
/**
|
||||
* \file
|
||||
|
@ -36,7 +36,7 @@
|
|||
* operations, for search and replace operations, and for obtaining detailed
|
||||
* information about bounds of a match. </p>
|
||||
*
|
||||
* <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
|
||||
* <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
|
||||
* expression pattern strings application code can be simplified and the explicit
|
||||
* need for <code>RegexPattern</code> objects can usually be eliminated.
|
||||
* </p>
|
||||
|
@ -480,7 +480,7 @@ public:
|
|||
* critical that the string not be altered or deleted before use by the regular
|
||||
* expression operations is complete.
|
||||
*
|
||||
* @param regexp The Regular Expression to be compiled.
|
||||
* @param regexp The Regular Expression to be compiled.
|
||||
* @param input The string to match. The matcher retains a reference to the
|
||||
* caller's string; mo copy is made.
|
||||
* @param flags Regular expression options, such as case insensitive matching.
|
||||
|
@ -517,7 +517,7 @@ public:
|
|||
|
||||
|
||||
/**
|
||||
* Attempts to match the entire input string against the pattern.
|
||||
* Attempts to match the entire input region against the pattern.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match
|
||||
* @stable ICU 2.4
|
||||
|
@ -525,8 +525,10 @@ public:
|
|||
virtual UBool matches(UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Attempts to match the input string, beginning at startIndex, against the pattern.
|
||||
* The match must extend to the end of the input string.
|
||||
* Resets the matcher, then attempts to match the input beginning
|
||||
* at the specified startIndex, and extending to the end of the input.
|
||||
* The input region is reset to include the entire input string.
|
||||
* A successful match must extend to the end of the input.
|
||||
* @param startIndex The input string index at which to begin matching.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match
|
||||
|
@ -538,9 +540,10 @@ public:
|
|||
|
||||
|
||||
/**
|
||||
* Attempts to match the input string, starting from the beginning, against the pattern.
|
||||
* Like the matches() method, this function always starts at the beginning of the input string;
|
||||
* unlike that function, it does not require that the entire input string be matched.
|
||||
* Attempts to match the input string, starting from the beginning of the region,
|
||||
* against the pattern. Like the matches() method, this function
|
||||
* always starts at the beginning of the input region;
|
||||
* unlike that function, it does not require that the entire region be matched.
|
||||
*
|
||||
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
|
||||
* <code>end()</code>, and <code>group()</code> functions.</p>
|
||||
|
@ -699,6 +702,12 @@ public:
|
|||
* The effect is to remove any memory of previous matches,
|
||||
* and to cause subsequent find() operations to begin at
|
||||
* the specified position in the input string.
|
||||
* <p>
|
||||
* The matcher's region is reset to its default, which is the entire
|
||||
* input string.
|
||||
* <p>
|
||||
* An alternative to this function is to set a match region
|
||||
* beginning at the desired index.
|
||||
*
|
||||
* @return this RegexMatcher.
|
||||
* @stable ICU 2.8
|
||||
|
@ -709,13 +718,13 @@ public:
|
|||
/**
|
||||
* Resets this matcher with a new input string. This allows instances of RegexMatcher
|
||||
* to be reused, which is more efficient than creating a new RegexMatcher for
|
||||
* each input string to be processed.
|
||||
* each input string to be processed.
|
||||
* @param input The new string on which subsequent pattern matches will operate.
|
||||
* The matcher retains a reference to the callers string, and operates
|
||||
* directly on that. Ownership of the string remains with the caller.
|
||||
* Because no copy of the string is made, it is essential that the
|
||||
* caller not delete the string until after regexp operations on it
|
||||
* are done.
|
||||
* are done.
|
||||
* @return this RegexMatcher.
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
|
@ -743,6 +752,132 @@ public:
|
|||
* @stable ICU 2.4
|
||||
*/
|
||||
virtual const UnicodeString &input() const;
|
||||
|
||||
|
||||
|
||||
/** Sets the limits of this matcher's region.
|
||||
* The region is the part of the input string that will be searched to find a match.
|
||||
* Invoking this method resets the matcher, and then sets the region to start
|
||||
* at the index specified by the start parameter and end at the index specified
|
||||
* by the end parameter.
|
||||
*
|
||||
* Depending on the transparency and anchoring being used (see useTransparentBounds
|
||||
* and useAnchoringBounds), certain constructs such as anchors may behave differently
|
||||
* at or around the boundaries of the region
|
||||
*
|
||||
* The function will fail if start is greater than limit, or if either index
|
||||
* is less than zero or greater than the length of the string being matched.
|
||||
*
|
||||
* @param start The index to begin searches at.
|
||||
* @param limit The index to end searches at (exclusive).
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual RegexMatcher ®ion(int32_t start, int32_t limit, UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Reports the start index of this matcher's region. The searches this matcher
|
||||
* conducts are limited to finding matches within regionStart (inclusive) and
|
||||
* regionEnd (exclusive).
|
||||
*
|
||||
* @return The starting index of this matcher's region.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual int32_t regionStart() const;
|
||||
|
||||
|
||||
/**
|
||||
* Reports the end (limit) index (exclusive) of this matcher's region. The searches
|
||||
* this matcher conducts are limited to finding matches within regionStart
|
||||
* (inclusive) and regionEnd (exclusive).
|
||||
*
|
||||
* @return The ending point of this matcher's region.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual int32_t regionEnd() const;
|
||||
|
||||
/**
|
||||
* Queries the transparency of region bounds for this matcher.
|
||||
* See useTransparentBounds for a description of transparent and opaque bounds.
|
||||
* By default, a matcher uses opaque region boundaries.
|
||||
*
|
||||
* @return TRUE if this matcher is using opaque bounds, false if it is not.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual UBool hasTransparentBounds() const;
|
||||
|
||||
/**
|
||||
* Sets the transparency of region bounds for this matcher.
|
||||
* Invoking this function with an argument of true will set this matcher to use transparent bounds.
|
||||
* If the boolean argument is false, then opaque bounds will be used.
|
||||
*
|
||||
* Using transparent bounds, the boundaries of this matcher's region are transparent
|
||||
* to lookahead, lookbehind, and boundary matching constructs. Those constructs can
|
||||
* see text beyond the boundaries of the region while checking for a match.
|
||||
*
|
||||
* With opaque bounds, no text outside of the matcher's region is visible to lookahead,
|
||||
* lookbehind, and boundary matching constructs.
|
||||
*
|
||||
* By default, a matcher uses opaque bounds.
|
||||
*
|
||||
* @param b TRUE for transparent bounds; FALSE for opaque bounds
|
||||
* @return This Matcher;
|
||||
* @draft ICU 4.0
|
||||
**/
|
||||
virtual RegexMatcher &useTransparentBounds(UBool b);
|
||||
|
||||
|
||||
/**
|
||||
* Return true if this matcher is using anchoring bounds.
|
||||
* By default, matchers use anchoring region boounds.
|
||||
*
|
||||
* @return TRUE if this matcher is using anchoring bounds.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual UBool hasAnchoringBounds() const;
|
||||
|
||||
/**
|
||||
* Set whether this matcher is using Anchoring Bounds for its region.
|
||||
* With anchoring bounds, pattern anchors such as ^ and $ will match at the start
|
||||
* and end of the region. Without Anchoring Bounds, anchors will only match at
|
||||
* the positions they would in the complete text.
|
||||
*
|
||||
* Anchoring Bounds are the default for regions.
|
||||
*
|
||||
* @param b TRUE if to enable anchoring bounds; FALSE to disable them.
|
||||
* @return This Matcher
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual RegexMatcher &useAnchoringBounds(UBool b);
|
||||
|
||||
/**
|
||||
* Return TRUE if the most recent matching operation touched the
|
||||
* end of the text being processed. In this case, additional input text could
|
||||
* change the results of that match.
|
||||
*
|
||||
* hitEnd() is defined for both successful and unsuccessful matches.
|
||||
* In either case hitEnd() will return TRUE if if the end of the text was
|
||||
* reached at any point during the matching process.
|
||||
*
|
||||
* @return TRUE if the most recent match hit the end of input
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual UBool hitEnd() const;
|
||||
|
||||
/**
|
||||
* Return TRUE the most recent match succeeded and additional input could cause
|
||||
* it to fail. If this method returns false and a match was found, then more input
|
||||
* might change the match but the match won't be lost. If a match was not found,
|
||||
* then requireEnd has no meaning.
|
||||
*
|
||||
* @return TRUE if more input could cause the most recent match to no longer match.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
virtual UBool requireEnd() const;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
@ -901,12 +1036,16 @@ private:
|
|||
RegexMatcher &operator =(const RegexMatcher &rhs);
|
||||
friend class RegexPattern;
|
||||
friend class RegexCImpl;
|
||||
public:
|
||||
/** @internal */
|
||||
void resetPreserveRegion(); // Reset matcher state, but preserve any region.
|
||||
private:
|
||||
|
||||
//
|
||||
// MatchAt This is the internal interface to the match engine itself.
|
||||
// Match status comes back in matcher member variables.
|
||||
//
|
||||
void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
|
||||
UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
|
||||
|
@ -918,18 +1057,45 @@ private:
|
|||
const RegexPattern *fPattern;
|
||||
RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
|
||||
// should delete it when through.
|
||||
const UnicodeString *fInput;
|
||||
|
||||
UBool fMatch; // True if the last match was successful.
|
||||
const UnicodeString *fInput; // The text being matched. Is never NULL.
|
||||
|
||||
int32_t fRegionStart; // Start of the input region, default = 0.
|
||||
int32_t fRegionLimit; // End of input region, default to input.length.
|
||||
|
||||
int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
|
||||
int32_t fAnchorLimit; // See useAnchoringBounds
|
||||
|
||||
int32_t fLookStart; // Region bounds for look-ahead/behind and
|
||||
int32_t fLookLimit; // and other boundary tests. See
|
||||
// useTransparentBounds
|
||||
|
||||
int32_t fActiveStart; // Currently active bounds for matching.
|
||||
int32_t fActiveLimit; // Usually is the same as region, but
|
||||
// is changed to fLookStart/Limit when
|
||||
// entering look around regions.
|
||||
|
||||
UBool fTransparentBounds; // True if using transparent bounds.
|
||||
UBool fAnchoringBounds; // True if using anchoring bounds.
|
||||
|
||||
UBool fMatch; // True if the last attempted match was successful.
|
||||
int32_t fMatchStart; // Position of the start of the most recent match
|
||||
int32_t fMatchEnd; // First position after the end of the most recent match
|
||||
// Zero if no previous match, even when a region
|
||||
// is active.
|
||||
int32_t fLastMatchEnd; // First position after the end of the previous match,
|
||||
// or -1 if there was no previous match.
|
||||
int32_t fLastReplaceEnd; // First position after the end of the previous appendReplacement();
|
||||
int32_t fAppendPosition; // First position after the end of the previous
|
||||
// appendReplacement(). As described by the
|
||||
// JavaDoc for Java Matcher, where it is called
|
||||
// "append position"
|
||||
UBool fHitEnd; // True if the last match touched the end of input.
|
||||
UBool fRequireEnd; // True if the last match required end-of-input
|
||||
// (matched $ or Z)
|
||||
|
||||
UVector32 *fStack;
|
||||
REStackFrame *fFrame; // After finding a match, the last active stack
|
||||
// frame, which will contain the capture group results.
|
||||
REStackFrame *fFrame; // After finding a match, the last active stack frame,
|
||||
// which will contain the capture group results.
|
||||
// NOT valid while match engine is running.
|
||||
|
||||
int32_t *fData; // Data area for use by the compiled pattern.
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2004-2006, International Business Machines
|
||||
* Copyright (C) 2004-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: regex.h
|
||||
|
@ -59,12 +59,32 @@ typedef enum URegexpFlag{
|
|||
/** If set, '.' matches line terminators, otherwise '.' matching stops at line end.
|
||||
* @stable ICU 2.4 */
|
||||
UREGEX_DOTALL = 32,
|
||||
|
||||
/** If set, treat the entire pattern as a literal string.
|
||||
* Metacharacters or escape sequences in the input sequence will be given
|
||||
* no special meaning.
|
||||
*
|
||||
* The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact
|
||||
* on matching when used in conjunction with this flag.
|
||||
* The other flags become superfluous.
|
||||
* TODO: say which escapes are still handled; anything Java does
|
||||
* early (\u) we should still do.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
UREGEX_LITERAL = 16,
|
||||
|
||||
/** Control behavior of "$" and "^"
|
||||
* If set, recognize line terminators within string,
|
||||
* otherwise, match only at start and end of input string.
|
||||
* @stable ICU 2.4 */
|
||||
UREGEX_MULTILINE = 8,
|
||||
|
||||
/** Unix-only line endings.
|
||||
* When this mode is enabled, only \u000a is recognized as a line ending
|
||||
* in the behavior of ., ^, and $.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
UREGEX_UNIX_LINES = 1,
|
||||
|
||||
/** Unicode word boundaries.
|
||||
* If set, \b uses the Unicode TR 29 definition of word boundaries.
|
||||
|
@ -73,7 +93,17 @@ typedef enum URegexpFlag{
|
|||
* http://unicode.org/reports/tr29/#Word_Boundaries
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
UREGEX_UWORD = 256
|
||||
UREGEX_UWORD = 256,
|
||||
|
||||
/** Error on Unrecognized backslash escapes.
|
||||
* If set, fail with an error on patterns that contain
|
||||
* backslash-escaped ASCII letters without a known specail
|
||||
* meaning. If this flag is not set, these
|
||||
* escaped letters represent themselves.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
|
||||
|
||||
} URegexpFlag;
|
||||
|
||||
/**
|
||||
|
@ -251,11 +281,21 @@ uregex_getText(URegularExpression *regexp,
|
|||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Attempts to match the input string, beginning at startIndex, against the pattern.
|
||||
* To succeed, the match must extend to the end of the input string.
|
||||
* Attempts to match the input string against the pattern.
|
||||
* To succeed, the match must extend to the end of the string,
|
||||
* or cover the complete match region.
|
||||
*
|
||||
* If startIndex >= zero the match operation starts at the specified
|
||||
* index and must extend to the end of the input string. Any region
|
||||
* that has been specified is reset.
|
||||
*
|
||||
* If startIndex == -1 the match must cover the input region, or the entire
|
||||
* input string if no region has been set. This directly corresponds to
|
||||
* Matcher.matches() in Java
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param startIndex The input string index at which to begin matching.
|
||||
* @param startIndex The input string index at which to begin matching, or -1
|
||||
* to match the input Region.
|
||||
* @param status Receives errors detected by this function.
|
||||
* @return TRUE if there is a match
|
||||
* @stable ICU 3.0
|
||||
|
@ -270,12 +310,20 @@ uregex_matches(URegularExpression *regexp,
|
|||
* The match may be of any length, and is not required to extend to the end
|
||||
* of the input string. Contrast with uregex_matches().
|
||||
*
|
||||
* <p>If startIndex is >= 0 any input region that was set for this
|
||||
* URegularExpression is reset before the operation begins.
|
||||
*
|
||||
* <p>If the specified starting index == -1 the match begins at the start of the input
|
||||
* region, or at the start of the full string if no region has been specified.
|
||||
* This corresponds directly with Matcher.lookingAt() in Java.
|
||||
*
|
||||
* <p>If the match succeeds then more information can be obtained via the
|
||||
* <code>uregexp_start()</code>, <code>uregexp_end()</code>,
|
||||
* and <code>uregexp_group()</code> functions.</p>
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param startIndex The input string index at which to begin matching.
|
||||
* @param startIndex The input string index at which to begin matching, or
|
||||
* -1 to match the Input Region
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if there is a match.
|
||||
* @stable ICU 3.0
|
||||
|
@ -287,12 +335,19 @@ uregex_lookingAt(URegularExpression *regexp,
|
|||
|
||||
/**
|
||||
* Find the first matching substring of the input string that matches the pattern.
|
||||
* The search for a match begins at the specified index.
|
||||
* If startIndex is >= zero the search for a match begins at the specified index,
|
||||
* and any match region is reset. This corresponds directly with
|
||||
* Matcher.find(startIndex) in Java.
|
||||
*
|
||||
* If startIndex == -1 the search begins at the start of the input region,
|
||||
* or at the start of the full string if no region has been specified.
|
||||
*
|
||||
* If a match is found, <code>uregex_start(), uregex_end()</code>, and
|
||||
* <code>uregex_group()</code> will provide more information regarding the match.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param startIndex The position in the input string to begin the search
|
||||
* @param startIndex The position in the input string to begin the search, or
|
||||
* -1 to search within the Input Region.
|
||||
* @param status A reference to a UErrorCode to receive any errors.
|
||||
* @return TRUE if a match is found.
|
||||
* @stable ICU 3.0
|
||||
|
@ -303,10 +358,10 @@ uregex_find(URegularExpression *regexp,
|
|||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Find the next pattern match in the input string.
|
||||
* Begin searching the input at the location following the end of
|
||||
* the previous match, or at the start of the string if there is no previous match.
|
||||
* If a match is found, <code>uregex_start(), uregex_end()</code>, and
|
||||
* Find the next pattern match in the input string. Begin searching
|
||||
* the input at the location following the end of he previous match,
|
||||
* or at the start of the string (or region) if there is no
|
||||
* previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and
|
||||
* <code>uregex_group()</code> will provide more information regarding the match.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
|
@ -395,7 +450,8 @@ uregex_end(URegularExpression *regexp,
|
|||
* Reset any saved state from the previous match. Has the effect of
|
||||
* causing uregex_findNext to begin at the specified index, and causing
|
||||
* uregex_start(), uregex_end() and uregex_group() to return an error
|
||||
* indicating that there is no match information available.
|
||||
* indicating that there is no match information available. Clears any
|
||||
* match region that may have been set.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param index The position in the text at which a
|
||||
|
@ -407,6 +463,166 @@ U_STABLE void U_EXPORT2
|
|||
uregex_reset(URegularExpression *regexp,
|
||||
int32_t index,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/** Sets the limits of the matching region for this URegularExpression.
|
||||
* The region is the part of the input string that will be considered when matching.
|
||||
* Invoking this method resets any saved state from the previous match,
|
||||
* then sets the region to start at the index specified by the start parameter
|
||||
* and end at the index specified by the end parameter.
|
||||
*
|
||||
* Depending on the transparency and anchoring being used (see useTransparentBounds
|
||||
* and useAnchoringBounds), certain constructs such as anchors may behave differently
|
||||
* at or around the boundaries of the region
|
||||
*
|
||||
* The function will fail if start is greater than limit, or if either index
|
||||
* is less than zero or greater than the length of the string being matched.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param start The index to begin searches at.
|
||||
* @param limit The index to end searches at (exclusive).
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
uregex_setRegion(URegularExpression *regexp,
|
||||
int32_t regionStart,
|
||||
int32_t regionLimit,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Reports the start index of the matching region. Any matches found are limited to
|
||||
* to the region bounded by regionStart (inclusive) and regionEnd (exclusive).
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @return The starting index of this matcher's region.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uregex_regionStart(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Reports the end index (exclusive) of the matching region for this URegularExpression.
|
||||
* Any matches found are limited to to the region bounded by regionStart (inclusive)
|
||||
* and regionEnd (exclusive).
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @return The ending point of this matcher's region.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uregex_regionEnd(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Queries the transparency of region bounds for this URegularExpression.
|
||||
* See useTransparentBounds for a description of transparent and opaque bounds.
|
||||
* By default, matching boundaries are opaque.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @return TRUE if this matcher is using opaque bounds, false if it is not.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
uregex_hasTransparentBounds(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Sets the transparency of region bounds for this URegularExpression.
|
||||
* Invoking this function with an argument of TRUE will set matches to use transparent bounds.
|
||||
* If the boolean argument is FALSE, then opaque bounds will be used.
|
||||
*
|
||||
* Using transparent bounds, the boundaries of the matching region are transparent
|
||||
* to lookahead, lookbehind, and boundary matching constructs. Those constructs can
|
||||
* see text beyond the boundaries of the region while checking for a match.
|
||||
*
|
||||
* With opaque bounds, no text outside of the matching region is visible to lookahead,
|
||||
* lookbehind, and boundary matching constructs.
|
||||
*
|
||||
* By default, opaque bounds are used.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param b TRUE for transparent bounds; FALSE for opaque bounds
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @draft ICU 4.0
|
||||
**/
|
||||
U_DRAFT void U_EXPORT2
|
||||
uregex_useTransparentBounds(URegularExpression *regexp,
|
||||
UBool b,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Return true if this URegularExpression is using anchoring bounds.
|
||||
* By default, anchoring region bounds are used.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @return TRUE if this matcher is using anchoring bounds.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
uregex_hasAnchoringBounds(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
/**
|
||||
* Set whether this URegularExpression is using Anchoring Bounds for its region.
|
||||
* With anchoring bounds, pattern anchors such as ^ and $ will match at the start
|
||||
* and end of the region. Without Anchoring Bounds, anchors will only match at
|
||||
* the positions they would in the complete text.
|
||||
*
|
||||
* Anchoring Bounds are the default for regions.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param b TRUE if to enable anchoring bounds; FALSE to disable them.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
uregex_useAnchoringBounds(URegularExpression *regexp,
|
||||
UBool b,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Return TRUE if the most recent matching operation touched the
|
||||
* end of the text being processed. In this case, additional input text could
|
||||
* change the results of that match.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @return TRUE if the most recent match hit the end of input
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
uregex_hitEnd(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Return TRUE the most recent match succeeded and additional input could cause
|
||||
* it to fail. If this function returns false and a match was found, then more input
|
||||
* might change the match but the match won't be lost. If a match was not found,
|
||||
* then requireEnd has no meaning.
|
||||
*
|
||||
* @param regexp The compiled regular expression.
|
||||
* @param status A pointer to a UErrorCode to receive any errors.
|
||||
* @return TRUE if more input could cause the most recent match to no longer match.
|
||||
* @draft ICU 4.0
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
uregex_requireEnd(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Replaces every substring of the input that matches the pattern
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2004-2006, International Business Machines
|
||||
* Copyright (C) 2004-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: regex.cpp
|
||||
|
@ -304,10 +304,15 @@ U_CAPI UBool U_EXPORT2
|
|||
uregex_matches(URegularExpression *regexp,
|
||||
int32_t startIndex,
|
||||
UErrorCode *status) {
|
||||
UBool result = FALSE;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
return result;
|
||||
}
|
||||
if (startIndex == -1) {
|
||||
result = regexp->fMatcher->matches(*status);
|
||||
} else {
|
||||
result = regexp->fMatcher->matches(startIndex, *status);
|
||||
}
|
||||
UBool result = regexp->fMatcher->matches(startIndex, *status);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -322,10 +327,15 @@ U_CAPI UBool U_EXPORT2
|
|||
uregex_lookingAt(URegularExpression *regexp,
|
||||
int32_t startIndex,
|
||||
UErrorCode *status) {
|
||||
UBool result = FALSE;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
return result;
|
||||
}
|
||||
if (startIndex == -1) {
|
||||
result = regexp->fMatcher->lookingAt(*status);
|
||||
} else {
|
||||
result = regexp->fMatcher->lookingAt(startIndex, *status);
|
||||
}
|
||||
UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -340,10 +350,16 @@ U_CAPI UBool U_EXPORT2
|
|||
uregex_find(URegularExpression *regexp,
|
||||
int32_t startIndex,
|
||||
UErrorCode *status) {
|
||||
UBool result = FALSE;
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
return result;
|
||||
}
|
||||
if (startIndex == -1) {
|
||||
regexp->fMatcher->resetPreserveRegion();
|
||||
result = regexp->fMatcher->find();
|
||||
} else {
|
||||
result = regexp->fMatcher->find(startIndex, *status);
|
||||
}
|
||||
UBool result = regexp->fMatcher->find(startIndex, *status);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -479,6 +495,145 @@ uregex_reset(URegularExpression *regexp,
|
|||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_setRegion
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_setRegion(URegularExpression *regexp,
|
||||
int32_t regionStart,
|
||||
int32_t regionLimit,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return;
|
||||
}
|
||||
regexp->fMatcher->region(regionStart, regionLimit, *status);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_regionStart
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_regionStart(const URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
return regexp->fMatcher->regionStart();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_regionEnd
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uregex_regionEnd(const URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return 0;
|
||||
}
|
||||
return regexp->fMatcher->regionEnd();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_hasTransparentBounds
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uregex_hasTransparentBounds(const URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
}
|
||||
return regexp->fMatcher->hasTransparentBounds();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_useTransparentBounds
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_useTransparentBounds(URegularExpression *regexp,
|
||||
UBool b,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return;
|
||||
}
|
||||
regexp->fMatcher->useTransparentBounds(b);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_hasAnchoringBounds
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uregex_hasAnchoringBounds(const URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
}
|
||||
return regexp->fMatcher->hasAnchoringBounds();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_useAnchoringBounds
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI void U_EXPORT2
|
||||
uregex_useAnchoringBounds(URegularExpression *regexp,
|
||||
UBool b,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return;
|
||||
}
|
||||
regexp->fMatcher->useAnchoringBounds(b);
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_hitEnd
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uregex_hitEnd(const URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
}
|
||||
return regexp->fMatcher->hitEnd();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_requireEnd
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uregex_requireEnd(const URegularExpression *regexp,
|
||||
UErrorCode *status) {
|
||||
if (validateRE(regexp, status) == FALSE) {
|
||||
return FALSE;
|
||||
}
|
||||
return regexp->fMatcher->requireEnd();
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// uregex_replaceAll
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2004-2006, International Business Machines Corporation and
|
||||
* Copyright (c) 2004-2007, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
|
@ -34,6 +34,36 @@ log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_error
|
|||
#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
|
||||
log_err("Test Failure at file %s, line %d\n", __FILE__, __LINE__);}}
|
||||
|
||||
/*
|
||||
* TEST_SETUP and TEST_TEARDOWN
|
||||
* macros to handle the boilerplate around setting up regex test cases.
|
||||
* parameteres to setup:
|
||||
* pattern: The regex pattern, a (char *) null terminated C string.
|
||||
* testString: The string data, also a (char *) C string.
|
||||
* flags: Regex flags to set when compiling the pattern
|
||||
*
|
||||
* Put arbitrary test code between SETUP and TEARDOWN.
|
||||
* 're" is the compiled, ready-to-go regular expression.
|
||||
*/
|
||||
#define TEST_SETUP(pattern, testString, flags) { \
|
||||
UChar *srcString = NULL; \
|
||||
status = U_ZERO_ERROR; \
|
||||
re = uregex_openC(pattern, flags, NULL, &status); \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
srcString = (UChar *)malloc((strlen(testString)+2)*sizeof(UChar)); \
|
||||
u_uastrncpy(srcString, testString, strlen(testString)+1); \
|
||||
uregex_setText(re, srcString, -1, &status); \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
if (U_SUCCESS(status)) {
|
||||
|
||||
#define TEST_TEARDOWN \
|
||||
} \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
uregex_close(re); \
|
||||
free(srcString); \
|
||||
}
|
||||
|
||||
|
||||
static void test_assert_string(const char *expected, const UChar *actual, UBool nulTerm, const char *file, int line) {
|
||||
char buf_inside_macro[120];
|
||||
int32_t len = (int32_t)strlen(expected);
|
||||
|
@ -544,6 +574,135 @@ static void TestRegexCAPI(void) {
|
|||
uregex_close(re);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Regions
|
||||
*/
|
||||
|
||||
|
||||
/* SetRegion(), getRegion() do something */
|
||||
TEST_SETUP(".*", "0123456789ABCDEF", 0)
|
||||
UChar resultString[40];
|
||||
TEST_ASSERT(uregex_regionStart(re, &status) == 0);
|
||||
TEST_ASSERT(uregex_regionEnd(re, &status) == 16);
|
||||
uregex_setRegion(re, 3, 6, &status);
|
||||
TEST_ASSERT(uregex_regionStart(re, &status) == 3);
|
||||
TEST_ASSERT(uregex_regionEnd(re, &status) == 6);
|
||||
TEST_ASSERT(uregex_findNext(re, &status));
|
||||
TEST_ASSERT(uregex_group(re, 0, resultString, sizeof(resultString)/2, &status) == 3)
|
||||
TEST_ASSERT_STRING("345", resultString, TRUE);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* find(start=-1) uses regions */
|
||||
TEST_SETUP(".*", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 6);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* find (start >=0) does not use regions */
|
||||
TEST_SETUP(".*", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 0);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 16);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* findNext() obeys regions */
|
||||
TEST_SETUP(".", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_findNext(re,&status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
|
||||
TEST_ASSERT(uregex_findNext(re, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 5);
|
||||
TEST_ASSERT(uregex_findNext(re, &status) == FALSE);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* matches(start=-1) uses regions */
|
||||
/* Also, verify that non-greedy *? succeeds in finding the full match. */
|
||||
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_matches(re, -1, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 6);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* matches (start >=0) does not use regions */
|
||||
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_matches(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 0);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 16);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* lookingAt(start=-1) uses regions */
|
||||
/* Also, verify that non-greedy *? finds the first (shortest) match. */
|
||||
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_lookingAt(re, -1, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 4);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 4);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* lookingAt (start >=0) does not use regions */
|
||||
TEST_SETUP(".*?", "0123456789ABCDEF", 0);
|
||||
uregex_setRegion(re, 4, 6, &status);
|
||||
TEST_ASSERT(uregex_lookingAt(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_start(re, 0, &status) == 0);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 0);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* hitEnd() */
|
||||
TEST_SETUP("[a-f]*", "abcdefghij", 0);
|
||||
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_hitEnd(re, &status) == FALSE);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
TEST_SETUP("[a-f]*", "abcdef", 0);
|
||||
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_hitEnd(re, &status) == TRUE);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* requireEnd */
|
||||
TEST_SETUP("abcd", "abcd", 0);
|
||||
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_requireEnd(re, &status) == FALSE);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
TEST_SETUP("abcd$", "abcd", 0);
|
||||
TEST_ASSERT(uregex_find(re, 0, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_requireEnd(re, &status) == TRUE);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* anchoringBounds */
|
||||
TEST_SETUP("abc$", "abcdef", 0);
|
||||
TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == TRUE);
|
||||
uregex_useAnchoringBounds(re, FALSE, &status);
|
||||
TEST_ASSERT(uregex_hasAnchoringBounds(re, &status) == FALSE);
|
||||
|
||||
TEST_ASSERT(uregex_find(re, -1, &status) == FALSE);
|
||||
uregex_useAnchoringBounds(re, TRUE, &status);
|
||||
uregex_setRegion(re, 0, 3, &status);
|
||||
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE);
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 3);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
/* Transparent Bounds */
|
||||
TEST_SETUP("abc(?=def)", "abcdef", 0);
|
||||
TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == FALSE);
|
||||
uregex_useTransparentBounds(re, TRUE, &status);
|
||||
TEST_ASSERT(uregex_hasTransparentBounds(re, &status) == TRUE);
|
||||
|
||||
uregex_useTransparentBounds(re, FALSE, &status);
|
||||
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* No Region */
|
||||
uregex_setRegion(re, 0, 3, &status);
|
||||
TEST_ASSERT(uregex_find(re, -1, &status) == FALSE); /* with region, opaque bounds */
|
||||
uregex_useTransparentBounds(re, TRUE, &status);
|
||||
TEST_ASSERT(uregex_find(re, -1, &status) == TRUE); /* with region, transparent bounds */
|
||||
TEST_ASSERT(uregex_end(re, 0, &status) == 3);
|
||||
TEST_TEARDOWN;
|
||||
|
||||
|
||||
/*
|
||||
* replaceFirst()
|
||||
|
|
|
@ -175,209 +175,6 @@ UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking,
|
|||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// regex_find(pattern, inputString, lineNumber)
|
||||
//
|
||||
// function to simplify writing tests regex tests.
|
||||
//
|
||||
// The input text is unescaped. The pattern is not.
|
||||
// The input text is marked with the expected match positions
|
||||
// <0>text <1> more text </1> </0>
|
||||
// The <n> </n> tags are removed before trying the match.
|
||||
// The tags mark the start and end of the match and of any capture groups.
|
||||
//
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
|
||||
// Set a value into a UVector at position specified by a decimal number in
|
||||
// a UnicodeString. This is a utility function needed by the actual test function,
|
||||
// which follows.
|
||||
static void set(UVector &vec, int32_t val, UnicodeString index) {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
int32_t idx = 0;
|
||||
for (int32_t i=0; i<index.length(); i++) {
|
||||
int32_t d=u_charDigitValue(index.charAt(i));
|
||||
if (d<0) {return;}
|
||||
idx = idx*10 + d;
|
||||
}
|
||||
while (vec.size()<idx+1) {vec.addElement(-1, status);}
|
||||
vec.setElementAt(val, idx);
|
||||
}
|
||||
|
||||
void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
const UnicodeString &flags,
|
||||
const UnicodeString &inputString,
|
||||
int32_t line) {
|
||||
UnicodeString unEscapedInput;
|
||||
UnicodeString deTaggedInput;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
RegexPattern *parsePat = NULL;
|
||||
RegexMatcher *parseMatcher = NULL;
|
||||
RegexPattern *callerPattern = NULL;
|
||||
RegexMatcher *matcher = NULL;
|
||||
UVector groupStarts(status);
|
||||
UVector groupEnds(status);
|
||||
UBool isMatch = FALSE;
|
||||
UBool failed = FALSE;
|
||||
int32_t numFinds;
|
||||
int32_t i;
|
||||
|
||||
//
|
||||
// Compile the caller's pattern
|
||||
//
|
||||
uint32_t bflags = 0;
|
||||
if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
|
||||
bflags |= UREGEX_CASE_INSENSITIVE;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
|
||||
bflags |= UREGEX_COMMENTS;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
|
||||
bflags |= UREGEX_DOTALL;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
|
||||
bflags |= UREGEX_MULTILINE;
|
||||
}
|
||||
|
||||
|
||||
callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
|
||||
if (status != U_ZERO_ERROR) {
|
||||
#if UCONFIG_NO_BREAK_ITERATION==1
|
||||
// 'v' test flag means that the test pattern should not compile if ICU was configured
|
||||
// to not include break iteration. RBBI is needed for Unicode word boundaries.
|
||||
if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
#endif
|
||||
errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)'d') >= 0) {
|
||||
RegexPatternDump(callerPattern);
|
||||
}
|
||||
|
||||
//
|
||||
// Number of times find() should be called on the test string, default to 1
|
||||
//
|
||||
numFinds = 1;
|
||||
for (i=2; i<=9; i++) {
|
||||
if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
|
||||
if (numFinds != 1) {
|
||||
errln("Line %d: more than one digit flag. Scanning %d.", line, i);
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
numFinds = i;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Find the tags in the input data, remove them, and record the group boundary
|
||||
// positions.
|
||||
//
|
||||
parsePat = RegexPattern::compile("<(/?)([0-9]+)>", 0, pe, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
|
||||
unEscapedInput = inputString.unescape();
|
||||
parseMatcher = parsePat->matcher(unEscapedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
while(parseMatcher->find()) {
|
||||
parseMatcher->appendReplacement(deTaggedInput, "", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString groupNum = parseMatcher->group(2, status);
|
||||
if (parseMatcher->group(1, status) == "/") {
|
||||
// close tag
|
||||
set(groupEnds, deTaggedInput.length(), groupNum);
|
||||
} else {
|
||||
set(groupStarts, deTaggedInput.length(), groupNum);
|
||||
}
|
||||
}
|
||||
parseMatcher->appendTail(deTaggedInput);
|
||||
REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
|
||||
|
||||
|
||||
//
|
||||
// Do a find on the de-tagged input using the caller's pattern
|
||||
//
|
||||
matcher = callerPattern->matcher(deTaggedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
if (flags.indexOf((UChar)'t') >= 0) {
|
||||
matcher->setTrace(TRUE);
|
||||
}
|
||||
|
||||
for (i=0; i<numFinds; i++) {
|
||||
isMatch = matcher->find();
|
||||
}
|
||||
matcher->setTrace(FALSE);
|
||||
|
||||
//
|
||||
// Match up the groups from the find() with the groups from the tags
|
||||
//
|
||||
|
||||
// number of tags should match number of groups from find operation.
|
||||
// matcher->groupCount does not include group 0, the entire match, hence the +1.
|
||||
// G option in test means that capture group data is not available in the
|
||||
// expected results, so the check needs to be suppressed.
|
||||
if (isMatch == FALSE && groupStarts.size() != 0) {
|
||||
errln("Error at line %d: Match expected, but none found.\n", line);
|
||||
failed = TRUE;
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
|
||||
// Only check for match / no match. Don't check capture groups.
|
||||
if (isMatch && groupStarts.size() == 0) {
|
||||
errln("Error at line %d: No match expected, but one found.\n", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
for (i=0; i<=matcher->groupCount(); i++) {
|
||||
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
|
||||
if (matcher->start(i, status) != expectedStart) {
|
||||
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
|
||||
line, i, expectedStart, matcher->start(i, status));
|
||||
failed = TRUE;
|
||||
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
|
||||
}
|
||||
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
|
||||
if (matcher->end(i, status) != expectedEnd) {
|
||||
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
|
||||
line, i, expectedEnd, matcher->end(i, status));
|
||||
failed = TRUE;
|
||||
// Error on end position; keep going; real error is probably yet to come as group
|
||||
// end positions work from end of the input data towards the front.
|
||||
}
|
||||
}
|
||||
if ( matcher->groupCount()+1 < groupStarts.size()) {
|
||||
errln("Error at line %d: Expected %d capture groups, found %d.",
|
||||
line, groupStarts.size()-1, matcher->groupCount());
|
||||
failed = TRUE;
|
||||
}
|
||||
|
||||
cleanupAndReturn:
|
||||
if (failed) {
|
||||
errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
|
||||
+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
|
||||
// callerPattern->dump();
|
||||
}
|
||||
delete parseMatcher;
|
||||
delete parsePat;
|
||||
delete matcher;
|
||||
delete callerPattern;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -938,6 +735,87 @@ void RegexTest::API_Match() {
|
|||
delete m;
|
||||
delete p;
|
||||
}
|
||||
|
||||
//
|
||||
// Regions
|
||||
//
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString testString("This is test data");
|
||||
RegexMatcher m(".*", testString, 0, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(m.regionStart() == 0);
|
||||
REGEX_ASSERT(m.regionEnd() == testString.length());
|
||||
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
|
||||
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
|
||||
|
||||
m.region(2,4, status);
|
||||
REGEX_CHECK_STATUS;
|
||||
REGEX_ASSERT(m.matches(status));
|
||||
REGEX_ASSERT(m.start(status)==2);
|
||||
REGEX_ASSERT(m.end(status)==4);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
m.reset();
|
||||
REGEX_ASSERT(m.regionStart() == 0);
|
||||
REGEX_ASSERT(m.regionEnd() == testString.length());
|
||||
|
||||
UnicodeString shorterString("short");
|
||||
m.reset(shorterString);
|
||||
REGEX_ASSERT(m.regionStart() == 0);
|
||||
REGEX_ASSERT(m.regionEnd() == shorterString.length());
|
||||
|
||||
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
|
||||
REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
|
||||
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
|
||||
REGEX_ASSERT(&m == &m.reset());
|
||||
REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
|
||||
|
||||
REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
|
||||
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
|
||||
REGEX_ASSERT(&m == &m.reset());
|
||||
REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
|
||||
|
||||
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
|
||||
REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
|
||||
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
|
||||
REGEX_ASSERT(&m == &m.reset());
|
||||
REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
|
||||
|
||||
REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
|
||||
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
|
||||
REGEX_ASSERT(&m == &m.reset());
|
||||
REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// hitEnd() and requireEnd()
|
||||
//
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString testString("aabb");
|
||||
RegexMatcher m1(".*", testString, 0, status);
|
||||
REGEX_ASSERT(m1.lookingAt(status) == TRUE);
|
||||
REGEX_ASSERT(m1.hitEnd() == TRUE);
|
||||
REGEX_ASSERT(m1.requireEnd() == FALSE);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
RegexMatcher m2("a*", testString, 0, status);
|
||||
REGEX_ASSERT(m2.lookingAt(status) == TRUE);
|
||||
REGEX_ASSERT(m2.hitEnd() == FALSE);
|
||||
REGEX_ASSERT(m2.requireEnd() == FALSE);
|
||||
REGEX_CHECK_STATUS;
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
RegexMatcher m3(".*$", testString, 0, status);
|
||||
REGEX_ASSERT(m3.lookingAt(status) == TRUE);
|
||||
REGEX_ASSERT(m3.hitEnd() == TRUE);
|
||||
REGEX_ASSERT(m3.requireEnd() == TRUE);
|
||||
REGEX_CHECK_STATUS;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Compilation error on reset with UChar *
|
||||
|
@ -1470,7 +1348,7 @@ void RegexTest::Extended() {
|
|||
}
|
||||
|
||||
int32_t len;
|
||||
UChar *testData = ReadAndConvertFile(srcPath, len, status);
|
||||
UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
|
||||
if (U_FAILURE(status)) {
|
||||
return; /* something went wrong, error already output */
|
||||
}
|
||||
|
@ -1482,7 +1360,7 @@ void RegexTest::Extended() {
|
|||
|
||||
RegexMatcher quotedStuffMat("\\s*([\\'\\\"/])(.*?)\\1", 0, status);
|
||||
RegexMatcher commentMat ("\\s*(#.*)?$", 0, status);
|
||||
RegexMatcher flagsMat ("\\s*([ixsmdtGv2-9]*)([:letter:]*)", 0, status);
|
||||
RegexMatcher flagsMat ("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)", 0, status);
|
||||
|
||||
RegexMatcher lineMat("(.*?)\\r?\\n", testString, 0, status);
|
||||
UnicodeString testPattern; // The pattern for test from the test file.
|
||||
|
@ -1581,6 +1459,295 @@ void RegexTest::Extended() {
|
|||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// regex_find(pattern, flags, inputString, lineNumber)
|
||||
//
|
||||
// Function to run a single test from the Extended (data driven) tests.
|
||||
// See file test/testdata/regextst.txt for a description of the
|
||||
// pattern and inputString fields, and the allowed flags.
|
||||
// lineNumber is the source line in regextst.txt of the test.
|
||||
//
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
|
||||
// Set a value into a UVector at position specified by a decimal number in
|
||||
// a UnicodeString. This is a utility function needed by the actual test function,
|
||||
// which follows.
|
||||
static void set(UVector &vec, int32_t val, UnicodeString index) {
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
int32_t idx = 0;
|
||||
for (int32_t i=0; i<index.length(); i++) {
|
||||
int32_t d=u_charDigitValue(index.charAt(i));
|
||||
if (d<0) {return;}
|
||||
idx = idx*10 + d;
|
||||
}
|
||||
while (vec.size()<idx+1) {vec.addElement(-1, status);}
|
||||
vec.setElementAt(val, idx);
|
||||
}
|
||||
|
||||
void RegexTest::regex_find(const UnicodeString &pattern,
|
||||
const UnicodeString &flags,
|
||||
const UnicodeString &inputString,
|
||||
int32_t line) {
|
||||
UnicodeString unEscapedInput;
|
||||
UnicodeString deTaggedInput;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UParseError pe;
|
||||
RegexPattern *parsePat = NULL;
|
||||
RegexMatcher *parseMatcher = NULL;
|
||||
RegexPattern *callerPattern = NULL;
|
||||
RegexMatcher *matcher = NULL;
|
||||
UVector groupStarts(status);
|
||||
UVector groupEnds(status);
|
||||
UBool isMatch = FALSE;
|
||||
UBool failed = FALSE;
|
||||
int32_t numFinds;
|
||||
int32_t i;
|
||||
UBool useMatchesFunc = FALSE;
|
||||
UBool useLookingAtFunc = FALSE;
|
||||
int32_t regionStart = -1;
|
||||
int32_t regionEnd = -1;
|
||||
|
||||
//
|
||||
// Compile the caller's pattern
|
||||
//
|
||||
uint32_t bflags = 0;
|
||||
if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
|
||||
bflags |= UREGEX_CASE_INSENSITIVE;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
|
||||
bflags |= UREGEX_COMMENTS;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
|
||||
bflags |= UREGEX_DOTALL;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
|
||||
bflags |= UREGEX_MULTILINE;
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
|
||||
bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
|
||||
bflags |= UREGEX_UNIX_LINES;
|
||||
}
|
||||
|
||||
|
||||
callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
|
||||
if (status != U_ZERO_ERROR) {
|
||||
#if UCONFIG_NO_BREAK_ITERATION==1
|
||||
// 'v' test flag means that the test pattern should not compile if ICU was configured
|
||||
// to not include break iteration. RBBI is needed for Unicode word boundaries.
|
||||
if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
#endif
|
||||
if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
|
||||
// Expected pattern compilation error.
|
||||
if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
|
||||
logln("Pattern Compile returns \"%s\"", u_errorName(status));
|
||||
}
|
||||
goto cleanupAndReturn;
|
||||
} else {
|
||||
// Unexpected pattern compilation error.
|
||||
errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
|
||||
RegexPatternDump(callerPattern);
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
|
||||
errln("Expected, but did not get, a pattern compilation error.");
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Number of times find() should be called on the test string, default to 1
|
||||
//
|
||||
numFinds = 1;
|
||||
for (i=2; i<=9; i++) {
|
||||
if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
|
||||
if (numFinds != 1) {
|
||||
errln("Line %d: more than one digit flag. Scanning %d.", line, i);
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
numFinds = i;
|
||||
}
|
||||
}
|
||||
|
||||
// 'M' flag. Use matches() instead of find()
|
||||
if (flags.indexOf((UChar)0x4d) >= 0) {
|
||||
useMatchesFunc = TRUE;
|
||||
}
|
||||
if (flags.indexOf((UChar)0x4c) >= 0) {
|
||||
useLookingAtFunc = TRUE;
|
||||
}
|
||||
|
||||
//
|
||||
// Find the tags in the input data, remove them, and record the group boundary
|
||||
// positions.
|
||||
//
|
||||
parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
|
||||
unEscapedInput = inputString.unescape();
|
||||
parseMatcher = parsePat->matcher(unEscapedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
while(parseMatcher->find()) {
|
||||
parseMatcher->appendReplacement(deTaggedInput, "", status);
|
||||
REGEX_CHECK_STATUS;
|
||||
UnicodeString groupNum = parseMatcher->group(2, status);
|
||||
if (groupNum == "r") {
|
||||
// <r> or </r>, a region specification within the string
|
||||
if (parseMatcher->group(1, status) == "/") {
|
||||
regionEnd = deTaggedInput.length();
|
||||
} else {
|
||||
regionStart = deTaggedInput.length();
|
||||
}
|
||||
} else {
|
||||
// <digits> or </digits>, a group match boundary tag.
|
||||
if (parseMatcher->group(1, status) == "/") {
|
||||
set(groupEnds, deTaggedInput.length(), groupNum);
|
||||
} else {
|
||||
set(groupStarts, deTaggedInput.length(), groupNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
parseMatcher->appendTail(deTaggedInput);
|
||||
REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
|
||||
if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
|
||||
errln("mismatched <r> tags");
|
||||
failed = TRUE;
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Configure the matcher according to the flags specified with this test.
|
||||
//
|
||||
matcher = callerPattern->matcher(deTaggedInput, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
|
||||
matcher->setTrace(TRUE);
|
||||
}
|
||||
if (regionStart>=0) {
|
||||
matcher->region(regionStart, regionEnd, status);
|
||||
REGEX_CHECK_STATUS_L(line);
|
||||
}
|
||||
if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
|
||||
matcher->useAnchoringBounds(FALSE);
|
||||
}
|
||||
if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
|
||||
matcher->useTransparentBounds(TRUE);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Do a find on the de-tagged input using the caller's pattern
|
||||
// TODO: error on count>1 and not find().
|
||||
// error on both matches() and lookingAt().
|
||||
//
|
||||
for (i=0; i<numFinds; i++) {
|
||||
if (useMatchesFunc) {
|
||||
isMatch = matcher->matches(status);
|
||||
} else if (useLookingAtFunc) {
|
||||
isMatch = matcher->lookingAt(status);
|
||||
} else {
|
||||
isMatch = matcher->find();
|
||||
}
|
||||
}
|
||||
matcher->setTrace(FALSE);
|
||||
|
||||
//
|
||||
// Match up the groups from the find() with the groups from the tags
|
||||
//
|
||||
|
||||
// number of tags should match number of groups from find operation.
|
||||
// matcher->groupCount does not include group 0, the entire match, hence the +1.
|
||||
// G option in test means that capture group data is not available in the
|
||||
// expected results, so the check needs to be suppressed.
|
||||
if (isMatch == FALSE && groupStarts.size() != 0) {
|
||||
errln("Error at line %d: Match expected, but none found.\n", line);
|
||||
failed = TRUE;
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
|
||||
// Only check for match / no match. Don't check capture groups.
|
||||
if (isMatch && groupStarts.size() == 0) {
|
||||
errln("Error at line %d: No match expected, but one found.\n", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
goto cleanupAndReturn;
|
||||
}
|
||||
|
||||
for (i=0; i<=matcher->groupCount(); i++) {
|
||||
int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
|
||||
if (matcher->start(i, status) != expectedStart) {
|
||||
errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
|
||||
line, i, expectedStart, matcher->start(i, status));
|
||||
failed = TRUE;
|
||||
goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
|
||||
}
|
||||
int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
|
||||
if (matcher->end(i, status) != expectedEnd) {
|
||||
errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
|
||||
line, i, expectedEnd, matcher->end(i, status));
|
||||
failed = TRUE;
|
||||
// Error on end position; keep going; real error is probably yet to come as group
|
||||
// end positions work from end of the input data towards the front.
|
||||
}
|
||||
}
|
||||
if ( matcher->groupCount()+1 < groupStarts.size()) {
|
||||
errln("Error at line %d: Expected %d capture groups, found %d.",
|
||||
line, groupStarts.size()-1, matcher->groupCount());
|
||||
failed = TRUE;
|
||||
}
|
||||
|
||||
if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
|
||||
matcher->requireEnd() == TRUE) {
|
||||
errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
|
||||
matcher->requireEnd() == FALSE) {
|
||||
errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
|
||||
matcher->hitEnd() == TRUE) {
|
||||
errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
|
||||
matcher->hitEnd() == FALSE) {
|
||||
errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
|
||||
failed = TRUE;
|
||||
}
|
||||
|
||||
|
||||
cleanupAndReturn:
|
||||
if (failed) {
|
||||
errln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
|
||||
+flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
|
||||
// callerPattern->dump();
|
||||
}
|
||||
delete parseMatcher;
|
||||
delete parsePat;
|
||||
delete matcher;
|
||||
delete callerPattern;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//
|
||||
// Errors Check for error handling in patterns.
|
||||
|
@ -1633,10 +1800,6 @@ void RegexTest::Errors() {
|
|||
REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
|
||||
REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
|
||||
|
||||
|
||||
// UnicodeSet containing a string
|
||||
REGEX_ERR("abc[{def}]xyz", 1, 10, U_REGEX_SET_CONTAINS_STRING);
|
||||
|
||||
// Ticket 5389
|
||||
REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
|
||||
|
||||
|
@ -1649,7 +1812,8 @@ void RegexTest::Errors() {
|
|||
// in one big UChar * buffer, which the caller must delete.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, UErrorCode &status) {
|
||||
UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
|
||||
const char *defEncoding, UErrorCode &status) {
|
||||
UChar *retPtr = NULL;
|
||||
char *fileBuf = NULL;
|
||||
UConverter* conv = NULL;
|
||||
|
@ -1698,6 +1862,11 @@ UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, UError
|
|||
if(encoding!=NULL ){
|
||||
fileBufC += signatureLength;
|
||||
fileSize -= signatureLength;
|
||||
} else {
|
||||
encoding = defEncoding;
|
||||
if (strcmp(encoding, "utf-8") == 0) {
|
||||
errln("file %s is missing its BOM", fileName);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -1804,7 +1973,7 @@ void RegexTest::PerlTests() {
|
|||
}
|
||||
|
||||
int32_t len;
|
||||
UChar *testData = ReadAndConvertFile(srcPath, len, status);
|
||||
UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
|
||||
if (U_FAILURE(status)) {
|
||||
return; /* something went wrong, error already output */
|
||||
}
|
||||
|
@ -1981,6 +2150,14 @@ void RegexTest::PerlTests() {
|
|||
lineNum, expected?"":"no ", found?"":"no " );
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't try to check expected results if there is no match.
|
||||
// (Some have stuff in the expected fields)
|
||||
if (!found) {
|
||||
delete testMat;
|
||||
delete testPat;
|
||||
continue;
|
||||
}
|
||||
|
||||
//
|
||||
// Interpret the Perl expression from the fourth field of the data file,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2002-2007, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
@ -16,7 +16,7 @@
|
|||
|
||||
class RegexTest: public IntlTest {
|
||||
public:
|
||||
|
||||
|
||||
RegexTest();
|
||||
virtual ~RegexTest();
|
||||
|
||||
|
@ -37,7 +37,7 @@ public:
|
|||
const UnicodeString &input, int32_t line);
|
||||
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
|
||||
UErrorCode expectedStatus, int32_t line);
|
||||
virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, UErrorCode &status);
|
||||
virtual UChar *ReadAndConvertFile(const char *fileName, int32_t &len, const char *charset, UErrorCode &status);
|
||||
virtual const char *getPath(char buffer[2048], const char *filename);
|
||||
|
||||
};
|
||||
|
|
112
icu4c/source/test/testdata/re_tests.txt
vendored
112
icu4c/source/test/testdata/re_tests.txt
vendored
|
@ -62,8 +62,8 @@ ab{0,1}c abc y $& abc
|
|||
^abc$ aabc n - -
|
||||
abc$ aabc y $& abc
|
||||
abc$ aabcd n - -
|
||||
^ abc y $&
|
||||
$ abc y $&
|
||||
^ abc y $&
|
||||
$ abc y $&
|
||||
a.c abc y $& abc
|
||||
a.c axc y $& axc
|
||||
a.*c axyzc y $& axyzc
|
||||
|
@ -79,13 +79,13 @@ a[b-a] - c - Invalid [] range "b-a"
|
|||
a[]b - ci - Unmatched [
|
||||
a[ - c - Unmatched [
|
||||
a] a] y $& a]
|
||||
a[]]b a]b yi $& a]b ICU makes [] into an empty set.
|
||||
a[]]b a]b y $& a]b
|
||||
a[^bc]d aed y $& aed
|
||||
a[^bc]d abd n - -
|
||||
a[^-b]c adc yi $& adc ICU [] set rules
|
||||
a[^-b]c a-c ni - - ICU [] set rules
|
||||
a[^-b]c adc y $& adc
|
||||
a[^-b]c a-c n - -
|
||||
a[^]b]c a]c n - -
|
||||
a[^]b]c adc yi $& adc ICU [] set rules.
|
||||
a[^]b]c adc y $& adc
|
||||
\ba\b a- y - -
|
||||
\ba\b -a y - -
|
||||
\ba\b -a- y - -
|
||||
|
@ -113,18 +113,18 @@ a\Sb a-b y - -
|
|||
\d - n - -
|
||||
\D 1 n - -
|
||||
\D - y - -
|
||||
[\w] a iy - -
|
||||
[\w] - in - -
|
||||
[\W] a in - -
|
||||
[\W] - iy - -
|
||||
a[\s]b a b iy - -
|
||||
a[\s]b a-b in - -
|
||||
a[\S]b a b in - -
|
||||
a[\S]b a-b iy - -
|
||||
[\d] 1 iy - -
|
||||
[\d] - in - -
|
||||
[\D] 1 in - -
|
||||
[\D] - iy - -
|
||||
[\w] a y - -
|
||||
[\w] - n - -
|
||||
[\W] a n - -
|
||||
[\W] - y - -
|
||||
a[\s]b a b y - -
|
||||
a[\s]b a-b n - -
|
||||
a[\S]b a b n - -
|
||||
a[\S]b a-b y - -
|
||||
[\d] 1 y - -
|
||||
[\d] - n - -
|
||||
[\D] 1 n - -
|
||||
[\D] - y - -
|
||||
ab|cd abc y $& ab
|
||||
ab|cd abcd y $& ab
|
||||
()ef def y $&-$1 ef-
|
||||
|
@ -167,7 +167,7 @@ a.+?c abcabc y $& abc
|
|||
)( - c - Unmatched )
|
||||
[^ab]* cde y $& cde
|
||||
abc n - -
|
||||
a* y $&
|
||||
a* y $&
|
||||
([abc])*d abbbcd y $&-$1 abbbcd-c
|
||||
([abc])*bcd abcd y $&-$1 abcd-a
|
||||
a|b|c|d|e e y $& e
|
||||
|
@ -292,8 +292,8 @@ a[-]?c ac y $& ac
|
|||
'^abc'i ABCC y $& ABC
|
||||
'^abc$'i AABC n - -
|
||||
'abc$'i AABC y $& ABC
|
||||
'^'i ABC y $&
|
||||
'$'i ABC y $&
|
||||
'^'i ABC y $&
|
||||
'$'i ABC y $&
|
||||
'a.c'i ABC y $& ABC
|
||||
'a.c'i AXC y $& AXC
|
||||
'a.*?c'i AXYZC y $& AXYZC
|
||||
|
@ -309,13 +309,13 @@ a[-]?c ac y $& ac
|
|||
'a[]b'i - ci - Unmatched [
|
||||
'a['i - c - Unmatched [
|
||||
'a]'i A] y $& A]
|
||||
'a[]]b'i A]B yi $& A]B
|
||||
'a[]]b'i A]B y $& A]B
|
||||
'a[^bc]d'i AED y $& AED
|
||||
'a[^bc]d'i ABD n - -
|
||||
'a[^-b]c'i ADC yi $& ADC ICU [] set rules
|
||||
'a[^-b]c'i A-C ni - - ICU [] set rules
|
||||
'a[^-b]c'i ADC y $& ADC
|
||||
'a[^-b]c'i A-C n - -
|
||||
'a[^]b]c'i A]C n - -
|
||||
'a[^]b]c'i ADC yi $& ADC
|
||||
'a[^]b]c'i ADC y $& ADC
|
||||
'ab|cd'i ABC y $& AB
|
||||
'ab|cd'i ABCD y $& AB
|
||||
'()ef'i DEF y $&-$1 EF-
|
||||
|
@ -347,7 +347,7 @@ a[-]?c ac y $& ac
|
|||
')('i - c - Unmatched )
|
||||
'[^ab]*'i CDE y $& CDE
|
||||
'abc'i n - -
|
||||
'a*'i y $&
|
||||
'a*'i y $&
|
||||
'([abc])*d'i ABBBCD y $&-$1 ABBBCD-C
|
||||
'([abc])*bcd'i ABCD y $&-$1 ABCD-A
|
||||
'a|b|c|d|e'i E y $& E
|
||||
|
@ -357,7 +357,7 @@ a[-]?c ac y $& ac
|
|||
'ab*'i XAYABBBZ y $& A
|
||||
'(ab|cd)e'i ABCDE y $&-$1 CDE-CD
|
||||
'[abhgefdc]ij'i HIJ y $& HIJ
|
||||
'^(ab|cd)e'i ABCDE ni x$1y XY
|
||||
'^(ab|cd)e'i ABCDE n x$1y XY
|
||||
'(abc|)ef'i ABCDEF y $&-$1 EF-
|
||||
'(a|b)c*d'i ABCD y $&-$1 BCD-B
|
||||
'(ab|ab*)bc'i ABC y $&-$1 ABC-A
|
||||
|
@ -486,7 +486,7 @@ foo\w*\d{4}baz foobar1234baz y $& foobar1234baz
|
|||
a(?{})b cabd y $& ab
|
||||
a(?{)b - c - Sequence (?{...}) not terminated or not {}-balanced
|
||||
a(?{{})b - c - Sequence (?{...}) not terminated or not {}-balanced
|
||||
a(?{}})b - c -
|
||||
a(?{}})b - c -
|
||||
a(?{"{"})b - c - Sequence (?{...}) not terminated or not {}-balanced
|
||||
a(?{"\{"})b cabd y $& ab
|
||||
a(?{"{"}})b - c - Unmatched right curly bracket
|
||||
|
@ -546,50 +546,50 @@ $(?<=^(a)) a y $1 a
|
|||
^(?=(a+?))\1ab aaab n - -
|
||||
([\w:]+::)?(\w+)$ abcd: n - -
|
||||
([\w:]+::)?(\w+)$ abcd y $1-$2 -abcd
|
||||
([\w:]+::)?(\w+)$ xy:z:::abcd iy $1-$2 xy:z:::-abcd
|
||||
([\w:]+::)?(\w+)$ xy:z:::abcd y $1-$2 xy:z:::-abcd
|
||||
^[^bcd]*(c+) aexycd y $1 c
|
||||
(a*)b+ caab y $1 aa
|
||||
([\w:]+::)?(\w+)$ abcd: n - -
|
||||
([\w:]+::)?(\w+)$ abcd y $1-$2 -abcd
|
||||
([\w:]+::)?(\w+)$ xy:z:::abcd iy $1-$2 xy:z:::-abcd
|
||||
([\w:]+::)?(\w+)$ xy:z:::abcd y $1-$2 xy:z:::-abcd
|
||||
^[^bcd]*(c+) aexycd y $1 c
|
||||
(?{$a=2})a*aa(?{local$a=$a+1})k*c(?{$b=$a}) yaaxxaaaacd y $b 3
|
||||
(?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) yaaxxaaaacd y $b 4
|
||||
(>a+)ab aaab n - -
|
||||
(?>a+)b aaab y - -
|
||||
([[:]+) a:[b]: iy $1 :[
|
||||
([[=]+) a=[b]= iy $1 =[
|
||||
([[.]+) a.[b]. iy $1 .[
|
||||
([[:]+) a:[b]: yi $1 :[ Java and ICU dont escape [[xyz
|
||||
([[=]+) a=[b]= yi $1 =[ Java and ICU dont escape [[xyz
|
||||
([[.]+) a.[b]. yi $1 .[ Java and ICU dont escape [[xyz
|
||||
[a[:xyz: - c - Unmatched [
|
||||
[a[:xyz:] - c - POSIX class [:xyz:] unknown
|
||||
[a[:]b[:c] abc iy $& abc
|
||||
[a[:]b[:c] abc yi $& abc Java and ICU embedded [ is nested set
|
||||
([a[:xyz:]b]+) pbaq c - POSIX class [:xyz:] unknown
|
||||
[a[:]b[:c] abc iy $& abc
|
||||
[a[:]b[:c] abc iy $& abc Java and ICU embedded [ is nested set
|
||||
([[:alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
|
||||
([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy
|
||||
([[:alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
|
||||
([[:ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__-- ${nulnul}
|
||||
([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ${nulnul}
|
||||
([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 01
|
||||
([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
|
||||
([[:cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}
|
||||
([[:digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
|
||||
([[:graph:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
|
||||
([[:lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd
|
||||
([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
|
||||
([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 __--
|
||||
([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1
|
||||
([[:print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
|
||||
([[:punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __--
|
||||
([[:space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1
|
||||
([[:word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__
|
||||
([[:upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
|
||||
([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01
|
||||
([[:xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01
|
||||
([[:^alpha:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 01
|
||||
([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 __-- ${nulnul}${ffff}
|
||||
([[:^alnum:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 __-- ${nulnul}${ffff}
|
||||
([[:^ascii:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${ffff}
|
||||
([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
|
||||
([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd
|
||||
([[:^cntrl:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
|
||||
([[:^digit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd
|
||||
([[:^lower:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 AB
|
||||
([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ${nulnul}${ffff}
|
||||
([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy
|
||||
([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 ABcd01Xy__--
|
||||
([[:^print:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ${nulnul}${ffff}
|
||||
([[:^punct:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy
|
||||
([[:^space:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 ABcd01Xy__--
|
||||
([[:^word:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 -- ${nulnul}${ffff}
|
||||
([[:^upper:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 cd01
|
||||
([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} yi $1 Xy__-- ${nulnul}${ffff}
|
||||
([[:^xdigit:]]+) ABcd01Xy__-- ${nulnul}${ffff} y $1 Xy__-- ${nulnul}${ffff}
|
||||
[[:foo:]] - c - POSIX class [:foo:] unknown
|
||||
[[:^foo:]] - c - POSIX class [:^foo:] unknown
|
||||
((?>a+)b) aaab y $1 aaab
|
||||
|
@ -823,11 +823,11 @@ foo.bart foo.bart y - -
|
|||
.[X][X](.+)+[X] bbbbXXXaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ni - -
|
||||
tt+$ xxxtt y - -
|
||||
([a-\d]+) za-9z yi $1 a-9
|
||||
([\d-z]+) a0-za yi $1 0-z
|
||||
([\d-\s]+) a0- z yi $1 0-
|
||||
([a-[:digit:]]+) za-9z iy $1 a-9
|
||||
([[:digit:]-z]+) =0-z= iy $1 0-z
|
||||
([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z
|
||||
([\d-z]+) a0-za y $1 0-z
|
||||
([\d-\s]+) a0- z y $1 0-
|
||||
([a-[:digit:]]+) za-9z y $1 a-9
|
||||
([[:digit:]-z]+) =0-z= y $1 0-z
|
||||
([[:digit:]-[:alpha:]]+) =0-z= iy $1 0-z Set difference in ICU
|
||||
\GX.*X aaaXbX n - -
|
||||
(\d+\.\d+) 3.1415926 y $1 3.1415926
|
||||
(\ba.{0,10}br) have a web browser y $1 a web br
|
||||
|
@ -857,7 +857,7 @@ tt+$ xxxtt y - -
|
|||
^([^,]{0,3},){0,3}d aaa,b,c,d y $1 c,
|
||||
(?i) y - -
|
||||
'(?!\A)x'm a\nxb\n y - -
|
||||
^(a(b)?)+$ aba iy -$1-$2- -a--
|
||||
^(a(b)?)+$ aba yi -$1-$2- -a-- Java disagrees. Not clear who is right.
|
||||
'^.{9}abc.*\n'm 123\nabcabcabcabc\n y - -
|
||||
^(a)?a$ a y -$1- --
|
||||
^(a)?(?(1)a|b)+$ a n - -
|
||||
|
|
707
icu4c/source/test/testdata/regextst.txt
vendored
707
icu4c/source/test/testdata/regextst.txt
vendored
|
@ -1,7 +1,7 @@
|
|||
# Copyright (c) 2001-2007 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file:
|
||||
# file:
|
||||
#
|
||||
# ICU regular expression test cases.
|
||||
#
|
||||
|
@ -10,24 +10,161 @@
|
|||
# <pattern> = "<regular expression pattern>"
|
||||
# <match string> = "<tagged string>"
|
||||
# the quotes on the pattern and match string can be " or ' or /
|
||||
# <tagged string> = text, with the start and end of each
|
||||
# <tagged string> = text, with the start and end of each
|
||||
# capture group tagged with <n>...</n>. The overall match,
|
||||
# if any, is group 0, as in <0>matched text</0>
|
||||
# <flags> = any combination of
|
||||
#
|
||||
# A region can be specified with <r>...</r> tags.
|
||||
#
|
||||
# <flags> = any combination of
|
||||
# i case insensitive match
|
||||
# x free spacing and comments
|
||||
# s dot-matches-all mode
|
||||
# m multi-line mode. $ and ^ match at embedded new-lines
|
||||
# m multi-line mode.
|
||||
# ($ and ^ match at embedded new-lines)
|
||||
# D Unix Lines mode (only recognize 0x0a as new-line)
|
||||
# v If icu configured without break iteration, this
|
||||
# regex test pattern should not compile.
|
||||
# e set the UREGEX_ERROR_ON_UNKNOWN_ESCAPES flag
|
||||
# d dump the compiled pattern
|
||||
# t trace operation of match engine.
|
||||
# 2-9 a digit between 2 and 9, specifies the number of
|
||||
# 2-9 a digit between 2 and 9, specifies the number of
|
||||
# times to execute find(). The expected results are
|
||||
# for the last find() in the sequence.
|
||||
# G Only check match / no match. Do not check capture groups.
|
||||
# E Pattern compilation error expected
|
||||
# L Use LookingAt() rather than find()
|
||||
# M Use matches() rather than find().
|
||||
#
|
||||
# a Use non-Anchoring Bounds.
|
||||
# b Use Transparent Bounds.
|
||||
# The a and t options only make a difference if
|
||||
# a <r>region</r> has been specified in the string.
|
||||
# z|Z hitEnd was expected(z) or not expected (Z).
|
||||
# With neither, hitEnd is not checked.
|
||||
# y|Y Require End expected(y) or not expected (Y).
|
||||
#
|
||||
# White space must be present between the flags and the match string.
|
||||
#
|
||||
|
||||
# Look-ahead expressions
|
||||
#
|
||||
"abc(?=def)" "<0>abc</0>def"
|
||||
"(.*)(?=c)" "<0><1>ab</1></0>cdef"
|
||||
|
||||
"(?:.*)(?=c)" "<r>ab</r>cdef"
|
||||
"(?:.*)(?=c)" b "<r><0>ab</0></r>cdef" # transparent bounds
|
||||
"(?:.*)(?=c)" bM "<r><0>ab</0></r>cdef" # transparent bounds
|
||||
|
||||
"(?:.*)(?=(c))" b "<0>ab</0><1>c</1>def" # Capture in look-ahead
|
||||
"(?=(.)\1\1)\1" "abcc<0><1>d</1></0>ddefg" # Backrefs to look-ahead capture
|
||||
|
||||
".(?!\p{L})" "abc<0>d</0> " # Negated look-ahead
|
||||
".(?!(\p{L}))" "abc<0>d</0> " # Negated look-ahead, no capture
|
||||
# visible outside of look-ahead
|
||||
"and(?=roid)" L "<0>and</0>roid"
|
||||
"and(?=roid)" M "<r>and</r>roid"
|
||||
"and(?=roid)" bM "<r><0>and</0></r>roid"
|
||||
|
||||
"and(?!roid)" L "<0>and</0>roix"
|
||||
"and(?!roid)" L "android"
|
||||
|
||||
"and(?!roid)" M "<r><0>and</0></r>roid" # Opaque bounds
|
||||
"and(?!roid)" bM "<r>and</r>roid"
|
||||
"and(?!roid)" bM "<r><0>and</0></r>roix"
|
||||
|
||||
#
|
||||
# Negated Lookahead, various regions and region transparency
|
||||
#
|
||||
"abc(?!def)" "<0>abc</0>xyz"
|
||||
"abc(?!def)" "abcdef"
|
||||
"abc(?!def)" "<r><0>abc</0></r>def"
|
||||
"abc(?!def)" b "<r>abc</r>def"
|
||||
"abc(?!def)" b "<r><0>abc</0></r>xyz"
|
||||
|
||||
#
|
||||
# Anchoring Bounds
|
||||
#
|
||||
"^def$" "abc<r><0>def</0></r>ghi" # anchoring (default) bounds
|
||||
"^def$" a "abc<r>def</r>ghi" # non-anchoring bounds
|
||||
"^def" a "<r><0>def</0></r>ghi" # non-anchoring bounds
|
||||
"def$" a "abc<r><0>def</0></r>" # non-anchoring bounds
|
||||
|
||||
"^.*$" m "<0>line 1</0>\n line 2"
|
||||
"^.*$" m2 "line 1\n<0> line 2</0>"
|
||||
"^.*$" m3 "line 1\n line 2"
|
||||
"^.*$" m "li<r><0>ne </0></r>1\n line 2" # anchoring bounds
|
||||
"^.*$" m2 "li<r>ne </r>1\n line 2" # anchoring bounds
|
||||
"^.*$" am "li<r>ne </r>1\n line 2" # non-anchoring bounds
|
||||
"^.*$" am "li\n<r><0>ne </0></r>\n1\n line 2" # non-anchoring bounds
|
||||
|
||||
#
|
||||
# HitEnd and RequireEnd for new-lines just before end-of-input
|
||||
#
|
||||
"xyz$" yz "<0>xyz</0>\n"
|
||||
"xyz$" yz "<0>xyz</0>\x{d}\x{a}"
|
||||
|
||||
"xyz$" myz "<0>xyz</0>" # multi-line mode
|
||||
"xyz$" mYZ "<0>xyz</0>\n"
|
||||
"xyz$" mYZ "<0>xyz</0>\r\n"
|
||||
"xyz$" mYZ "<0>xyz</0>\x{85}abcd"
|
||||
|
||||
"xyz$" Yz "xyz\nx"
|
||||
"xyz$" Yz "xyza"
|
||||
"xyz$" yz "<0>xyz</0>"
|
||||
|
||||
#
|
||||
# All Unicode line endings recognized.
|
||||
# 0a, 0b, 0c, 0d, 0x85, 0x2028, 0x2029
|
||||
# Multi-line and non-multiline mode take different paths, so repeated tests.
|
||||
#
|
||||
"^def$" mYZ "abc\x{a}<0>def</0>\x{a}ghi"
|
||||
"^def$" mYZ "abc\x{b}<0>def</0>\x{b}ghi"
|
||||
"^def$" mYZ "abc\x{c}<0>def</0>\x{c}ghi"
|
||||
"^def$" mYZ "abc\x{d}<0>def</0>\x{d}ghi"
|
||||
"^def$" mYZ "abc\x{85}<0>def</0>\x{85}ghi"
|
||||
"^def$" mYZ "abc\x{2028}<0>def</0>\x{2028}ghi"
|
||||
"^def$" mYZ "abc\x{2029}<0>def</0>\x{2029}ghi"
|
||||
"^def$" mYZ "abc\r\n<0>def</0>\r\nghi"
|
||||
|
||||
"^def$" yz "<0>def</0>\x{a}"
|
||||
"^def$" yz "<0>def</0>\x{b}"
|
||||
"^def$" yz "<0>def</0>\x{c}"
|
||||
"^def$" yz "<0>def</0>\x{d}"
|
||||
"^def$" yz "<0>def</0>\x{85}"
|
||||
"^def$" yz "<0>def</0>\x{2028}"
|
||||
"^def$" yz "<0>def</0>\x{2029}"
|
||||
"^def$" yz "<0>def</0>\r\n"
|
||||
"^def$" yz "<0>def</0>"
|
||||
|
||||
|
||||
"^def$" "<0>def</0>\x{2028" #TODO: should be an error of some sort.
|
||||
|
||||
#
|
||||
# UNIX_LINES mode
|
||||
#
|
||||
"abc$" D "<0>abc</0>\n"
|
||||
"abc$" D "abc\r"
|
||||
"abc$" D "abc\u0085"
|
||||
"a.b" D "<0>a\rb</0>"
|
||||
"a.b" D "a\nb"
|
||||
"(?d)abc$" "<0>abc</0>\n"
|
||||
"(?d)abc$" "abc\r"
|
||||
"abc$" mD "<0>abc</0>\ndef"
|
||||
"abc$" mD "abc\rdef"
|
||||
|
||||
".*def" L "abc\r def xyz" # Normal mode, LookingAt() stops at \r
|
||||
".*def" DL "<0>abc\r def</0> xyz" # Unix Lines mode, \r not line end.
|
||||
".*def" DL "abc\n def xyz"
|
||||
|
||||
"(?d)a.b" "a\nb"
|
||||
"(?d)a.b" "<0>a\rb</0>"
|
||||
|
||||
"^abc" m "xyz\r<0>abc</0>"
|
||||
"^abc" Dm "xyz\rabc"
|
||||
"^abc" Dm "xyz\n<0>abc</0>"
|
||||
|
||||
|
||||
|
||||
# Capturing parens
|
||||
".(..)." "<0>a<1>bc</1>d</0>"
|
||||
|
@ -97,6 +234,16 @@
|
|||
"(?w:.+?(\b\S.+?\b).*)" v "<0> <1>don't</1> </0>"
|
||||
"(?w:(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?)(\b.+?).*)" v "<0><1>.</1><2> </2><3>,</3><4>:</4><5>$</5><6>37,000.50</6><7> </7> </0>"
|
||||
|
||||
#
|
||||
# Unicode word boundaries with Regions
|
||||
#
|
||||
"(?w).*?\b" v "abc<r><0>def</0></r>ghi"
|
||||
"(?w).*?\b" v2 "abc<r>def<0></0></r>ghi"
|
||||
"(?w).*?\b" v3 "abc<r>def</r>ghi"
|
||||
#"(?w).*?\b" vb "abc<r><0>def</0></r>ghi" # TODO: bug. Ticket 6073
|
||||
#"(?w).*?\b" vb2 "abc<r>def</r>ghi"
|
||||
|
||||
|
||||
|
||||
# . does not match new-lines
|
||||
"." "\u000a\u000d\u0085\u000c\u000b\u2028\u2029<0>X</0>\u000aY"
|
||||
|
@ -128,20 +275,20 @@
|
|||
".*^(Hello)" " Hello Hello Hello Hello Goodbye"# No Match
|
||||
|
||||
# $ matches only at end of line, or before a newline preceding the end of line
|
||||
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
||||
".*?(Goodbye)" "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
|
||||
".*?(Goodbye)$" "Hello Goodbye> Goodbye Goodbye "# No Match
|
||||
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
||||
".*?(Goodbye)" ZY "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
|
||||
".*?(Goodbye)$" z "Hello Goodbye> Goodbye Goodbye "# No Match
|
||||
|
||||
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
||||
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
||||
".*?(Goodbye)$" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
|
||||
".*?(Goodbye)$" "Hello Goodbye Goodbye Goodbye\n\n"# No Match
|
||||
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
||||
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
||||
".*?(Goodbye)$" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\r\n"
|
||||
".*?(Goodbye)$" z "Hello Goodbye Goodbye Goodbye\n\n"# No Match
|
||||
|
||||
# \Z matches at end of input, like $ with default flags.
|
||||
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
||||
".*?(Goodbye)" "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
|
||||
".*?(Goodbye)\Z" "Hello Goodbye> Goodbye Goodbye "# No Match
|
||||
"here$" "here\nthe end"# No Match
|
||||
".*?(Goodbye)\Z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
||||
".*?(Goodbye)" ZY "<0>Hello <1>Goodbye</1></0> Goodbye Goodbye"
|
||||
".*?(Goodbye)\Z" z "Hello Goodbye> Goodbye Goodbye "# No Match
|
||||
"here$" z "here\nthe end"# No Match
|
||||
|
||||
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
||||
".*?(Goodbye)\Z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>\n"
|
||||
|
@ -151,12 +298,13 @@
|
|||
# \z matches only at the end of string.
|
||||
# no special treatment of new lines.
|
||||
# no dependencies on flag settings.
|
||||
".*?(Goodbye)\z" "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
||||
".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye "# No Match
|
||||
"here$" "here\nthe end"# No Match
|
||||
".*?(Goodbye)\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1></0>"
|
||||
".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye "# No Match
|
||||
"here$" z "here\nthe end"# No Match
|
||||
|
||||
".*?(Goodbye)\z" "Hello Goodbye Goodbye Goodbye\n"# No Match
|
||||
".*?(Goodbye)\n\z" "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
|
||||
".*?(Goodbye)\z" z "Hello Goodbye Goodbye Goodbye\n"# No Match
|
||||
".*?(Goodbye)\n\z" zy "<0>Hello Goodbye Goodbye <1>Goodbye</1>\n</0>"
|
||||
"abc\z|def" ZY "abc<0>def</0>"
|
||||
|
||||
# (?# comment) doesn't muck up pattern
|
||||
"Hello (?# this is a comment) world" " <0>Hello world</0>..."
|
||||
|
@ -180,6 +328,61 @@
|
|||
"(x?)*xyz" "<0>xx<1></1>xyz</0>" # Sligthly wierd, but correct. The "last" time through (x?),
|
||||
# it matches the empty string.
|
||||
|
||||
# Set expressions, basic operators and escapes work
|
||||
#
|
||||
"[\d]+" "<0>0123</0>abc/.,"
|
||||
"[^\d]+" "0123<0>abc/.,</0>"
|
||||
"[\D]+" "0123<0>abc/.,</0>"
|
||||
"[^\D]+" "<0>0123</0>abc/.,"
|
||||
|
||||
"[\s]+" "<0> \t</0>abc/.,"
|
||||
"[^\s]+" " \t<0>abc/.,</0>"
|
||||
"[\S]+" " \t<0>abc/.,</0>"
|
||||
"[^\S]+" "<0> \t</0>abc/.,"
|
||||
|
||||
"[\w]+" "<0>abc123</0> .,;"
|
||||
"[^\w]+" "abc123<0> .,;</0>"
|
||||
"[\W]+" "abc123<0> .,;</0>"
|
||||
"[^\W]+" "<0>abc123</0> .,;"
|
||||
|
||||
"[\z]+" "abc<0>zzz</0>def" # \z has no special meaning
|
||||
"[^\z]+" "<0>abc</0>zzzdef"
|
||||
"[\^]+" "abc<0>^^</0>"
|
||||
"[^\^]+" "<0>abc</0>^^"
|
||||
|
||||
"[\u0041c]+" "<0>AcAc</0>def"
|
||||
"[\U00010002]+" "<0>\ud800\udc02</0>\U00010003"
|
||||
"[^\U00010002]+" "<0>Hello</0>\x{10002}"
|
||||
"[\x61b]+" "<0>abab</0>cde"
|
||||
#"[\x6z]+" "\x06" #TODO: single hex digits should fail
|
||||
"[\x{9}\x{75}\x{6d6}\x{6ba6}\x{6146B}\x{10ffe3}]+" "<0>\u0009\u0075\u06d6\u6ba6\U0006146B\U0010ffe3</0>abc"
|
||||
|
||||
"[\N{LATIN CAPITAL LETTER TONE SIX}ab\N{VARIATION SELECTOR-70} ]+" "x<0> \u0184\U000E0135 ab</0>c"
|
||||
"[\N{LATIN SMALL LETTER C}-\N{LATIN SMALL LETTER F}]+" "ab<0>cdef</0>ghi"
|
||||
|
||||
|
||||
|
||||
#
|
||||
# [set expressions], check the precedence of '-', '&', '--', '&&'
|
||||
# '-' and '&', for compatibility with ICU UnicodeSet, have the same
|
||||
# precedence as the implicit Union between adjacent items.
|
||||
# '--' and '&&', for compatibility with Java, have lower precedence than
|
||||
# the implicit Union operations. '--' and '&&' themselves
|
||||
# have the same precedence, and group left to right.
|
||||
#
|
||||
"[[a-m]-[f-w]p]+" "<0>dep</0>fgwxyz"
|
||||
"[^[a-m]-[f-w]p]+" "dep<0>fgwxyz</0>"
|
||||
|
||||
"[[a-m]--[f-w]p]+" "<0>de</0>pfgwxyz"
|
||||
"[^[a-m]--[f-w]p]+" "de<0>pfgwxyz</0>"
|
||||
|
||||
"[[a-m]&[e-s]w]+" "<0>efmw</0>adnst"
|
||||
"[^[a-m]&[e-s]w]+" "efmw<0>adnst</0>"
|
||||
|
||||
"[[a-m]&[e-s]]+" "<0>efm</0>adnst"
|
||||
|
||||
|
||||
|
||||
# {min,max} iteration qualifier
|
||||
"A{3}BC" "<0>AAABC</0>"
|
||||
|
||||
|
@ -247,8 +450,8 @@
|
|||
"ab(?:c|(d?))(\1)" "<0>ab<1></1><2></2></0>"
|
||||
|
||||
# Case Insensitive
|
||||
"aBc" i "<0>ABC</0>"
|
||||
"a[^bc]d" i "ABD"
|
||||
"aBc" i "<0>ABC</0>"
|
||||
"a[^bc]d" i "ABD"
|
||||
'((((((((((a))))))))))\10' i "<0><1><2><3><4><5><6><7><8><9><10>A</10></9></8></7></6></5></4></3></2></1>A</0>"
|
||||
|
||||
"(?:(?i)a)b" "<0>Ab</0>"
|
||||
|
@ -259,15 +462,36 @@
|
|||
"a b" "ab"
|
||||
"abc " "abc"
|
||||
"abc " "<0>abc </0>"
|
||||
"ab[cd e]z" "<0>ab z</0>"
|
||||
"ab[cd e]z" "<0>ab z</0>"
|
||||
"ab\ c" "<0>ab c</0> "
|
||||
"ab c" "<0>ab c</0> "
|
||||
"ab c" x "ab c "
|
||||
"ab\ c" x "<0>ab c</0> "
|
||||
|
||||
#
|
||||
# Pattern Flags
|
||||
#
|
||||
"(?u)abc" "<0>abc</0>"
|
||||
"(?-u)abc" "<0>abc</0>"
|
||||
|
||||
#
|
||||
# \c escapes (Control-whatever)
|
||||
#
|
||||
"\cA" "<0>\u0001</0>"
|
||||
"\ca" "<0>\u0001</0>"
|
||||
"\c\x" "<0>\u001cx</0>"
|
||||
|
||||
|
||||
#Multi-line mode
|
||||
'b\s^' m "a\nb\n"
|
||||
'b\s^' m "a\nb\n"
|
||||
"(?m)^abc$" "abc \n abc\n<0>abc</0>\nabc"
|
||||
"(?m)^abc$" 2 "abc \n abc\nabc\n<0>abc</0>"
|
||||
"^abc$" 2 "abc \n abc\nabc\nabc"
|
||||
|
||||
# Empty and full range
|
||||
"[\u0000-\U0010ffff]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
|
||||
"[^\u0000-\U0010ffff]" "abc\u0000\uffff\U00010000\U0010ffffzz"
|
||||
"[^a--a]+" "<0>abc\u0000\uffff\U00010000\U0010ffffzz</0>"
|
||||
|
||||
# Free-spacing mode
|
||||
"a b c # this is a comment" x "<0>abc</0> "
|
||||
|
@ -316,8 +540,8 @@
|
|||
"abc.*$" "<0>abcdef</0>"
|
||||
"abc(.*)" "<0>abc<1>def</1></0>"
|
||||
"abc(.*)" "<0>abc<1></1></0>"
|
||||
"abc.*" "<0>abc</0>\ndef"
|
||||
"abc.*" s "<0>abc\ndef</0>"
|
||||
"abc.*" "<0>abc</0>\ndef"
|
||||
"abc.*" s "<0>abc\ndef</0>"
|
||||
"abc.*$" s "<0>abc\ndef</0>"
|
||||
"abc.*$" "abc\ndef"
|
||||
"abc.*$" m "<0>abc</0>\ndef"
|
||||
|
@ -357,9 +581,16 @@
|
|||
"ab\x09w" "<0>ab\u0009w</0>"
|
||||
"ab\xabcdc" "<0>ab\u00abcdc</0>"
|
||||
"ab\x{abcd}c" "<0>ab\uabcdc</0>"
|
||||
"ab\x{101234}c" "<0>ab\U00101234c</0>"
|
||||
"ab\x{101234}c" "<0>ab\U00101234c</0>"
|
||||
"abα" "<0>abα</0>"
|
||||
|
||||
#
|
||||
# Octal Escaping. This conforms to Java conventions, not Perl.
|
||||
"\0101\00\03\073\0154\01442" "<0>A\u0000\u0003\u003b\u006c\u0064\u0032</0>"
|
||||
"\0776" "<0>\u003f\u0036</0>" # overflow, the 6 is literal.
|
||||
"\0376xyz" "<0>\u00fexyz</0>"
|
||||
"\08" E "<0>\u00008</0>"
|
||||
"\0" E "x"
|
||||
|
||||
#
|
||||
# \u Surrogate Pairs
|
||||
|
@ -369,6 +600,24 @@
|
|||
"\ud800\ud800\udc00" "<0>\ud800\U00010000</0>\U00010000\U00010000\U00010001"
|
||||
"(\ud800)(\udc00)" "\U00010000"
|
||||
|
||||
#
|
||||
# hitEnd with find()
|
||||
#
|
||||
"abc" Z "aa<0>abc</0> abcab"
|
||||
"abc" 2Z "aaabc <0>abc</0>ab"
|
||||
"abc" 3z "aa>abc abcab"
|
||||
|
||||
#
|
||||
# Bug xxxx
|
||||
#
|
||||
"(?:\-|(\-?\d+\d\d\d))?(?:\-|\-(\d\d))?(?:\-|\-(\d\d))?(T)?(?:(\d\d):(\d\d):(\d\d)(\.\d+)?)?(?:(?:((?:\+|\-)\d\d):(\d\d))|(Z))?" MG "<0>-1234-21-31T41:51:61.789+71:81</0>"
|
||||
|
||||
|
||||
#
|
||||
# A random, complex, meaningless pattern that should at least compile
|
||||
#
|
||||
"(?![^\<C\f\0146\0270\}&&[|\02-\x3E\}|X-\|]]{7,}+)[|\\\x98\<\?\u4FCFr\,\0025\}\004|\0025-\0521]|(?<![|\01-\u829E])|(?<!\p{Alpha})|^|(?-s:[^\x15\\\x24F\a\,\a\u97D8[\x38\a[\0224-\0306[^\0020-\u6A57]]]]??)(?xix:[^|\{\[\0367\t\e\x8C\{\[\074c\]V[|b\fu\r\0175\<\07f\066s[^D-\x5D]]])(?xx:^{5,}+)(?d)(?=^\D)|(?!\G)(?>\G)(?![^|\]\070\ne\{\t\[\053\?\\\x51\a\075\0023-\[&&[|\022-\xEA\00-\u41C2&&[^|a-\xCC&&[^\037\uECB3\u3D9A\x31\|\<b\0206\uF2EC\01m\,\ak\a\03&&\p{Punct}]]]])(?-dxs:[|\06-\07|\e-\x63&&[|Tp\u18A3\00\|\xE4\05\061\015\0116C|\r\{\}\006\xEA\0367\xC4\01\0042\0267\xBB\01T\}\0100\?[|\[-\u459B|\x23\x91\rF\0376[|\?-\x94\0113-\\\s]]]]{6}?)(?<=[^\t-\x42H\04\f\03\0172\?i\u97B6\e\f\uDAC2])(?=\B)(?>[^\016\r\{\,\uA29D\034\02[\02-\[|\t\056\uF599\x62\e\<\032\uF0AC\0026\0205Q\|\\\06\0164[|\057-\u7A98&&[\061-g|\|\0276\n\042\011\e\xE8\x64B\04\u6D0EDW^\p{Lower}]]]]?)(?<=[^\n\\\t\u8E13\,\0114\u656E\xA5\]&&[\03-\026|\uF39D\01\{i\u3BC2\u14FE]])(?<=[^|\uAE62\054H\|\}&&^\p{Space}])(?sxx)(?<=[\f\006\a\r\xB4]{1,5})|(?x-xd:^{5}+)()" "<0></0>abc"
|
||||
|
||||
|
||||
#
|
||||
# Bug 3225
|
||||
|
@ -435,7 +684,7 @@
|
|||
"^" "<0></0>"
|
||||
"^" 2 ""
|
||||
|
||||
"\Z" "<0></0>"
|
||||
"\Z" "<0></0>"
|
||||
"\Z" 2 ""
|
||||
"\Z" 2 "\u000a<0></0>"
|
||||
"\Z" "<0></0>\u000d\u000a"
|
||||
|
@ -471,6 +720,173 @@
|
|||
".{6}" "123\u000a\u000dXYZ"
|
||||
".{6}" s "<0>123\u000a\u000dX</0>Y"
|
||||
|
||||
|
||||
#
|
||||
# Ranges
|
||||
#
|
||||
".*" "abc<r><0>def</0></r>ghi"
|
||||
"a" "aaa<r><0>a</0>aa</r>aaa"
|
||||
"a" 2 "aaa<r>a<0>a</0>a</r>aaa"
|
||||
"a" 3 "aaa<r>aa<0>a</0></r>aaa"
|
||||
"a" 4 "aaa<r>aaa</r>aaa"
|
||||
"a" "aaa<r><0>a</0>aa</r>aaa"
|
||||
|
||||
#
|
||||
# [set] parsing, systematically run through all of the parser states.
|
||||
#
|
||||
#
|
||||
"[def]+" "abc<0>ddeeff</0>ghi" # set-open
|
||||
"[^def]+" "<0>abc</0>defghi"
|
||||
"[:digit:]+" "abc<0>123</0>def"
|
||||
"[:^digit:]+" "<0>abc</0>123def"
|
||||
"[\u005edef]+" "abc<0>de^f</0>ghi"
|
||||
|
||||
"[]]+" "abc<0>]]]</0>[def" # set-open2
|
||||
"[^]]+" "<0>abc</0>]]][def"
|
||||
|
||||
"[:Lu:]+" "abc<0>ABC</0>def" # set-posix
|
||||
"[:Lu]+" "abc<0>uL::Lu</0>"
|
||||
"[:^Lu]+" "abc<0>uL:^:Lu</0>"
|
||||
"[:]+" "abc<0>:::</0>def"
|
||||
"[:whats this:]" E " "
|
||||
"[--]+" dE "-------"
|
||||
|
||||
"[[nested]]+" "xyz[<0>nnetsteed</0>]abc" #set-start
|
||||
"[\x{41}]+" "CB<0>AA</0>ZYX"
|
||||
"[\[\]\\]+" "&*<0>[]\\</0>..."
|
||||
"[*({<]+" "^&<0>{{(<<*</0>)))"
|
||||
|
||||
|
||||
"[-def]+" "abc<0>def-ef-d</0>xyz" # set-start-dash
|
||||
"[abc[--def]]" E " "
|
||||
|
||||
"[x[&def]]+" "abc<0>def&</0>ghi" # set-start-amp
|
||||
"[&& is bad at start]" E " "
|
||||
|
||||
"[abc" E " " # set-after-lit
|
||||
"[def]]" "abcdef"
|
||||
"[def]]" "abcde<0>f]</0>]"
|
||||
|
||||
"[[def][ghi]]+" "abc]<0>defghi</0>[xyz" # set-after-set
|
||||
"[[def]ghi]+" "abc]<0>defghi</0>[xyz"
|
||||
"[[[[[[[[[[[abc]" E " "
|
||||
"[[abc]\p{Lu}]+" "def<0>abcABC</0>xyz"
|
||||
|
||||
"[d-f]+" "abc<0>def</0>ghi" # set-after-range
|
||||
"[d-f[x-z]]+" "abc<0>defxyzzz</0>gw"
|
||||
"[\s\d]+" "abc<0> 123</0>def"
|
||||
"[d-f\d]+" "abc<0>def123</0>ghi"
|
||||
"[d-fr-t]+" "abc<0>defrst</0>uvw"
|
||||
|
||||
"[abc--]" E " " # set-after-op
|
||||
"[[def]&&]" E " "
|
||||
"[-abcd---]+" "<0>abc</0>--" #[-abcd]--[-]
|
||||
"[&abcd&&&ac]+" "b<0>ac&&ca</0>d" #[&abcd]&&[&ac]
|
||||
|
||||
"[[abcd]&[ac]]+" "b<0>acac</0>d" # set-set-amp
|
||||
"[[abcd]&&[ac]]+" "b<0>acac</0>d"
|
||||
"[[abcd]&&ac]+" "b<0>acac</0>d"
|
||||
"[[abcd]&ac]+" "<0>bacacd&&&</0>"
|
||||
|
||||
"[abcd&[ac]]+" "<0>bacacd&&&</0>" #set-lit-amp
|
||||
"[abcd&&[ac]]+" "b<0>acac</0>d"
|
||||
"[abcd&&ac]+" "b<0>acac</0>d"
|
||||
|
||||
"[[abcd]-[ac]]+" "a<0>bdbd</0>c" # set-set-dash
|
||||
"[[abcd]--[ac]]+" "a<0>bdbd</0>c"
|
||||
"[[abcd]--ac]+" "a<0>bdbd</0>c"
|
||||
"[[abcd]-ac]+" "<0>bacacd---</0>"
|
||||
|
||||
"[a-d--[b-c]]+" "b<0>adad</0>c" # set-range-dash
|
||||
"[a-d--b-c]+" "b<0>adad</0>c"
|
||||
"[a-d-[b-c]]+" "<0>bad-adc</0>"
|
||||
"[a-d-b-c]+" "<0>bad-adc</0>"
|
||||
"[\w--[b-c]]+" "b<0>adad</0>c"
|
||||
"[\w--b-c]+" "b<0>adad</0>c"
|
||||
"[\w-[b-c]]+" "<0>bad-adc</0>"
|
||||
"[\w-b-c]+" "<0>bad-adc</0>"
|
||||
|
||||
"[a-d&&[b-c]]+" "a<0>bcbc</0>d" # set-range-amp
|
||||
"[a-d&&b-c]+" "a<0>bcbc</0>d"
|
||||
"[a-d&[b-c]]+" "<0>abc&bcd</0>"
|
||||
"[a-d&b-c]+" "<0>abc&bcd</0>"
|
||||
|
||||
"[abcd--bc]+" "b<0>adda</0>c" # set-lit-dash
|
||||
"[abcd--[bc]]+" "b<0>adda</0>c"
|
||||
"[abcd-[bc]]+" "<0>bad--dac</0>xyz"
|
||||
"[abcd-]+" "<0>bad--dac</0>xyz"
|
||||
|
||||
"[abcd-\s]+" E "xyz<0>abcd --</0>xyz" # set-lit-dash-esc
|
||||
"[abcd-\N{LATIN SMALL LETTER G}]+" "xyz-<0>abcdefg</0>hij-"
|
||||
"[bcd-\{]+" "a<0>bcdefyz{</0>|}"
|
||||
|
||||
"[\p{Ll}]+" "ABC<0>abc</0>^&*&" # set-escape
|
||||
"[\P{Ll}]+" "abc<0>ABC^&*&</0>xyz"
|
||||
"[\N{LATIN SMALL LETTER Q}]+" "mnop<0>qqq</0>rst"
|
||||
"[\sa]+" "cb<0>a a </0>(*&"
|
||||
"[\S]+" " <0>hello</0> "
|
||||
"[\w]+" " <0>hello_world</0>! "
|
||||
"[\W]+" "a<0> *$%#,</0>hello "
|
||||
"[\d]+" "abc<0>123</0>def"
|
||||
"[\D]+" "123<0>abc</0>567"
|
||||
"[\$\#]+" "123<0>$#$#</0>\\"
|
||||
|
||||
#
|
||||
# Try each of the Java compatibility properties.
|
||||
# These are checked here, while normal Unicode properties aren't, because
|
||||
# these Java compatibility properties are implemented directly by regexp, while other
|
||||
# properties are handled by ICU's Property and UnicodeSet APIs.
|
||||
#
|
||||
# These tests are only to verify that the names are recognized and the
|
||||
# implementation isn't dead. They are not intended to verify that the
|
||||
# function defintions are 100% correct.
|
||||
#
|
||||
"[:InBasic Latin:]+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
|
||||
"[:^InBasic Latin:]+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
|
||||
"\p{InBasicLatin}+" "ΓΔΕΖΗΘ<0>hello, world.</0>ニヌネノハバパ"
|
||||
"\P{InBasicLatin}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
|
||||
"\p{InGreek}+" "<0>ΓΔΕΖΗΘ</0>hello, world.ニヌネノハバパ"
|
||||
"\p{InCombining Marks for Symbols}" "<0>\u20d0</0>"
|
||||
"\p{Incombiningmarksforsymbols}" "<0>\u20d0</0>"
|
||||
|
||||
|
||||
"\p{javaDefined}+" "\uffff<0>abcd</0>\U00045678"
|
||||
"\p{javaDigit}+" "abc<0>1234</0>xyz"
|
||||
"\p{javaIdentifierIgnorable}+" "abc<0>\u0000\u000e\u009f</0>xyz"
|
||||
"\p{javaISOControl}+" "abc<0>\u0000\u000d\u0083</0>xyz"
|
||||
"\p{javaJavaIdentifierPart}+" "#@!<0>abc123_$</0>;"
|
||||
"\p{javaJavaIdentifierStart}+" "123\u0301<0>abc$_</0>%^&"
|
||||
"\p{javaLetter}+" "123<0>abcDEF</0>&*()("
|
||||
"\p{javaLetterOrDigit}+" "$%^&*<0>123abcகஙசஜஞ</0>☺♘♚☔☎♬⚄⚡"
|
||||
"\p{javaLowerCase}+" "ABC<0>def</0>&^%#:="
|
||||
"\p{javaMirrored}+" "ab$%<0>(){}[]</0>xyz"
|
||||
"\p{javaSpaceChar}+" "abc<0> \u00ao\u2028</0>!@#"
|
||||
"\p{javaSupplementaryCodePoint}+" "abc\uffff<0>\U00010000\U0010ffff</0>\u0000"
|
||||
"\p{javaTitleCase}+" "abCE<0>Džῌᾨ</0>123"
|
||||
"\p{javaUnicodeIdentifierStart}+" "123<0>abcⅣ</0>%^&&*"
|
||||
"\p{javaUnicodeIdentifierPart}+" "%&&^<0>abc123\u0301\u0002</0>..."
|
||||
"\p{javaUpperCase}+" "abc<0>ABC</0>123"
|
||||
"\p{javaValidCodePoint}+" "<0>\u0000abc\ud800 unpaired \udfff |\U0010ffff</0>"
|
||||
"\p{javaWhitespace}+" "abc\u00a0\u2007\u202f<0> \u0009\u001c\u001f\u2028</0>42"
|
||||
"\p{all}+" "<0>123\u0000\U0010ffff</0>"
|
||||
"\P{all}+" "123\u0000\U0010ffff"
|
||||
|
||||
#
|
||||
# Errors on unrecognized ASCII letter escape sequences.
|
||||
#
|
||||
"[abc\Y]+" "<0>abcY</0>"
|
||||
"[abc\Y]+" eE "<0>abcY</0>"
|
||||
|
||||
"(?:a|b|c|\Y)+" "<0>abcY</0>"
|
||||
"(?:a|b|c|\Y)+" eE "<0>abcY</0>"
|
||||
|
||||
"\Q\Y\E" e "<0>\\Y</0>"
|
||||
|
||||
#
|
||||
# Reported problem
|
||||
#
|
||||
"[a-\w]" E "x"
|
||||
|
||||
#
|
||||
# Bug 4045
|
||||
#
|
||||
|
@ -485,7 +901,7 @@
|
|||
"A*" 3 ""
|
||||
"A*" 4 ""
|
||||
"A*" 5 ""
|
||||
|
||||
|
||||
#
|
||||
# Bug 4046
|
||||
#
|
||||
|
@ -512,11 +928,10 @@
|
|||
# Bug 4058 ICU Unicode Set patterns have an odd feature -
|
||||
# A $ as the last character before the close bracket means match
|
||||
# a \uffff, which means off the end of the string in transliterators.
|
||||
# Doesn't make much sense for regex, but works that way anyhow.
|
||||
# Didn't make sense for regular expressions, and is now fixed.
|
||||
#
|
||||
"[\$](P|C|D);" "<0>$<1>P</1>;</0>"
|
||||
"[$](P|C|D);" "<0>\uffff<1>P</1>;</0>"
|
||||
"[$](P|C|D);" "$P;"
|
||||
"[$](P|C|D);" "<0>$<1>P</1>;</0>"
|
||||
"[$$](P|C|D);" "<0>$<1>P</1>;</0>"
|
||||
|
||||
#
|
||||
|
@ -537,10 +952,68 @@
|
|||
".+?\b" 2 " <0>\u0935\u0915\u094D\u200D\u0924\u0947</0> "
|
||||
".+?\b" 3 " \u0935\u0915\u094D\u200D\u0924\u0947 "
|
||||
|
||||
#
|
||||
# bug 5386 "^.*$" should match empty input
|
||||
#
|
||||
"^.*$" "<0></0>"
|
||||
"^.*$" m "<0></0>"
|
||||
"^.*$" "<0></0>\n"
|
||||
"(?s)^.*$" "<0>\n</0>"
|
||||
|
||||
#
|
||||
# bug 5386 Empty pattern and empty input should match.
|
||||
#
|
||||
"" "<0></0>abc"
|
||||
"" "<0></0>"
|
||||
|
||||
#
|
||||
# bug 5386 Range upper and lower bounds can be equal
|
||||
#
|
||||
"[a-a]" "<0>a</0>"
|
||||
|
||||
#
|
||||
# bug 5386 $* should not fail, should match empty string.
|
||||
#
|
||||
"$*" "<0></0>abc"
|
||||
|
||||
#
|
||||
# bug 5386 \Q ... \E escaping problem
|
||||
#
|
||||
"[a-z\Q-$\E]+" "QE<0>abc-def$</0>."
|
||||
|
||||
# More reported 5386 Java comaptibility failures
|
||||
#
|
||||
"[^]*abb]*" "<0>kkkk</0>"
|
||||
"\xa" "huh" # Java would like to be warned.
|
||||
"^.*$" "<0></0>"
|
||||
|
||||
#
|
||||
# bug 5386 Empty left alternation should produce a zero length match.
|
||||
#
|
||||
"|a" "<0></0>a"
|
||||
"$|ab" "<0>ab</0>"
|
||||
"$|ba" "ab<0></0>"
|
||||
|
||||
#
|
||||
# bug 5386 Java compatibility for set expressions
|
||||
#
|
||||
"[a-z&&[cde]]+" "ab<0>cde</0>fg"
|
||||
|
||||
#
|
||||
# bug 6019 matches() needs to backtrack and check for a longer match if the
|
||||
# first match(es) found don't match the entire input.
|
||||
#
|
||||
"a?|b" "<0></0>b"
|
||||
"a?|b" M "<0>b</0>"
|
||||
"a?|.*?u|stuff|d" M "<0>stuff</0>"
|
||||
"a?|.*?(u)|stuff|d" M "<0>stuff<1>u</1></0>"
|
||||
"a+?" "<0>a</0>aaaaaaaaaaaa"
|
||||
"a+?" M "<0>aaaaaaaaaaaaa</0>"
|
||||
|
||||
#
|
||||
# Random debugging, Temporary
|
||||
#
|
||||
#"^(?:a?b?)*$" "a--"
|
||||
#"^(?:a?b?)*$" "a--"
|
||||
"^(?:a?b?)*$" "a--"
|
||||
|
||||
"This is a string with (?:one |two |three )endings" "<0>This is a string with two endings</0>"
|
||||
|
@ -681,7 +1154,7 @@
|
|||
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>ftp://ftp.blah.co.uk:2828/blah%20blah.gif</0>"
|
||||
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" G "<0>https://blah.gov/blah-blah.as</0>"
|
||||
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "www.blah.com"
|
||||
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "http://www.blah.com/I have spaces!"
|
||||
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "http://www.blah.com/I have spaces!"
|
||||
"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+\&%\$#\=~])*$" "ftp://blah_underscore/[nope]"
|
||||
"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002</0>"
|
||||
"^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2})$|^(([0-2]\d|[3][0-1])\/([0]\d|[1][0-2])\/[2][0]\d{2}\s([0-1]\d|[2][0-3])\:[0-5]\d\:[0-5]\d)$" G "<0>12/01/2002 12:32:10</0>"
|
||||
|
@ -959,18 +1432,18 @@
|
|||
"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "10.0.5.4"
|
||||
"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "192.168.0.1"
|
||||
"^(((25[0-5]|2[0-4][0-9]|19[0-1]|19[3-9]|18[0-9]|17[0-1]|17[3-9]|1[0-6][0-9]|1[1-9]|[2-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9]))|(192\.(25[0-5]|2[0-4][0-9]|16[0-7]|169|1[0-5][0-9]|1[7-9][0-9]|[1-9][0-9]|[0-9]))|(172\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|1[0-5]|3[2-9]|[4-9][0-9]|[0-9])))\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])$" "my ip address"
|
||||
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.com</0>" # TODO: \w in pattern
|
||||
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo-foo.com.au</0>" # TODO: \w in pattern
|
||||
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.foo.info</0>" # TODO: \w in pattern
|
||||
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@.com" # TODO: \w in pattern
|
||||
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@foo..com" # TODO: \w in pattern
|
||||
#"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@me@.com" # TODO: \w in pattern
|
||||
#"/\*[\d\D]*?\*/" G "<0>/* my comment */</0>"
|
||||
#"/\*[\d\D]*?\*/" G "<0>/* my multiline comment */</0>"
|
||||
#"/\*[\d\D]*?\*/" G "<0>/* my nested comment */</0>"
|
||||
#"/\*[\d\D]*?\*/" "*/ anything here /*"
|
||||
#"/\*[\d\D]*?\*/" "anything between 2 seperate comments"
|
||||
#"/\*[\d\D]*?\*/" "\* *\"
|
||||
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.com</0>"
|
||||
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo-foo.com.au</0>"
|
||||
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" G "<0>foo@foo.foo.info</0>"
|
||||
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@.com"
|
||||
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@foo..com"
|
||||
"^([\w\d\-\.]+)@{1}(([\w\d\-]{1,67})|([\w\d\-]+\.[\w\d\-]{1,67}))\.(([a-zA-Z\d]{2,4})(\.[a-zA-Z\d]{2})?)$" "foo@me@.com"
|
||||
"/\*[\d\D]*?\*/" G "<0>/* my comment */</0>"
|
||||
"/\*[\d\D]*?\*/" G "<0>/* my multiline comment */</0>"
|
||||
"/\*[\d\D]*?\*/" G "<0>/* my nested comment */</0>"
|
||||
"/\*[\d\D]*?\*/" "*/ anything here /*"
|
||||
"/\*[\d\D]*?\*/" "anything between 2 seperate comments"
|
||||
"/\*[\d\D]*?\*/" "\* *\"
|
||||
"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my comment */</0>"
|
||||
"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my multiline comment */</0>"
|
||||
"/\*[\p{N}\P{N}]*?\*/" G "<0>/* my nested comment */</0>"
|
||||
|
@ -986,9 +1459,9 @@
|
|||
'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>blah@[10.0.0.1]</0>"
|
||||
'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' G "<0>a@b.c</0>"
|
||||
'^(([^<>;()\[\]\\.,;:@"]+(\.[^<>()\[\]\\.,;:@"]+)*)|(".+"))@((([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))\.)*(([a-z]([-a-z0-9]*[a-z0-9])?)|(#[0-9]+)|(\[((([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\.){3}(([01]?[0-9]{0,2})|(2(([0-4][0-9])|(5[0-5]))))\]))$' "non@match@."
|
||||
#"^\d{9}[\d|X]$" G "<0>1234123412</0>"
|
||||
#"^\d{9}[\d|X]$" G "<0>123412341X</0>"
|
||||
#"^\d{9}[\d|X]$" "not an isbn"
|
||||
"^\d{9}[\d|X]$" G "<0>1234123412</0>"
|
||||
"^\d{9}[\d|X]$" G "<0>123412341X</0>"
|
||||
"^\d{9}[\d|X]$" "not an isbn"
|
||||
"^\d{9}(\d|X)$" G "<0>1234123412</0>"
|
||||
"^\d{9}(\d|X)$" G "<0>123412341X</0>"
|
||||
"^\d{9}(\d|X)$" "not an isbn"
|
||||
|
@ -1056,12 +1529,12 @@
|
|||
"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "12 123 1234"
|
||||
"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123/1234"
|
||||
"\(([0-9]{2}|0{1}((x|[0-9]){2}[0-9]{2}))\)\s*[0-9]{3,4}[- ]*[0-9]{4}" "(012) 123 12345"
|
||||
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob-smith@foo.com</0>" # TODO: \w in pattern
|
||||
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob.smith@foo.com</0>" # TODO: \w in pattern
|
||||
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob_smith@foo.com</0>" # TODO: \w in pattern
|
||||
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "-smith@foo.com" # TODO: \w in pattern
|
||||
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" ".smith@foo.com" # TODO: \w in pattern
|
||||
#"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "smith@foo_com" # TODO: \w in pattern
|
||||
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob-smith@foo.com</0>"
|
||||
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob.smith@foo.com</0>"
|
||||
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" G "<0>bob_smith@foo.com</0>"
|
||||
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "-smith@foo.com"
|
||||
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" ".smith@foo.com"
|
||||
"^\w+[\w-\.]*\@\w+((-\w+)|(\w*))\.[a-z]{2,3}$" "smith@foo_com"
|
||||
"^(?=.*\d).{4,8}$" G "<0>1234</0>"
|
||||
"^(?=.*\d).{4,8}$" G "<0>asdf1234</0>"
|
||||
"^(?=.*\d).{4,8}$" G "<0>asp123</0>"
|
||||
|
@ -1175,7 +1648,7 @@
|
|||
"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$12,3456.01"
|
||||
"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "12345"
|
||||
"^\$(\d{1,3}(\,\d{3})*|(\d+))(\.\d{2})?$" "$1.234"
|
||||
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>C:\\temp\\this allows spaces\\web.config</0>"
|
||||
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>C:\\temp\\this allows spaces\\web.config</0>"
|
||||
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" G "<0>\\\\Andromeda\\share\\file name.123</0>"
|
||||
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "tz:\temp\ fi*le?na:m<e>.doc"
|
||||
"([A-Z]:\\[^/:\*\?<>\|]+\.\w{2,6})|(\\{2}[^/:\*\?<>\|]+\.\w{2,6})" "\\Andromeda\share\filename.a"
|
||||
|
@ -1206,24 +1679,24 @@
|
|||
"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "qqqBFDB4D31-3E35-4DAB-AFCA-5E6E5C8F61EA"
|
||||
"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E-4DAB-AFCA-5E6E5C8F61EA"
|
||||
"^[A-Za-z0-9]{8}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{4}-[A-Za-z0-9]{12}$" "BFDB4D31-3E35-4DAB-AF"
|
||||
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>12.345-678</0>" # TODO: \x not implemented.
|
||||
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>23.345-123</0>"
|
||||
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>99.999</0>"
|
||||
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "41222-222"
|
||||
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "3.444-233"
|
||||
#"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "43.324444"
|
||||
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>12.345-678</0>"
|
||||
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>23.345-123</0>"
|
||||
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" G "<0>99.999</0>"
|
||||
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "41222-222"
|
||||
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "3.444-233"
|
||||
"^\d{2}(\x2e)(\d{3})(-\d{3})?$" "43.324444"
|
||||
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>12.345-678</0>"
|
||||
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>23.345-123</0>"
|
||||
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" G "<0>99.999</0>"
|
||||
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "41222-222"
|
||||
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "3.444-233"
|
||||
"^\d{2}(\u002e)(\d{3})(-\d{3})?$" "43.324444"
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\file.txt</0>"
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\folder\sub folder\file.txt</0>"
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>\\network\folder\file.txt</0>" # TODO: \w in pattern
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:" # TODO: \w in pattern
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:\file.xls" # TODO: \w in pattern
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "folder.txt" # TODO: \w in pattern
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\file.txt</0>" # TODO: debug
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>c:\folder\sub folder\file.txt</0>" # TODO: debug
|
||||
#"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" G "<0>\\network\folder\file.txt</0>" # TODO: debug
|
||||
"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:"
|
||||
"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "C:\file.xls"
|
||||
"^(([a-zA-Z]:)|(\\{2}\w+)\$?)(\\(\w[\w ]*))+\.(txt|TXT)$" "folder.txt"
|
||||
"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>my.domain.com</0>"
|
||||
"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>regexlib.com</0>"
|
||||
"^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$" G "<0>big-reg.com</0>"
|
||||
|
@ -1265,12 +1738,12 @@
|
|||
"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "1-555-5555"
|
||||
"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "15553333"
|
||||
"^(1?(-?\d{3})-?)?(\d{3})(-?\d{4})$" "0-561-555-1212"
|
||||
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" G "<0><input type = text name = "bob"></0>" # TODO: \w in pattern
|
||||
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" G "<0><select name = "fred"></0>" # TODO: \w in pattern
|
||||
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" G "<0><form</0>" # TODO: \w in pattern
|
||||
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" "<input type = submit>" # TODO: \w in pattern
|
||||
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" "<font face = "arial">" # TODO: \w in pattern
|
||||
#"<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>" "The drity brown fox stank like" # TODO: \w in pattern
|
||||
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0><input type = text name = "bob"></0>'
|
||||
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0><select name = "fred"></0>'
|
||||
#'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' G '<0><form></0>' #TODO: Debug
|
||||
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' "<input type = submit>" # TODO: \w in pattern
|
||||
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' '<font face = "arial">' # TODO: \w in pattern
|
||||
'<[^>]*name[\s]*=[\s]*"?[^\w_]*"?[^>]*>' "The drity brown fox stank like"
|
||||
"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00 AM</0>"
|
||||
"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>12:00 PM</0>"
|
||||
"^(1|01|2|02|3|03|4|04|5|05|6|06|7|07|8|08|9|09|10|11|12{1,2}):(([0-5]{1}[0-9]{1}\s{0,1})([AM|PM|am|pm]{2,2}))\W{0}$" G "<0>1:00am</0>"
|
||||
|
@ -1495,9 +1968,9 @@
|
|||
"^(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])$" "10.57.98.23."
|
||||
"<img([^>]*[^/])>" G '<0><img src="bob"></0>'
|
||||
"<img([^>]*[^/])>" '<img src="bob" />'
|
||||
#"<!--[\s\S]*?-->" G "<0><!-- comments --></0>"
|
||||
#"<!--[\s\S]*?-->" G "<0><!-- x = a > b - 3 --></0>"
|
||||
#"<!--[\s\S]*?-->" "<COMMENTS>this is a comment</COMMENTS>"
|
||||
"<!--[\s\S]*?-->" G "<0><!-- comments --></0>"
|
||||
"<!--[\s\S]*?-->" G "<0><!-- x = a > b - 3 --></0>"
|
||||
"<!--[\s\S]*?-->" "<COMMENTS>this is a comment</COMMENTS>"
|
||||
"<!--[\p{Zs}\P{Zs}]*?-->" G "<0><!-- comments --></0>"
|
||||
"<!--[\p{Zs}\P{Zs}]*?-->" G "<0><!-- x = a > b - 3 --></0>"
|
||||
"<!--[\p{Zs}\P{Zs}]*?-->" "<COMMENTS>this is a comment</COMMENTS>"
|
||||
|
@ -1509,8 +1982,8 @@
|
|||
"(\{\\f\d*)\\([^;]+;)" G "<0>{\\f1\\fswiss\\fcharset0\\fprq2{\\*\\panose 020b0604020202020204}Arial;</0>"
|
||||
"(\{\\f\d*)\\([^;]+;)" G "{\\f"
|
||||
"(\{\\f\d*)\\([^;]+;)" "{f0fs20 some text}"
|
||||
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" G "<0><IMG src='stars.gif' alt="space" height=1></0>" # TODO: \w in pattern
|
||||
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" "this is not a tag" # TODO: \w in pattern
|
||||
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" G '<0><IMG src='stars.gif' alt="space" height=1></0>' # TODO: Can't quote this pattern with the test syntax!
|
||||
#"</?([a-zA-Z][-A-Za-z\d\.]{0,71})(\s+(\S+)(\s*=\s*([-\w\.]{1,1024}|"[^"]{0,1024}"|'[^']{0,1024}'))?)*\s*>" "this is not a tag"
|
||||
"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>12/30/2002</0>"
|
||||
"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/12/1998 13:30</0>"
|
||||
"^([0]\d|[1][0-2])\/([0-2]\d|[3][0-1])\/([2][01]|[1][6-9])\d{2}(\s([0-1]\d|[2][0-3])(\:[0-5]\d){1,2})?$" G "<0>01/28/2002 22:35:00</0>"
|
||||
|
@ -1586,10 +2059,10 @@
|
|||
"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "bad.bad.gif"
|
||||
"^[0-9A-Za-z_ ]+(.[jJ][pP][gG]|.[gG][iI][fF])$" "slash\gif."
|
||||
"<[^>\s]*\bauthor\b[^>]*>" G '<0><author name="Daniel"></0>'
|
||||
#"<[^>\s]*\bauthor\b[^>]*>" G "<0></sch:author></0>"
|
||||
#"<[^>\s]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"</0>'
|
||||
"<[^>\s]*\bauthor\b[^>]*>" G "<0></sch:author></0>"
|
||||
# "<[^>\s]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"</0>' #Debug should work
|
||||
"<[^> ]*\bauthor\b[^>]*>" G "<0></sch:author></0>"
|
||||
"<[^> ]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"></0>'
|
||||
"<[^> ]*\bauthor\b[^>]*>" G '<0><pp:author name="Daniel"></0>'
|
||||
"<[^>\s]*\bauthor\b[^>]*>" "<other>"
|
||||
"<[^>\s]*\bauthor\b[^>]*>" "</authors>"
|
||||
"<[^>\s]*\bauthor\b[^>]*>" "<work>author</work>"
|
||||
|
@ -1625,15 +2098,15 @@
|
|||
"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0"
|
||||
"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" "0.0"
|
||||
"(^\d*\.?\d*[1-9]+\d*$)|(^[1-9]+\d*\.\d*$)" ".0"
|
||||
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>Sacramento</0>" #TODO: Octal
|
||||
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>San Francisco</0>"
|
||||
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>San Luis Obispo</0>"
|
||||
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanFrancisco"
|
||||
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanLuisObispo"
|
||||
#"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "San francisco"
|
||||
#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
|
||||
#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
|
||||
#"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0"
|
||||
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" G "<0>Sacramento</0>"
|
||||
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><2>San Francisco</2></0>"
|
||||
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "<0><3>San Luis Obispo</3></0>"
|
||||
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanFrancisco"
|
||||
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "SanLuisObispo"
|
||||
"^([A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^([A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,}\040[A-Z]{1}[a-z]{1,})$|^$" "San francisco"
|
||||
"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
|
||||
"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
|
||||
"^\{?[a-fA-F\d]{8}-([a-fA-F\d]{4}-){3}[a-fA-F\d]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0"
|
||||
"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>{e02ff0e4-00ad-090A-c030-0d00a0008ba0}</0>"
|
||||
"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" G "<0>e02ff0e4-00ad-090A-c030-0d00a0008ba0</0>"
|
||||
"^\{?[a-fA-F0-9]{8}-([a-fA-F0-9]{4}-){3}[a-fA-F0-9]{12}\}?$" "0xe02ff0e400ad090Ac0300d00a0008ba0"
|
||||
|
@ -1682,15 +2155,15 @@
|
|||
"^((0[1-9])|(1[0-2]))\/(\d{2})$" G "<0>01/04</0>"
|
||||
"^((0[1-9])|(1[0-2]))\/(\d{2})$" "13/03"
|
||||
"^((0[1-9])|(1[0-2]))\/(\d{2})$" "10/2003"
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0><script language=javascript>document.write("one");</script></0>" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "--" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "A-Z][a-z]+" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>strFirstName</0>" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>intAgeInYears</0>" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>Where the Wild Things Are</0>" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "123" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "abc" # TODO: \w in pattern
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" "this has no caps in it" # TODO: \w in pattern
|
||||
"<script[^>]*>[\w|\t|\r|\W]*</script>" G '<0><script language=javascript>document.write("one");</script></0>'
|
||||
"<script[^>]*>[\w|\t|\r|\W]*</script>" "--"
|
||||
"<script[^>]*>[\w|\t|\r|\W]*</script>" "A-Z][a-z]+"
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>strFirstName</0>" # Test Case damaged?
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>intAgeInYears</0>" # Test Case damaged?
|
||||
#"<script[^>]*>[\w|\t|\r|\W]*</script>" G "<0>Where the Wild Things Are</0>" # Test Case damaged?
|
||||
"<script[^>]*>[\w|\t|\r|\W]*</script>" "123"
|
||||
"<script[^>]*>[\w|\t|\r|\W]*</script>" "abc"
|
||||
"<script[^>]*>[\w|\t|\r|\W]*</script>" "this has no caps in it"
|
||||
"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-0.050</0>"
|
||||
"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5.000</0>"
|
||||
"(^-\d*\.?\d*[1-9]+\d*$)|(^-[1-9]+\d*\.\d*$)" G "<0>-5</0>"
|
||||
|
@ -1725,12 +2198,12 @@
|
|||
"^.{4,8}$" "asd"
|
||||
"^.{4,8}$" "123"
|
||||
"^.{4,8}$" "asdfe12345"
|
||||
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com</0>" # TODO: \w in pattern
|
||||
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com.au</ # TODO: \w in pattern0>"
|
||||
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.au</0>" # TODO: \w in pattern
|
||||
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word" # TODO: \w in pattern
|
||||
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word@" # TODO: \w in pattern
|
||||
#"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "@word" # TODO: \w in pattern
|
||||
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com</0>"
|
||||
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.com.au</0>"
|
||||
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" G "<0>a@a.au</0>"
|
||||
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word"
|
||||
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "word@"
|
||||
"^[\w\.=-]+@[\w\.-]+\.[\w]{2,3}$" "@word"
|
||||
"^\d{5}-\d{4}$" G "<0>22222-3333</0>"
|
||||
"^\d{5}-\d{4}$" G "<0>34545-2367</0>"
|
||||
"^\d{5}-\d{4}$" G "<0>56334-2343</0>"
|
||||
|
@ -1795,22 +2268,22 @@
|
|||
"^[12345]$" "6"
|
||||
"^[12345]$" "-1"
|
||||
"^[12345]$" "abc"
|
||||
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@aol.com</0>" # TODO: \w in pattern
|
||||
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@wrox.co.uk</0>" # TODO: \w in pattern
|
||||
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@domain.info</0>" # TODO: \w in pattern
|
||||
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "a@b" # TODO: \w in pattern
|
||||
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "notanemail" # TODO: \w in pattern
|
||||
#"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "joe@@." # TODO: \w in pattern
|
||||
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@aol.com</0>"
|
||||
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@wrox.co.uk</0>"
|
||||
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" G "<0>joe@domain.info</0>"
|
||||
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "a@b"
|
||||
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "notanemail"
|
||||
"^[\w-\.]+@([\w-]+\.)+[\w-]{2,4}$" "joe@@."
|
||||
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>joe@aol.com</0>"
|
||||
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>ssmith@aspalliance.com</0>"
|
||||
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" G "<0>a@b.cc</0>"
|
||||
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@123aspx.com"
|
||||
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@web.info"
|
||||
"^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$" "joe@company.co.uk"
|
||||
#"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>joe@aol.com</0>" # TODO: \w in pattern
|
||||
#"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>a@b.c</0>" # TODO: \w in pattern
|
||||
#"[\w-]+@([\w-]+\.)+[\w-]+" "asdf" # TODO: \w in pattern
|
||||
#"[\w-]+@([\w-]+\.)+[\w-]+" "1234" # TODO: \w in pattern
|
||||
"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>joe@aol.com</0>"
|
||||
"[\w-]+@([\w-]+\.)+[\w-]+" G "<0>a@b.c</0>"
|
||||
"[\w-]+@([\w-]+\.)+[\w-]+" "asdf"
|
||||
"[\w-]+@([\w-]+\.)+[\w-]+" "1234"
|
||||
"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234-1234-1234-1234</0>"
|
||||
"\d{4}-?\d{4}-?\d{4}-?\d{4}" G "<0>1234123412341234</0>"
|
||||
"\d{4}-?\d{4}-?\d{4}-?\d{4}" "1234123412345"
|
||||
|
|
Loading…
Add table
Reference in a new issue