ICU-2411 Regexp, support for \x hex escapes in patterns added.

X-SVN-Rev: 11862
This commit is contained in:
Andy Heninger 2003-05-09 16:55:08 +00:00
parent 800f4020cb
commit efc59e36a9
7 changed files with 30 additions and 31 deletions

View file

@ -1131,12 +1131,6 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
break;
case doBackslashx: // \x{abcd} alternate hex format
// TODO: this is waiting for a decision on adding \x to unescape.
error(U_REGEX_UNIMPLEMENTED);
break;
case doBackslashZ:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);

View file

@ -37,7 +37,6 @@ enum Regex_PatternParseAction {
doPossesiveStar,
doMismatchedParenErr,
doOpenLookBehind,
doBackslashx,
doBackslashz,
doIntervalError,
doStar,
@ -117,7 +116,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
, {doRuleError, 255, 101,0, FALSE} // 13
, {doRuleError, 255, 100,0, FALSE} // 13
, {doNOP, 42 /* * */, 57,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 60,0, TRUE} // 15
, {doNOP, 63 /* ? */, 63,0, TRUE} // 16
@ -144,14 +143,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBeginMatchMode, 115 /* s */, 49,0, FALSE} // 37
, {doBeginMatchMode, 120 /* x */, 49,0, FALSE} // 38
, {doBeginMatchMode, 45 /* - */, 49,0, FALSE} // 39
, {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 40
, {doPerlInline, 123 /* { */, 101,0, TRUE} // 41
, {doBadOpenParenType, 255, 101,0, FALSE} // 42
, {doConditionalExpr, 40 /* ( */, 100,0, TRUE} // 40
, {doPerlInline, 123 /* { */, 100,0, TRUE} // 41
, {doBadOpenParenType, 255, 100,0, FALSE} // 42
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 43 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 44
, {doBadOpenParenType, 255, 101,0, FALSE} // 45
, {doBadOpenParenType, 255, 100,0, FALSE} // 45
, {doNOP, 41 /* ) */, 255,0, TRUE} // 46 paren-comment
, {doMismatchedParenErr, 253, 101,0, FALSE} // 47
, {doMismatchedParenErr, 253, 100,0, FALSE} // 47
, {doNOP, 255, 46,0, TRUE} // 48
, {doMatchMode, 105 /* i */, 49,0, TRUE} // 49 paren-flag
, {doMatchMode, 109 /* m */, 49,0, TRUE} // 50
@ -160,7 +159,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 45 /* - */, 49,0, TRUE} // 53
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 54
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 55
, {doNOP, 255, 101,0, FALSE} // 56
, {doNOP, 255, 100,0, FALSE} // 56
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 57 quant-star
, {doPossesiveStar, 43 /* + */, 20,0, TRUE} // 58
, {doStar, 255, 20,0, FALSE} // 59
@ -172,14 +171,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doOpt, 255, 20,0, FALSE} // 65
, {doNOP, 129, 66,0, TRUE} // 66 interval-open
, {doNOP, 128, 69,0, FALSE} // 67
, {doIntervalError, 255, 101,0, FALSE} // 68
, {doIntervalError, 255, 100,0, FALSE} // 68
, {doIntevalLowerDigit, 128, 69,0, TRUE} // 69 interval-lower
, {doNOP, 44 /* , */, 73,0, TRUE} // 70
, {doIntervalSame, 125 /* } */, 76,0, TRUE} // 71
, {doIntervalError, 255, 101,0, FALSE} // 72
, {doIntervalError, 255, 100,0, FALSE} // 72
, {doIntervalUpperDigit, 128, 73,0, TRUE} // 73 interval-upper
, {doNOP, 125 /* } */, 76,0, TRUE} // 74
, {doIntervalError, 255, 101,0, FALSE} // 75
, {doIntervalError, 255, 100,0, FALSE} // 75
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 76 interval-type
, {doPossesiveInterval, 43 /* + */, 20,0, TRUE} // 77
, {doInterval, 255, 20,0, FALSE} // 78
@ -198,14 +197,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBackslashW, 87 /* W */, 14,0, TRUE} // 91
, {doBackslashw, 119 /* w */, 14,0, TRUE} // 92
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 93
, {doBackslashx, 120 /* x */, 14,0, TRUE} // 94
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 96
, {doOctal, 48 /* 0 */, 14,0, TRUE} // 97
, {doBackRef, 128, 14,0, TRUE} // 98
, {doEscapeError, 253, 101,0, FALSE} // 99
, {doLiteralChar, 255, 14,0, TRUE} // 100
, {doExit, 255, 101,0, TRUE} // 101 errorDeath
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 94
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 95
, {doOctal, 48 /* 0 */, 14,0, TRUE} // 96
, {doBackRef, 128, 14,0, TRUE} // 97
, {doEscapeError, 253, 100,0, FALSE} // 98
, {doLiteralChar, 255, 14,0, TRUE} // 99
, {doExit, 255, 100,0, TRUE} // 100 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@ -306,7 +304,6 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
"errorDeath",
0};

View file

@ -246,7 +246,6 @@ backslash:
'W' n expr-quant doBackslashW
'w' n expr-quant doBackslashw
'X' n expr-quant doBackslashX
'x' n expr-quant doBackslashx
'Z' n term doBackslashZ
'z' n term doBackslashz
'0' n expr-quant doOctal

View file

@ -68,8 +68,8 @@ static const UnicodeSet *gRuleDigits = NULL;
// will handle.
//
static const UChar gUnescapeCharPattern[] = {
// [ a c e f n r t u U ]
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};
// [ a c e f n r t u U x ]
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x78, 0x5d, 0};
//

View file

@ -768,7 +768,7 @@ public:
* field delimiters, is placed in the last destination string.
* @param status A reference to a UErrorCode to receive any errors.
* @return The number of fields into which the input string was split.
* @draft ICU 2.4
* @draft ICU 2.6
*/
virtual int32_t split(const UnicodeString &input,
UnicodeString dest[],

View file

@ -1232,7 +1232,7 @@ void RegexTest::Extended() {
//---------------------------------------------------------------------------
void RegexTest::Errors() {
// \escape sequences that aren't implemented yet.
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
//REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
// Missing close parentheses
REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);

View file

@ -328,6 +328,15 @@
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>AB</1>yz</0>+++"
"ab([0-9]+|[A-Z]+)yz" "---<0>ab<1>ABCDE</1>yz</0>+++"
#
# Hex format \x escaping
#
"ab\x63" "<0>abc</0>"
"ab\x09w" "<0>ab\u0009w</0>"
"ab\xabcdc" "<0>ab\u00abcdc</0>"
"ab\x{abcd}c" "<0>ab\uabcdc</0>"
"ab\x{101234}c" "<0>ab\U00101234c</0>"
"abα" "<0>abα</0>"
#
# Random debugging, Temporary