ICU-105 Regular Expressions, ongoing development

X-SVN-Rev: 10180
This commit is contained in:
Andy Heninger 2002-11-07 02:34:46 +00:00
parent f8f62de907
commit 2d39fda4e3
14 changed files with 286 additions and 205 deletions

View file

@ -1833,10 +1833,13 @@ static const char * const
_uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
"U_REGEX_ERROR_START",
"U_REGEX_INTERNAL_ERROR",
"U_REGEX_RULE_SYNTAX",
"U_REGEX_INVALID_STATE",
"U_REGEX_BAD_ESCAPE_SEQUENCE",
"U_REGEX_PROPERTY_SYNTAX",
"U_REGEX_UNIMPLEMENTED"
"U_REGEX_UNIMPLEMENTED",
"U_REGEX_MISMATCHED_PAREN",
"U_REGEX_MATCH_MODE_ERROR"
};
U_CAPI const char * U_EXPORT2
@ -1852,7 +1855,7 @@ u_errorName(UErrorCode code) {
} else if (U_BRK_ERROR_START <= code && code < U_BRK_ERROR_LIMIT){
return _uBrkErrorName[code - U_BRK_ERROR_START];
} else if (U_REGEX_ERROR_START <= code && code < U_REGEX_ERROR_LIMIT) {
return _uBrkErrorName[code - U_REGEX_ERROR_START];
return _uRegexErrorName[code - U_REGEX_ERROR_START];
} else {
return "[BOGUS UErrorCode]";
}

View file

@ -52,6 +52,7 @@
# endif
# define UCONFIG_NO_FORMATTING 1
# define UCONFIG_NO_TRANSLITERATION 1
# define UCONFIG_NO_REGULAR_EXPRESSIONS 1
#endif
/* common library switches -------------------------------------------------- */
@ -114,5 +115,16 @@
# define UCONFIG_NO_TRANSLITERATION 0
#endif
/**
* \def UCONFIG_NO_REGULAR_EXPRESSIONS
* This switch turns off regular expressions.
*
* @draft ICU 2.6
*/
#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS
# define UCONFIG_NO_REGULAR_EXPRESSIONS 0
#endif
#endif

View file

@ -502,10 +502,13 @@ typedef enum UErrorCode {
*/
U_REGEX_ERROR_START=0x10300,
U_REGEX_INTERNAL_ERROR,
U_REGEX_RULE_SYNTAX,
U_REGEX_INVALID_STATE,
U_REGEX_BAD_ESCAPE_SEQUENCE,
U_REGEX_PROPERTY_SYNTAX,
U_REGEX_UNIMPLEMENTED,
U_REGEX_MISMATCHED_PAREN,
U_REGEX_MATCH_MODE_ERROR,
U_REGEX_ERROR_LIMIT,
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */

View file

@ -5,11 +5,14 @@
// Copyright (C) 2002, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains the ICU regular expression scanner, which is responsible
// for preprocessing a regular expression pattern into the tokenized form that
// This file contains the ICU regular expression compiler, which is responsible
// for processing a regular expression pattern into the compiled form that
// is used by the match finding engine.
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/unistr.h"
#include "unicode/uniset.h"
@ -18,18 +21,18 @@
#include "unicode/parsepos.h"
#include "unicode/parseerr.h"
#include "unicode/regex.h"
#include "regeximp.h"
#include "uprops.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "stdio.h" // TODO: Get rid of this
#include "regeximp.h"
#include "regexcst.h" // Contains state table for the regex pattern parser.
// generated by a Perl script.
#include "regexcmp.h"
#include "uassert.h"
U_NAMESPACE_BEGIN
@ -52,10 +55,10 @@ static const int RESCAN_DEBUG = 0;
// Characters that have no special meaning, and thus do not need to be escaped. Expressed
// as the inverse of those needing escaping -- [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
static const UChar gRuleSet_rule_char_pattern[] = {
static const UChar gRuleSet_rule_char_pattern[] = {
// [ ^ \ * \ ? \ + \ [ \ ( / )
0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
@ -72,7 +75,7 @@ static UnicodeSet *gUnescapeCharSet;
// will handle.
//
static const UChar gUnescapeCharPattern[] = {
// [ a c e f n r t u U ]
// [ a c e f n r t u U ]
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};
@ -123,7 +126,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
//
// Set up the constant (static) Unicode Sets.
//
//
if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
// TODO: Make thread safe.
// TODO: Memory Cleanup on ICU shutdown.
@ -131,8 +134,8 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
gRuleSets[kRuleSet_white_space-128] = (UnicodeSet*) uprv_openRuleWhiteSpaceSet(&status);
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status);
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status);
if (U_FAILURE(status)) {
delete gRuleSets[kRuleSet_rule_char-128];
@ -171,7 +174,7 @@ RegexCompile::~RegexCompile() {
// script regexcst.pl
//
//---------------------------------------------------------------------------------
void RegexCompile::compile(
void RegexCompile::compile(
RegexPattern &rxp, // User level pattern object to receive
// the compiled pattern.
const UnicodeString &pat, // Source pat to be compiled.
@ -285,7 +288,7 @@ void RegexCompile::compile(
if (tableEl->fPushState != 0) {
fStackPtr++;
if (fStackPtr >= kStackSize) {
error(U_BRK_INTERNAL_ERROR);
error(U_REGEX_INTERNAL_ERROR);
printf("RegexCompile::parse() - state stack overflow.\n");
fStackPtr--;
}
@ -304,7 +307,7 @@ void RegexCompile::compile(
state = fStack[fStackPtr];
fStackPtr--;
if (fStackPtr < 0) {
error(U_BRK_INTERNAL_ERROR);
error(U_REGEX_INTERNAL_ERROR);
printf("RegexCompile::compile() - state stack underflow.\n");
fStackPtr++;
}
@ -358,12 +361,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
// Encountering end of pattern also behaves like a close paren,
// and forces fixups of the State Save at the beginning of the compiled pattern
// and of any OR operations at the top level.
//
//
handleCloseParen();
if (fParenStack.size() > 0) {
// Missing close paren in pattern.
error(U_REGEX_MISMATCHED_PAREN);
}
// add the END operation to the compiled pattern.
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
// Terminate the pattern compilation state machine.
returnVal = FALSE;
break;
@ -405,7 +412,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doOpenCaptureParen:
// Open Paren.
// Compile to a
// Compile to a
// - NOP, which later may be replaced by a save-state if the
// parenthesized group gets a * quantifier, followed by
// - START_CAPTURE
@ -430,7 +437,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doOpenNonCaptureParen:
// Open non-caputuring (grouping only) Paren.
// Compile to a
// Compile to a
// - NOP, which later may be replaced by a save-state if the
// parenthesized group gets a * quantifier, followed by
// - NOP, which may later be replaced by a save-state if there
@ -440,7 +447,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
// On the Parentheses stack, start a new frame and add the postions
// of the two NOPs.
// of the two NOPs.
fParenStack.push(-1, *fStatus); // Begin a new frame.
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
@ -475,6 +482,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doCloseParen:
handleCloseParen();
if (fParenStack.size() <= 0) {
// Extra close paren, or missing open paren.
error(U_REGEX_MISMATCHED_PAREN);
}
break;
case doNOP:
@ -483,11 +494,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBadOpenParenType:
case doRuleError:
error(U_BRK_RULE_SYNTAX);
error(U_REGEX_RULE_SYNTAX);
returnVal = FALSE;
break;
case doMismatchedParenErr:
error(U_REGEX_MISMATCHED_PAREN);
returnVal = FALSE;
break;
case doPlus:
// Normal '+' compiles to
// 1. stuff to be repeated (already built)
@ -532,7 +548,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 3. ...
// Insert the state save into the compiled pattern, and we're done.
{
int32_t saveStateLoc = blockTopLoc(TRUE);
int32_t saveStateLoc = blockTopLoc(TRUE);
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
}
@ -572,9 +588,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 3. JMP 0
// 4. ...
//
{
{
// location of item #1, the STATE_SAVE
int32_t saveStateLoc = blockTopLoc(TRUE);
int32_t saveStateLoc = blockTopLoc(TRUE);
// Locate the position in the compiled pattern where the match will continue
// after completing the *. (4 in the comment above)
@ -599,7 +615,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
// 3. STATE_SAVE 2
// 4 ...
{
int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
@ -607,7 +623,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
}
break;
case doStartString:
// We've just scanned a single "normal" character from the pattern,
@ -678,7 +694,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
// has only one character, emit the single character token instead.
{
int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart;
U_ASSERT(strLength > 0);
U_ASSERT(strLength > 0);
int32_t lastCharIdx = fRXPat->fLiteralText.length()-1;
lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
if (lastCharIdx == fStringOpStart) {
@ -735,7 +751,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBackslashG:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
break;
break;
case doBackslashS:
fRXPat->fCompiledPat->addElement(
@ -750,31 +766,31 @@ UBool RegexCompile::doParseActions(EParseAction action)
case doBackslashW:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
break;
break;
case doBackslashw:
fRXPat->fCompiledPat->addElement(
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
break;
break;
case doBackslashX:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
break;
break;
case doBackslashx: // \x{abcd} alternate hex format
// TODO: implement
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
break;
case doBackslashZ:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
break;
break;
case doBackslashz:
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
break;
break;
case doExit:
returnVal = FALSE;
@ -806,12 +822,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
break;
case doNamedChar: // \N{NAMED_CHAR}
// TODO: implement
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
break;
case doMatchMode: // (?i) and similar
// TODO: implement
error(U_REGEX_UNIMPLEMENTED);
break;
case doNotImplementedError:
// TODO: get rid of this once everything is implemented.
error(U_REGEX_UNIMPLEMENTED);
break;
default:
error(U_BRK_INTERNAL_ERROR);
error(U_REGEX_INTERNAL_ERROR);
returnVal = FALSE;
break;
}
@ -838,7 +865,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
//------------------------------------------------------------------------------
int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
int32_t theLoc;
if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
{
// The item just processed is a parenthesized block.
theLoc = fMatchOpenParen; // A slot is already reserved for us.
@ -878,8 +905,11 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
void RegexCompile::handleCloseParen() {
int32_t patIdx;
int32_t patOp;
U_ASSERT(fParenStack.size() >= 1);
if (fParenStack.size() <= 0) {
error(U_REGEX_MISMATCHED_PAREN);
return;
}
// Fixup any operations within the just-closed parenthesized group
// that need to reference the end of the (block).
// (The first one on popped from the stack is an unused slot for
@ -896,17 +926,17 @@ void RegexCompile::handleCloseParen() {
fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
fMatchOpenParen = patIdx;
}
// DO any additional fixups, depending on the specific kind of
// parentesized grouping this is
switch (patIdx) {
case -1:
// No additional fixups required.
// This is the case with most kinds of groupings.
break;
case -2:
// Capturing Parentheses.
// Capturing Parentheses.
// Insert a End Capture op into the pattern.
// Grab the group number from the start capture op
// and put it into the end-capture op.
@ -1039,7 +1069,7 @@ UChar32 RegexCompile::nextCharLL() {
fLineNum++;
fCharNum=0;
if (fQuoteMode) {
error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
error(U_REGEX_RULE_SYNTAX);
fQuoteMode = FALSE;
}
}
@ -1120,7 +1150,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
// Use UnicodeString::unescapeAt() to handle those that it can.
// Otherwise just return the '\', and let the pattern parser deal with it.
//
int32_t startX = fNextIndex; // start and end positions of the
int32_t startX = fNextIndex; // start and end positions of the
int32_t endX = fNextIndex; // sequence following the '\'
if (c.fChar == chBackSlash) {
if (gUnescapeCharSet->contains(peekCharLL())) {
@ -1148,7 +1178,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
//
// The scan position is normally under the control of the state machine
// that controls pattern parsing. UnicodeSets, however, are parsed by
// the UnicodeSet constructor, not by the Regex pattern parser.
// the UnicodeSet constructor, not by the Regex pattern parser.
//
//---------------------------------------------------------------------------------
UnicodeSet *RegexCompile::scanSet() {
@ -1193,7 +1223,7 @@ UnicodeSet *RegexCompile::scanSet() {
//---------------------------------------------------------------------------------
//
// scanProp Construct a UnicodeSet from the text at the current scan
// position, which will be of the form \p{whaterver}
// position, which will be of the form \p{whaterver}
//
// The scan position will be at the 'p' or 'P'. On return
// the scan position should be just after the '}'
@ -1240,6 +1270,5 @@ UnicodeSet *RegexCompile::scanProp() {
return uset;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

View file

@ -4,8 +4,10 @@
// Copyright (C) 2002, International Business Machines Corporation and others.
// All Rights Reserved.
//
// This file contains declarations for the class RegexCompile and for compiled
// regular expression data format
// This file contains declarations for the class RegexCompile
//
// This class is internal to the regular expression implementation.
// For the public Regular Expression API, see the file "unicode/regex.h"
//
@ -13,6 +15,8 @@
#define RBBISCAN_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/uobject.h"
#include "unicode/uniset.h"
#include "unicode/parseerr.h"
@ -28,12 +32,7 @@ static const UBool REGEX_DEBUG = TRUE;
//--------------------------------------------------------------------------------
//
// class RegexCompile does the lowest level, character-at-a-time
// scanning of a regular expression.
//
// The output of the scanner is a tokenized form
// of the RE, plus prebuilt UnicodeSet objects for each
// set of charcters that is referenced.
// class RegexCompile Contains the regular expression compiler.
//
//--------------------------------------------------------------------------------
static const int kStackSize = 100; // The size of the state stack for
@ -161,5 +160,5 @@ private:
};
U_NAMESPACE_END
#endif
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif // RBBISCAN_H

View file

@ -24,7 +24,6 @@ U_NAMESPACE_BEGIN
enum Regex_PatternParseAction {
doCloseParen,
doProperty,
doTagValue,
doOrOperator,
doOpenCaptureParen,
doBadOpenParenType,
@ -35,6 +34,7 @@ enum Regex_PatternParseAction {
doNamedChar,
doBackslashw,
doPossesiveStar,
doMismatchedParenErr,
doOpenLookBehind,
doBackslashx,
doBackslashz,
@ -43,6 +43,7 @@ enum Regex_PatternParseAction {
doEnterQuoteMode,
doPossesivePlus,
doNGStar,
doMatchMode,
doOpenLookAheadNeg,
doPlus,
doOpenNonCaptureParen,
@ -51,14 +52,11 @@ enum Regex_PatternParseAction {
doNGPlus,
doPatFinish,
doBackslashD,
doIntervalMinValue,
doIntervalDigit,
doPossesiveOpt,
doBackslashG,
doOpt,
doOpenAtomicParen,
doBackslashS,
doNumberExpectedError,
doStringChar,
doOpenLookAhead,
doBackRef,
@ -74,6 +72,7 @@ enum Regex_PatternParseAction {
doBackslashb,
doEndString,
doBackslashd,
doNotImplementedError,
doOpenLookBehindNeg,
doSplitString,
rbbiLastAction};
@ -100,13 +99,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doStartString, 254, 13,0, TRUE} // 3 term
, {doStartString, 130, 13,0, TRUE} // 4
, {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5
, {doNOP, 40 /* ( */, 27, 20, TRUE} // 6
, {doNOP, 40 /* ( */, 28, 20, TRUE} // 6
, {doDotAny, 46 /* . */, 20,0, TRUE} // 7
, {doCaret, 94 /* ^ */, 3,0, TRUE} // 8
, {doDollar, 36 /* $ */, 3,0, TRUE} // 9
, {doNOP, 92 /* \ */, 60,0, TRUE} // 10
, {doNOP, 92 /* \ */, 67,0, TRUE} // 10
, {doNOP, 253, 2,0, FALSE} // 11
, {doRuleError, 255, 80,0, FALSE} // 12
, {doRuleError, 255, 87,0, FALSE} // 12
, {doStringChar, 254, 13,0, TRUE} // 13 string
, {doStringChar, 130, 13,0, TRUE} // 14
, {doSplitString, 63 /* ? */, 20,0, FALSE} // 15
@ -114,67 +113,74 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doSplitString, 42 /* * */, 20,0, FALSE} // 17
, {doSplitString, 123 /* { */, 20,0, FALSE} // 18
, {doEndString, 255, 20,0, FALSE} // 19
, {doNOP, 42 /* * */, 41,0, TRUE} // 20 expr-quant
, {doNOP, 43 /* + */, 44,0, TRUE} // 21
, {doNOP, 63 /* ? */, 47,0, TRUE} // 22
, {doNOP, 255, 24,0, FALSE} // 23
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 24 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 25
, {doNOP, 255, 3,0, FALSE} // 26
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 28
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 29 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 30
, {doOpenLookAhead, 61 /* = */, 3, 24, TRUE} // 31
, {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE} // 32
, {doNOP, 60 /* < */, 36,0, TRUE} // 33
, {doNOP, 35 /* # */, 39,0, TRUE} // 34
, {doBadOpenParenType, 255, 80,0, FALSE} // 35
, {doOpenLookBehind, 61 /* = */, 3, 24, TRUE} // 36 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE} // 37
, {doBadOpenParenType, 255, 80,0, FALSE} // 38
, {doNOP, 41 /* ) */, 3,0, TRUE} // 39 paren-comment
, {doNOP, 255, 39,0, TRUE} // 40
, {doNGStar, 63 /* ? */, 24,0, TRUE} // 41 quant-star
, {doPossesiveStar, 43 /* + */, 24,0, TRUE} // 42
, {doStar, 255, 24,0, FALSE} // 43
, {doNGPlus, 63 /* ? */, 24,0, TRUE} // 44 quant-plus
, {doPossesivePlus, 43 /* + */, 24,0, TRUE} // 45
, {doPlus, 255, 24,0, FALSE} // 46
, {doNGOpt, 63 /* ? */, 24,0, TRUE} // 47 quant-opt
, {doPossesiveOpt, 43 /* + */, 24,0, TRUE} // 48
, {doOpt, 255, 24,0, FALSE} // 49
, {doNOP, 129, 50,0, TRUE} // 50 interval-open
, {doIntervalMinValue, 128, 53,0, FALSE} // 51
, {doNumberExpectedError, 255, 80,0, FALSE} // 52
, {doNOP, 129, 57,0, TRUE} // 53 interval-value
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
, {doIntervalDigit, 128, 53,0, TRUE} // 55
, {doNumberExpectedError, 255, 80,0, FALSE} // 56
, {doNOP, 129, 57,0, TRUE} // 57 interval-close
, {doTagValue, 125 /* } */, 24,0, TRUE} // 58
, {doNumberExpectedError, 255, 80,0, FALSE} // 59
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 60 backslash
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 61
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 62
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 63
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 64
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 65
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 66
, {doProperty, 112 /* p */, 20,0, FALSE} // 67
, {doProperty, 80 /* P */, 20,0, FALSE} // 68
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 69
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 70
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 71
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 72
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 73
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 74
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 75
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 76
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 77
, {doBackRef, 128, 20,0, TRUE} // 78
, {doStartString, 255, 13,0, TRUE} // 79
, {doExit, 255, 80,0, TRUE} // 80 errorDeath
, {doNOP, 42 /* * */, 56,0, TRUE} // 20 expr-quant
, {doNOP, 43 /* + */, 59,0, TRUE} // 21
, {doNOP, 63 /* ? */, 62,0, TRUE} // 22
, {doNOP, 123 /* { */, 65,0, TRUE} // 23
, {doNOP, 255, 25,0, FALSE} // 24
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 25 expr-cont
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 26
, {doNOP, 255, 3,0, FALSE} // 27
, {doNOP, 63 /* ? */, 30,0, TRUE} // 28 open-paren
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 29
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 30 open-paren-extended
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 31
, {doOpenLookAhead, 61 /* = */, 3, 25, TRUE} // 32
, {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE} // 33
, {doNOP, 60 /* < */, 42,0, TRUE} // 34
, {doNOP, 35 /* # */, 45,0, TRUE} // 35
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 36
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 37
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 38
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 39
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 40
, {doBadOpenParenType, 255, 87,0, FALSE} // 41
, {doOpenLookBehind, 61 /* = */, 3, 25, TRUE} // 42 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE} // 43
, {doBadOpenParenType, 255, 87,0, FALSE} // 44
, {doNOP, 41 /* ) */, 3,0, TRUE} // 45 paren-comment
, {doMismatchedParenErr, 253, 87,0, FALSE} // 46
, {doNOP, 255, 45,0, TRUE} // 47
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 48 paren-flag
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 49
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 50
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 51
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 52
, {doNOP, 41 /* ) */, 3,0, TRUE} // 53
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 54
, {doNOP, 255, 87,0, FALSE} // 55
, {doNGStar, 63 /* ? */, 25,0, TRUE} // 56 quant-star
, {doPossesiveStar, 43 /* + */, 25,0, TRUE} // 57
, {doStar, 255, 25,0, FALSE} // 58
, {doNGPlus, 63 /* ? */, 25,0, TRUE} // 59 quant-plus
, {doPossesivePlus, 43 /* + */, 25,0, TRUE} // 60
, {doPlus, 255, 25,0, FALSE} // 61
, {doNGOpt, 63 /* ? */, 25,0, TRUE} // 62 quant-opt
, {doPossesiveOpt, 43 /* + */, 25,0, TRUE} // 63
, {doOpt, 255, 25,0, FALSE} // 64
, {doNOP, 129, 65,0, TRUE} // 65 interval-open
, {doNotImplementedError, 255, 87,0, FALSE} // 66
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 67 backslash
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 68
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 69
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 70
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 71
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 72
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 73
, {doProperty, 112 /* p */, 20,0, FALSE} // 74
, {doProperty, 80 /* P */, 20,0, FALSE} // 75
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 76
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 77
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 78
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 79
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 80
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 81
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 82
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 83
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 84
, {doBackRef, 128, 20,0, TRUE} // 85
, {doStartString, 255, 13,0, TRUE} // 86
, {doExit, 255, 87,0, TRUE} // 87 errorDeath
};
static const char *RegexStateNames[] = { 0,
"start",
@ -199,6 +205,7 @@ static const char *RegexStateNames[] = { 0,
"expr-quant",
0,
0,
0,
0,
"expr-cont",
0,
@ -211,11 +218,25 @@ static const char *RegexStateNames[] = { 0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
"open-paren-lookbehind",
0,
0,
"paren-comment",
0,
0,
"paren-flag",
0,
0,
0,
0,
0,
0,
0,
"quant-star",
0,
@ -227,14 +248,6 @@ static const char *RegexStateNames[] = { 0,
0,
0,
"interval-open",
0,
0,
"interval-value",
0,
0,
0,
"interval-close",
0,
0,
"backslash",
0,

View file

@ -107,7 +107,8 @@ string:
expr-quant:
'*' n quant-star
'+' n quant-plus
'?' n quant-opt
'?' n quant-opt
'{' n interval-open
default expr-cont
@ -136,6 +137,11 @@ open-paren-extended:
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
'<' n open-paren-lookbehind
'#' n paren-comment
'i' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
default errorDeath doBadOpenParenType
open-paren-lookbehind:
@ -150,7 +156,21 @@ open-paren-lookbehind:
#
paren-comment:
')' n term
eof errorDeath doMismatchedParenErr
default n paren-comment
#
# paren-flag Scanned a (?ismx-ismx flag setting thing
# TODO: this is not fully implemented yet.
paren-flag:
'i' n paren-flag doMatchMode
's' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
'x' n paren-flag doMatchMode
'-' n paren-flag doMatchMode
')' n term
':' n term ^expr-quant doOpenNonCaptureParen
default errorDeath
#
@ -189,19 +209,8 @@ quant-opt:
#
interval-open:
white_space n interval-open
digit_char interval-value doIntervalMinValue
default errorDeath doNumberExpectedError
default errorDeath doNotImplementedError
interval-value:
white_space n interval-close
'}' interval-close
digit_char n interval-value doIntervalDigit
default errorDeath doNumberExpectedError
interval-close:
white_space n interval-close
'}' n expr-cont doTagValue
default errorDeath doNumberExpectedError

View file

@ -4,9 +4,9 @@
//
// file: regeximp.h
//
// ICU Regular Expressions, declarations of internal implementation types
// and constants that are common between the pattern compiler and the
// runtime execution engine.
// ICU Regular Expressions,
// Definitions of constant values used in the compiled form of
// a regular expression pattern.
//
#ifndef _REGEXIMP_H
@ -19,7 +19,7 @@
//
static const uint32_t URX_UNUSED1 = 1;
static const uint32_t URX_END = 2;
static const uint32_t URX_ONECHAR = 3;
static const uint32_t URX_ONECHAR = 3; // Value field is the 21 bit unicode char to match
static const uint32_t URX_STRING = 4; // Value field is index of string start
static const uint32_t URX_STRING_LEN = 5; // Value field is string length (code units)
static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern position to push
@ -55,7 +55,7 @@ static const uint32_t URX_DOLLAR = 24; // Also for \Z
//
// Access to Unicode Sets for composite properties
// Access to Unicode Sets for Perl-like composite character properties
// The sets are accessed by the match engine for things like \w (word boundary)
//
static const uint32_t URX_ISWORD_SET = 1;

View file

@ -1,6 +1,9 @@
//
// file: rematch.cpp
//
// Contains the implementation of class RegexMatcher,
// which is one of the main API classes for the ICU regular expression package.
//
/*
**********************************************************************
* Copyright (C) 2002 International Business Machines Corporation *
@ -9,6 +12,8 @@
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#include "unicode/uniset.h"
#include "unicode/uchar.h"
@ -443,10 +448,11 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
// isWordBoundary
// in perl, "xab..cd..", \b is true at positions 0,3,5,7
// For us,
// If the current char is a combining mark, \b is FALSE
// Scan backwards to the first non-combining char
// Pos is a boundary if the current and previous chars are
// opposite in membership in \w set
// If the current char is a combining mark,
// \b is FALSE.
// Else Scan backwards to the first non-combining char.
// We are at a boundary if the this char and the original chars are
// opposite in membership in \w set
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::isWordBoundary(int32_t pos) {
@ -486,27 +492,6 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
}
//--------------------------------------------------------------------------------
//
// getCaptureText We have encountered a '\' that might preceed a
// capture group specification.
// If a valid capture group number follows the '\',
// return the indicies to the start & end of the captured
// text, and update the patIdx to the position following the
// \n sequence.
//
// This function is used during find and replace operations when
// processing caputure references in the replacement text.
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::getCaptureText(const UnicodeString &rep,
int32_t &repIdx,
int32_t &textStart,
int32_t &textEnd)
{
return FALSE;
}
//--------------------------------------------------------------------------------
//
// backTrack Within the match engine, this function is called when
@ -915,10 +900,9 @@ breakFromLoop:
const char RegexMatcher::fgClassID = 0;
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

View file

@ -9,6 +9,9 @@
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/regex.h"
#include "uassert.h"
#include "uvector.h"
@ -66,6 +69,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fBadState = other.fBadState;
fNumCaptureGroups = other.fNumCaptureGroups;
fMaxCaptureDigits = other.fMaxCaptureDigits;
fStaticSets = other.fStaticSets;
if (fBadState) {
return *this;
}
@ -110,6 +114,7 @@ void RegexPattern::init() {
fBadState = FALSE;
fNumCaptureGroups = 0;
fMaxCaptureDigits = 1; // TODO: calculate for real.
fStaticSets = NULL;
fMatcher = NULL;
UErrorCode status=U_ZERO_ERROR;
@ -384,15 +389,6 @@ int32_t RegexPattern::split(const UnicodeString &input,
//---------------------------------------------------------------------
//
// hashcode
//
//---------------------------------------------------------------------
int32_t RegexPattern::hashCode(void) const {
return 0; // TODO: Do something better here
};
//---------------------------------------------------------------------
//
@ -512,8 +508,8 @@ breakFromLoop:
printf("\n\n");
};
const char RegexPattern::fgClassID = 0;
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS

View file

@ -9,6 +9,9 @@
#define REGEX_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/parseerr.h"
@ -69,9 +72,6 @@ public:
RegexPattern &operator =(const RegexPattern &other);
virtual RegexPattern *clone() const;
// TODO: Do we really want a hashCode function on this class?
virtual int32_t hashCode(void) const;
/**
* Compiles the given regular expression into a pattern
@ -428,10 +428,6 @@ private:
//
void MatchAt(int32_t startIdx, UErrorCode &status);
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
UBool getCaptureText(const UnicodeString &rep,
int32_t &repIdx,
int32_t &textStart,
int32_t &textEnd);
UBool isWordBoundary(int32_t pos); // perform the \b test
@ -448,7 +444,6 @@ private:
};
U_NAMESPACE_END
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
#endif

View file

@ -70,11 +70,13 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
break;
case 3: name = "regex";
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
if (exec) {
logln("TestSuite Regex---"); logln();
RegexTest test;
callTest( test, par );
}
#endif
break;
case 4: name = "format";

View file

@ -11,6 +11,8 @@
//
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "unicode/uchar.h"
#include "intltest.h"
#include "regextst.h"
@ -1195,8 +1197,38 @@ void RegexTest::Errors() {
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
// Missing close parentheses
//REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR);
REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
// Extra close paren
REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX);
REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
// Flag settings not yet implemented
REGEX_ERR("(?i:stuff*)", 1, 3, U_REGEX_UNIMPLEMENTED);
REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED);
// Look-ahead, Look-behind
REGEX_ERR("abc(?=xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // look-ahead
REGEX_ERR("abc(?!xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // negated look-ahead
REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // look-behind
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
// Atomic Grouping
REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
// {Numeric Quantifiers}
REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
// Quantifiers are allowed only after something that can be quantified.
REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
}
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */

View file

@ -8,6 +8,8 @@
#ifndef REGEXTST_H
#define REGEXTST_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
#include "intltest.h"
#include "unicode/regex.h"
@ -35,4 +37,6 @@ public:
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
UErrorCode expectedStatus, int line);
};
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif