mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-105 Regular Expressions, ongoing development
X-SVN-Rev: 10180
This commit is contained in:
parent
f8f62de907
commit
2d39fda4e3
14 changed files with 286 additions and 205 deletions
|
@ -1833,10 +1833,13 @@ static const char * const
|
|||
_uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
|
||||
"U_REGEX_ERROR_START",
|
||||
"U_REGEX_INTERNAL_ERROR",
|
||||
"U_REGEX_RULE_SYNTAX",
|
||||
"U_REGEX_INVALID_STATE",
|
||||
"U_REGEX_BAD_ESCAPE_SEQUENCE",
|
||||
"U_REGEX_PROPERTY_SYNTAX",
|
||||
"U_REGEX_UNIMPLEMENTED"
|
||||
"U_REGEX_UNIMPLEMENTED",
|
||||
"U_REGEX_MISMATCHED_PAREN",
|
||||
"U_REGEX_MATCH_MODE_ERROR"
|
||||
};
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
|
@ -1852,7 +1855,7 @@ u_errorName(UErrorCode code) {
|
|||
} else if (U_BRK_ERROR_START <= code && code < U_BRK_ERROR_LIMIT){
|
||||
return _uBrkErrorName[code - U_BRK_ERROR_START];
|
||||
} else if (U_REGEX_ERROR_START <= code && code < U_REGEX_ERROR_LIMIT) {
|
||||
return _uBrkErrorName[code - U_REGEX_ERROR_START];
|
||||
return _uRegexErrorName[code - U_REGEX_ERROR_START];
|
||||
} else {
|
||||
return "[BOGUS UErrorCode]";
|
||||
}
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
# endif
|
||||
# define UCONFIG_NO_FORMATTING 1
|
||||
# define UCONFIG_NO_TRANSLITERATION 1
|
||||
# define UCONFIG_NO_REGULAR_EXPRESSIONS 1
|
||||
#endif
|
||||
|
||||
/* common library switches -------------------------------------------------- */
|
||||
|
@ -114,5 +115,16 @@
|
|||
# define UCONFIG_NO_TRANSLITERATION 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
* This switch turns off regular expressions.
|
||||
*
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
# define UCONFIG_NO_REGULAR_EXPRESSIONS 0
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
|
|
@ -502,10 +502,13 @@ typedef enum UErrorCode {
|
|||
*/
|
||||
U_REGEX_ERROR_START=0x10300,
|
||||
U_REGEX_INTERNAL_ERROR,
|
||||
U_REGEX_RULE_SYNTAX,
|
||||
U_REGEX_INVALID_STATE,
|
||||
U_REGEX_BAD_ESCAPE_SEQUENCE,
|
||||
U_REGEX_PROPERTY_SYNTAX,
|
||||
U_REGEX_UNIMPLEMENTED,
|
||||
U_REGEX_MISMATCHED_PAREN,
|
||||
U_REGEX_MATCH_MODE_ERROR,
|
||||
U_REGEX_ERROR_LIMIT,
|
||||
|
||||
U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */
|
||||
|
|
|
@ -5,11 +5,14 @@
|
|||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains the ICU regular expression scanner, which is responsible
|
||||
// for preprocessing a regular expression pattern into the tokenized form that
|
||||
// This file contains the ICU regular expression compiler, which is responsible
|
||||
// for processing a regular expression pattern into the compiled form that
|
||||
// is used by the match finding engine.
|
||||
//
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
@ -18,18 +21,18 @@
|
|||
#include "unicode/parsepos.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "unicode/regex.h"
|
||||
#include "regeximp.h"
|
||||
#include "uprops.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
|
||||
#include "stdio.h" // TODO: Get rid of this
|
||||
|
||||
#include "regeximp.h"
|
||||
#include "regexcst.h" // Contains state table for the regex pattern parser.
|
||||
// generated by a Perl script.
|
||||
#include "regexcmp.h"
|
||||
|
||||
#include "uassert.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -52,10 +55,10 @@ static const int RESCAN_DEBUG = 0;
|
|||
|
||||
// Characters that have no special meaning, and thus do not need to be escaped. Expressed
|
||||
// as the inverse of those needing escaping -- [^\*\?\+\[\(\)\{\}\^\$\|\\\.]
|
||||
static const UChar gRuleSet_rule_char_pattern[] = {
|
||||
static const UChar gRuleSet_rule_char_pattern[] = {
|
||||
// [ ^ \ * \ ? \ + \ [ \ ( / )
|
||||
0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
|
||||
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
|
||||
0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29,
|
||||
// \ { \ } \ ^ \ $ \ | \ \ \ . ]
|
||||
0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
|
||||
|
||||
|
||||
|
@ -72,7 +75,7 @@ static UnicodeSet *gUnescapeCharSet;
|
|||
// will handle.
|
||||
//
|
||||
static const UChar gUnescapeCharPattern[] = {
|
||||
// [ a c e f n r t u U ]
|
||||
// [ a c e f n r t u U ]
|
||||
0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0};
|
||||
|
||||
|
||||
|
@ -123,7 +126,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||
|
||||
//
|
||||
// Set up the constant (static) Unicode Sets.
|
||||
//
|
||||
//
|
||||
if (gRuleSets[kRuleSet_rule_char-128] == NULL) {
|
||||
// TODO: Make thread safe.
|
||||
// TODO: Memory Cleanup on ICU shutdown.
|
||||
|
@ -131,8 +134,8 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status)
|
|||
gRuleSets[kRuleSet_white_space-128] = (UnicodeSet*) uprv_openRuleWhiteSpaceSet(&status);
|
||||
gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status);
|
||||
gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status);
|
||||
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
|
||||
gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status);
|
||||
gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status);
|
||||
gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
delete gRuleSets[kRuleSet_rule_char-128];
|
||||
|
@ -171,7 +174,7 @@ RegexCompile::~RegexCompile() {
|
|||
// script regexcst.pl
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
void RegexCompile::compile(
|
||||
void RegexCompile::compile(
|
||||
RegexPattern &rxp, // User level pattern object to receive
|
||||
// the compiled pattern.
|
||||
const UnicodeString &pat, // Source pat to be compiled.
|
||||
|
@ -285,7 +288,7 @@ void RegexCompile::compile(
|
|||
if (tableEl->fPushState != 0) {
|
||||
fStackPtr++;
|
||||
if (fStackPtr >= kStackSize) {
|
||||
error(U_BRK_INTERNAL_ERROR);
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
printf("RegexCompile::parse() - state stack overflow.\n");
|
||||
fStackPtr--;
|
||||
}
|
||||
|
@ -304,7 +307,7 @@ void RegexCompile::compile(
|
|||
state = fStack[fStackPtr];
|
||||
fStackPtr--;
|
||||
if (fStackPtr < 0) {
|
||||
error(U_BRK_INTERNAL_ERROR);
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
printf("RegexCompile::compile() - state stack underflow.\n");
|
||||
fStackPtr++;
|
||||
}
|
||||
|
@ -358,12 +361,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// Encountering end of pattern also behaves like a close paren,
|
||||
// and forces fixups of the State Save at the beginning of the compiled pattern
|
||||
// and of any OR operations at the top level.
|
||||
//
|
||||
//
|
||||
handleCloseParen();
|
||||
|
||||
if (fParenStack.size() > 0) {
|
||||
// Missing close paren in pattern.
|
||||
error(U_REGEX_MISMATCHED_PAREN);
|
||||
}
|
||||
|
||||
// add the END operation to the compiled pattern.
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus);
|
||||
|
||||
|
||||
// Terminate the pattern compilation state machine.
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
@ -405,7 +412,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doOpenCaptureParen:
|
||||
// Open Paren.
|
||||
// Compile to a
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced by a save-state if the
|
||||
// parenthesized group gets a * quantifier, followed by
|
||||
// - START_CAPTURE
|
||||
|
@ -430,7 +437,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doOpenNonCaptureParen:
|
||||
// Open non-caputuring (grouping only) Paren.
|
||||
// Compile to a
|
||||
// Compile to a
|
||||
// - NOP, which later may be replaced by a save-state if the
|
||||
// parenthesized group gets a * quantifier, followed by
|
||||
// - NOP, which may later be replaced by a save-state if there
|
||||
|
@ -440,7 +447,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus);
|
||||
|
||||
// On the Parentheses stack, start a new frame and add the postions
|
||||
// of the two NOPs.
|
||||
// of the two NOPs.
|
||||
fParenStack.push(-1, *fStatus); // Begin a new frame.
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
|
||||
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
|
||||
|
@ -475,6 +482,10 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doCloseParen:
|
||||
handleCloseParen();
|
||||
if (fParenStack.size() <= 0) {
|
||||
// Extra close paren, or missing open paren.
|
||||
error(U_REGEX_MISMATCHED_PAREN);
|
||||
}
|
||||
break;
|
||||
|
||||
case doNOP:
|
||||
|
@ -483,11 +494,16 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doBadOpenParenType:
|
||||
case doRuleError:
|
||||
error(U_BRK_RULE_SYNTAX);
|
||||
error(U_REGEX_RULE_SYNTAX);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
||||
|
||||
case doMismatchedParenErr:
|
||||
error(U_REGEX_MISMATCHED_PAREN);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
|
||||
case doPlus:
|
||||
// Normal '+' compiles to
|
||||
// 1. stuff to be repeated (already built)
|
||||
|
@ -532,7 +548,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// 3. ...
|
||||
// Insert the state save into the compiled pattern, and we're done.
|
||||
{
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
|
||||
fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
|
||||
}
|
||||
|
@ -572,9 +588,9 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// 3. JMP 0
|
||||
// 4. ...
|
||||
//
|
||||
{
|
||||
{
|
||||
// location of item #1, the STATE_SAVE
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
int32_t saveStateLoc = blockTopLoc(TRUE);
|
||||
|
||||
// Locate the position in the compiled pattern where the match will continue
|
||||
// after completing the *. (4 in the comment above)
|
||||
|
@ -599,7 +615,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// 3. STATE_SAVE 2
|
||||
// 4 ...
|
||||
{
|
||||
int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
|
||||
int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
|
||||
int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
|
||||
int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc);
|
||||
int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1);
|
||||
|
@ -607,7 +623,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus);
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
case doStartString:
|
||||
// We've just scanned a single "normal" character from the pattern,
|
||||
|
@ -678,7 +694,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
// has only one character, emit the single character token instead.
|
||||
{
|
||||
int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart;
|
||||
U_ASSERT(strLength > 0);
|
||||
U_ASSERT(strLength > 0);
|
||||
int32_t lastCharIdx = fRXPat->fLiteralText.length()-1;
|
||||
lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx);
|
||||
if (lastCharIdx == fStringOpStart) {
|
||||
|
@ -735,7 +751,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
|
||||
case doBackslashG:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus);
|
||||
break;
|
||||
break;
|
||||
|
||||
case doBackslashS:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
|
@ -750,31 +766,31 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
case doBackslashW:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus);
|
||||
break;
|
||||
break;
|
||||
|
||||
case doBackslashw:
|
||||
fRXPat->fCompiledPat->addElement(
|
||||
URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus);
|
||||
break;
|
||||
break;
|
||||
|
||||
case doBackslashX:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus);
|
||||
break;
|
||||
break;
|
||||
|
||||
case doBackslashx: // \x{abcd} alternate hex format
|
||||
// TODO: implement
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
|
||||
|
||||
case doBackslashZ:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus);
|
||||
break;
|
||||
break;
|
||||
|
||||
case doBackslashz:
|
||||
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus);
|
||||
break;
|
||||
break;
|
||||
|
||||
case doExit:
|
||||
returnVal = FALSE;
|
||||
|
@ -806,12 +822,23 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
break;
|
||||
|
||||
case doNamedChar: // \N{NAMED_CHAR}
|
||||
// TODO: implement
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
case doMatchMode: // (?i) and similar
|
||||
// TODO: implement
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
case doNotImplementedError:
|
||||
// TODO: get rid of this once everything is implemented.
|
||||
error(U_REGEX_UNIMPLEMENTED);
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
error(U_BRK_INTERNAL_ERROR);
|
||||
error(U_REGEX_INTERNAL_ERROR);
|
||||
returnVal = FALSE;
|
||||
break;
|
||||
}
|
||||
|
@ -838,7 +865,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
|
|||
//------------------------------------------------------------------------------
|
||||
int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
|
||||
int32_t theLoc;
|
||||
if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
|
||||
if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
|
||||
{
|
||||
// The item just processed is a parenthesized block.
|
||||
theLoc = fMatchOpenParen; // A slot is already reserved for us.
|
||||
|
@ -878,8 +905,11 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
|
|||
void RegexCompile::handleCloseParen() {
|
||||
int32_t patIdx;
|
||||
int32_t patOp;
|
||||
U_ASSERT(fParenStack.size() >= 1);
|
||||
|
||||
if (fParenStack.size() <= 0) {
|
||||
error(U_REGEX_MISMATCHED_PAREN);
|
||||
return;
|
||||
}
|
||||
|
||||
// Fixup any operations within the just-closed parenthesized group
|
||||
// that need to reference the end of the (block).
|
||||
// (The first one on popped from the stack is an unused slot for
|
||||
|
@ -896,17 +926,17 @@ void RegexCompile::handleCloseParen() {
|
|||
fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
|
||||
fMatchOpenParen = patIdx;
|
||||
}
|
||||
|
||||
|
||||
// DO any additional fixups, depending on the specific kind of
|
||||
// parentesized grouping this is
|
||||
|
||||
|
||||
switch (patIdx) {
|
||||
case -1:
|
||||
// No additional fixups required.
|
||||
// This is the case with most kinds of groupings.
|
||||
break;
|
||||
case -2:
|
||||
// Capturing Parentheses.
|
||||
// Capturing Parentheses.
|
||||
// Insert a End Capture op into the pattern.
|
||||
// Grab the group number from the start capture op
|
||||
// and put it into the end-capture op.
|
||||
|
@ -1039,7 +1069,7 @@ UChar32 RegexCompile::nextCharLL() {
|
|||
fLineNum++;
|
||||
fCharNum=0;
|
||||
if (fQuoteMode) {
|
||||
error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
|
||||
error(U_REGEX_RULE_SYNTAX);
|
||||
fQuoteMode = FALSE;
|
||||
}
|
||||
}
|
||||
|
@ -1120,7 +1150,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
// Use UnicodeString::unescapeAt() to handle those that it can.
|
||||
// Otherwise just return the '\', and let the pattern parser deal with it.
|
||||
//
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t startX = fNextIndex; // start and end positions of the
|
||||
int32_t endX = fNextIndex; // sequence following the '\'
|
||||
if (c.fChar == chBackSlash) {
|
||||
if (gUnescapeCharSet->contains(peekCharLL())) {
|
||||
|
@ -1148,7 +1178,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
|
|||
//
|
||||
// The scan position is normally under the control of the state machine
|
||||
// that controls pattern parsing. UnicodeSets, however, are parsed by
|
||||
// the UnicodeSet constructor, not by the Regex pattern parser.
|
||||
// the UnicodeSet constructor, not by the Regex pattern parser.
|
||||
//
|
||||
//---------------------------------------------------------------------------------
|
||||
UnicodeSet *RegexCompile::scanSet() {
|
||||
|
@ -1193,7 +1223,7 @@ UnicodeSet *RegexCompile::scanSet() {
|
|||
//---------------------------------------------------------------------------------
|
||||
//
|
||||
// scanProp Construct a UnicodeSet from the text at the current scan
|
||||
// position, which will be of the form \p{whaterver}
|
||||
// position, which will be of the form \p{whaterver}
|
||||
//
|
||||
// The scan position will be at the 'p' or 'P'. On return
|
||||
// the scan position should be just after the '}'
|
||||
|
@ -1240,6 +1270,5 @@ UnicodeSet *RegexCompile::scanProp() {
|
|||
return uset;
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
|
|
@ -4,8 +4,10 @@
|
|||
// Copyright (C) 2002, International Business Machines Corporation and others.
|
||||
// All Rights Reserved.
|
||||
//
|
||||
// This file contains declarations for the class RegexCompile and for compiled
|
||||
// regular expression data format
|
||||
// This file contains declarations for the class RegexCompile
|
||||
//
|
||||
// This class is internal to the regular expression implementation.
|
||||
// For the public Regular Expression API, see the file "unicode/regex.h"
|
||||
//
|
||||
|
||||
|
||||
|
@ -13,6 +15,8 @@
|
|||
#define RBBISCAN_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
@ -28,12 +32,7 @@ static const UBool REGEX_DEBUG = TRUE;
|
|||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// class RegexCompile does the lowest level, character-at-a-time
|
||||
// scanning of a regular expression.
|
||||
//
|
||||
// The output of the scanner is a tokenized form
|
||||
// of the RE, plus prebuilt UnicodeSet objects for each
|
||||
// set of charcters that is referenced.
|
||||
// class RegexCompile Contains the regular expression compiler.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
static const int kStackSize = 100; // The size of the state stack for
|
||||
|
@ -161,5 +160,5 @@ private:
|
|||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif // RBBISCAN_H
|
||||
|
|
|
@ -24,7 +24,6 @@ U_NAMESPACE_BEGIN
|
|||
enum Regex_PatternParseAction {
|
||||
doCloseParen,
|
||||
doProperty,
|
||||
doTagValue,
|
||||
doOrOperator,
|
||||
doOpenCaptureParen,
|
||||
doBadOpenParenType,
|
||||
|
@ -35,6 +34,7 @@ enum Regex_PatternParseAction {
|
|||
doNamedChar,
|
||||
doBackslashw,
|
||||
doPossesiveStar,
|
||||
doMismatchedParenErr,
|
||||
doOpenLookBehind,
|
||||
doBackslashx,
|
||||
doBackslashz,
|
||||
|
@ -43,6 +43,7 @@ enum Regex_PatternParseAction {
|
|||
doEnterQuoteMode,
|
||||
doPossesivePlus,
|
||||
doNGStar,
|
||||
doMatchMode,
|
||||
doOpenLookAheadNeg,
|
||||
doPlus,
|
||||
doOpenNonCaptureParen,
|
||||
|
@ -51,14 +52,11 @@ enum Regex_PatternParseAction {
|
|||
doNGPlus,
|
||||
doPatFinish,
|
||||
doBackslashD,
|
||||
doIntervalMinValue,
|
||||
doIntervalDigit,
|
||||
doPossesiveOpt,
|
||||
doBackslashG,
|
||||
doOpt,
|
||||
doOpenAtomicParen,
|
||||
doBackslashS,
|
||||
doNumberExpectedError,
|
||||
doStringChar,
|
||||
doOpenLookAhead,
|
||||
doBackRef,
|
||||
|
@ -74,6 +72,7 @@ enum Regex_PatternParseAction {
|
|||
doBackslashb,
|
||||
doEndString,
|
||||
doBackslashd,
|
||||
doNotImplementedError,
|
||||
doOpenLookBehindNeg,
|
||||
doSplitString,
|
||||
rbbiLastAction};
|
||||
|
@ -100,13 +99,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doStartString, 254, 13,0, TRUE} // 3 term
|
||||
, {doStartString, 130, 13,0, TRUE} // 4
|
||||
, {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5
|
||||
, {doNOP, 40 /* ( */, 27, 20, TRUE} // 6
|
||||
, {doNOP, 40 /* ( */, 28, 20, TRUE} // 6
|
||||
, {doDotAny, 46 /* . */, 20,0, TRUE} // 7
|
||||
, {doCaret, 94 /* ^ */, 3,0, TRUE} // 8
|
||||
, {doDollar, 36 /* $ */, 3,0, TRUE} // 9
|
||||
, {doNOP, 92 /* \ */, 60,0, TRUE} // 10
|
||||
, {doNOP, 92 /* \ */, 67,0, TRUE} // 10
|
||||
, {doNOP, 253, 2,0, FALSE} // 11
|
||||
, {doRuleError, 255, 80,0, FALSE} // 12
|
||||
, {doRuleError, 255, 87,0, FALSE} // 12
|
||||
, {doStringChar, 254, 13,0, TRUE} // 13 string
|
||||
, {doStringChar, 130, 13,0, TRUE} // 14
|
||||
, {doSplitString, 63 /* ? */, 20,0, FALSE} // 15
|
||||
|
@ -114,67 +113,74 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
|
|||
, {doSplitString, 42 /* * */, 20,0, FALSE} // 17
|
||||
, {doSplitString, 123 /* { */, 20,0, FALSE} // 18
|
||||
, {doEndString, 255, 20,0, FALSE} // 19
|
||||
, {doNOP, 42 /* * */, 41,0, TRUE} // 20 expr-quant
|
||||
, {doNOP, 43 /* + */, 44,0, TRUE} // 21
|
||||
, {doNOP, 63 /* ? */, 47,0, TRUE} // 22
|
||||
, {doNOP, 255, 24,0, FALSE} // 23
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 24 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 25
|
||||
, {doNOP, 255, 3,0, FALSE} // 26
|
||||
, {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 28
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 29 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 30
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 24, TRUE} // 31
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE} // 32
|
||||
, {doNOP, 60 /* < */, 36,0, TRUE} // 33
|
||||
, {doNOP, 35 /* # */, 39,0, TRUE} // 34
|
||||
, {doBadOpenParenType, 255, 80,0, FALSE} // 35
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 24, TRUE} // 36 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE} // 37
|
||||
, {doBadOpenParenType, 255, 80,0, FALSE} // 38
|
||||
, {doNOP, 41 /* ) */, 3,0, TRUE} // 39 paren-comment
|
||||
, {doNOP, 255, 39,0, TRUE} // 40
|
||||
, {doNGStar, 63 /* ? */, 24,0, TRUE} // 41 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 24,0, TRUE} // 42
|
||||
, {doStar, 255, 24,0, FALSE} // 43
|
||||
, {doNGPlus, 63 /* ? */, 24,0, TRUE} // 44 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 24,0, TRUE} // 45
|
||||
, {doPlus, 255, 24,0, FALSE} // 46
|
||||
, {doNGOpt, 63 /* ? */, 24,0, TRUE} // 47 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 24,0, TRUE} // 48
|
||||
, {doOpt, 255, 24,0, FALSE} // 49
|
||||
, {doNOP, 129, 50,0, TRUE} // 50 interval-open
|
||||
, {doIntervalMinValue, 128, 53,0, FALSE} // 51
|
||||
, {doNumberExpectedError, 255, 80,0, FALSE} // 52
|
||||
, {doNOP, 129, 57,0, TRUE} // 53 interval-value
|
||||
, {doNOP, 125 /* } */, 57,0, FALSE} // 54
|
||||
, {doIntervalDigit, 128, 53,0, TRUE} // 55
|
||||
, {doNumberExpectedError, 255, 80,0, FALSE} // 56
|
||||
, {doNOP, 129, 57,0, TRUE} // 57 interval-close
|
||||
, {doTagValue, 125 /* } */, 24,0, TRUE} // 58
|
||||
, {doNumberExpectedError, 255, 80,0, FALSE} // 59
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 60 backslash
|
||||
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 61
|
||||
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 62
|
||||
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 63
|
||||
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 64
|
||||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 65
|
||||
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 66
|
||||
, {doProperty, 112 /* p */, 20,0, FALSE} // 67
|
||||
, {doProperty, 80 /* P */, 20,0, FALSE} // 68
|
||||
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 69
|
||||
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 70
|
||||
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 71
|
||||
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 72
|
||||
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 73
|
||||
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 74
|
||||
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 75
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 76
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 77
|
||||
, {doBackRef, 128, 20,0, TRUE} // 78
|
||||
, {doStartString, 255, 13,0, TRUE} // 79
|
||||
, {doExit, 255, 80,0, TRUE} // 80 errorDeath
|
||||
, {doNOP, 42 /* * */, 56,0, TRUE} // 20 expr-quant
|
||||
, {doNOP, 43 /* + */, 59,0, TRUE} // 21
|
||||
, {doNOP, 63 /* ? */, 62,0, TRUE} // 22
|
||||
, {doNOP, 123 /* { */, 65,0, TRUE} // 23
|
||||
, {doNOP, 255, 25,0, FALSE} // 24
|
||||
, {doOrOperator, 124 /* | */, 3,0, TRUE} // 25 expr-cont
|
||||
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 26
|
||||
, {doNOP, 255, 3,0, FALSE} // 27
|
||||
, {doNOP, 63 /* ? */, 30,0, TRUE} // 28 open-paren
|
||||
, {doOpenCaptureParen, 255, 3, 20, FALSE} // 29
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 30 open-paren-extended
|
||||
, {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 31
|
||||
, {doOpenLookAhead, 61 /* = */, 3, 25, TRUE} // 32
|
||||
, {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE} // 33
|
||||
, {doNOP, 60 /* < */, 42,0, TRUE} // 34
|
||||
, {doNOP, 35 /* # */, 45,0, TRUE} // 35
|
||||
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 36
|
||||
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 37
|
||||
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 38
|
||||
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 39
|
||||
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 40
|
||||
, {doBadOpenParenType, 255, 87,0, FALSE} // 41
|
||||
, {doOpenLookBehind, 61 /* = */, 3, 25, TRUE} // 42 open-paren-lookbehind
|
||||
, {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE} // 43
|
||||
, {doBadOpenParenType, 255, 87,0, FALSE} // 44
|
||||
, {doNOP, 41 /* ) */, 3,0, TRUE} // 45 paren-comment
|
||||
, {doMismatchedParenErr, 253, 87,0, FALSE} // 46
|
||||
, {doNOP, 255, 45,0, TRUE} // 47
|
||||
, {doMatchMode, 105 /* i */, 48,0, TRUE} // 48 paren-flag
|
||||
, {doMatchMode, 115 /* s */, 48,0, TRUE} // 49
|
||||
, {doMatchMode, 109 /* m */, 48,0, TRUE} // 50
|
||||
, {doMatchMode, 120 /* x */, 48,0, TRUE} // 51
|
||||
, {doMatchMode, 45 /* - */, 48,0, TRUE} // 52
|
||||
, {doNOP, 41 /* ) */, 3,0, TRUE} // 53
|
||||
, {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 54
|
||||
, {doNOP, 255, 87,0, FALSE} // 55
|
||||
, {doNGStar, 63 /* ? */, 25,0, TRUE} // 56 quant-star
|
||||
, {doPossesiveStar, 43 /* + */, 25,0, TRUE} // 57
|
||||
, {doStar, 255, 25,0, FALSE} // 58
|
||||
, {doNGPlus, 63 /* ? */, 25,0, TRUE} // 59 quant-plus
|
||||
, {doPossesivePlus, 43 /* + */, 25,0, TRUE} // 60
|
||||
, {doPlus, 255, 25,0, FALSE} // 61
|
||||
, {doNGOpt, 63 /* ? */, 25,0, TRUE} // 62 quant-opt
|
||||
, {doPossesiveOpt, 43 /* + */, 25,0, TRUE} // 63
|
||||
, {doOpt, 255, 25,0, FALSE} // 64
|
||||
, {doNOP, 129, 65,0, TRUE} // 65 interval-open
|
||||
, {doNotImplementedError, 255, 87,0, FALSE} // 66
|
||||
, {doBackslashA, 65 /* A */, 3,0, TRUE} // 67 backslash
|
||||
, {doBackslashB, 66 /* B */, 3,0, TRUE} // 68
|
||||
, {doBackslashb, 98 /* b */, 3,0, TRUE} // 69
|
||||
, {doBackslashd, 100 /* d */, 20,0, TRUE} // 70
|
||||
, {doBackslashD, 68 /* D */, 20,0, TRUE} // 71
|
||||
, {doBackslashG, 71 /* G */, 3,0, TRUE} // 72
|
||||
, {doNamedChar, 78 /* N */, 20,0, TRUE} // 73
|
||||
, {doProperty, 112 /* p */, 20,0, FALSE} // 74
|
||||
, {doProperty, 80 /* P */, 20,0, FALSE} // 75
|
||||
, {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 76
|
||||
, {doBackslashS, 83 /* S */, 20,0, TRUE} // 77
|
||||
, {doBackslashs, 115 /* s */, 20,0, TRUE} // 78
|
||||
, {doBackslashW, 87 /* W */, 20,0, TRUE} // 79
|
||||
, {doBackslashw, 119 /* w */, 20,0, TRUE} // 80
|
||||
, {doBackslashX, 88 /* X */, 20,0, TRUE} // 81
|
||||
, {doBackslashx, 120 /* x */, 20,0, TRUE} // 82
|
||||
, {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 83
|
||||
, {doBackslashz, 122 /* z */, 3,0, TRUE} // 84
|
||||
, {doBackRef, 128, 20,0, TRUE} // 85
|
||||
, {doStartString, 255, 13,0, TRUE} // 86
|
||||
, {doExit, 255, 87,0, TRUE} // 87 errorDeath
|
||||
};
|
||||
static const char *RegexStateNames[] = { 0,
|
||||
"start",
|
||||
|
@ -199,6 +205,7 @@ static const char *RegexStateNames[] = { 0,
|
|||
"expr-quant",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"expr-cont",
|
||||
0,
|
||||
|
@ -211,11 +218,25 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"open-paren-lookbehind",
|
||||
0,
|
||||
0,
|
||||
"paren-comment",
|
||||
0,
|
||||
0,
|
||||
"paren-flag",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"quant-star",
|
||||
0,
|
||||
|
@ -227,14 +248,6 @@ static const char *RegexStateNames[] = { 0,
|
|||
0,
|
||||
0,
|
||||
"interval-open",
|
||||
0,
|
||||
0,
|
||||
"interval-value",
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
"interval-close",
|
||||
0,
|
||||
0,
|
||||
"backslash",
|
||||
0,
|
||||
|
|
|
@ -107,7 +107,8 @@ string:
|
|||
expr-quant:
|
||||
'*' n quant-star
|
||||
'+' n quant-plus
|
||||
'?' n quant-opt
|
||||
'?' n quant-opt
|
||||
'{' n interval-open
|
||||
default expr-cont
|
||||
|
||||
|
||||
|
@ -136,6 +137,11 @@ open-paren-extended:
|
|||
'!' n term ^expr-cont doOpenLookAheadNeg # (?!
|
||||
'<' n open-paren-lookbehind
|
||||
'#' n paren-comment
|
||||
'i' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
default errorDeath doBadOpenParenType
|
||||
|
||||
open-paren-lookbehind:
|
||||
|
@ -150,7 +156,21 @@ open-paren-lookbehind:
|
|||
#
|
||||
paren-comment:
|
||||
')' n term
|
||||
eof errorDeath doMismatchedParenErr
|
||||
default n paren-comment
|
||||
|
||||
#
|
||||
# paren-flag Scanned a (?ismx-ismx flag setting thing
|
||||
# TODO: this is not fully implemented yet.
|
||||
paren-flag:
|
||||
'i' n paren-flag doMatchMode
|
||||
's' n paren-flag doMatchMode
|
||||
'm' n paren-flag doMatchMode
|
||||
'x' n paren-flag doMatchMode
|
||||
'-' n paren-flag doMatchMode
|
||||
')' n term
|
||||
':' n term ^expr-quant doOpenNonCaptureParen
|
||||
default errorDeath
|
||||
|
||||
|
||||
#
|
||||
|
@ -189,19 +209,8 @@ quant-opt:
|
|||
#
|
||||
interval-open:
|
||||
white_space n interval-open
|
||||
digit_char interval-value doIntervalMinValue
|
||||
default errorDeath doNumberExpectedError
|
||||
default errorDeath doNotImplementedError
|
||||
|
||||
interval-value:
|
||||
white_space n interval-close
|
||||
'}' interval-close
|
||||
digit_char n interval-value doIntervalDigit
|
||||
default errorDeath doNumberExpectedError
|
||||
|
||||
interval-close:
|
||||
white_space n interval-close
|
||||
'}' n expr-cont doTagValue
|
||||
default errorDeath doNumberExpectedError
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
//
|
||||
// file: regeximp.h
|
||||
//
|
||||
// ICU Regular Expressions, declarations of internal implementation types
|
||||
// and constants that are common between the pattern compiler and the
|
||||
// runtime execution engine.
|
||||
// ICU Regular Expressions,
|
||||
// Definitions of constant values used in the compiled form of
|
||||
// a regular expression pattern.
|
||||
//
|
||||
|
||||
#ifndef _REGEXIMP_H
|
||||
|
@ -19,7 +19,7 @@
|
|||
//
|
||||
static const uint32_t URX_UNUSED1 = 1;
|
||||
static const uint32_t URX_END = 2;
|
||||
static const uint32_t URX_ONECHAR = 3;
|
||||
static const uint32_t URX_ONECHAR = 3; // Value field is the 21 bit unicode char to match
|
||||
static const uint32_t URX_STRING = 4; // Value field is index of string start
|
||||
static const uint32_t URX_STRING_LEN = 5; // Value field is string length (code units)
|
||||
static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern position to push
|
||||
|
@ -55,7 +55,7 @@ static const uint32_t URX_DOLLAR = 24; // Also for \Z
|
|||
|
||||
|
||||
//
|
||||
// Access to Unicode Sets for composite properties
|
||||
// Access to Unicode Sets for Perl-like composite character properties
|
||||
// The sets are accessed by the match engine for things like \w (word boundary)
|
||||
//
|
||||
static const uint32_t URX_ISWORD_SET = 1;
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
//
|
||||
// file: rematch.cpp
|
||||
//
|
||||
// Contains the implementation of class RegexMatcher,
|
||||
// which is one of the main API classes for the ICU regular expression package.
|
||||
//
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002 International Business Machines Corporation *
|
||||
|
@ -9,6 +12,8 @@
|
|||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/regex.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
|
@ -443,10 +448,11 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const {
|
|||
// isWordBoundary
|
||||
// in perl, "xab..cd..", \b is true at positions 0,3,5,7
|
||||
// For us,
|
||||
// If the current char is a combining mark, \b is FALSE
|
||||
// Scan backwards to the first non-combining char
|
||||
// Pos is a boundary if the current and previous chars are
|
||||
// opposite in membership in \w set
|
||||
// If the current char is a combining mark,
|
||||
// \b is FALSE.
|
||||
// Else Scan backwards to the first non-combining char.
|
||||
// We are at a boundary if the this char and the original chars are
|
||||
// opposite in membership in \w set
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
||||
|
@ -486,27 +492,6 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) {
|
|||
}
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// getCaptureText We have encountered a '\' that might preceed a
|
||||
// capture group specification.
|
||||
// If a valid capture group number follows the '\',
|
||||
// return the indicies to the start & end of the captured
|
||||
// text, and update the patIdx to the position following the
|
||||
// \n sequence.
|
||||
//
|
||||
// This function is used during find and replace operations when
|
||||
// processing caputure references in the replacement text.
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
UBool RegexMatcher::getCaptureText(const UnicodeString &rep,
|
||||
int32_t &repIdx,
|
||||
int32_t &textStart,
|
||||
int32_t &textEnd)
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------
|
||||
//
|
||||
// backTrack Within the match engine, this function is called when
|
||||
|
@ -915,10 +900,9 @@ breakFromLoop:
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
const char RegexMatcher::fgClassID = 0;
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
|
|
|
@ -9,6 +9,9 @@
|
|||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/regex.h"
|
||||
#include "uassert.h"
|
||||
#include "uvector.h"
|
||||
|
@ -66,6 +69,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
|
|||
fBadState = other.fBadState;
|
||||
fNumCaptureGroups = other.fNumCaptureGroups;
|
||||
fMaxCaptureDigits = other.fMaxCaptureDigits;
|
||||
fStaticSets = other.fStaticSets;
|
||||
if (fBadState) {
|
||||
return *this;
|
||||
}
|
||||
|
@ -110,6 +114,7 @@ void RegexPattern::init() {
|
|||
fBadState = FALSE;
|
||||
fNumCaptureGroups = 0;
|
||||
fMaxCaptureDigits = 1; // TODO: calculate for real.
|
||||
fStaticSets = NULL;
|
||||
fMatcher = NULL;
|
||||
|
||||
UErrorCode status=U_ZERO_ERROR;
|
||||
|
@ -384,15 +389,6 @@ int32_t RegexPattern::split(const UnicodeString &input,
|
|||
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// hashcode
|
||||
//
|
||||
//---------------------------------------------------------------------
|
||||
int32_t RegexPattern::hashCode(void) const {
|
||||
return 0; // TODO: Do something better here
|
||||
};
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
|
@ -512,8 +508,8 @@ breakFromLoop:
|
|||
printf("\n\n");
|
||||
};
|
||||
|
||||
|
||||
|
||||
const char RegexPattern::fgClassID = 0;
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
|
|
@ -9,6 +9,9 @@
|
|||
#define REGEX_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/parseerr.h"
|
||||
|
@ -69,9 +72,6 @@ public:
|
|||
RegexPattern &operator =(const RegexPattern &other);
|
||||
virtual RegexPattern *clone() const;
|
||||
|
||||
// TODO: Do we really want a hashCode function on this class?
|
||||
virtual int32_t hashCode(void) const;
|
||||
|
||||
|
||||
/**
|
||||
* Compiles the given regular expression into a pattern
|
||||
|
@ -428,10 +428,6 @@ private:
|
|||
//
|
||||
void MatchAt(int32_t startIdx, UErrorCode &status);
|
||||
inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
|
||||
UBool getCaptureText(const UnicodeString &rep,
|
||||
int32_t &repIdx,
|
||||
int32_t &textStart,
|
||||
int32_t &textEnd);
|
||||
UBool isWordBoundary(int32_t pos); // perform the \b test
|
||||
|
||||
|
||||
|
@ -448,7 +444,6 @@ private:
|
|||
|
||||
};
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif
|
||||
|
|
|
@ -70,11 +70,13 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam
|
|||
break;
|
||||
|
||||
case 3: name = "regex";
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
if (exec) {
|
||||
logln("TestSuite Regex---"); logln();
|
||||
RegexTest test;
|
||||
callTest( test, par );
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
|
||||
case 4: name = "format";
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
//
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "unicode/uchar.h"
|
||||
#include "intltest.h"
|
||||
#include "regextst.h"
|
||||
|
@ -1195,8 +1197,38 @@ void RegexTest::Errors() {
|
|||
REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Missing close parentheses
|
||||
//REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR);
|
||||
REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
|
||||
REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
|
||||
REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
|
||||
|
||||
// Extra close paren
|
||||
REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
|
||||
REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX);
|
||||
REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
|
||||
|
||||
// Flag settings not yet implemented
|
||||
REGEX_ERR("(?i:stuff*)", 1, 3, U_REGEX_UNIMPLEMENTED);
|
||||
REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// Look-ahead, Look-behind
|
||||
REGEX_ERR("abc(?=xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // look-ahead
|
||||
REGEX_ERR("abc(?!xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // negated look-ahead
|
||||
REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // look-behind
|
||||
REGEX_ERR("abc(?<!xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // negated look-behind
|
||||
REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
|
||||
|
||||
// Atomic Grouping
|
||||
REGEX_ERR("abc(?>xyz)", 1, 6, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
// {Numeric Quantifiers}
|
||||
REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED);
|
||||
|
||||
|
||||
// Quantifiers are allowed only after something that can be quantified.
|
||||
REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
|
||||
REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
|
||||
REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
|
||||
}
|
||||
|
||||
|
||||
#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
|
||||
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
#ifndef REGEXTST_H
|
||||
#define REGEXTST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
|
||||
#include "intltest.h"
|
||||
#include "unicode/regex.h"
|
||||
|
@ -35,4 +37,6 @@ public:
|
|||
virtual void regex_err(const char *pat, int32_t errline, int32_t errcol,
|
||||
UErrorCode expectedStatus, int line);
|
||||
};
|
||||
|
||||
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue