From 2d39fda4e32bcbb4f5cf5e1c562f7561107ee369 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 7 Nov 2002 02:34:46 +0000 Subject: [PATCH] ICU-105 Regular Expressions, ongoing development X-SVN-Rev: 10180 --- icu4c/source/common/putil.c | 7 +- icu4c/source/common/unicode/uconfig.h | 12 ++ icu4c/source/common/unicode/utypes.h | 3 + icu4c/source/i18n/regexcmp.cpp | 129 +++++++++++------- icu4c/source/i18n/regexcmp.h | 19 ++- icu4c/source/i18n/regexcst.h | 165 +++++++++++++----------- icu4c/source/i18n/regexcst.txt | 35 +++-- icu4c/source/i18n/regeximp.h | 10 +- icu4c/source/i18n/rematch.cpp | 38 ++---- icu4c/source/i18n/repattrn.cpp | 18 +-- icu4c/source/i18n/unicode/regex.h | 13 +- icu4c/source/test/intltest/itmajor.cpp | 2 + icu4c/source/test/intltest/regextst.cpp | 36 +++++- icu4c/source/test/intltest/regextst.h | 4 + 14 files changed, 286 insertions(+), 205 deletions(-) diff --git a/icu4c/source/common/putil.c b/icu4c/source/common/putil.c index c570d6b0b19..17bc5b4e93b 100644 --- a/icu4c/source/common/putil.c +++ b/icu4c/source/common/putil.c @@ -1833,10 +1833,13 @@ static const char * const _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = { "U_REGEX_ERROR_START", "U_REGEX_INTERNAL_ERROR", + "U_REGEX_RULE_SYNTAX", "U_REGEX_INVALID_STATE", "U_REGEX_BAD_ESCAPE_SEQUENCE", "U_REGEX_PROPERTY_SYNTAX", - "U_REGEX_UNIMPLEMENTED" + "U_REGEX_UNIMPLEMENTED", + "U_REGEX_MISMATCHED_PAREN", + "U_REGEX_MATCH_MODE_ERROR" }; U_CAPI const char * U_EXPORT2 @@ -1852,7 +1855,7 @@ u_errorName(UErrorCode code) { } else if (U_BRK_ERROR_START <= code && code < U_BRK_ERROR_LIMIT){ return _uBrkErrorName[code - U_BRK_ERROR_START]; } else if (U_REGEX_ERROR_START <= code && code < U_REGEX_ERROR_LIMIT) { - return _uBrkErrorName[code - U_REGEX_ERROR_START]; + return _uRegexErrorName[code - U_REGEX_ERROR_START]; } else { return "[BOGUS UErrorCode]"; } diff --git a/icu4c/source/common/unicode/uconfig.h b/icu4c/source/common/unicode/uconfig.h index b6af2ea93a0..334ffe2667b 100644 --- a/icu4c/source/common/unicode/uconfig.h +++ b/icu4c/source/common/unicode/uconfig.h @@ -52,6 +52,7 @@ # endif # define UCONFIG_NO_FORMATTING 1 # define UCONFIG_NO_TRANSLITERATION 1 +# define UCONFIG_NO_REGULAR_EXPRESSIONS 1 #endif /* common library switches -------------------------------------------------- */ @@ -114,5 +115,16 @@ # define UCONFIG_NO_TRANSLITERATION 0 #endif +/** + * \def UCONFIG_NO_REGULAR_EXPRESSIONS + * This switch turns off regular expressions. + * + * @draft ICU 2.6 + */ +#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS +# define UCONFIG_NO_REGULAR_EXPRESSIONS 0 +#endif + + #endif diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h index 9c9372e215a..ffb3bc334af 100644 --- a/icu4c/source/common/unicode/utypes.h +++ b/icu4c/source/common/unicode/utypes.h @@ -502,10 +502,13 @@ typedef enum UErrorCode { */ U_REGEX_ERROR_START=0x10300, U_REGEX_INTERNAL_ERROR, + U_REGEX_RULE_SYNTAX, U_REGEX_INVALID_STATE, U_REGEX_BAD_ESCAPE_SEQUENCE, U_REGEX_PROPERTY_SYNTAX, U_REGEX_UNIMPLEMENTED, + U_REGEX_MISMATCHED_PAREN, + U_REGEX_MATCH_MODE_ERROR, U_REGEX_ERROR_LIMIT, U_ERROR_LIMIT=U_BRK_ERROR_LIMIT /**< This must always be the last value to indicate the limit for UErrorCode (last error code +1) */ diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp index 8eb4debccc8..f899837320e 100644 --- a/icu4c/source/i18n/regexcmp.cpp +++ b/icu4c/source/i18n/regexcmp.cpp @@ -5,11 +5,14 @@ // Copyright (C) 2002, International Business Machines Corporation and others. // All Rights Reserved. // -// This file contains the ICU regular expression scanner, which is responsible -// for preprocessing a regular expression pattern into the tokenized form that +// This file contains the ICU regular expression compiler, which is responsible +// for processing a regular expression pattern into the compiled form that // is used by the match finding engine. // +#include "unicode/utypes.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "unicode/unistr.h" #include "unicode/uniset.h" @@ -18,18 +21,18 @@ #include "unicode/parsepos.h" #include "unicode/parseerr.h" #include "unicode/regex.h" -#include "regeximp.h" #include "uprops.h" #include "cmemory.h" #include "cstring.h" +#include "uassert.h" #include "stdio.h" // TODO: Get rid of this +#include "regeximp.h" #include "regexcst.h" // Contains state table for the regex pattern parser. // generated by a Perl script. #include "regexcmp.h" -#include "uassert.h" U_NAMESPACE_BEGIN @@ -52,10 +55,10 @@ static const int RESCAN_DEBUG = 0; // Characters that have no special meaning, and thus do not need to be escaped. Expressed // as the inverse of those needing escaping -- [^\*\?\+\[\(\)\{\}\^\$\|\\\.] -static const UChar gRuleSet_rule_char_pattern[] = { +static const UChar gRuleSet_rule_char_pattern[] = { // [ ^ \ * \ ? \ + \ [ \ ( / ) - 0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29, - // \ { \ } \ ^ \ $ \ | \ \ \ . ] + 0x5b, 0x5e, 0x5c, 0x2a, 0x5c, 0x3f, 0x5c, 0x2b, 0x5c, 0x5b, 0x5c, 0x28, 0x5c, 0x29, + // \ { \ } \ ^ \ $ \ | \ \ \ . ] 0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0}; @@ -72,7 +75,7 @@ static UnicodeSet *gUnescapeCharSet; // will handle. // static const UChar gUnescapeCharPattern[] = { -// [ a c e f n r t u U ] +// [ a c e f n r t u U ] 0x5b, 0x61, 0x63, 0x65, 0x66, 0x6e, 0x72, 0x74, 0x75, 0x55, 0x5d, 0}; @@ -123,7 +126,7 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) // // Set up the constant (static) Unicode Sets. - // + // if (gRuleSets[kRuleSet_rule_char-128] == NULL) { // TODO: Make thread safe. // TODO: Memory Cleanup on ICU shutdown. @@ -131,8 +134,8 @@ RegexCompile::RegexCompile(UErrorCode &status) : fParenStack(status) gRuleSets[kRuleSet_white_space-128] = (UnicodeSet*) uprv_openRuleWhiteSpaceSet(&status); gRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(gRuleSet_digit_char_pattern, status); gUnescapeCharSet = new UnicodeSet(gUnescapeCharPattern, status); - gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status); - gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status); + gPropSets[URX_ISWORD_SET] = new UnicodeSet(gIsWordPattern, status); + gPropSets[URX_ISSPACE_SET] = new UnicodeSet(gIsSpacePattern, status); if (U_FAILURE(status)) { delete gRuleSets[kRuleSet_rule_char-128]; @@ -171,7 +174,7 @@ RegexCompile::~RegexCompile() { // script regexcst.pl // //--------------------------------------------------------------------------------- -void RegexCompile::compile( +void RegexCompile::compile( RegexPattern &rxp, // User level pattern object to receive // the compiled pattern. const UnicodeString &pat, // Source pat to be compiled. @@ -285,7 +288,7 @@ void RegexCompile::compile( if (tableEl->fPushState != 0) { fStackPtr++; if (fStackPtr >= kStackSize) { - error(U_BRK_INTERNAL_ERROR); + error(U_REGEX_INTERNAL_ERROR); printf("RegexCompile::parse() - state stack overflow.\n"); fStackPtr--; } @@ -304,7 +307,7 @@ void RegexCompile::compile( state = fStack[fStackPtr]; fStackPtr--; if (fStackPtr < 0) { - error(U_BRK_INTERNAL_ERROR); + error(U_REGEX_INTERNAL_ERROR); printf("RegexCompile::compile() - state stack underflow.\n"); fStackPtr++; } @@ -358,12 +361,16 @@ UBool RegexCompile::doParseActions(EParseAction action) // Encountering end of pattern also behaves like a close paren, // and forces fixups of the State Save at the beginning of the compiled pattern // and of any OR operations at the top level. - // + // handleCloseParen(); - + if (fParenStack.size() > 0) { + // Missing close paren in pattern. + error(U_REGEX_MISMATCHED_PAREN); + } + // add the END operation to the compiled pattern. fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); - + // Terminate the pattern compilation state machine. returnVal = FALSE; break; @@ -405,7 +412,7 @@ UBool RegexCompile::doParseActions(EParseAction action) case doOpenCaptureParen: // Open Paren. - // Compile to a + // Compile to a // - NOP, which later may be replaced by a save-state if the // parenthesized group gets a * quantifier, followed by // - START_CAPTURE @@ -430,7 +437,7 @@ UBool RegexCompile::doParseActions(EParseAction action) case doOpenNonCaptureParen: // Open non-caputuring (grouping only) Paren. - // Compile to a + // Compile to a // - NOP, which later may be replaced by a save-state if the // parenthesized group gets a * quantifier, followed by // - NOP, which may later be replaced by a save-state if there @@ -440,7 +447,7 @@ UBool RegexCompile::doParseActions(EParseAction action) fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); // On the Parentheses stack, start a new frame and add the postions - // of the two NOPs. + // of the two NOPs. fParenStack.push(-1, *fStatus); // Begin a new frame. fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP @@ -475,6 +482,10 @@ UBool RegexCompile::doParseActions(EParseAction action) case doCloseParen: handleCloseParen(); + if (fParenStack.size() <= 0) { + // Extra close paren, or missing open paren. + error(U_REGEX_MISMATCHED_PAREN); + } break; case doNOP: @@ -483,11 +494,16 @@ UBool RegexCompile::doParseActions(EParseAction action) case doBadOpenParenType: case doRuleError: - error(U_BRK_RULE_SYNTAX); + error(U_REGEX_RULE_SYNTAX); returnVal = FALSE; break; + case doMismatchedParenErr: + error(U_REGEX_MISMATCHED_PAREN); + returnVal = FALSE; + break; + case doPlus: // Normal '+' compiles to // 1. stuff to be repeated (already built) @@ -532,7 +548,7 @@ UBool RegexCompile::doParseActions(EParseAction action) // 3. ... // Insert the state save into the compiled pattern, and we're done. { - int32_t saveStateLoc = blockTopLoc(TRUE); + int32_t saveStateLoc = blockTopLoc(TRUE); int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); } @@ -572,9 +588,9 @@ UBool RegexCompile::doParseActions(EParseAction action) // 3. JMP 0 // 4. ... // - { + { // location of item #1, the STATE_SAVE - int32_t saveStateLoc = blockTopLoc(TRUE); + int32_t saveStateLoc = blockTopLoc(TRUE); // Locate the position in the compiled pattern where the match will continue // after completing the *. (4 in the comment above) @@ -599,7 +615,7 @@ UBool RegexCompile::doParseActions(EParseAction action) // 3. STATE_SAVE 2 // 4 ... { - int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. + int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); @@ -607,7 +623,7 @@ UBool RegexCompile::doParseActions(EParseAction action) fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); } break; - + case doStartString: // We've just scanned a single "normal" character from the pattern, @@ -678,7 +694,7 @@ UBool RegexCompile::doParseActions(EParseAction action) // has only one character, emit the single character token instead. { int32_t strLength = fRXPat->fLiteralText.length() - fStringOpStart; - U_ASSERT(strLength > 0); + U_ASSERT(strLength > 0); int32_t lastCharIdx = fRXPat->fLiteralText.length()-1; lastCharIdx = fRXPat->fLiteralText.getChar32Start(lastCharIdx); if (lastCharIdx == fStringOpStart) { @@ -735,7 +751,7 @@ UBool RegexCompile::doParseActions(EParseAction action) case doBackslashG: fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); - break; + break; case doBackslashS: fRXPat->fCompiledPat->addElement( @@ -750,31 +766,31 @@ UBool RegexCompile::doParseActions(EParseAction action) case doBackslashW: fRXPat->fCompiledPat->addElement( URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET | URX_NEG_SET), *fStatus); - break; + break; case doBackslashw: fRXPat->fCompiledPat->addElement( URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); - break; + break; case doBackslashX: fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); - break; + break; case doBackslashx: // \x{abcd} alternate hex format - // TODO: implement + // TODO: implement error(U_REGEX_UNIMPLEMENTED); break; - + case doBackslashZ: fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); - break; + break; case doBackslashz: fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); - break; + break; case doExit: returnVal = FALSE; @@ -806,12 +822,23 @@ UBool RegexCompile::doParseActions(EParseAction action) break; case doNamedChar: // \N{NAMED_CHAR} - // TODO: implement + // TODO: implement error(U_REGEX_UNIMPLEMENTED); break; - + + case doMatchMode: // (?i) and similar + // TODO: implement + error(U_REGEX_UNIMPLEMENTED); + break; + + case doNotImplementedError: + // TODO: get rid of this once everything is implemented. + error(U_REGEX_UNIMPLEMENTED); + break; + + default: - error(U_BRK_INTERNAL_ERROR); + error(U_REGEX_INTERNAL_ERROR); returnVal = FALSE; break; } @@ -838,7 +865,7 @@ UBool RegexCompile::doParseActions(EParseAction action) //------------------------------------------------------------------------------ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { int32_t theLoc; - if (fRXPat->fCompiledPat->size() == fMatchCloseParen) + if (fRXPat->fCompiledPat->size() == fMatchCloseParen) { // The item just processed is a parenthesized block. theLoc = fMatchOpenParen; // A slot is already reserved for us. @@ -878,8 +905,11 @@ int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { void RegexCompile::handleCloseParen() { int32_t patIdx; int32_t patOp; - U_ASSERT(fParenStack.size() >= 1); - + if (fParenStack.size() <= 0) { + error(U_REGEX_MISMATCHED_PAREN); + return; + } + // Fixup any operations within the just-closed parenthesized group // that need to reference the end of the (block). // (The first one on popped from the stack is an unused slot for @@ -896,17 +926,17 @@ void RegexCompile::handleCloseParen() { fRXPat->fCompiledPat->setElementAt(patOp, patIdx); fMatchOpenParen = patIdx; } - + // DO any additional fixups, depending on the specific kind of // parentesized grouping this is - + switch (patIdx) { case -1: // No additional fixups required. // This is the case with most kinds of groupings. break; case -2: - // Capturing Parentheses. + // Capturing Parentheses. // Insert a End Capture op into the pattern. // Grab the group number from the start capture op // and put it into the end-capture op. @@ -1039,7 +1069,7 @@ UChar32 RegexCompile::nextCharLL() { fLineNum++; fCharNum=0; if (fQuoteMode) { - error(U_BRK_NEW_LINE_IN_QUOTED_STRING); + error(U_REGEX_RULE_SYNTAX); fQuoteMode = FALSE; } } @@ -1120,7 +1150,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // Use UnicodeString::unescapeAt() to handle those that it can. // Otherwise just return the '\', and let the pattern parser deal with it. // - int32_t startX = fNextIndex; // start and end positions of the + int32_t startX = fNextIndex; // start and end positions of the int32_t endX = fNextIndex; // sequence following the '\' if (c.fChar == chBackSlash) { if (gUnescapeCharSet->contains(peekCharLL())) { @@ -1148,7 +1178,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // // The scan position is normally under the control of the state machine // that controls pattern parsing. UnicodeSets, however, are parsed by -// the UnicodeSet constructor, not by the Regex pattern parser. +// the UnicodeSet constructor, not by the Regex pattern parser. // //--------------------------------------------------------------------------------- UnicodeSet *RegexCompile::scanSet() { @@ -1193,7 +1223,7 @@ UnicodeSet *RegexCompile::scanSet() { //--------------------------------------------------------------------------------- // // scanProp Construct a UnicodeSet from the text at the current scan -// position, which will be of the form \p{whaterver} +// position, which will be of the form \p{whaterver} // // The scan position will be at the 'p' or 'P'. On return // the scan position should be just after the '}' @@ -1240,6 +1270,5 @@ UnicodeSet *RegexCompile::scanProp() { return uset; }; - U_NAMESPACE_END - +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h index b023d885b18..fe096b72400 100644 --- a/icu4c/source/i18n/regexcmp.h +++ b/icu4c/source/i18n/regexcmp.h @@ -4,8 +4,10 @@ // Copyright (C) 2002, International Business Machines Corporation and others. // All Rights Reserved. // -// This file contains declarations for the class RegexCompile and for compiled -// regular expression data format +// This file contains declarations for the class RegexCompile +// +// This class is internal to the regular expression implementation. +// For the public Regular Expression API, see the file "unicode/regex.h" // @@ -13,6 +15,8 @@ #define RBBISCAN_H #include "unicode/utypes.h" +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + #include "unicode/uobject.h" #include "unicode/uniset.h" #include "unicode/parseerr.h" @@ -28,12 +32,7 @@ static const UBool REGEX_DEBUG = TRUE; //-------------------------------------------------------------------------------- // -// class RegexCompile does the lowest level, character-at-a-time -// scanning of a regular expression. -// -// The output of the scanner is a tokenized form -// of the RE, plus prebuilt UnicodeSet objects for each -// set of charcters that is referenced. +// class RegexCompile Contains the regular expression compiler. // //-------------------------------------------------------------------------------- static const int kStackSize = 100; // The size of the state stack for @@ -161,5 +160,5 @@ private: }; U_NAMESPACE_END - -#endif +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS +#endif // RBBISCAN_H diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h index 4776f63e5c1..9946301cbce 100644 --- a/icu4c/source/i18n/regexcst.h +++ b/icu4c/source/i18n/regexcst.h @@ -24,7 +24,6 @@ U_NAMESPACE_BEGIN enum Regex_PatternParseAction { doCloseParen, doProperty, - doTagValue, doOrOperator, doOpenCaptureParen, doBadOpenParenType, @@ -35,6 +34,7 @@ enum Regex_PatternParseAction { doNamedChar, doBackslashw, doPossesiveStar, + doMismatchedParenErr, doOpenLookBehind, doBackslashx, doBackslashz, @@ -43,6 +43,7 @@ enum Regex_PatternParseAction { doEnterQuoteMode, doPossesivePlus, doNGStar, + doMatchMode, doOpenLookAheadNeg, doPlus, doOpenNonCaptureParen, @@ -51,14 +52,11 @@ enum Regex_PatternParseAction { doNGPlus, doPatFinish, doBackslashD, - doIntervalMinValue, - doIntervalDigit, doPossesiveOpt, doBackslashG, doOpt, doOpenAtomicParen, doBackslashS, - doNumberExpectedError, doStringChar, doOpenLookAhead, doBackRef, @@ -74,6 +72,7 @@ enum Regex_PatternParseAction { doBackslashb, doEndString, doBackslashd, + doNotImplementedError, doOpenLookBehindNeg, doSplitString, rbbiLastAction}; @@ -100,13 +99,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doStartString, 254, 13,0, TRUE} // 3 term , {doStartString, 130, 13,0, TRUE} // 4 , {doScanUnicodeSet, 91 /* [ */, 20,0, TRUE} // 5 - , {doNOP, 40 /* ( */, 27, 20, TRUE} // 6 + , {doNOP, 40 /* ( */, 28, 20, TRUE} // 6 , {doDotAny, 46 /* . */, 20,0, TRUE} // 7 , {doCaret, 94 /* ^ */, 3,0, TRUE} // 8 , {doDollar, 36 /* $ */, 3,0, TRUE} // 9 - , {doNOP, 92 /* \ */, 60,0, TRUE} // 10 + , {doNOP, 92 /* \ */, 67,0, TRUE} // 10 , {doNOP, 253, 2,0, FALSE} // 11 - , {doRuleError, 255, 80,0, FALSE} // 12 + , {doRuleError, 255, 87,0, FALSE} // 12 , {doStringChar, 254, 13,0, TRUE} // 13 string , {doStringChar, 130, 13,0, TRUE} // 14 , {doSplitString, 63 /* ? */, 20,0, FALSE} // 15 @@ -114,67 +113,74 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doSplitString, 42 /* * */, 20,0, FALSE} // 17 , {doSplitString, 123 /* { */, 20,0, FALSE} // 18 , {doEndString, 255, 20,0, FALSE} // 19 - , {doNOP, 42 /* * */, 41,0, TRUE} // 20 expr-quant - , {doNOP, 43 /* + */, 44,0, TRUE} // 21 - , {doNOP, 63 /* ? */, 47,0, TRUE} // 22 - , {doNOP, 255, 24,0, FALSE} // 23 - , {doOrOperator, 124 /* | */, 3,0, TRUE} // 24 expr-cont - , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 25 - , {doNOP, 255, 3,0, FALSE} // 26 - , {doNOP, 63 /* ? */, 29,0, TRUE} // 27 open-paren - , {doOpenCaptureParen, 255, 3, 20, FALSE} // 28 - , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 29 open-paren-extended - , {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 30 - , {doOpenLookAhead, 61 /* = */, 3, 24, TRUE} // 31 - , {doOpenLookAheadNeg, 33 /* ! */, 3, 24, TRUE} // 32 - , {doNOP, 60 /* < */, 36,0, TRUE} // 33 - , {doNOP, 35 /* # */, 39,0, TRUE} // 34 - , {doBadOpenParenType, 255, 80,0, FALSE} // 35 - , {doOpenLookBehind, 61 /* = */, 3, 24, TRUE} // 36 open-paren-lookbehind - , {doOpenLookBehindNeg, 33 /* ! */, 3, 24, TRUE} // 37 - , {doBadOpenParenType, 255, 80,0, FALSE} // 38 - , {doNOP, 41 /* ) */, 3,0, TRUE} // 39 paren-comment - , {doNOP, 255, 39,0, TRUE} // 40 - , {doNGStar, 63 /* ? */, 24,0, TRUE} // 41 quant-star - , {doPossesiveStar, 43 /* + */, 24,0, TRUE} // 42 - , {doStar, 255, 24,0, FALSE} // 43 - , {doNGPlus, 63 /* ? */, 24,0, TRUE} // 44 quant-plus - , {doPossesivePlus, 43 /* + */, 24,0, TRUE} // 45 - , {doPlus, 255, 24,0, FALSE} // 46 - , {doNGOpt, 63 /* ? */, 24,0, TRUE} // 47 quant-opt - , {doPossesiveOpt, 43 /* + */, 24,0, TRUE} // 48 - , {doOpt, 255, 24,0, FALSE} // 49 - , {doNOP, 129, 50,0, TRUE} // 50 interval-open - , {doIntervalMinValue, 128, 53,0, FALSE} // 51 - , {doNumberExpectedError, 255, 80,0, FALSE} // 52 - , {doNOP, 129, 57,0, TRUE} // 53 interval-value - , {doNOP, 125 /* } */, 57,0, FALSE} // 54 - , {doIntervalDigit, 128, 53,0, TRUE} // 55 - , {doNumberExpectedError, 255, 80,0, FALSE} // 56 - , {doNOP, 129, 57,0, TRUE} // 57 interval-close - , {doTagValue, 125 /* } */, 24,0, TRUE} // 58 - , {doNumberExpectedError, 255, 80,0, FALSE} // 59 - , {doBackslashA, 65 /* A */, 3,0, TRUE} // 60 backslash - , {doBackslashB, 66 /* B */, 3,0, TRUE} // 61 - , {doBackslashb, 98 /* b */, 3,0, TRUE} // 62 - , {doBackslashd, 100 /* d */, 20,0, TRUE} // 63 - , {doBackslashD, 68 /* D */, 20,0, TRUE} // 64 - , {doBackslashG, 71 /* G */, 3,0, TRUE} // 65 - , {doNamedChar, 78 /* N */, 20,0, TRUE} // 66 - , {doProperty, 112 /* p */, 20,0, FALSE} // 67 - , {doProperty, 80 /* P */, 20,0, FALSE} // 68 - , {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 69 - , {doBackslashS, 83 /* S */, 20,0, TRUE} // 70 - , {doBackslashs, 115 /* s */, 20,0, TRUE} // 71 - , {doBackslashW, 87 /* W */, 20,0, TRUE} // 72 - , {doBackslashw, 119 /* w */, 20,0, TRUE} // 73 - , {doBackslashX, 88 /* X */, 20,0, TRUE} // 74 - , {doBackslashx, 120 /* x */, 20,0, TRUE} // 75 - , {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 76 - , {doBackslashz, 122 /* z */, 3,0, TRUE} // 77 - , {doBackRef, 128, 20,0, TRUE} // 78 - , {doStartString, 255, 13,0, TRUE} // 79 - , {doExit, 255, 80,0, TRUE} // 80 errorDeath + , {doNOP, 42 /* * */, 56,0, TRUE} // 20 expr-quant + , {doNOP, 43 /* + */, 59,0, TRUE} // 21 + , {doNOP, 63 /* ? */, 62,0, TRUE} // 22 + , {doNOP, 123 /* { */, 65,0, TRUE} // 23 + , {doNOP, 255, 25,0, FALSE} // 24 + , {doOrOperator, 124 /* | */, 3,0, TRUE} // 25 expr-cont + , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 26 + , {doNOP, 255, 3,0, FALSE} // 27 + , {doNOP, 63 /* ? */, 30,0, TRUE} // 28 open-paren + , {doOpenCaptureParen, 255, 3, 20, FALSE} // 29 + , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 30 open-paren-extended + , {doOpenAtomicParen, 62 /* > */, 3, 20, TRUE} // 31 + , {doOpenLookAhead, 61 /* = */, 3, 25, TRUE} // 32 + , {doOpenLookAheadNeg, 33 /* ! */, 3, 25, TRUE} // 33 + , {doNOP, 60 /* < */, 42,0, TRUE} // 34 + , {doNOP, 35 /* # */, 45,0, TRUE} // 35 + , {doMatchMode, 105 /* i */, 48,0, TRUE} // 36 + , {doMatchMode, 120 /* x */, 48,0, TRUE} // 37 + , {doMatchMode, 115 /* s */, 48,0, TRUE} // 38 + , {doMatchMode, 109 /* m */, 48,0, TRUE} // 39 + , {doMatchMode, 45 /* - */, 48,0, TRUE} // 40 + , {doBadOpenParenType, 255, 87,0, FALSE} // 41 + , {doOpenLookBehind, 61 /* = */, 3, 25, TRUE} // 42 open-paren-lookbehind + , {doOpenLookBehindNeg, 33 /* ! */, 3, 25, TRUE} // 43 + , {doBadOpenParenType, 255, 87,0, FALSE} // 44 + , {doNOP, 41 /* ) */, 3,0, TRUE} // 45 paren-comment + , {doMismatchedParenErr, 253, 87,0, FALSE} // 46 + , {doNOP, 255, 45,0, TRUE} // 47 + , {doMatchMode, 105 /* i */, 48,0, TRUE} // 48 paren-flag + , {doMatchMode, 115 /* s */, 48,0, TRUE} // 49 + , {doMatchMode, 109 /* m */, 48,0, TRUE} // 50 + , {doMatchMode, 120 /* x */, 48,0, TRUE} // 51 + , {doMatchMode, 45 /* - */, 48,0, TRUE} // 52 + , {doNOP, 41 /* ) */, 3,0, TRUE} // 53 + , {doOpenNonCaptureParen, 58 /* : */, 3, 20, TRUE} // 54 + , {doNOP, 255, 87,0, FALSE} // 55 + , {doNGStar, 63 /* ? */, 25,0, TRUE} // 56 quant-star + , {doPossesiveStar, 43 /* + */, 25,0, TRUE} // 57 + , {doStar, 255, 25,0, FALSE} // 58 + , {doNGPlus, 63 /* ? */, 25,0, TRUE} // 59 quant-plus + , {doPossesivePlus, 43 /* + */, 25,0, TRUE} // 60 + , {doPlus, 255, 25,0, FALSE} // 61 + , {doNGOpt, 63 /* ? */, 25,0, TRUE} // 62 quant-opt + , {doPossesiveOpt, 43 /* + */, 25,0, TRUE} // 63 + , {doOpt, 255, 25,0, FALSE} // 64 + , {doNOP, 129, 65,0, TRUE} // 65 interval-open + , {doNotImplementedError, 255, 87,0, FALSE} // 66 + , {doBackslashA, 65 /* A */, 3,0, TRUE} // 67 backslash + , {doBackslashB, 66 /* B */, 3,0, TRUE} // 68 + , {doBackslashb, 98 /* b */, 3,0, TRUE} // 69 + , {doBackslashd, 100 /* d */, 20,0, TRUE} // 70 + , {doBackslashD, 68 /* D */, 20,0, TRUE} // 71 + , {doBackslashG, 71 /* G */, 3,0, TRUE} // 72 + , {doNamedChar, 78 /* N */, 20,0, TRUE} // 73 + , {doProperty, 112 /* p */, 20,0, FALSE} // 74 + , {doProperty, 80 /* P */, 20,0, FALSE} // 75 + , {doEnterQuoteMode, 81 /* Q */, 3,0, TRUE} // 76 + , {doBackslashS, 83 /* S */, 20,0, TRUE} // 77 + , {doBackslashs, 115 /* s */, 20,0, TRUE} // 78 + , {doBackslashW, 87 /* W */, 20,0, TRUE} // 79 + , {doBackslashw, 119 /* w */, 20,0, TRUE} // 80 + , {doBackslashX, 88 /* X */, 20,0, TRUE} // 81 + , {doBackslashx, 120 /* x */, 20,0, TRUE} // 82 + , {doBackslashZ, 90 /* Z */, 3,0, TRUE} // 83 + , {doBackslashz, 122 /* z */, 3,0, TRUE} // 84 + , {doBackRef, 128, 20,0, TRUE} // 85 + , {doStartString, 255, 13,0, TRUE} // 86 + , {doExit, 255, 87,0, TRUE} // 87 errorDeath }; static const char *RegexStateNames[] = { 0, "start", @@ -199,6 +205,7 @@ static const char *RegexStateNames[] = { 0, "expr-quant", 0, 0, + 0, 0, "expr-cont", 0, @@ -211,11 +218,25 @@ static const char *RegexStateNames[] = { 0, 0, 0, 0, + 0, + 0, + 0, + 0, + 0, 0, "open-paren-lookbehind", 0, 0, "paren-comment", + 0, + 0, + "paren-flag", + 0, + 0, + 0, + 0, + 0, + 0, 0, "quant-star", 0, @@ -227,14 +248,6 @@ static const char *RegexStateNames[] = { 0, 0, 0, "interval-open", - 0, - 0, - "interval-value", - 0, - 0, - 0, - "interval-close", - 0, 0, "backslash", 0, diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt index 5438cdacf67..1a39d5b4335 100644 --- a/icu4c/source/i18n/regexcst.txt +++ b/icu4c/source/i18n/regexcst.txt @@ -107,7 +107,8 @@ string: expr-quant: '*' n quant-star '+' n quant-plus - '?' n quant-opt + '?' n quant-opt + '{' n interval-open default expr-cont @@ -136,6 +137,11 @@ open-paren-extended: '!' n term ^expr-cont doOpenLookAheadNeg # (?! '<' n open-paren-lookbehind '#' n paren-comment + 'i' n paren-flag doMatchMode + 'x' n paren-flag doMatchMode + 's' n paren-flag doMatchMode + 'm' n paren-flag doMatchMode + '-' n paren-flag doMatchMode default errorDeath doBadOpenParenType open-paren-lookbehind: @@ -150,7 +156,21 @@ open-paren-lookbehind: # paren-comment: ')' n term + eof errorDeath doMismatchedParenErr default n paren-comment + +# +# paren-flag Scanned a (?ismx-ismx flag setting thing +# TODO: this is not fully implemented yet. +paren-flag: + 'i' n paren-flag doMatchMode + 's' n paren-flag doMatchMode + 'm' n paren-flag doMatchMode + 'x' n paren-flag doMatchMode + '-' n paren-flag doMatchMode + ')' n term + ':' n term ^expr-quant doOpenNonCaptureParen + default errorDeath # @@ -189,19 +209,8 @@ quant-opt: # interval-open: white_space n interval-open - digit_char interval-value doIntervalMinValue - default errorDeath doNumberExpectedError + default errorDeath doNotImplementedError -interval-value: - white_space n interval-close - '}' interval-close - digit_char n interval-value doIntervalDigit - default errorDeath doNumberExpectedError - -interval-close: - white_space n interval-close - '}' n expr-cont doTagValue - default errorDeath doNumberExpectedError diff --git a/icu4c/source/i18n/regeximp.h b/icu4c/source/i18n/regeximp.h index c21bd05a9f2..5b8f1e0df43 100644 --- a/icu4c/source/i18n/regeximp.h +++ b/icu4c/source/i18n/regeximp.h @@ -4,9 +4,9 @@ // // file: regeximp.h // -// ICU Regular Expressions, declarations of internal implementation types -// and constants that are common between the pattern compiler and the -// runtime execution engine. +// ICU Regular Expressions, +// Definitions of constant values used in the compiled form of +// a regular expression pattern. // #ifndef _REGEXIMP_H @@ -19,7 +19,7 @@ // static const uint32_t URX_UNUSED1 = 1; static const uint32_t URX_END = 2; -static const uint32_t URX_ONECHAR = 3; +static const uint32_t URX_ONECHAR = 3; // Value field is the 21 bit unicode char to match static const uint32_t URX_STRING = 4; // Value field is index of string start static const uint32_t URX_STRING_LEN = 5; // Value field is string length (code units) static const uint32_t URX_STATE_SAVE = 6; // Value field is pattern position to push @@ -55,7 +55,7 @@ static const uint32_t URX_DOLLAR = 24; // Also for \Z // -// Access to Unicode Sets for composite properties +// Access to Unicode Sets for Perl-like composite character properties // The sets are accessed by the match engine for things like \w (word boundary) // static const uint32_t URX_ISWORD_SET = 1; diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp index f5d9ec1bfc8..25193e7c3f1 100644 --- a/icu4c/source/i18n/rematch.cpp +++ b/icu4c/source/i18n/rematch.cpp @@ -1,6 +1,9 @@ // // file: rematch.cpp // +// Contains the implementation of class RegexMatcher, +// which is one of the main API classes for the ICU regular expression package. +// /* ********************************************************************** * Copyright (C) 2002 International Business Machines Corporation * @@ -9,6 +12,8 @@ */ #include "unicode/utypes.h" +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + #include "unicode/regex.h" #include "unicode/uniset.h" #include "unicode/uchar.h" @@ -443,10 +448,11 @@ int32_t RegexMatcher::start(int group, UErrorCode &err) const { // isWordBoundary // in perl, "xab..cd..", \b is true at positions 0,3,5,7 // For us, -// If the current char is a combining mark, \b is FALSE -// Scan backwards to the first non-combining char -// Pos is a boundary if the current and previous chars are -// opposite in membership in \w set +// If the current char is a combining mark, +// \b is FALSE. +// Else Scan backwards to the first non-combining char. +// We are at a boundary if the this char and the original chars are +// opposite in membership in \w set // //-------------------------------------------------------------------------------- UBool RegexMatcher::isWordBoundary(int32_t pos) { @@ -486,27 +492,6 @@ UBool RegexMatcher::isWordBoundary(int32_t pos) { } -//-------------------------------------------------------------------------------- -// -// getCaptureText We have encountered a '\' that might preceed a -// capture group specification. -// If a valid capture group number follows the '\', -// return the indicies to the start & end of the captured -// text, and update the patIdx to the position following the -// \n sequence. -// -// This function is used during find and replace operations when -// processing caputure references in the replacement text. -// -//-------------------------------------------------------------------------------- -UBool RegexMatcher::getCaptureText(const UnicodeString &rep, - int32_t &repIdx, - int32_t &textStart, - int32_t &textEnd) -{ - return FALSE; -} - //-------------------------------------------------------------------------------- // // backTrack Within the match engine, this function is called when @@ -915,10 +900,9 @@ breakFromLoop: - - const char RegexMatcher::fgClassID = 0; U_NAMESPACE_END +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp index b28c0a90a75..1fdf37f778c 100644 --- a/icu4c/source/i18n/repattrn.cpp +++ b/icu4c/source/i18n/repattrn.cpp @@ -9,6 +9,9 @@ */ #include "unicode/utypes.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + #include "unicode/regex.h" #include "uassert.h" #include "uvector.h" @@ -66,6 +69,7 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fBadState = other.fBadState; fNumCaptureGroups = other.fNumCaptureGroups; fMaxCaptureDigits = other.fMaxCaptureDigits; + fStaticSets = other.fStaticSets; if (fBadState) { return *this; } @@ -110,6 +114,7 @@ void RegexPattern::init() { fBadState = FALSE; fNumCaptureGroups = 0; fMaxCaptureDigits = 1; // TODO: calculate for real. + fStaticSets = NULL; fMatcher = NULL; UErrorCode status=U_ZERO_ERROR; @@ -384,15 +389,6 @@ int32_t RegexPattern::split(const UnicodeString &input, -//--------------------------------------------------------------------- -// -// hashcode -// -//--------------------------------------------------------------------- -int32_t RegexPattern::hashCode(void) const { - return 0; // TODO: Do something better here -}; - //--------------------------------------------------------------------- // @@ -512,8 +508,8 @@ breakFromLoop: printf("\n\n"); }; - - const char RegexPattern::fgClassID = 0; + U_NAMESPACE_END +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h index 412210f0ed4..0b10329dfb6 100644 --- a/icu4c/source/i18n/unicode/regex.h +++ b/icu4c/source/i18n/unicode/regex.h @@ -9,6 +9,9 @@ #define REGEX_H #include "unicode/utypes.h" + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + #include "unicode/uobject.h" #include "unicode/unistr.h" #include "unicode/parseerr.h" @@ -69,9 +72,6 @@ public: RegexPattern &operator =(const RegexPattern &other); virtual RegexPattern *clone() const; - // TODO: Do we really want a hashCode function on this class? - virtual int32_t hashCode(void) const; - /** * Compiles the given regular expression into a pattern @@ -428,10 +428,6 @@ private: // void MatchAt(int32_t startIdx, UErrorCode &status); inline void backTrack(int32_t &inputIdx, int32_t &patIdx); - UBool getCaptureText(const UnicodeString &rep, - int32_t &repIdx, - int32_t &textStart, - int32_t &textEnd); UBool isWordBoundary(int32_t pos); // perform the \b test @@ -448,7 +444,6 @@ private: }; - - U_NAMESPACE_END +#endif // UCONFIG_NO_REGULAR_EXPRESSIONS #endif diff --git a/icu4c/source/test/intltest/itmajor.cpp b/icu4c/source/test/intltest/itmajor.cpp index 08e009bb878..eb9b2e1d7c2 100644 --- a/icu4c/source/test/intltest/itmajor.cpp +++ b/icu4c/source/test/intltest/itmajor.cpp @@ -70,11 +70,13 @@ void MajorTestLevel::runIndexedTest( int32_t index, UBool exec, const char* &nam break; case 3: name = "regex"; +#if !UCONFIG_NO_REGULAR_EXPRESSIONS if (exec) { logln("TestSuite Regex---"); logln(); RegexTest test; callTest( test, par ); } +#endif break; case 4: name = "format"; diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp index 492ac16a46a..f551d764de1 100644 --- a/icu4c/source/test/intltest/regextst.cpp +++ b/icu4c/source/test/intltest/regextst.cpp @@ -11,6 +11,8 @@ // #include "unicode/utypes.h" +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + #include "unicode/uchar.h" #include "intltest.h" #include "regextst.h" @@ -1195,8 +1197,38 @@ void RegexTest::Errors() { REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); // Missing close parentheses - //REGEX_ERR("Comment (?# with no close", 1, 0, U_REGEX_INTERNAL_ERROR); + REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); + REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); + REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); + + // Extra close paren + REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); + REGEX_ERR(")))))))", 1, 1, U_REGEX_RULE_SYNTAX); + REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); + + // Flag settings not yet implemented + REGEX_ERR("(?i:stuff*)", 1, 3, U_REGEX_UNIMPLEMENTED); + REGEX_ERR("(?-si) stuff", 1, 3, U_REGEX_UNIMPLEMENTED); + + // Look-ahead, Look-behind + REGEX_ERR("abc(?=xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // look-ahead + REGEX_ERR("abc(?!xyz).*", 1, 6, U_REGEX_UNIMPLEMENTED); // negated look-ahead + REGEX_ERR("abc(?<=xyz).*", 1, 7, U_REGEX_UNIMPLEMENTED); // look-behind + REGEX_ERR("abc(?xyz)", 1, 6, U_REGEX_UNIMPLEMENTED); + + // {Numeric Quantifiers} + REGEX_ERR("abc{4}", 1, 5, U_REGEX_UNIMPLEMENTED); + + + // Quantifiers are allowed only after something that can be quantified. + REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); + REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); + REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); } - +#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h index 0718535afb0..fb249b6b95f 100644 --- a/icu4c/source/test/intltest/regextst.h +++ b/icu4c/source/test/intltest/regextst.h @@ -8,6 +8,8 @@ #ifndef REGEXTST_H #define REGEXTST_H +#include "unicode/utypes.h" +#if !UCONFIG_NO_REGULAR_EXPRESSIONS #include "intltest.h" #include "unicode/regex.h" @@ -35,4 +37,6 @@ public: virtual void regex_err(const char *pat, int32_t errline, int32_t errcol, UErrorCode expectedStatus, int line); }; + +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS #endif