ICU-2925 API extensions, work in process.

X-SVN-Rev: 13491
This commit is contained in:
Andy Heninger 2003-10-25 00:52:03 +00:00
parent cea34629f2
commit 14325e430b
3 changed files with 206 additions and 33 deletions

View file

@ -1197,7 +1197,7 @@ UBool RegexCompile::doParseActions(EParseAction action)
// the compiled code for it.
// Because capture groups can be forward-referenced by back-references,
// we fill the operand with the capture group number. At the end
// of compilation, it will be changed to the variables location.
// of compilation, it will be changed to the variable's location.
U_ASSERT(groupNum > 0);
int32_t op;
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
@ -2498,7 +2498,7 @@ void RegexCompile::matchStartType() {
fRXPat->fStartType = START_STRING;
fRXPat->fInitialChar = c;
} else if (fRXPat->fStartType == START_LINE) {
// Match at start of line in Mulit-Line mode.
// Match at start of line in Multi-Line mode.
// Nothing to do here; everything is already set.
} else if (fRXPat->fMinMatchLen == 0) {
// Zero length match possible. We could start anywhere.

View file

@ -516,6 +516,11 @@ const UnicodeString &RegexMatcher::input() const {
//--------------------------------------------------------------------------------
//
// lookingAt()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::lookingAt(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
@ -530,7 +535,30 @@ UBool RegexMatcher::lookingAt(UErrorCode &status) {
}
UBool RegexMatcher::lookingAt(int32_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
if (start < 0 || start >= fInput->length()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
reset();
MatchAt(start, status);
return fMatch;
}
//--------------------------------------------------------------------------------
//
// matches()
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::matches(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
@ -546,6 +574,24 @@ UBool RegexMatcher::matches(UErrorCode &status) {
}
UBool RegexMatcher::matches(int32_t start, UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (U_FAILURE(fDeferredStatus)) {
status = fDeferredStatus;
return FALSE;
}
if (start < 0 || start >= fInput->length()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
reset();
MatchAt(start, status);
UBool success = (fMatch && fMatchEnd==fInput->length());
return success;
}
const RegexPattern &RegexMatcher::pattern() const {
@ -618,6 +664,7 @@ RegexMatcher &RegexMatcher::reset() {
fMatchEnd = 0;
fLastMatchEnd = 0;
fMatch = FALSE;
fTouchedEnd = FALSE;
resetStack();
return *this;
}
@ -631,23 +678,23 @@ RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
}
REStackFrame *RegexMatcher::resetStack() {
// Discard any previous contents of the state save stack, and initialize a
// new stack frame to all -1. The -1s are needed for capture group limits, where
// they indicate that a group has not yet matched anything.
fStack->removeAllElements();
int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
int i;
for (i=0; i<fPattern->fFrameSize; i++) {
iFrame[i] = -1;
RegexMatcher &RegexMatcher::reset(int32_t position, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
return (REStackFrame *)iFrame;
reset();
if (position < 0 || position >= fInput->length()) {
status = U_INDEX_OUTOFBOUNDS_ERROR;
return *this;
}
fMatchEnd = position;
return *this;
}
//--------------------------------------------------------------------------------
//
// setTrace
@ -791,6 +838,48 @@ int32_t RegexMatcher::start(int group, UErrorCode &status) const {
}
//--------------------------------------------------------------------------------
//
// touchedEnd
//
//--------------------------------------------------------------------------------
UBool RegexMatcher::touchedEnd() {
return fTouchedEnd;
}
//================================================================================
//
// Code following this point in this file is the internal
// Match Engine Implementation.
//
//================================================================================
//--------------------------------------------------------------------------------
//
// resetStack
// Discard any previous contents of the state save stack, and initialize a
// new stack frame to all -1. The -1s are needed for capture group limits,
// where they indicate that a group has not yet matched anything.
//--------------------------------------------------------------------------------
REStackFrame *RegexMatcher::resetStack() {
// Discard any previous contents of the state save stack, and initialize a
// new stack frame to all -1. The -1s are needed for capture group limits, where
// they indicate that a group has not yet matched anything.
fStack->removeAllElements();
int32_t *iFrame = fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
int i;
for (i=0; i<fPattern->fFrameSize; i++) {
iFrame[i] = -1;
}
return (REStackFrame *)iFrame;
}
//--------------------------------------------------------------------------------
//
@ -915,6 +1004,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fTouchedEnd = FALSE;
// Cache frequently referenced items from the compiled pattern
// in local variables.
@ -982,6 +1072,8 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
if (c == opValue) {
break;
}
} else {
fTouchedEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@ -1004,6 +1096,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
if (fp->fInputIdx + stringLen > inputLen) {
// No match. String is longer than the remaining input text.
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1188,6 +1281,7 @@ void RegexMatcher::MatchAt(int32_t startIdx, UErrorCode &status) {
// Fail if at end of input
if (fp->fInputIdx >= inputLen) {
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1277,6 +1371,7 @@ GC_Done:
// 0: success if input char is in set.
// 1: success if input char is not in set.
if (fp->fInputIdx >= inputLen) {
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1309,6 +1404,7 @@ GC_Done:
// Test input character for NOT being a member of one of
// the predefined sets (Word Characters, for example)
if (fp->fInputIdx >= inputLen) {
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1352,6 +1448,8 @@ GC_Done:
break;
}
}
} else {
fTouchedEnd = TRUE;
}
// Either at end of input, or the character wasn't in the set.
// Either way, we need to back track out.
@ -1364,6 +1462,7 @@ GC_Done:
// . matches anything, but stops at end-of-line.
if (fp->fInputIdx >= inputLen) {
// At end of input. Match failed. Backtrack out.
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1385,6 +1484,7 @@ GC_Done:
// ., in dot-matches-all (including new lines) mode
if (fp->fInputIdx >= inputLen) {
// At end of input. Match failed. Backtrack out.
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1407,6 +1507,7 @@ GC_Done:
{
// Fail if input already exhausted.
if (fp->fInputIdx >= inputLen) {
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
}
@ -1439,6 +1540,7 @@ GC_Done:
{
// Match up to end of input. Fail if already at end of input.
if (fp->fInputIdx >= inputLen) {
fTouchedEnd = TRUE;
fp = (REStackFrame *)fStack->popFrame(frameSize);
} else {
fp->fInputIdx = inputLen;
@ -1639,14 +1741,7 @@ GC_Done:
// we do too.
break;
}
/*
if ((fp->fInputIdx + len > inputLen) ||
u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, len) != 0) {
fp = (REStackFrame *)fStack->popFrame(frameSize); // FAIL, no match.
} else {
fp->fInputIdx += len; // Match. Advance current input position.
}
*/
UBool haveMatch = FALSE;
if (fp->fInputIdx + len <= inputLen) {
if (opType == URX_BACKREF) {
@ -1659,6 +1754,8 @@ GC_Done:
haveMatch = TRUE;
}
}
} else {
fTouchedEnd = TRUE;
}
if (haveMatch) {
fp->fInputIdx += len; // Match. Advance current input position.
@ -1729,7 +1826,10 @@ GC_Done:
if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
break;
}
} else {
fTouchedEnd = TRUE;
}
fp = (REStackFrame *)fStack->popFrame(frameSize);
break;
@ -1747,17 +1847,21 @@ GC_Done:
opValue = URX_VAL(op);
U_ASSERT(opType == URX_STRING_LEN);
stringLen = opValue;
int32_t stringEndIndex = fp->fInputIdx + stringLen;
if (stringEndIndex <= inputLen &&
u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
stringLen, U_FOLD_CASE_DEFAULT) == 0) {
// Success. Advance the current input position.
fp->fInputIdx = stringEndIndex;
if (stringEndIndex <= inputLen) {
if (u_strncasecmp(inputBuf+fp->fInputIdx, litText+stringStartIdx,
stringLen, U_FOLD_CASE_DEFAULT) == 0) {
// Success. Advance the current input position.
fp->fInputIdx = stringEndIndex;
break;
}
} else {
// No match. Back up matching to a saved state
fp = (REStackFrame *)fStack->popFrame(frameSize);
fTouchedEnd = TRUE;
}
// No match. Back up matching to a saved state
fp = (REStackFrame *)fStack->popFrame(frameSize);
}
break;
@ -1953,6 +2057,7 @@ GC_Done:
int32_t ix = fp->fInputIdx;
for (;;) {
if (ix >= inputLen) {
fTouchedEnd = TRUE;
break;
}
UChar32 c;
@ -2003,13 +2108,19 @@ GC_Done:
{
// Loop through input until the input is exhausted (we reach an end-of-line)
// In multi-line mode, we can just go straight to the end of the input.
int32_t ix = inputLen;
if (opValue == 0) {
int32_t ix;
if (opValue == 1) {
// Multi-line mode.
ix = inputLen;
fTouchedEnd = TRUE;
} else {
// NOT multi-line mode. Line endings do not match '.'
// Scan forward until a line ending or end of input.
ix = fp->fInputIdx;
for (;;) {
if (ix >= inputLen) {
ix = inputLen;
fTouchedEnd = TRUE;
break;
}
UChar32 c;

View file

@ -79,7 +79,16 @@ enum {
* If set, recognize line terminators within string,
* otherwise, match only at start and end of input string.
* @draft ICU 2.4 */
UREGEX_MULTILINE = 8
UREGEX_MULTILINE = 8,
/** Unicode word boundaries.
* If set, \b uses the Unicode TR 29 definition of word boundaries.
* Warning: Unicode word boundaries are quite different from
* traditional regular expression word boundaries. See
* http://unicode.org/reports/tr29/#Word_Boundaries
* @draft ICU 2.8
*/
UREGEX_UWORD = 256
};
@ -479,6 +488,17 @@ public:
*/
virtual UBool matches(UErrorCode &status);
/**
* Attempts to match the input string, beginning at startIndex, against the pattern.
* The match must extend to the end of the input string.
* @param startIndex The input string index at which to begin matching.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match
* @draft ICU 2.8
*/
virtual UBool matches(int32_t startIndex, UErrorCode &status);
/**
@ -496,6 +516,21 @@ public:
virtual UBool lookingAt(UErrorCode &status);
/**
* Attempts to match the input string, starting from the specified index, against the pattern.
* The match may be of any length, and is not required to extend to the end
* of the input string. Contrast with match().
*
* <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
* <code>end()</code>, and <code>group()</code> functions.</p>
*
* @param startIndex The input string index at which to begin matching.
* @param status A reference to a UErrorCode to receive any errors.
* @return TRUE if there is a match.
* @draft ICU 2.8
*/
virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
/**
* Find the next pattern match in the input string.
* The find begins searching the input at the location following the end of
@ -612,6 +647,18 @@ public:
virtual int32_t end(int group, UErrorCode &status) const;
/**
* Return TRUE of the most recent attempted match or match touched
* the end of the input string. For failed matches, this normally
* means thta some amount of additional input, appended to the
* existing input string, could have resulted in a match
* @return True if the most recently attempted match reached the
* end of the input string.
* @draft ICU 2.8
*/
virtual UBool touchedEnd();
/**
* Resets this matcher. The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at the beginning of
@ -623,6 +670,18 @@ public:
virtual RegexMatcher &reset();
/**
* Resets this matcher, and set the current input position.
* The effect is to remove any memory of previous matches,
* and to cause subsequent find() operations to begin at
* the specified position in the input string.
*
* @return this RegexMatcher.
* @draft ICU 2.8
*/
virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
@ -834,6 +893,9 @@ private:
UErrorCode fDeferredStatus; // Save error state if that cannot be immediately
// reported, or that permanently disables this matcher.
UBool fTouchedEnd; // Set true if match engine reaches eof on input
// while attempting a match.
};
U_NAMESPACE_END