ICU-2421 regexp C API

X-SVN-Rev: 14777
This commit is contained in:
Andy Heninger 2004-03-26 20:20:21 +00:00
parent 6ffcca54d2
commit c685748636
3 changed files with 59 additions and 70 deletions

View file

@ -40,7 +40,7 @@ typedef struct URegularExpression URegularExpression;
* @stable ICU 2.4
*/
typedef enum URegexpFlag{
/** Forces normalization of pattern and strings. @draft ICU 3.0 */
/** Forces normalization of pattern and strings. @draft ICU 2.4 */
UREGEX_CANON_EQ = 128,
/** Enable case insensitive matching. @stable ICU 2.4 */
@ -87,9 +87,10 @@ typedef enum URegexpFlag{
* error within the source regular expression string. If this
* information is not wanted, pass NULL for this parameter.
* @param status Receives error detected by this function.
* @draft ICU 3.0
*
*/
U_STABLE URegularExpression * U_EXPORT2
U_DRAFT URegularExpression * U_EXPORT2
uregex_open( const UChar *pattern,
int32_t patternLength,
uint32_t flags,
@ -117,8 +118,9 @@ uregex_open( const UChar *pattern,
* @return The URegularExpression object representing the compiled
* pattern.
*
* @draft ICU 3.0
*/
U_STABLE URegularExpression * U_EXPORT2
U_DRAFT URegularExpression * U_EXPORT2
uregex_openC( const char *pattern,
uint32_t flags,
UParseError *pe,
@ -132,8 +134,9 @@ uregex_openC( const char *pattern,
* was holding.
*
* @param regexp The regular expression to be closed.
* @draft ICU 3.0
*/
U_STABLE void U_EXPORT2
U_DRAFT void U_EXPORT2
uregex_close(URegularExpression *regexp);
/**
@ -152,8 +155,9 @@ uregex_close(URegularExpression *regexp);
* @param regexp The compiled regular expression to be cloned.
* @param status Receives indication of any errors encountered
* @return the cloned copy of the compiled regular expression.
* @draft ICU 3.0
*/
U_STABLE URegularExpression * U_EXPORT2
U_DRAFT URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression *regexp, UErrorCode *status);
/**
@ -170,8 +174,9 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status);
* owned by the regular expression object, and must not be
* altered or deleted by the application. The returned string
* will remain valid until the regular expression is closed.
* @draft ICU 3.0
*/
U_STABLE const UChar * U_EXPORT2
U_DRAFT const UChar * U_EXPORT2
uregex_pattern(const URegularExpression *regexp,
int32_t *patLength,
UErrorCode *status);
@ -183,8 +188,9 @@ uregex_pattern(const URegularExpression *regexp,
* @param regexp The compiled regular expression.
* @param return The match mode flags
* @see URegexpFlag
* @draft ICU 3.0
*/
U_STABLE int32_t U_EXPORT2
U_DRAFT int32_t U_EXPORT2
uregex_flags(const URegularExpression *regexp,
UErrorCode *status);
@ -207,8 +213,9 @@ uregex_flags(const URegularExpression *regexp,
* @param textLength The length of the subject text, or -1 if the string
* is NUL terminated.
* @param status Receives errors detected by this function.
* @draft ICU 3.0
*/
U_STABLE void U_EXPORT2
U_DRAFT void U_EXPORT2
uregex_setText(URegularExpression *regexp,
const UChar *text,
int32_t textLength,
@ -228,6 +235,7 @@ uregex_setText(URegularExpression *regexp,
* @param status Receives errors detected by this function.
* @return Poiner to the subject text string currently associated with
* this regular expression.
* @draft ICU 3.0
*/
U_DRAFT const UChar * U_EXPORT2
uregex_getText(URegularExpression *regexp,

View file

@ -620,6 +620,19 @@ unescape_charAt(int32_t offset, void *context) {
static const UChar BACKSLASH = 0x5c;
static const UChar DOLLARSIGN = 0x24;
//
// Move a character to an output buffer, with bounds checking on the index.
// Index advances even if capacity is exceeded, for preflight size computations.
// This little sequence is used a LOT.
//
static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
if (*idx < bufCapacity) {
buf[*idx] = c;
}
(*idx)++;
}
//
// appendReplacement, the actual implementation.
//
@ -659,9 +672,10 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
return 0;
}
int32_t resultLen = 0;
int32_t capacityRemaining = *destCapacity;
UChar *dest = *destBuf;
UChar *dest = *destBuf;
int32_t capacity = *destCapacity;
int32_t destIdx = 0;
int32_t i;
// If it wasn't supplied by the caller, get the length of the replacement text.
// TODO: slightly smarter logic in the copy loop could watch for the NUL on
@ -671,19 +685,10 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
}
// Copy input string from the end of previous match to start of current match
int32_t startIdx = m->fLastMatchEnd;
int32_t len = m->fMatchStart - startIdx;
if (len > 0) {
if (len < capacityRemaining) {
// TODO: replace memcpy with inline loop
u_memcpy(&dest[resultLen], &regexp->fText[startIdx], len);
capacityRemaining -= len;
} else if (capacityRemaining > 0) {
u_memcpy(&dest[resultLen], &regexp->fText[startIdx], capacityRemaining);
capacityRemaining = 0;
}
resultLen += len;
for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
}
// scan the replacement text, looking for substitutions ($n) and \escapes.
@ -694,11 +699,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
if (c != DOLLARSIGN && c != BACKSLASH) {
// Common case, no substitution, no escaping,
// just copy the char to the dest buf.
if (capacityRemaining > 0) {
dest[resultLen] = c;
capacityRemaining--;
}
resultLen++;
appendToBuf(c, &destIdx, dest, capacity);
continue;
}
@ -718,41 +719,25 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
UChar32 escapedChar =
u_unescapeAt(unescape_charAt,
&replIdx, // Index is updated by unescapeAt
replacementLength-replIdx, // Remaining length of replacement text
replacementLength, // Length of replacement text
replacementText);
if (escapedChar != (UChar32)0xFFFFFFFF) {
if (escapedChar <= 0xffff) {
if (capacityRemaining > 0) {
dest[resultLen] = (UChar)escapedChar;
capacityRemaining--;
}
resultLen++;
appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
} else {
if (capacityRemaining > 0) {
dest[resultLen] = U16_LEAD(escapedChar);
capacityRemaining--;
}
resultLen++;
if (capacityRemaining > 0) {
dest[resultLen] = U16_TRAIL(escapedChar);
capacityRemaining--;
}
resultLen++;
appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
}
continue;
continue;
}
// Note: if the \u escape was invalid, just fall through and
// treat it as a plain \<anything> escape.
}
// Plain backslash escape. Just put out the escaped character.
if (capacityRemaining > 0) {
dest[resultLen] = c;
capacityRemaining--;
}
resultLen++;
appendToBuf(c, &destIdx, dest, capacity);
replIdx++;
continue;
}
@ -787,20 +772,16 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
if (numDigits == 0) {
// The $ didn't introduce a group number at all.
// Treat it as just part of the substitution text.
if (capacityRemaining > 0) {
dest[resultLen] = DOLLARSIGN;
capacityRemaining--;
}
resultLen++;
appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
continue;
}
// Finally, append the capture group data to the destination.
resultLen += uregex_group(regexp, groupNum, dest+resultLen, capacityRemaining, status);
capacityRemaining = *destCapacity - resultLen;
int32_t capacityRemaining = capacity - destIdx;
if (capacityRemaining < 0) {
capacityRemaining = 0;
}
destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
// Ignore buffer overflow when extracting the group. We need to
// continue on to get full size of the untruncated result. We will
@ -819,9 +800,9 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
// Nul Terminate the dest buffer if possible.
// Set the appropriate buffer overflow or not terminated error, if needed.
//
if (resultLen < *destCapacity) {
dest[resultLen] = 0;
} else if (resultLen == *destCapacity) {
if (destIdx < capacity) {
dest[destIdx] = 0;
} else if (destIdx == *destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
@ -830,13 +811,13 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
//
// Return an updated dest buffer and capacity to the caller.
//
if (resultLen > 0 && *destCapacity > 0) {
if (capacityRemaining == 0) {
*destBuf += *destCapacity;
*destCapacity = 0;
if (destIdx > 0 && *destCapacity > 0) {
if (destIdx < capacity) {
*destBuf += destIdx;
*destCapacity -= destIdx;
} else {
*destBuf += resultLen;
*destCapacity -= resultLen;
*destBuf += capacity;
*destCapacity = 0;
}
}
@ -847,7 +828,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
*status = U_BUFFER_OVERFLOW_ERROR;
}
return resultLen;
return destIdx;
}
//

View file

@ -682,10 +682,10 @@ void TestRegexCAPI(void) {
TEST_ASSERT_SUCCESS(status);
bufPtr = buf;
bufCap = sizeof(buf) / 2;
u_uastrncpy(repl, "abc\\u0041 \\U00000042 \\\\ \\abc", sizeof(repl)/2);
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2);
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
TEST_ASSERT_SUCCESS(status);
/* TEST_ASSERT_STRING("abcAB \\ abc", buf, TRUE); TODO: */
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
}