From c685748636ca9b7861fa7b76bfc0977a5d54edaf Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 26 Mar 2004 20:20:21 +0000 Subject: [PATCH] ICU-2421 regexp C API X-SVN-Rev: 14777 --- icu4c/source/i18n/unicode/uregex.h | 24 ++++--- icu4c/source/i18n/uregex.cpp | 101 +++++++++++---------------- icu4c/source/test/cintltst/reapits.c | 4 +- 3 files changed, 59 insertions(+), 70 deletions(-) diff --git a/icu4c/source/i18n/unicode/uregex.h b/icu4c/source/i18n/unicode/uregex.h index ed968413c32..51c2206fbaf 100644 --- a/icu4c/source/i18n/unicode/uregex.h +++ b/icu4c/source/i18n/unicode/uregex.h @@ -40,7 +40,7 @@ typedef struct URegularExpression URegularExpression; * @stable ICU 2.4 */ typedef enum URegexpFlag{ - /** Forces normalization of pattern and strings. @draft ICU 3.0 */ + /** Forces normalization of pattern and strings. @draft ICU 2.4 */ UREGEX_CANON_EQ = 128, /** Enable case insensitive matching. @stable ICU 2.4 */ @@ -87,9 +87,10 @@ typedef enum URegexpFlag{ * error within the source regular expression string. If this * information is not wanted, pass NULL for this parameter. * @param status Receives error detected by this function. + * @draft ICU 3.0 * */ -U_STABLE URegularExpression * U_EXPORT2 +U_DRAFT URegularExpression * U_EXPORT2 uregex_open( const UChar *pattern, int32_t patternLength, uint32_t flags, @@ -117,8 +118,9 @@ uregex_open( const UChar *pattern, * @return The URegularExpression object representing the compiled * pattern. * + * @draft ICU 3.0 */ -U_STABLE URegularExpression * U_EXPORT2 +U_DRAFT URegularExpression * U_EXPORT2 uregex_openC( const char *pattern, uint32_t flags, UParseError *pe, @@ -132,8 +134,9 @@ uregex_openC( const char *pattern, * was holding. * * @param regexp The regular expression to be closed. + * @draft ICU 3.0 */ -U_STABLE void U_EXPORT2 +U_DRAFT void U_EXPORT2 uregex_close(URegularExpression *regexp); /** @@ -152,8 +155,9 @@ uregex_close(URegularExpression *regexp); * @param regexp The compiled regular expression to be cloned. * @param status Receives indication of any errors encountered * @return the cloned copy of the compiled regular expression. + * @draft ICU 3.0 */ -U_STABLE URegularExpression * U_EXPORT2 +U_DRAFT URegularExpression * U_EXPORT2 uregex_clone(const URegularExpression *regexp, UErrorCode *status); /** @@ -170,8 +174,9 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status); * owned by the regular expression object, and must not be * altered or deleted by the application. The returned string * will remain valid until the regular expression is closed. + * @draft ICU 3.0 */ -U_STABLE const UChar * U_EXPORT2 +U_DRAFT const UChar * U_EXPORT2 uregex_pattern(const URegularExpression *regexp, int32_t *patLength, UErrorCode *status); @@ -183,8 +188,9 @@ uregex_pattern(const URegularExpression *regexp, * @param regexp The compiled regular expression. * @param return The match mode flags * @see URegexpFlag + * @draft ICU 3.0 */ -U_STABLE int32_t U_EXPORT2 +U_DRAFT int32_t U_EXPORT2 uregex_flags(const URegularExpression *regexp, UErrorCode *status); @@ -207,8 +213,9 @@ uregex_flags(const URegularExpression *regexp, * @param textLength The length of the subject text, or -1 if the string * is NUL terminated. * @param status Receives errors detected by this function. + * @draft ICU 3.0 */ -U_STABLE void U_EXPORT2 +U_DRAFT void U_EXPORT2 uregex_setText(URegularExpression *regexp, const UChar *text, int32_t textLength, @@ -228,6 +235,7 @@ uregex_setText(URegularExpression *regexp, * @param status Receives errors detected by this function. * @return Poiner to the subject text string currently associated with * this regular expression. + * @draft ICU 3.0 */ U_DRAFT const UChar * U_EXPORT2 uregex_getText(URegularExpression *regexp, diff --git a/icu4c/source/i18n/uregex.cpp b/icu4c/source/i18n/uregex.cpp index 4f38e64b6d3..efcf46a3fd0 100644 --- a/icu4c/source/i18n/uregex.cpp +++ b/icu4c/source/i18n/uregex.cpp @@ -620,6 +620,19 @@ unescape_charAt(int32_t offset, void *context) { static const UChar BACKSLASH = 0x5c; static const UChar DOLLARSIGN = 0x24; +// +// Move a character to an output buffer, with bounds checking on the index. +// Index advances even if capacity is exceeded, for preflight size computations. +// This little sequence is used a LOT. +// +static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { + if (*idx < bufCapacity) { + buf[*idx] = c; + } + (*idx)++; +} + + // // appendReplacement, the actual implementation. // @@ -659,9 +672,10 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, return 0; } - int32_t resultLen = 0; - int32_t capacityRemaining = *destCapacity; - UChar *dest = *destBuf; + UChar *dest = *destBuf; + int32_t capacity = *destCapacity; + int32_t destIdx = 0; + int32_t i; // If it wasn't supplied by the caller, get the length of the replacement text. // TODO: slightly smarter logic in the copy loop could watch for the NUL on @@ -671,19 +685,10 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, } // Copy input string from the end of previous match to start of current match - int32_t startIdx = m->fLastMatchEnd; - int32_t len = m->fMatchStart - startIdx; - if (len > 0) { - if (len < capacityRemaining) { - // TODO: replace memcpy with inline loop - u_memcpy(&dest[resultLen], ®exp->fText[startIdx], len); - capacityRemaining -= len; - } else if (capacityRemaining > 0) { - u_memcpy(&dest[resultLen], ®exp->fText[startIdx], capacityRemaining); - capacityRemaining = 0; - } - resultLen += len; + for (i=m->fLastMatchEnd; ifMatchStart; i++) { + appendToBuf(regexp->fText[i], &destIdx, dest, capacity); } + // scan the replacement text, looking for substitutions ($n) and \escapes. @@ -694,11 +699,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, if (c != DOLLARSIGN && c != BACKSLASH) { // Common case, no substitution, no escaping, // just copy the char to the dest buf. - if (capacityRemaining > 0) { - dest[resultLen] = c; - capacityRemaining--; - } - resultLen++; + appendToBuf(c, &destIdx, dest, capacity); continue; } @@ -718,41 +719,25 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, UChar32 escapedChar = u_unescapeAt(unescape_charAt, &replIdx, // Index is updated by unescapeAt - replacementLength-replIdx, // Remaining length of replacement text + replacementLength, // Length of replacement text replacementText); if (escapedChar != (UChar32)0xFFFFFFFF) { if (escapedChar <= 0xffff) { - if (capacityRemaining > 0) { - dest[resultLen] = (UChar)escapedChar; - capacityRemaining--; - } - resultLen++; + appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); } else { - if (capacityRemaining > 0) { - dest[resultLen] = U16_LEAD(escapedChar); - capacityRemaining--; - } - resultLen++; - if (capacityRemaining > 0) { - dest[resultLen] = U16_TRAIL(escapedChar); - capacityRemaining--; - } - resultLen++; + appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); + appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); } - - continue; + continue; } // Note: if the \u escape was invalid, just fall through and // treat it as a plain \ escape. } // Plain backslash escape. Just put out the escaped character. - if (capacityRemaining > 0) { - dest[resultLen] = c; - capacityRemaining--; - } - resultLen++; + appendToBuf(c, &destIdx, dest, capacity); + replIdx++; continue; } @@ -787,20 +772,16 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, if (numDigits == 0) { // The $ didn't introduce a group number at all. // Treat it as just part of the substitution text. - if (capacityRemaining > 0) { - dest[resultLen] = DOLLARSIGN; - capacityRemaining--; - } - resultLen++; + appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); continue; } // Finally, append the capture group data to the destination. - resultLen += uregex_group(regexp, groupNum, dest+resultLen, capacityRemaining, status); - capacityRemaining = *destCapacity - resultLen; + int32_t capacityRemaining = capacity - destIdx; if (capacityRemaining < 0) { capacityRemaining = 0; } + destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status); if (*status == U_BUFFER_OVERFLOW_ERROR) { // Ignore buffer overflow when extracting the group. We need to // continue on to get full size of the untruncated result. We will @@ -819,9 +800,9 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, // Nul Terminate the dest buffer if possible. // Set the appropriate buffer overflow or not terminated error, if needed. // - if (resultLen < *destCapacity) { - dest[resultLen] = 0; - } else if (resultLen == *destCapacity) { + if (destIdx < capacity) { + dest[destIdx] = 0; + } else if (destIdx == *destCapacity) { *status = U_STRING_NOT_TERMINATED_WARNING; } else { *status = U_BUFFER_OVERFLOW_ERROR; @@ -830,13 +811,13 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, // // Return an updated dest buffer and capacity to the caller. // - if (resultLen > 0 && *destCapacity > 0) { - if (capacityRemaining == 0) { - *destBuf += *destCapacity; - *destCapacity = 0; + if (destIdx > 0 && *destCapacity > 0) { + if (destIdx < capacity) { + *destBuf += destIdx; + *destCapacity -= destIdx; } else { - *destBuf += resultLen; - *destCapacity -= resultLen; + *destBuf += capacity; + *destCapacity = 0; } } @@ -847,7 +828,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, *status = U_BUFFER_OVERFLOW_ERROR; } - return resultLen; + return destIdx; } // diff --git a/icu4c/source/test/cintltst/reapits.c b/icu4c/source/test/cintltst/reapits.c index eeabba085d4..c3486861c19 100644 --- a/icu4c/source/test/cintltst/reapits.c +++ b/icu4c/source/test/cintltst/reapits.c @@ -682,10 +682,10 @@ void TestRegexCAPI(void) { TEST_ASSERT_SUCCESS(status); bufPtr = buf; bufCap = sizeof(buf) / 2; - u_uastrncpy(repl, "abc\\u0041 \\U00000042 \\\\ \\abc", sizeof(repl)/2); + u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2); uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status); TEST_ASSERT_SUCCESS(status); - /* TEST_ASSERT_STRING("abcAB \\ abc", buf, TRUE); TODO: */ + TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); }