mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-2421 regexp C API
X-SVN-Rev: 14777
This commit is contained in:
parent
6ffcca54d2
commit
c685748636
3 changed files with 59 additions and 70 deletions
|
@ -40,7 +40,7 @@ typedef struct URegularExpression URegularExpression;
|
|||
* @stable ICU 2.4
|
||||
*/
|
||||
typedef enum URegexpFlag{
|
||||
/** Forces normalization of pattern and strings. @draft ICU 3.0 */
|
||||
/** Forces normalization of pattern and strings. @draft ICU 2.4 */
|
||||
UREGEX_CANON_EQ = 128,
|
||||
|
||||
/** Enable case insensitive matching. @stable ICU 2.4 */
|
||||
|
@ -87,9 +87,10 @@ typedef enum URegexpFlag{
|
|||
* error within the source regular expression string. If this
|
||||
* information is not wanted, pass NULL for this parameter.
|
||||
* @param status Receives error detected by this function.
|
||||
* @draft ICU 3.0
|
||||
*
|
||||
*/
|
||||
U_STABLE URegularExpression * U_EXPORT2
|
||||
U_DRAFT URegularExpression * U_EXPORT2
|
||||
uregex_open( const UChar *pattern,
|
||||
int32_t patternLength,
|
||||
uint32_t flags,
|
||||
|
@ -117,8 +118,9 @@ uregex_open( const UChar *pattern,
|
|||
* @return The URegularExpression object representing the compiled
|
||||
* pattern.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_STABLE URegularExpression * U_EXPORT2
|
||||
U_DRAFT URegularExpression * U_EXPORT2
|
||||
uregex_openC( const char *pattern,
|
||||
uint32_t flags,
|
||||
UParseError *pe,
|
||||
|
@ -132,8 +134,9 @@ uregex_openC( const char *pattern,
|
|||
* was holding.
|
||||
*
|
||||
* @param regexp The regular expression to be closed.
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_STABLE void U_EXPORT2
|
||||
U_DRAFT void U_EXPORT2
|
||||
uregex_close(URegularExpression *regexp);
|
||||
|
||||
/**
|
||||
|
@ -152,8 +155,9 @@ uregex_close(URegularExpression *regexp);
|
|||
* @param regexp The compiled regular expression to be cloned.
|
||||
* @param status Receives indication of any errors encountered
|
||||
* @return the cloned copy of the compiled regular expression.
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_STABLE URegularExpression * U_EXPORT2
|
||||
U_DRAFT URegularExpression * U_EXPORT2
|
||||
uregex_clone(const URegularExpression *regexp, UErrorCode *status);
|
||||
|
||||
/**
|
||||
|
@ -170,8 +174,9 @@ uregex_clone(const URegularExpression *regexp, UErrorCode *status);
|
|||
* owned by the regular expression object, and must not be
|
||||
* altered or deleted by the application. The returned string
|
||||
* will remain valid until the regular expression is closed.
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_STABLE const UChar * U_EXPORT2
|
||||
U_DRAFT const UChar * U_EXPORT2
|
||||
uregex_pattern(const URegularExpression *regexp,
|
||||
int32_t *patLength,
|
||||
UErrorCode *status);
|
||||
|
@ -183,8 +188,9 @@ uregex_pattern(const URegularExpression *regexp,
|
|||
* @param regexp The compiled regular expression.
|
||||
* @param return The match mode flags
|
||||
* @see URegexpFlag
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_STABLE int32_t U_EXPORT2
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uregex_flags(const URegularExpression *regexp,
|
||||
UErrorCode *status);
|
||||
|
||||
|
@ -207,8 +213,9 @@ uregex_flags(const URegularExpression *regexp,
|
|||
* @param textLength The length of the subject text, or -1 if the string
|
||||
* is NUL terminated.
|
||||
* @param status Receives errors detected by this function.
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_STABLE void U_EXPORT2
|
||||
U_DRAFT void U_EXPORT2
|
||||
uregex_setText(URegularExpression *regexp,
|
||||
const UChar *text,
|
||||
int32_t textLength,
|
||||
|
@ -228,6 +235,7 @@ uregex_setText(URegularExpression *regexp,
|
|||
* @param status Receives errors detected by this function.
|
||||
* @return Poiner to the subject text string currently associated with
|
||||
* this regular expression.
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
U_DRAFT const UChar * U_EXPORT2
|
||||
uregex_getText(URegularExpression *regexp,
|
||||
|
|
|
@ -620,6 +620,19 @@ unescape_charAt(int32_t offset, void *context) {
|
|||
static const UChar BACKSLASH = 0x5c;
|
||||
static const UChar DOLLARSIGN = 0x24;
|
||||
|
||||
//
|
||||
// Move a character to an output buffer, with bounds checking on the index.
|
||||
// Index advances even if capacity is exceeded, for preflight size computations.
|
||||
// This little sequence is used a LOT.
|
||||
//
|
||||
static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
|
||||
if (*idx < bufCapacity) {
|
||||
buf[*idx] = c;
|
||||
}
|
||||
(*idx)++;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// appendReplacement, the actual implementation.
|
||||
//
|
||||
|
@ -659,9 +672,10 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
return 0;
|
||||
}
|
||||
|
||||
int32_t resultLen = 0;
|
||||
int32_t capacityRemaining = *destCapacity;
|
||||
UChar *dest = *destBuf;
|
||||
UChar *dest = *destBuf;
|
||||
int32_t capacity = *destCapacity;
|
||||
int32_t destIdx = 0;
|
||||
int32_t i;
|
||||
|
||||
// If it wasn't supplied by the caller, get the length of the replacement text.
|
||||
// TODO: slightly smarter logic in the copy loop could watch for the NUL on
|
||||
|
@ -671,19 +685,10 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
}
|
||||
|
||||
// Copy input string from the end of previous match to start of current match
|
||||
int32_t startIdx = m->fLastMatchEnd;
|
||||
int32_t len = m->fMatchStart - startIdx;
|
||||
if (len > 0) {
|
||||
if (len < capacityRemaining) {
|
||||
// TODO: replace memcpy with inline loop
|
||||
u_memcpy(&dest[resultLen], ®exp->fText[startIdx], len);
|
||||
capacityRemaining -= len;
|
||||
} else if (capacityRemaining > 0) {
|
||||
u_memcpy(&dest[resultLen], ®exp->fText[startIdx], capacityRemaining);
|
||||
capacityRemaining = 0;
|
||||
}
|
||||
resultLen += len;
|
||||
for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
|
||||
appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// scan the replacement text, looking for substitutions ($n) and \escapes.
|
||||
|
@ -694,11 +699,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
if (c != DOLLARSIGN && c != BACKSLASH) {
|
||||
// Common case, no substitution, no escaping,
|
||||
// just copy the char to the dest buf.
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = c;
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
appendToBuf(c, &destIdx, dest, capacity);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -718,41 +719,25 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
UChar32 escapedChar =
|
||||
u_unescapeAt(unescape_charAt,
|
||||
&replIdx, // Index is updated by unescapeAt
|
||||
replacementLength-replIdx, // Remaining length of replacement text
|
||||
replacementLength, // Length of replacement text
|
||||
replacementText);
|
||||
|
||||
if (escapedChar != (UChar32)0xFFFFFFFF) {
|
||||
if (escapedChar <= 0xffff) {
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = (UChar)escapedChar;
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
|
||||
} else {
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = U16_LEAD(escapedChar);
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = U16_TRAIL(escapedChar);
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
|
||||
appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
|
||||
}
|
||||
|
||||
continue;
|
||||
continue;
|
||||
}
|
||||
// Note: if the \u escape was invalid, just fall through and
|
||||
// treat it as a plain \<anything> escape.
|
||||
}
|
||||
|
||||
// Plain backslash escape. Just put out the escaped character.
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = c;
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
appendToBuf(c, &destIdx, dest, capacity);
|
||||
|
||||
replIdx++;
|
||||
continue;
|
||||
}
|
||||
|
@ -787,20 +772,16 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
if (numDigits == 0) {
|
||||
// The $ didn't introduce a group number at all.
|
||||
// Treat it as just part of the substitution text.
|
||||
if (capacityRemaining > 0) {
|
||||
dest[resultLen] = DOLLARSIGN;
|
||||
capacityRemaining--;
|
||||
}
|
||||
resultLen++;
|
||||
appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Finally, append the capture group data to the destination.
|
||||
resultLen += uregex_group(regexp, groupNum, dest+resultLen, capacityRemaining, status);
|
||||
capacityRemaining = *destCapacity - resultLen;
|
||||
int32_t capacityRemaining = capacity - destIdx;
|
||||
if (capacityRemaining < 0) {
|
||||
capacityRemaining = 0;
|
||||
}
|
||||
destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// Ignore buffer overflow when extracting the group. We need to
|
||||
// continue on to get full size of the untruncated result. We will
|
||||
|
@ -819,9 +800,9 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
// Nul Terminate the dest buffer if possible.
|
||||
// Set the appropriate buffer overflow or not terminated error, if needed.
|
||||
//
|
||||
if (resultLen < *destCapacity) {
|
||||
dest[resultLen] = 0;
|
||||
} else if (resultLen == *destCapacity) {
|
||||
if (destIdx < capacity) {
|
||||
dest[destIdx] = 0;
|
||||
} else if (destIdx == *destCapacity) {
|
||||
*status = U_STRING_NOT_TERMINATED_WARNING;
|
||||
} else {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
|
@ -830,13 +811,13 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
//
|
||||
// Return an updated dest buffer and capacity to the caller.
|
||||
//
|
||||
if (resultLen > 0 && *destCapacity > 0) {
|
||||
if (capacityRemaining == 0) {
|
||||
*destBuf += *destCapacity;
|
||||
*destCapacity = 0;
|
||||
if (destIdx > 0 && *destCapacity > 0) {
|
||||
if (destIdx < capacity) {
|
||||
*destBuf += destIdx;
|
||||
*destCapacity -= destIdx;
|
||||
} else {
|
||||
*destBuf += resultLen;
|
||||
*destCapacity -= resultLen;
|
||||
*destBuf += capacity;
|
||||
*destCapacity = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -847,7 +828,7 @@ int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
|
|||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
|
||||
return resultLen;
|
||||
return destIdx;
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
@ -682,10 +682,10 @@ void TestRegexCAPI(void) {
|
|||
TEST_ASSERT_SUCCESS(status);
|
||||
bufPtr = buf;
|
||||
bufCap = sizeof(buf) / 2;
|
||||
u_uastrncpy(repl, "abc\\u0041 \\U00000042 \\\\ \\abc", sizeof(repl)/2);
|
||||
u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", sizeof(repl)/2);
|
||||
uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
/* TEST_ASSERT_STRING("abcAB \\ abc", buf, TRUE); TODO: */
|
||||
TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE);
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue